Skip to content

Instantly share code, notes, and snippets.

@mmerce
Last active November 4, 2018 20:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mmerce/7e255b1875e322de1d29ad28a8d91be7 to your computer and use it in GitHub Desktop.
Save mmerce/7e255b1875e322de1d29ad28a8d91be7 to your computer and use it in GitHub Desktop.
truncating categories in dataset fields
{
"name": "Truncating text fields to categories",
"description": "It creates a dataset with the selected first n categories in the fields provided",
"inputs": [
{
"name": "source-id",
"type": "source-id",
"description": "Data source"
},
{
"name": "upd-fields",
"type": "list",
"description": "List of fields to be filtered"
},
{
"name": "limit",
"type": "number",
"default": 1000,
"description": "Maximum number of categories"
}
],
"outputs": [
{
"name": "filtered_dataset",
"type": "dataset-id",
"description": "Dataset containing rows that match the filtered categories."
}
]
}
(define (full-text source-id upd-fields limit)
(let (source (fetch source-id)
fields (source "fields" {})
upd-field-ids (map (lambda (x) ((find-field fields x) "id")) upd-fields)
changes (iterate (acc {} f upd-field-ids)
(assoc acc
f
{"optype" "text"
"term_analysis" {"token_mode" "full_terms_only"}}))
upd-source (update-and-wait source-id {"fields" changes})
dataset (create-and-wait-dataset upd-source)
ds-ids (map (lambda (f) (category-instances dataset f limit))
upd-fields)
ds-list (iterate (acc [dataset] ds-id ds-ids f upd-fields)
(cons (filter-categories (head acc) ds-id f) acc))
final-ds (head ds-list))
(delete* (butlast (tail ds-list)))
final-ds))
(define (filter-categories ds-id cat-ds-id field)
(create-and-wait-dataset {
"origin_datasets" [ds-id cat-ds-id]
"origin_dataset_names" (assoc {} ds-id "A" cat-ds-id "B")
"sql_query" (str "select A.* from A join B on A.`"
field
"` = B.`"
field
"`")}))
(define (category-instances ds-id field limit)
(create-and-wait-dataset {
"name" (str field " categories")
"origin_datasets" [ds-id]
"origin_dataset_names" (assoc {} ds-id "A")
"sql_query" (str "select " field ", count(*) as instances from A"
" group by " field " order by instances desc"
" limit " limit)}))
(define filtered-dataset (full-text source-id upd-fields limit))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment