Last active
November 4, 2018 20:52
-
-
Save mmerce/7e255b1875e322de1d29ad28a8d91be7 to your computer and use it in GitHub Desktop.
truncating categories in dataset fields
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "Truncating text fields to categories", | |
"description": "It creates a dataset with the selected first n categories in the fields provided", | |
"inputs": [ | |
{ | |
"name": "source-id", | |
"type": "source-id", | |
"description": "Data source" | |
}, | |
{ | |
"name": "upd-fields", | |
"type": "list", | |
"description": "List of fields to be filtered" | |
}, | |
{ | |
"name": "limit", | |
"type": "number", | |
"default": 1000, | |
"description": "Maximum number of categories" | |
} | |
], | |
"outputs": [ | |
{ | |
"name": "filtered_dataset", | |
"type": "dataset-id", | |
"description": "Dataset containing rows that match the filtered categories." | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(define (full-text source-id upd-fields limit) | |
(let (source (fetch source-id) | |
fields (source "fields" {}) | |
upd-field-ids (map (lambda (x) ((find-field fields x) "id")) upd-fields) | |
changes (iterate (acc {} f upd-field-ids) | |
(assoc acc | |
f | |
{"optype" "text" | |
"term_analysis" {"token_mode" "full_terms_only"}})) | |
upd-source (update-and-wait source-id {"fields" changes}) | |
dataset (create-and-wait-dataset upd-source) | |
ds-ids (map (lambda (f) (category-instances dataset f limit)) | |
upd-fields) | |
ds-list (iterate (acc [dataset] ds-id ds-ids f upd-fields) | |
(cons (filter-categories (head acc) ds-id f) acc)) | |
final-ds (head ds-list)) | |
(delete* (butlast (tail ds-list))) | |
final-ds)) | |
(define (filter-categories ds-id cat-ds-id field) | |
(create-and-wait-dataset { | |
"origin_datasets" [ds-id cat-ds-id] | |
"origin_dataset_names" (assoc {} ds-id "A" cat-ds-id "B") | |
"sql_query" (str "select A.* from A join B on A.`" | |
field | |
"` = B.`" | |
field | |
"`")})) | |
(define (category-instances ds-id field limit) | |
(create-and-wait-dataset { | |
"name" (str field " categories") | |
"origin_datasets" [ds-id] | |
"origin_dataset_names" (assoc {} ds-id "A") | |
"sql_query" (str "select " field ", count(*) as instances from A" | |
" group by " field " order by instances desc" | |
" limit " limit)})) | |
(define filtered-dataset (full-text source-id upd-fields limit)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment