Skip to content

Instantly share code, notes, and snippets.

@JnBrymn
Created April 29, 2014 20:37
Show Gist options
  • Save JnBrymn/b643a3d7d65262eee753 to your computer and use it in GitHub Desktop.
Save JnBrymn/b643a3d7d65262eee753 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Three ways to group users into as \"active\", \"fading\", and \"stale\" using Elastic Search"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, generate fake data and load it into ElasticSearch"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import random\n",
"import json\n",
"import datetime\n",
"import calendar\n",
"\n",
"url = \"http://localhost:9200/biz-intel\"\n",
"#url = \"http://monitor1:9200/biz-intel\"\n",
" \n",
"def make_fake_doc(ts):\n",
" doc = {}\n",
" doc[\"ts\"] = ts.isoformat()\n",
" doc[\"org_id\"] = random.choice(range(10))\n",
" doc[\"last_login\"] = (ts - datetime.timedelta(seconds=random.randint(0,3600*24*7*52))).isoformat()\n",
" return doc\n",
" \n",
"def delete_index():\n",
" r = requests.delete(url)\n",
" \n",
"def send_to_es(doc):\n",
" return requests.post(url+\"/docs\"\n",
" ,data=json.dumps(doc)\n",
" ,headers={'content-type': 'application/json'}\n",
" )\n",
" \n",
"def simulate():\n",
" start_date = datetime.datetime.now()\n",
" start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)\n",
" \n",
" delete_index()\n",
" for day in range(365):\n",
" print day\n",
" for _ in range(random.randint(5,10)):\n",
" send_to_es(make_fake_doc(start_date + datetime.timedelta(days=day)))\n",
" \n",
"def query():\n",
" r = requests.post(url+\"/_search?pretty\"\n",
" ,data=json.dumps({\"query\": {\"match_all\": {}}})\n",
" ,headers={'content-type': 'application/json'}\n",
" )\n",
" return r\n",
"\n",
"\n",
"\n",
"simulate()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#active, fading, stale"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This uses script filters to define the three categories"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query = {\n",
" \"aggs\": {\n",
" \"dates\": {\n",
" \"date_histogram\": {\n",
" \"field\":\"ts\",\n",
" \"interval\":\"month\"\n",
" },\n",
" \"aggs\": {\n",
" \"active\": {\n",
" \"filter\": { \n",
" \"script\": {\n",
" #last_login before this date is less than 2 weeks\n",
" \"script\": \"(doc['ts'].value - doc['last_login'].value) < 1209600000\"\n",
" }\n",
" },\n",
" \"aggs\": {\n",
" \"count\": {\n",
" \"cardinality\": {\n",
" \"field\":\"org_id\",\n",
" \"precision_threshold\":1000\n",
" }\n",
" }\n",
" }\n",
" },\n",
" \"fading\": {\n",
" \"filter\": { \n",
" \"script\": {\n",
" #last_login before this date is between 2 and 4 weeks ago\n",
" \"script\": \"diff = (doc['ts'].value - doc['last_login'].value); diff >= 1209600000 && diff < 2419200000\"\n",
" }\n",
" },\n",
" \"aggs\": {\n",
" \"count\": {\n",
" \"cardinality\": {\n",
" \"field\":\"org_id\",\n",
" \"precision_threshold\":1000\n",
" }\n",
" }\n",
" }\n",
" },\n",
" \"stale\": {\n",
" \"filter\": { \n",
" \"script\": {\n",
" #last_login before this date is between 2 and 4 weeks ago\n",
" \"script\": \"(doc['ts'].value - doc['last_login'].value) >= 2419200000\"\n",
" }\n",
" },\n",
" \"aggs\": {\n",
" \"count\": {\n",
" \"cardinality\": {\n",
" \"field\":\"org_id\",\n",
" \"precision_threshold\":1000\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
"r = requests.post(url+\"/_search?pretty=true&size=1\"\n",
" ,data=json.dumps(query)\n",
" ,headers={'content-type': 'application/json'}\n",
")\n",
"j = r.json()\n",
"print \"date\\t\\t\\t\\tactive\\tfading\\tstale\"\n",
"for bucket in j[\"aggregations\"][\"dates\"][\"buckets\"]:\n",
" print \"{0}\\t{1}\\t{2}\\t{3}\".format(bucket[\"key_as_string\"],bucket[\"active\"][\"count\"][\"value\"],bucket[\"fading\"][\"count\"][\"value\"],bucket[\"stale\"][\"count\"][\"value\"])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"date\t\t\t\tactive\tfading\tstale\n",
"2014-04-01T00:00:00.000Z\t0\t2\t8\n",
"2014-05-01T00:00:00.000Z\t8\t7\t10\n",
"2014-06-01T00:00:00.000Z\t3\t4\t10\n",
"2014-07-01T00:00:00.000Z\t6\t4\t10\n",
"2014-08-01T00:00:00.000Z\t5\t5\t10\n",
"2014-09-01T00:00:00.000Z\t6\t6\t10\n",
"2014-10-01T00:00:00.000Z\t5\t5\t10\n",
"2014-11-01T00:00:00.000Z\t3\t5\t10\n",
"2014-12-01T00:00:00.000Z\t6\t8\t10\n",
"2015-01-01T00:00:00.000Z\t5\t6\t10\n",
"2015-02-01T00:00:00.000Z\t4\t4\t10\n",
"2015-03-01T00:00:00.000Z\t7\t6\t10\n",
"2015-04-01T00:00:00.000Z\t4\t6\t10\n"
]
}
],
"prompt_number": 635
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below will use ranges to define the three groups"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query = {\n",
" \"aggs\": {\n",
" \"dates\": {\n",
" \"date_histogram\": {\n",
" \"field\":\"ts\",\n",
" \"interval\":\"month\"\n",
" },\n",
" \"aggs\": {\n",
" \"last_activity\": {\n",
" \"range\": { \n",
" \"script\": \"(doc['ts'].value - doc['last_login'].value)/604800000.0\",\n",
" \"ranges\": [\n",
" {\"to\": 2},\n",
" {\"from\":2,\"to\": 4},\n",
" {\"from\": 4},\n",
" ]\n",
" },\n",
" \"aggs\": {\n",
" \"count\": {\n",
" \"cardinality\": {\n",
" \"field\":\"org_id\",\n",
" \"precision_threshold\":1000\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
"r = requests.post(url+\"/_search?pretty=true&size=1\"\n",
" ,data=json.dumps(query)\n",
" ,headers={'content-type': 'application/json'}\n",
")\n",
"j = r.json()\n",
"print \"date\\t\\t\\t\\tactive\\tfading\\tstale\"\n",
"for bucket in j[\"aggregations\"][\"dates\"][\"buckets\"]:\n",
" print \"{0}\\t{1}\\t{2}\\t{3}\".format(bucket[\"key_as_string\"],bucket[\"last_activity\"][\"buckets\"][0][\"count\"][\"value\"],bucket[\"last_activity\"][\"buckets\"][1][\"count\"][\"value\"],bucket[\"last_activity\"][\"buckets\"][2][\"count\"][\"value\"])\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"date\t\t\t\tactive\tfading\tstale\n",
"2014-04-01T00:00:00.000Z\t0\t2\t8\n",
"2014-05-01T00:00:00.000Z\t8\t7\t10\n",
"2014-06-01T00:00:00.000Z\t3\t4\t10\n",
"2014-07-01T00:00:00.000Z\t6\t4\t10\n",
"2014-08-01T00:00:00.000Z\t5\t5\t10\n",
"2014-09-01T00:00:00.000Z\t6\t6\t10\n",
"2014-10-01T00:00:00.000Z\t5\t5\t10\n",
"2014-11-01T00:00:00.000Z\t3\t5\t10\n",
"2014-12-01T00:00:00.000Z\t6\t8\t10\n",
"2015-01-01T00:00:00.000Z\t5\t6\t10\n",
"2015-02-01T00:00:00.000Z\t4\t4\t10\n",
"2015-03-01T00:00:00.000Z\t7\t6\t10\n",
"2015-04-01T00:00:00.000Z\t4\t6\t10\n"
]
}
],
"prompt_number": 636
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below will use terms filter to define the fields, scripting will be used to define the terms"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query = {\n",
" \"aggs\": {\n",
" \"dates\": {\n",
" \"date_histogram\": {\n",
" \"field\":\"ts\",\n",
" \"interval\":\"month\"\n",
" },\n",
" \"aggs\": {\n",
" \"last_activity\": {\n",
" \"terms\": { \n",
" \"script\": \"\"\"\n",
" weeks_ago = (doc['ts'].value - doc['last_login'].value)/604800000.0;\n",
" if (weeks_ago < 2.0) {\n",
" return \"active\";\n",
" } else if (weeks_ago >= 2.0 && weeks_ago < 4.0) {\n",
" return \"fading\";\n",
" } else {\n",
" return \"stale\"\n",
" }\n",
" \"\"\", \n",
" },\n",
" \"aggs\": {\n",
" \"count\": {\n",
" \"cardinality\": {\n",
" \"field\":\"org_id\",\n",
" \"precision_threshold\":1000\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
"}\n",
"r = requests.post(url+\"/_search?pretty=true&size=1\"\n",
" ,data=json.dumps(query)\n",
" ,headers={'content-type': 'application/json'}\n",
")\n",
"j = r.json()\n",
"print \"date\\t\\t\\t\\tactive\\tfading\\tstale\"\n",
"for bucket in j[\"aggregations\"][\"dates\"][\"buckets\"]:\n",
" date = bucket[\"key_as_string\"]\n",
" engagement = {\"active\":0,\"fading\":0,\"stale\":0}\n",
" for last_activity_bucket in bucket[\"last_activity\"][\"buckets\"]:\n",
" engagement[last_activity_bucket[\"key\"]]=last_activity_bucket[\"count\"][\"value\"]\n",
" print \"{0}\\t{1}\\t{2}\\t{3}\".format(date, engagement[\"active\"], engagement[\"fading\"], engagement[\"stale\"])\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"date\t\t\t\tactive\tfading\tstale\n",
"2014-04-01T00:00:00.000Z\t0\t2\t8\n",
"2014-05-01T00:00:00.000Z\t8\t7\t10\n",
"2014-06-01T00:00:00.000Z\t3\t4\t10\n",
"2014-07-01T00:00:00.000Z\t6\t4\t10\n",
"2014-08-01T00:00:00.000Z\t5\t5\t10\n",
"2014-09-01T00:00:00.000Z\t6\t6\t10\n",
"2014-10-01T00:00:00.000Z\t5\t5\t10\n",
"2014-11-01T00:00:00.000Z\t3\t5\t10\n",
"2014-12-01T00:00:00.000Z\t6\t8\t10\n",
"2015-01-01T00:00:00.000Z\t5\t6\t10\n",
"2015-02-01T00:00:00.000Z\t4\t4\t10\n",
"2015-03-01T00:00:00.000Z\t7\t6\t10\n",
"2015-04-01T00:00:00.000Z\t4\t6\t10\n"
]
}
],
"prompt_number": 637
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment