Skip to content

Instantly share code, notes, and snippets.

@rileycrane
Forked from kerryrodden/.block
Last active August 29, 2015 14:03
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save rileycrane/92a2c36eb932b4f99e51 to your computer and use it in GitHub Desktop.
Common Message Sequences

This example shows how it is possible to use a D3 sunburst visualization (partition layout) with data that describes sequences of events.

Tips for generating the CSV file (note, might complain for data that contains too large of a dynamic range):

  • no header is required (but it's OK if one is present)
  • use a hyphen to separate the steps in the sequence
  • the step names should be one word only, and ideally should be kept short. Non-alphanumeric characters will probably cause problems (I haven't tested this).
  • every sequence should have an "end" marker as the last element, unless it has been truncated because it is longer than the maximum sequence length (6, in the example). The purpose of the "end" marker is to distinguish a true end point (e.g. the user left the site) from an end point that has been forced by truncation.
  • each line should be a complete path from root to leaf - don't include counts for intermediate steps. For example, include "home-search-end" and "home-search-product-end" but not "home-search" - the latter is computed by the partition layout, by adding up the counts of all the sequences with that prefix.
  • to keep the number of permutations low, use a small number of unique step names, and a small maximum sequence length. Larger numbers of either of these will lead to a very large CSV that will be slow to process (and therefore require pre-processing into hierarchical JSON).

Copyright 2013 Google Inc. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

from communication.models import Discussion, Message
from collections import defaultdict, Counter
import random
from datetime import datetime, timedelta
from lib.common.utils import get_start_time_for_rate_limit_counting
start = get_start_time_for_rate_limit_counting(datetime.utcnow())-timedelta(days=30)
today = get_start_time_for_rate_limit_counting(datetime.utcnow())
days = 30
data = []
discussion_sequence_list = []
entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'}
message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'notice','notice':'notice'}
discussion_sequence = defaultdict(list)
# GO THROUGH EACH DAY
for day in range(days):
start = start + timedelta(days=1)
stop = start + timedelta(days=1)
print '===========\n%s - %s' % (start, stop)
discussion_counted_list = []
# GET MESSAGES THAT DAY
messages = Message.objects.filter(created_at__gte=start).filter(created_at__lt=stop)
messages_count = messages.count()
print 'msgs on %s: %s' % (day, messages_count)
# GO THROUGH EACH DISCUSSION THAT HAD MSGS THAT DAY
counter = 0
for message in messages:
counter+=1
if message.discussion in discussion_counted_list:
continue
if counter % 500 == 0:
print "\tOn %s of %s" % (counter, messages_count)
msgs_in_disc_today = message.discussion.message_set.filter(created_at__gte=start).filter(created_at__lt=stop).order_by('created_at')
sequence = []
for msg in msgs_in_disc_today:
try:
etype = entity_type_map[msg.sender.entity_type]
mtype = message_type_map[msg.message_type]
except:
continue
# SEQUENCE
sequence.append('%s%s' % (etype,mtype))
if len(msgs_in_disc_today)<=6:
sequence.append('end')
else:
sequence = sequence[:6]
# append sequence to data
data.append('-'.join(sequence))
discussion_counted_list.append(message.discussion)
print 'counting'
cnt = Counter(data)
for item in cnt.most_common(75):
seq, count = item
print '%s,%s' % (seq, count)
####
# 'duuid':['sequence']
class FindMatches(object):
"""
USAGE
discussions = Discussion.objects.filter(created_at__gte='2014-06-01').filter(created_at__lt='2014-06-27')
fm = FindMatches(discussions=discussions)
fm.process()
typec = fm.dmatches_sequence(fm.dseq, ['person-normal','place-auto','place-notice'])
"""
def __init__(self, *args, **kwargs):
self.entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'}
self.message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'notice','notice':'notice'}
self.discussions = kwargs.get('discussions')
def process(self):
self.dseq = self.get_all_sequences(self.discussions)
def get_all_sequences(self, discussions):
"""
Get all sequences for all discussions provided
"""
from collections import defaultdict
dseq = defaultdict(list)
dcount = self.discussions.count()
counter = 0
for d in self.discussions:
counter+=1
if counter % 500 == 0:
print 'On %s of %s' % (counter, dcount)
dseq[d.uuid] = self.build_sequence(d)
return dseq
def build_sequence(self, discussion, start=None, stop=None, truncate=None):
"""
Build a sequence from a particular discussion
"""
sequence = []
if start is None and stop is None:
msgs_in_disc_today = discussion.message_set.all()
else:
msgs_in_disc_today = discussion.message_set.filter(created_at__gte=start).filter(created_at__lt=stop).order_by('created_at')
# Get all messages in sequence
for msg in msgs_in_disc_today:
try:
etype = self.entity_type_map[msg.sender.entity_type]
mtype = self.message_type_map[msg.message_type]
except:
continue
# SEQUENCE
sequence.append('%s%s' % (etype,mtype))
sequence.append('end')
return sequence
def dmatches_sequence(self, dseq, sequence):
"""
find discussions that match the given sequence
sequence: [person-normal, place-normal]...
Takes a defaultdict of {'duuid':[sequence]....}
"""
from communication.models import Discussion
matches = []
# transform passed in sequence
collapsed_seq = '-'.join(["%s%s" % (self.entity_type_map[item.split('-')[0]],self.message_type_map[item.split('-')[1]]) for item in sequence])
# iterate through dseq
for duuid, seq in dseq.items():
if collapsed_seq in '-'.join(seq):
matches.append(Discussion.objects.get(uuid=duuid))
return matches
# ########################################
# # Find discussions with particular sequence
# # sequence = 'normal-auto-agent'
# def discussions_with_sequence(discussions, sequence):
# entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'}
# message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'notice','notice':'notice'}
# matches = []
# for discussion in discussions:
# observed_sequence = '-'.join(["%s%s" % (entity_type_map[message.sender.entity_type], message_type_map[message.message_type]) for message in discussion.message_set.all()])
# if sequence in observed_sequence:
# matches.append(discussion)
# return matches
# GENERATE SEQUENCE DATA (with Counts)
prep_data = []
map_data = {'person':'p','place':'b', 'admin':'b', 'assistant':'b'}
for discussion in discussions:
sequence_list = ["%s%s" % (map_data[message.sender.entity_type], message.message_type) for message in discussion.message_set.all().order_by('created_at')]
if discussion.message_set.count() <= 6:
sequence_list.append("end")
else:
sequence_list = sequence_list[:6]
prep_data.append('-'.join(sequence_list))
cnt = Counter(prep_data)
for item in cnt.items():
seq, count = item
print '%s,%s' % (seq, count)
# GENERATE HIERARCHICAL DATA
for discussion in discussions:
parent = None
for message in discussion.message_set.all().order_by('created_at'):
message_type = '%s-%s' % (message.sender.entity_type, message.message_type)
if parent is not None:
data[parent].append(message_type)
parent = message_type
start = get_start_time_for_rate_limit_counting(datetime.utcnow())-timedelta(days=30)
today = get_start_time_for_rate_limit_counting(datetime.utcnow())
days = 30
data = []
discussion_sequence_list = []
entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'}
message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'notice','notice':'notice'}
discussion_sequence = defaultdict(list)
# GO THROUGH EACH DAY
discussions = Discussion.objects.filter(created_at__gte=start).filter(created_at__lt='2014-06-27')
dcount = discussions.count()
counter = 0
for discussion in discussions:
counter+=1
if counter%500==0:
print 'on %s of %s' % (counter,dcount)
msgs_in_disc_today = discussion.message_set.all().order_by('created_at')
sequence = []
for msg in msgs_in_disc_today:
try:
etype = entity_type_map[msg.sender.entity_type]
mtype = message_type_map[msg.message_type]
except:
continue
# SEQUENCE
sequence.append('%s%s' % (etype,mtype))
if len(msgs_in_disc_today)<=6:
sequence.append('end')
else:
sequence = sequence[:6]
# append sequence to data
data.append('-'.join(sequence))
print 'counting'
cnt = Counter(data)
for item in cnt.most_common(75):
seq, count = item
print '%s,%s' % (seq, count)
def build_sequence(discussion):
"""
Build a sequence from a particular discussion
"""
sequence = []
entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'}
message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'update','notice':'notice'}
# Get all messages in sequence
for msg in discussion.message_set.all():
try:
etype = entity_type_map[msg.sender.entity_type]
mtype = message_type_map[msg.message_type]
except:
continue
# SEQUENCE
sequence.append('%s%s' % (etype,mtype))
sequence.append('end')
return sequence
def dmatches_sequence(discussions, sequence):
"""
find discussions that match the given sequence
sequence: [person-normal, place-normal]...
Takes a defaultdict of {'duuid':[sequence]....}
"""
from communication.models import Discussion
entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'}
message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'update','notice':'notice'}
matches = []
# transform passed in sequence
collapsed_seq = '-'.join(["%s%s" % (entity_type_map[item.split('-')[0]],message_type_map[item.split('-')[1]]) for item in sequence])
# iterate through dseq
for discussion in discussions:
# BUILD SEQUENCE
sequence = []
for msg in discussion.message_set.all():
try:
etype = entity_type_map[msg.sender.entity_type]
mtype = message_type_map[msg.message_type]
except:
continue
sequence.append('%s%s' % (etype,mtype))
if collapsed_seq in '-'.join(sequence):
matches.append(discussion)
break
return matches
def print_discussion(discussion):
import unicodedata
out = ''
name = unicodedata.normalize('NFKD', discussion.place().name).encode('ascii','ignore')
next_day = False
out+='-'*len(name)+'\n'
out+= name+'\n'
out+='-'*len(name)+'\n'
for message in discussion.message_set.all():
content = unicodedata.normalize('NFKD', message.content).encode('ascii','ignore')
if message.created_at.date() > discussion.created_at.date() and not next_day:
out+='\t====== next day ======'+'\n'
next_day = True
if message.sender.entity_type=='person':
out+='user : %s' % content +'\n'
else:
out+='place (%s): %s' % (message.message_type[:4], content)+'\n'
out+='\n'
return out
# ANALYSIS
discussions = Discussion.objects.filter(created_at__gte='2014-06-01').filter(created_at__lt='2014-06-27')
f = open('/home/talkto/tmp/sample_discussions_type_c__agent.txt','w')
type_b=dmatches_sequence(discussions,['person-normal','place-auto','place-notice'])
for d in type_c:
f.write(print_discussion(d))
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Sequences sunburst</title>
<script src="http://d3js.org/d3.v3.min.js"></script>
<link rel="stylesheet" type="text/css"
href="https://fonts.googleapis.com/css?family=Open+Sans:400,600">
<link rel="stylesheet" type="text/css" href="sequences.css"/>
</head>
<body>
<div id="main">
<div id="sequence"></div>
<div id="chart">
<div id="explanation" style="visibility: hidden;">
<span id="percentage"></span><br/>
of visits begin with this sequence of pages
</div>
</div>
</div>
<div id="sidebar">
<input type="checkbox" id="togglelegend"> Legend<br/>
<div id="legend" style="visibility: hidden;"></div>
</div>
<script type="text/javascript" src="sequences.js"></script>
<script type="text/javascript">
// Hack to make this example display correctly in an iframe on bl.ocks.org
d3.select(self.frameElement).style("height", "700px");
</script>
</body>
</html>
body {
font-family: 'Open Sans', sans-serif;
font-size: 12px;
font-weight: 400;
background-color: #fff;
width: 960px;
height: 700px;
margin-top: 10px;
}
#main {
float: left;
width: 750px;
}
#sidebar {
float: right;
width: 100px;
}
#sequence {
width: 600px;
height: 70px;
}
#legend {
padding: 10px 0 0 3px;
}
#sequence text, #legend text {
font-weight: 600;
fill: #fff;
}
#chart {
position: relative;
}
#chart path {
stroke: #fff;
}
#explanation {
position: absolute;
top: 260px;
left: 305px;
width: 140px;
text-align: center;
color: #666;
z-index: -1;
}
#percentage {
font-size: 2.5em;
}
// Dimensions of sunburst.
var width = 750;
var height = 600;
var radius = Math.min(width, height) / 2;
// Breadcrumb dimensions: width, height, spacing, width of tip/tail.
var b = {
w: 75, h: 30, s: 3, t: 10
};
// Mapping of step names to colors.
var colors = {
"pnormal": "#5687d1",
"bnormal": "#94acd1",
"bauto": "#b56353",
"bagent": "#3bde45",
"bnotice": "#b96ab8",
"bupdate": "#d1cb73",
"end": "#bbbbbb"
};
// Total size of all segments; we set this later, after loading the data.
var totalSize = 0;
var vis = d3.select("#chart").append("svg:svg")
.attr("width", width)
.attr("height", height)
.append("svg:g")
.attr("id", "container")
.attr("transform", "translate(" + width / 2 + "," + height / 2 + ")");
var partition = d3.layout.partition()
.size([2 * Math.PI, radius * radius])
.value(function(d) { return d.size; });
var arc = d3.svg.arc()
.startAngle(function(d) { return d.x; })
.endAngle(function(d) { return d.x + d.dx; })
.innerRadius(function(d) { return Math.sqrt(d.y); })
.outerRadius(function(d) { return Math.sqrt(d.y + d.dy); });
// Use d3.text and d3.csv.parseRows so that we do not need to have a header
// row, and can receive the csv as an array of arrays.
d3.text("visit-sequences.csv", function(text) {
var csv = d3.csv.parseRows(text);
var json = buildHierarchy(csv);
createVisualization(json);
});
// Main function to draw and set up the visualization, once we have the data.
function createVisualization(json) {
// Basic setup of page elements.
initializeBreadcrumbTrail();
drawLegend();
d3.select("#togglelegend").on("click", toggleLegend);
// Bounding circle underneath the sunburst, to make it easier to detect
// when the mouse leaves the parent g.
vis.append("svg:circle")
.attr("r", radius)
.style("opacity", 0);
// For efficiency, filter nodes to keep only those large enough to see.
var nodes = partition.nodes(json)
.filter(function(d) {
return (d.dx > 0.005); // 0.005 radians = 0.29 degrees
});
var path = vis.data([json]).selectAll("path")
.data(nodes)
.enter().append("svg:path")
.attr("display", function(d) { return d.depth ? null : "none"; })
.attr("d", arc)
.attr("fill-rule", "evenodd")
.style("fill", function(d) { return colors[d.name]; })
.style("opacity", 1)
.on("mouseover", mouseover);
// Add the mouseleave handler to the bounding circle.
d3.select("#container").on("mouseleave", mouseleave);
// Get total size of the tree = value of root node from partition.
totalSize = path.node().__data__.value;
};
// Fade all but the current sequence, and show it in the breadcrumb trail.
function mouseover(d) {
var percentage = (100 * d.value / totalSize).toPrecision(3);
var percentageString = percentage + "%";
if (percentage < 0.1) {
percentageString = "< 0.1%";
}
d3.select("#percentage")
.text(percentageString);
d3.select("#explanation")
.style("visibility", "");
var sequenceArray = getAncestors(d);
updateBreadcrumbs(sequenceArray, percentageString);
// Fade all the segments.
d3.selectAll("path")
.style("opacity", 0.3);
// Then highlight only those that are an ancestor of the current segment.
vis.selectAll("path")
.filter(function(node) {
return (sequenceArray.indexOf(node) >= 0);
})
.style("opacity", 1);
}
// Restore everything to full opacity when moving off the visualization.
function mouseleave(d) {
// Hide the breadcrumb trail
d3.select("#trail")
.style("visibility", "hidden");
// Deactivate all segments during transition.
d3.selectAll("path").on("mouseover", null);
// Transition each segment to full opacity and then reactivate it.
d3.selectAll("path")
.transition()
.duration(1000)
.style("opacity", 1)
.each("end", function() {
d3.select(this).on("mouseover", mouseover);
});
d3.select("#explanation")
.transition()
.duration(1000)
.style("visibility", "hidden");
}
// Given a node in a partition layout, return an array of all of its ancestor
// nodes, highest first, but excluding the root.
function getAncestors(node) {
var path = [];
var current = node;
while (current.parent) {
path.unshift(current);
current = current.parent;
}
return path;
}
function initializeBreadcrumbTrail() {
// Add the svg area.
var trail = d3.select("#sequence").append("svg:svg")
.attr("width", width)
.attr("height", 50)
.attr("id", "trail");
// Add the label at the end, for the percentage.
trail.append("svg:text")
.attr("id", "endlabel")
.style("fill", "#000");
}
// Generate a string that describes the points of a breadcrumb polygon.
function breadcrumbPoints(d, i) {
var points = [];
points.push("0,0");
points.push(b.w + ",0");
points.push(b.w + b.t + "," + (b.h / 2));
points.push(b.w + "," + b.h);
points.push("0," + b.h);
if (i > 0) { // Leftmost breadcrumb; don't include 6th vertex.
points.push(b.t + "," + (b.h / 2));
}
return points.join(" ");
}
// Update the breadcrumb trail to show the current sequence and percentage.
function updateBreadcrumbs(nodeArray, percentageString) {
// Data join; key function combines name and depth (= position in sequence).
var g = d3.select("#trail")
.selectAll("g")
.data(nodeArray, function(d) { return d.name + d.depth; });
// Add breadcrumb and label for entering nodes.
var entering = g.enter().append("svg:g");
entering.append("svg:polygon")
.attr("points", breadcrumbPoints)
.style("fill", function(d) { return colors[d.name]; });
entering.append("svg:text")
.attr("x", (b.w + b.t) / 2)
.attr("y", b.h / 2)
.attr("dy", "0.35em")
.attr("text-anchor", "middle")
.text(function(d) { return d.name; });
// Set position for entering and updating nodes.
g.attr("transform", function(d, i) {
return "translate(" + i * (b.w + b.s) + ", 0)";
});
// Remove exiting nodes.
g.exit().remove();
// Now move and update the percentage at the end.
d3.select("#trail").select("#endlabel")
.attr("x", (nodeArray.length + 0.5) * (b.w + b.s))
.attr("y", b.h / 2)
.attr("dy", "0.35em")
.attr("text-anchor", "middle")
.text(percentageString);
// Make the breadcrumb trail visible, if it's hidden.
d3.select("#trail")
.style("visibility", "");
}
function drawLegend() {
// Dimensions of legend item: width, height, spacing, radius of rounded rect.
var li = {
w: 75, h: 30, s: 3, r: 3
};
var legend = d3.select("#legend").append("svg:svg")
.attr("width", li.w)
.attr("height", d3.keys(colors).length * (li.h + li.s));
var g = legend.selectAll("g")
.data(d3.entries(colors))
.enter().append("svg:g")
.attr("transform", function(d, i) {
return "translate(0," + i * (li.h + li.s) + ")";
});
g.append("svg:rect")
.attr("rx", li.r)
.attr("ry", li.r)
.attr("width", li.w)
.attr("height", li.h)
.style("fill", function(d) { return d.value; });
g.append("svg:text")
.attr("x", li.w / 2)
.attr("y", li.h / 2)
.attr("dy", "0.35em")
.attr("text-anchor", "middle")
.text(function(d) { return d.key; });
}
function toggleLegend() {
var legend = d3.select("#legend");
if (legend.style("visibility") == "hidden") {
legend.style("visibility", "");
} else {
legend.style("visibility", "hidden");
}
}
// Take a 2-column CSV and transform it into a hierarchical structure suitable
// for a partition layout. The first column is a sequence of step names, from
// root to leaf, separated by hyphens. The second column is a count of how
// often that sequence occurred.
function buildHierarchy(csv) {
var root = {"name": "root", "children": []};
for (var i = 0; i < csv.length; i++) {
var sequence = csv[i][0];
var size = +csv[i][1];
if (isNaN(size)) { // e.g. if this is a header row
continue;
}
var parts = sequence.split("-");
var currentNode = root;
for (var j = 0; j < parts.length; j++) {
var children = currentNode["children"];
var nodeName = parts[j];
var childNode;
if (j + 1 < parts.length) {
// Not yet at the end of the sequence; move down the tree.
var foundChild = false;
for (var k = 0; k < children.length; k++) {
if (children[k]["name"] == nodeName) {
childNode = children[k];
foundChild = true;
break;
}
}
// If we don't already have a child node for this branch, create it.
if (!foundChild) {
childNode = {"name": nodeName, "children": []};
children.push(childNode);
}
currentNode = childNode;
} else {
// Reached the end of the sequence; create a leaf node.
childNode = {"name": nodeName, "size": size};
children.push(childNode);
}
}
}
return root;
};
pnormal-bauto-bnormal-end 5748
pnormal-bauto-bagent-end 950
bnormal-end 799
pnormal-bauto-bnotice-end 753
pnormal-bauto-bnotice-bnormal-end 728
pnormal-bauto-bupdate-bnormal-end 644
pnormal-bauto-bagent-bnormal-end 582
pnormal-bauto-bnormal-pnormal-end 421
pnormal-bnormal-end 410
pnormal-bauto-bnormal-pnormal-bnormal-end 359
pnormal-bauto-bnormal-bnormal-end 220
pnormal-bauto-bagent-bnotice-end 193
pnormal-bauto-bupdate-bupdate-bnormal-end 179
pnormal-bauto-bagent-pnormal-bnormal-end 173
pnormal-bauto-end 161
bupdate-bnormal-end 125
pnormal-bauto-bupdate-bupdate-bupdate-bupdate 106
pnormal-bauto-pnormal-bnormal-end 102
pnormal-bauto-bnormal-pnormal-bnormal-pnormal 100
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment