|
from communication.models import Discussion, Message |
|
from collections import defaultdict, Counter |
|
import random |
|
from datetime import datetime, timedelta |
|
from lib.common.utils import get_start_time_for_rate_limit_counting |
|
|
|
start = get_start_time_for_rate_limit_counting(datetime.utcnow())-timedelta(days=30) |
|
today = get_start_time_for_rate_limit_counting(datetime.utcnow()) |
|
days = 30 |
|
data = [] |
|
discussion_sequence_list = [] |
|
entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'} |
|
message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'notice','notice':'notice'} |
|
discussion_sequence = defaultdict(list) |
|
# GO THROUGH EACH DAY |
|
for day in range(days): |
|
start = start + timedelta(days=1) |
|
stop = start + timedelta(days=1) |
|
print '===========\n%s - %s' % (start, stop) |
|
discussion_counted_list = [] |
|
# GET MESSAGES THAT DAY |
|
messages = Message.objects.filter(created_at__gte=start).filter(created_at__lt=stop) |
|
messages_count = messages.count() |
|
print 'msgs on %s: %s' % (day, messages_count) |
|
# GO THROUGH EACH DISCUSSION THAT HAD MSGS THAT DAY |
|
counter = 0 |
|
for message in messages: |
|
counter+=1 |
|
if message.discussion in discussion_counted_list: |
|
continue |
|
if counter % 500 == 0: |
|
print "\tOn %s of %s" % (counter, messages_count) |
|
msgs_in_disc_today = message.discussion.message_set.filter(created_at__gte=start).filter(created_at__lt=stop).order_by('created_at') |
|
sequence = [] |
|
for msg in msgs_in_disc_today: |
|
try: |
|
etype = entity_type_map[msg.sender.entity_type] |
|
mtype = message_type_map[msg.message_type] |
|
except: |
|
continue |
|
# SEQUENCE |
|
sequence.append('%s%s' % (etype,mtype)) |
|
if len(msgs_in_disc_today)<=6: |
|
sequence.append('end') |
|
else: |
|
sequence = sequence[:6] |
|
# append sequence to data |
|
data.append('-'.join(sequence)) |
|
discussion_counted_list.append(message.discussion) |
|
print 'counting' |
|
cnt = Counter(data) |
|
for item in cnt.most_common(75): |
|
seq, count = item |
|
print '%s,%s' % (seq, count) |
|
|
|
|
|
#### |
|
# 'duuid':['sequence'] |
|
|
|
class FindMatches(object): |
|
""" |
|
USAGE |
|
discussions = Discussion.objects.filter(created_at__gte='2014-06-01').filter(created_at__lt='2014-06-27') |
|
fm = FindMatches(discussions=discussions) |
|
fm.process() |
|
typec = fm.dmatches_sequence(fm.dseq, ['person-normal','place-auto','place-notice']) |
|
""" |
|
def __init__(self, *args, **kwargs): |
|
self.entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'} |
|
self.message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'notice','notice':'notice'} |
|
self.discussions = kwargs.get('discussions') |
|
|
|
def process(self): |
|
self.dseq = self.get_all_sequences(self.discussions) |
|
|
|
def get_all_sequences(self, discussions): |
|
""" |
|
Get all sequences for all discussions provided |
|
""" |
|
from collections import defaultdict |
|
dseq = defaultdict(list) |
|
dcount = self.discussions.count() |
|
counter = 0 |
|
for d in self.discussions: |
|
counter+=1 |
|
if counter % 500 == 0: |
|
print 'On %s of %s' % (counter, dcount) |
|
dseq[d.uuid] = self.build_sequence(d) |
|
return dseq |
|
def build_sequence(self, discussion, start=None, stop=None, truncate=None): |
|
""" |
|
Build a sequence from a particular discussion |
|
""" |
|
sequence = [] |
|
if start is None and stop is None: |
|
msgs_in_disc_today = discussion.message_set.all() |
|
else: |
|
msgs_in_disc_today = discussion.message_set.filter(created_at__gte=start).filter(created_at__lt=stop).order_by('created_at') |
|
# Get all messages in sequence |
|
for msg in msgs_in_disc_today: |
|
try: |
|
etype = self.entity_type_map[msg.sender.entity_type] |
|
mtype = self.message_type_map[msg.message_type] |
|
except: |
|
continue |
|
# SEQUENCE |
|
sequence.append('%s%s' % (etype,mtype)) |
|
sequence.append('end') |
|
return sequence |
|
|
|
def dmatches_sequence(self, dseq, sequence): |
|
""" |
|
find discussions that match the given sequence |
|
sequence: [person-normal, place-normal]... |
|
Takes a defaultdict of {'duuid':[sequence]....} |
|
""" |
|
from communication.models import Discussion |
|
matches = [] |
|
# transform passed in sequence |
|
collapsed_seq = '-'.join(["%s%s" % (self.entity_type_map[item.split('-')[0]],self.message_type_map[item.split('-')[1]]) for item in sequence]) |
|
# iterate through dseq |
|
for duuid, seq in dseq.items(): |
|
if collapsed_seq in '-'.join(seq): |
|
matches.append(Discussion.objects.get(uuid=duuid)) |
|
return matches |
|
|
|
# ######################################## |
|
# # Find discussions with particular sequence |
|
# # sequence = 'normal-auto-agent' |
|
# def discussions_with_sequence(discussions, sequence): |
|
# entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'} |
|
# message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'notice','notice':'notice'} |
|
|
|
# matches = [] |
|
|
|
# for discussion in discussions: |
|
# observed_sequence = '-'.join(["%s%s" % (entity_type_map[message.sender.entity_type], message_type_map[message.message_type]) for message in discussion.message_set.all()]) |
|
# if sequence in observed_sequence: |
|
# matches.append(discussion) |
|
# return matches |
|
|
|
|
|
|
|
|
|
|
|
|
|
# GENERATE SEQUENCE DATA (with Counts) |
|
prep_data = [] |
|
map_data = {'person':'p','place':'b', 'admin':'b', 'assistant':'b'} |
|
for discussion in discussions: |
|
sequence_list = ["%s%s" % (map_data[message.sender.entity_type], message.message_type) for message in discussion.message_set.all().order_by('created_at')] |
|
if discussion.message_set.count() <= 6: |
|
sequence_list.append("end") |
|
else: |
|
sequence_list = sequence_list[:6] |
|
prep_data.append('-'.join(sequence_list)) |
|
cnt = Counter(prep_data) |
|
for item in cnt.items(): |
|
seq, count = item |
|
print '%s,%s' % (seq, count) |
|
|
|
|
|
# GENERATE HIERARCHICAL DATA |
|
for discussion in discussions: |
|
parent = None |
|
for message in discussion.message_set.all().order_by('created_at'): |
|
message_type = '%s-%s' % (message.sender.entity_type, message.message_type) |
|
if parent is not None: |
|
data[parent].append(message_type) |
|
parent = message_type |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start = get_start_time_for_rate_limit_counting(datetime.utcnow())-timedelta(days=30) |
|
today = get_start_time_for_rate_limit_counting(datetime.utcnow()) |
|
days = 30 |
|
data = [] |
|
discussion_sequence_list = [] |
|
entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'} |
|
message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'notice','notice':'notice'} |
|
discussion_sequence = defaultdict(list) |
|
# GO THROUGH EACH DAY |
|
discussions = Discussion.objects.filter(created_at__gte=start).filter(created_at__lt='2014-06-27') |
|
dcount = discussions.count() |
|
counter = 0 |
|
for discussion in discussions: |
|
counter+=1 |
|
if counter%500==0: |
|
print 'on %s of %s' % (counter,dcount) |
|
msgs_in_disc_today = discussion.message_set.all().order_by('created_at') |
|
sequence = [] |
|
for msg in msgs_in_disc_today: |
|
try: |
|
etype = entity_type_map[msg.sender.entity_type] |
|
mtype = message_type_map[msg.message_type] |
|
except: |
|
continue |
|
# SEQUENCE |
|
sequence.append('%s%s' % (etype,mtype)) |
|
if len(msgs_in_disc_today)<=6: |
|
sequence.append('end') |
|
else: |
|
sequence = sequence[:6] |
|
# append sequence to data |
|
data.append('-'.join(sequence)) |
|
print 'counting' |
|
cnt = Counter(data) |
|
for item in cnt.most_common(75): |
|
seq, count = item |
|
print '%s,%s' % (seq, count) |
|
|
|
|
|
|
|
def build_sequence(discussion): |
|
""" |
|
Build a sequence from a particular discussion |
|
""" |
|
sequence = [] |
|
entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'} |
|
message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'update','notice':'notice'} |
|
|
|
# Get all messages in sequence |
|
for msg in discussion.message_set.all(): |
|
try: |
|
etype = entity_type_map[msg.sender.entity_type] |
|
mtype = message_type_map[msg.message_type] |
|
except: |
|
continue |
|
# SEQUENCE |
|
sequence.append('%s%s' % (etype,mtype)) |
|
sequence.append('end') |
|
return sequence |
|
|
|
|
|
def dmatches_sequence(discussions, sequence): |
|
""" |
|
find discussions that match the given sequence |
|
sequence: [person-normal, place-normal]... |
|
Takes a defaultdict of {'duuid':[sequence]....} |
|
""" |
|
from communication.models import Discussion |
|
entity_type_map = {'person':'p','place':'b','admin':'b','assistant':'b'} |
|
message_type_map = {'normal':'normal','agent':'agent','auto':'auto','update':'update','notice':'notice'} |
|
matches = [] |
|
# transform passed in sequence |
|
collapsed_seq = '-'.join(["%s%s" % (entity_type_map[item.split('-')[0]],message_type_map[item.split('-')[1]]) for item in sequence]) |
|
# iterate through dseq |
|
for discussion in discussions: |
|
# BUILD SEQUENCE |
|
sequence = [] |
|
for msg in discussion.message_set.all(): |
|
try: |
|
etype = entity_type_map[msg.sender.entity_type] |
|
mtype = message_type_map[msg.message_type] |
|
except: |
|
continue |
|
sequence.append('%s%s' % (etype,mtype)) |
|
if collapsed_seq in '-'.join(sequence): |
|
matches.append(discussion) |
|
break |
|
return matches |
|
|
|
|
|
def print_discussion(discussion): |
|
import unicodedata |
|
out = '' |
|
name = unicodedata.normalize('NFKD', discussion.place().name).encode('ascii','ignore') |
|
next_day = False |
|
out+='-'*len(name)+'\n' |
|
out+= name+'\n' |
|
out+='-'*len(name)+'\n' |
|
for message in discussion.message_set.all(): |
|
content = unicodedata.normalize('NFKD', message.content).encode('ascii','ignore') |
|
if message.created_at.date() > discussion.created_at.date() and not next_day: |
|
out+='\t====== next day ======'+'\n' |
|
next_day = True |
|
if message.sender.entity_type=='person': |
|
out+='user : %s' % content +'\n' |
|
else: |
|
out+='place (%s): %s' % (message.message_type[:4], content)+'\n' |
|
out+='\n' |
|
return out |
|
|
|
# ANALYSIS |
|
discussions = Discussion.objects.filter(created_at__gte='2014-06-01').filter(created_at__lt='2014-06-27') |
|
f = open('/home/talkto/tmp/sample_discussions_type_c__agent.txt','w') |
|
type_b=dmatches_sequence(discussions,['person-normal','place-auto','place-notice']) |
|
for d in type_c: |
|
f.write(print_discussion(d)) |
|
|