Skip to content

Instantly share code, notes, and snippets.

@JnBrymn
Last active August 29, 2015 13:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JnBrymn/9885157 to your computer and use it in GitHub Desktop.
Save JnBrymn/9885157 to your computer and use it in GitHub Desktop.
Simple Markov Model
from collections import defaultdict
import random
class MarkovModel(object):
"""
Takes iterator of tokens and makes a markov model of the tokens. n is the "order" of the model
None is a special token that serves as a sort of delimiter of phrases.
"""
@classmethod
def _tokenizer(cls,text,token_delim):
for phrase in text.split("\n"):
for token in phrase.split(token_delim):
yield token
yield None
@classmethod
def fromText(cls,text,token_delim=".",n=1):
return MarkovModel(MarkovModel._tokenizer(text,token_delim),n)
def __init__(self,token_iterator,n=1):
self.n = n
self.model_dict = defaultdict(lambda: {"count":0,"tokens_and_counts":defaultdict(int)})
key = (None,) #this is a tuple
for token in token_iterator:
sub_dict = self.model_dict[key]
sub_dict["count"] += 1
sub_dict["tokens_and_counts"][token] += 1
key = self._shift_key(key,token)
self.model_dict.default_factory = lambda:None #make it so that you can't add anything new
def __repr__(self):
string = ""
for key,counts in self.model_dict.iteritems():
string += "{0}\tcount:{1}\n".format(key,counts["count"])
for token,count in counts["tokens_and_counts"].iteritems():
string += "\t{0}\tcount:{1}\n".format(token,count)
return string
def generateSample(self,max_tokens=100):
key = (None,)
tokens = []
for i in xrange(max_tokens):
sub_dict = self.model_dict[key]
if sub_dict is None:
return tokens #here we have reached a dead end
until = random.randint(0,sub_dict["count"])
for token,count in sub_dict["tokens_and_counts"].iteritems():
until -= count
if until <= 0:
if token is None:
return tokens #here we have reached the end of a phrase
tokens.append(token)
key = self._shift_key(key,token)
break
return tokens #here we have reached the max_tokens
def _shift_key(self,key,token):
if token is None:
key = (token,)
else:
key = list(key)
key.append(token)
if len(key)>self.n:
del(key[0])
key = tuple(key)
return key
@JnBrymn
Copy link
Author

JnBrymn commented Mar 31, 2014

Works like this:

>>> mm = MarkovModel("I am not a number I am a free man")
>>> print "".join(mm.generateSample())
I anot I am I anuma a I I am a not a a I fr a I fr not I a I I not fr a numa I not I numa a a I not

You can implement higher order models as well

>>> mm = MarkovModel("I am not a number I am a free man",n=2)
>>> print "".join(mm.generateSample())
I a not am not am number I am am a not a free man

And you can do words (or any token you wish)

>>> mm = MarkovModel("I am not a number I am a free man".split(" "))
>>> print " ".join(mm.generateSample())
I am a number I am not a number I am a number I am a free man

Numbers work too.

mm = MarkovModel([1,2,4,3,5,4,4,None,5,1,2,4,2,None,3,4,1,1,3,5,2,None,3,4,2,1,3,4,2,None,1,3,4,2,3,None,2,1,4,3],n=2)

None is considered a special "phrase" delimeter.

The pretty print is also not bad:

('a',)  count:2
    number  count:1
    free    count:1
('I',)  count:2
    am  count:2
('am',) count:2
    not count:1
    a   count:1
('number',) count:1
    I   count:1
('not',)    count:1
    a   count:1
('free',)   count:1
    man count:1
(None,) count:1
    I   count:1

@JnBrymn
Copy link
Author

JnBrymn commented Apr 26, 2014

If you want to make fun of people's tweets, here's a good way to do it!

import tweepy
import os
auth = tweepy.OAuthHandler(os.getenv("TWITTER_CONSUMER_KEY"),os.getenv("TWITTER_CONSUMER_SECRET"))
auth.set_access_token(os.getenv("TWITTER_BOT_TOKEN"), os.getenv("TWITTER_BOT_SECRET"))
t = tweepy.API(auth)

def make_fun_of(screen_name,n=1):
    ms=t.user_timeline( screen_name=screen_name,count=200)
    text = []
    for m in ms:
        text.extend(m.text.split(" "))
        text.append(None)
    mm = MarkovModel(text,n=n)
    return mm

use it this way

bbombgardener = make_fun_of("bbombgardener")
print " ".join(bbombgardener.generateSample())

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment