Skip to content

Instantly share code, notes, and snippets.

@JnBrymn
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JnBrymn/76100614a6500e193cff to your computer and use it in GitHub Desktop.
Save JnBrymn/76100614a6500e193cff to your computer and use it in GitHub Desktop.
/* MarkovModel creates a Markov model of text (or tokens) and allow you to generate new
* text from the model. It takes two optional arguments:
*
* tokenizer - a function that takes a string and returns an array of tokens
* defaults to a tokenizer that breaks on whitespace and lowercases everything
* shingle_n - the number of tokens that make up a state in the markov model
* the higher the number the more realistic the generated data, but the more
* training data required
* defaults to 1
* join_str - string used to join text together
* defaults to space
*/
function MarkovModel(tokenizer,shingle_n,join_str) {
this.shingle_n = shingle_n || 1;
this.tokenizer = tokenizer || function(text) {
return text.toLowerCase().split(/\s/);
};
this.join_str = join_str || " ";
//Consider trying this too
// this.shingle_n = shingle_n || 2;
// this.tokenizer = tokenizer || function(text) {
// return text.toLowerCase().split(/\b/);
// };
this.delimiter = "\u0037";
this.start_token = "\u0002";
this.end_token = "\u0003";
this._shift_key = function(current_key,next_token) {
var tokens = current_key.split(this.delimiter);
tokens.push(next_token);
while (tokens.length > this.shingle_n) {
tokens = tokens.slice(1,tokens.length);
}
return tokens.join(this.delimiter);
};
this.model = {};
}
MarkovModel.prototype.addSample = function(text) {
if (typeof text == "string") {
text = this.tokenizer(text);
}
text.push(this.end_token);
var new_key_struct = function() {
return {count:0, tokens_and_counts:{}};
};
key = this.start_token;
for (var i = 0; i < text.length; i++) {
this.model[key] = this.model[key] || new_key_struct();//make sure it's been initialized
this.model[key].count ++;
var token = text[i];
this.model[key].tokens_and_counts[token] = this.model[key].tokens_and_counts[token] || 0;//make sure it's been initialized
this.model[key].tokens_and_counts[token]++;
key = this._shift_key(key,token);
}
};
MarkovModel.prototype.generateText = function(max_len) {
max_len = max_len || 100;
var tokens = [];
key = this.start_token;
for (var i = 0; i < max_len; i++) {
var sub_model = this.model[key];
//here I pick a work with probability based upon how common the token was for the given key
until = Math.floor(Math.random()*sub_model.count);
for (var token in sub_model.tokens_and_counts) {
until -= sub_model.tokens_and_counts[token]; //subtract the count for this token
if (until <= 0) {
if (token == this.end_token) {
//then we've reached the end of a sentence
return tokens.join(this.join_str);
}
tokens.push(token);
key = this._shift_key(key,token);
break;
}
}
}
return tokens.join(this.join_str);
};
//usage
var mm = new MarkovModel();
for (var x in hits) {
mm.addSample(getSentenceFromSomewhere());
}
console.log(mm.generateText());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment