|
<!DOCTYPE html> |
|
<head> |
|
<meta charset="utf-8"> |
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script> |
|
<script src='https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.11.2/lodash.js'></script> |
|
<link href='https://fonts.googleapis.com/css?family=Lora' rel='stylesheet' type='text/css'> |
|
|
|
<style> |
|
body { |
|
font-family: 'Lora', serif; |
|
margin:0; |
|
color: #49438C; |
|
} |
|
#main { |
|
width: 1000px; |
|
} |
|
#left, #right { |
|
width: 45%; |
|
display: inline-block; |
|
padding: 15px; |
|
vertical-align: top; |
|
|
|
} |
|
#left div, #right div { |
|
display: inline-block; |
|
padding: 5px; |
|
} |
|
|
|
</style> |
|
</head> |
|
|
|
<body> |
|
<div id='main'> |
|
<div id='left'></div> |
|
<div id='right'></div> |
|
</div> |
|
|
|
<script> |
|
var startDate = new Date('2016-04-25T00:00:00-04:00'); |
|
var endDate = new Date('2016-04-27T00:00:00-04:00'); |
|
var dateFormat = d3.time.format('%x %I:%M%p'); |
|
// 100 most common words, taken from https://gist.github.com/gravitymonkey/2406023 |
|
// thank you gravitymonkey you beautiful person. |
|
var commonWords = ["the","of","and","a","to","in","is","you","that","it","he","was","for","on","are","as","with","his","they","I","at","be","this","have","from","or","one","had","by","word","but","not","what","all","were","we","when","your","can","said","there","use","an","each","which","she","do","how","their","if","will","up","other","about","out","many","then","them","these","so","some","her","would","make","like","him","into","time","has","look","two","more","write","go","see","number","no","way","could","people","my","than","first","water","been","call","who","oil","its","now","find","long","down","day","did","get","come","made","may","part"]; |
|
var customWords = ["openvisconf", "talk", "me", "here", "im", "very", "just", "too", "really", "much", "our", "us", "most", "another", "off", "should", "cant", "via", "going", "dont", "also", "says", "always", "after", "such", "check", "need", "keep", "say", "any", "hey", "between", "–", "over", "강남풀싸롱", "available", "gt", "got", "still", "lots", "being", "seen", "looks", "free", "am", "users", "take", "tiny", "own", "before", "big", "england", "back", "ive", "everyone", "super", "maybe", "stuff", "even", "lot", "make", "last", "open", "through", "something", "httpstcozc0ps1kc8h"]; |
|
|
|
commonWords = _.chain(commonWords) |
|
.union(customWords) |
|
.reduce(function(obj, word) { |
|
word = word.toLowerCase(); |
|
obj[word] = 1; |
|
return obj |
|
}, {}).value(); |
|
var translations = { |
|
"datavis": "dataviz", |
|
"viz": "vis", |
|
"charts": "chart", |
|
"tools": "tool", |
|
"things": "thing", |
|
"visualizations": "visualization", |
|
"using": "use", |
|
"making": "make", |
|
"slides": "slide", |
|
"talks": "talk", |
|
"learning": "learn", |
|
"visualizing": "visualize", |
|
"showing": "show", |
|
"looking": "look", |
|
"talking": "talk", |
|
"thanks": "thank", |
|
"tweets": "tweet", |
|
"working": "work", |
|
"maps": "map", |
|
"thinking": "think", |
|
"speakers": "speaker", |
|
"friends": "friend", |
|
"d3js": "d3", |
|
"days": "day", |
|
"folks": "folk" |
|
} |
|
|
|
d3.json('tweets.json', function(tweets) { |
|
tweets = _.chain(tweets) |
|
.filter(tweet => { |
|
tweet.date = new Date(tweet.postedTime); |
|
return !tweet.body.match(/^RT/) && |
|
startDate <= tweet.date && tweet.date <= endDate; |
|
}).sortBy(tweet => -tweet.date) |
|
.reduce((obj, tweet) => { |
|
obj[tweet.link] = tweet; |
|
return obj; |
|
}, {}) |
|
.value(); |
|
|
|
var words = {}; |
|
_.each(tweets, function(tweet, key) { |
|
var username = tweet.actor.preferredUsername; |
|
_.each(tweet.body.split(' '), function(word) { |
|
word = word.toLowerCase() |
|
.replace(/[.,\/#!$%\^&\*;:{}=\-_`~()'|+]/g,"").replace(/\s/g, ""); |
|
// if word is a mention, or one of the 100 most common words |
|
if (!word || word.startsWith('@') || commonWords[word] || parseInt(word)) return; |
|
// also translate some words to a more common form |
|
word = translations[word] || word; |
|
|
|
var wordObj = words[word]; |
|
if (!wordObj) { |
|
wordObj = words[word] = { |
|
text: word, |
|
count: 0, |
|
tweets: {}, |
|
users: {} |
|
}; |
|
} |
|
|
|
if (!wordObj.tweets[key]) { |
|
wordObj.count += 1; |
|
wordObj.tweets[key] = 1; |
|
wordObj.users[username] = 1; |
|
} |
|
}); |
|
}); |
|
|
|
words = _.chain(words) |
|
.sortBy(function(word) { |
|
return -word.count; |
|
}).take(100).value(); |
|
|
|
// now find the words closely correlated with each other |
|
var filteredTweets = {}; |
|
var wordsByTweets = {}; |
|
_.each(words, function(word) { |
|
_.each(word.tweets, function(val, tweet) { |
|
var wBT = wordsByTweets[tweet]; |
|
if (!wBT) { |
|
wBT = wordsByTweets[tweet] = {}; |
|
} |
|
wBT[word.text] = 1; |
|
|
|
filteredTweets[tweet] = tweets[tweet]; |
|
}); |
|
}); |
|
|
|
var correlations = {}; |
|
_.each(wordsByTweets, function(words, tweet) { |
|
// this is so inefficient hahaha yay |
|
_.each(words, function(val, word1) { |
|
_.each(words, function(val, word2) { |
|
if (word1 !== word2) { |
|
var key = [word1, word2].sort().join(','); |
|
if (!correlations[key]) { |
|
correlations[key] = { |
|
count: 0, |
|
source: word1, |
|
target: word2, |
|
tweets: {} |
|
}; |
|
} |
|
|
|
if (!correlations[key].tweets[tweet]) { |
|
correlations[key].count += 1; |
|
correlations[key].tweets[tweet] = 1; |
|
} |
|
} |
|
}); |
|
}); |
|
}); |
|
|
|
correlations = _.chain(correlations) |
|
.filter(function(correlation) {return correlation.count > 1}) |
|
.sortBy(function(correlation) {return -correlation.count}) |
|
.value(); |
|
|
|
var left = d3.select('#left'); |
|
var right = d3.select('#right'); |
|
left.append('h1') |
|
.text('top 100 words'); |
|
right.append('h1') |
|
.text('commonly correlated words'); |
|
left.selectAll('div') |
|
.data(words) |
|
.enter().append('div') |
|
.text(function(d) {return d.text}); |
|
right.selectAll('div') |
|
.data(correlations) |
|
.enter().append('div') |
|
.text(function(d) {return d.source + ',' + d.target}); |
|
|
|
_.each(words, function(word) { |
|
word.tweets = _.keys(word.tweets); |
|
word.users = _.keys(word.users); |
|
}); |
|
_.each(correlations, function(correlation) { |
|
correlation.tweets = _.keys(correlation.tweets); |
|
}); |
|
_.each(wordsByTweets, function(words, tweet) { |
|
wordsByTweets[tweet] = _.keys(words); |
|
}); |
|
|
|
// console.log(JSON.stringify(filteredTweets)); |
|
console.log(JSON.stringify(words)) |
|
// console.log(JSON.stringify(correlations)); |
|
// console.log(JSON.stringify(wordsByTweets)); |
|
// console.log(_.size(filteredTweets)); |
|
|
|
}); |
|
</script> |
|
</body> |