Skip to content

Instantly share code, notes, and snippets.

@melisabok
Last active October 6, 2015 23:51
Most informative ngrams by gender based on spanish tweets

Bubble plot show the most informative features got running a Bayesian classifier over 38.000 spanish tweets.

  • The tweets are from the year 2013 and localized in Argentina.
  • Features are ngrams from 1 to 6 length.
  • The plot shows the 100 most informative features of the NaiveBayes classifier:
classifier.show_most_informative_features(100)

More info about Naive Bayes

This is part of my Master thesis that is trying to predict the gender of the author of the tweet.

<!DOCTYPE html>
<meta charset="utf-8">
<style>
text {
font: 10px arial;
}
.Female {
fill: #fc8d59;
}
.Male {
fill: #91bfdb;
}
.legend {
padding: 5px;
font: 10px sans-serif;
background: yellow;
box-shadow: 2px 2px 1px #888;
}
</style>
<body>
<div align='center'>
<button type="button" onclick="renderAll()" style="font-size:15px">All</button>
<button type="button" onclick="renderByGender()" style="font-size:15px">By Gender</button>
</div>
</body>
<script src="http://d3js.org/d3.v3.min.js"></script>
<script>
var diameter = 485,
format = d3.format(",d"),
all_nodes,
female_nodes,
male_nodes,
split_nodes = [];
var bubble_all = d3.layout.pack()
.sort(null)
.size([485, 485])
.padding(1.5);
var bubble = d3.layout.pack()
.sort(null)
.size([diameter, diameter])
.padding(1.5);
var svg = d3.select("body").append("svg")
.attr("width", 970)
.attr("height", 787)
.attr("class", "bubble");
d3.json("most_informative_features.json", function(error, json) {
if (error) throw error;
var females = [],
males = [];
for(i=0; i<json.length; i++){
if(json[i].gender == 'Female'){
females.push(json[i]);
} else {
males.push(json[i]);
}
}
all_nodes = {
text: "root",
children: json
}
female_nodes = {
text: "root",
children: females
}
male_nodes = {
text: "root",
children: males
}
all_nodes = bubble_all.nodes(classes(all_nodes));
female_nodes = bubble.nodes(classes(female_nodes));
male_nodes = bubble.nodes(classes(male_nodes));
//Remove root
all_nodes.splice(0,1);
female_nodes.splice(0, 1);
male_nodes.splice(0, 1);
split_nodes = female_nodes.slice();
for(i=0; i < male_nodes.length; i++) {
split_nodes.push(male_nodes[i]);
}
update(all_nodes, 'all');
addLegend();
});
// Returns a flattened hierarchy containing all leaf nodes under the root.
function classes(root) {
var classes = [];
function recurse(name, node) {
if (node.children) node.children.forEach(function(child) { recurse(node.text, child); });
else classes.push({packageName: node.gender, className: node.text, value: node.size});
}
recurse(null, root);
return {children: classes};
}
function update(nodes, mode){
var node = svg.selectAll(".node")
.data(nodes, function(d) { return d.className; });
//UPDATE
node.transition()
.duration(750)
.attr("transform", function(d) {
if(mode == 'all'){
return "translate(" + (d.x + 243) + "," + d.y + ")";
} else {
if(d.packageName == 'Female') {
return "translate(" + (d.x + 1) + "," + d.y + ")";
}
return "translate(" + (d.x + 485) + "," + d.y + ")";
}
});
node.select('circle')
.transition()
.duration(750)
.attr("r", function(d) { return d.r; })
//INSERT
var news = node.enter().append("g")
.attr("class", function(d) { return "node"; })
.attr("transform", function(d) { return "translate(" + (d.x + 243) + "," + d.y + ")"; });
news.append("title")
.text(function(d) { return d.className + ": " + d.value; });
news.append("circle")
.attr("r", function(d) { return d.r; })
.attr("class", function(d) { return d.packageName; });
news.append("text")
.attr("dy", ".3em")
.style("text-anchor", "middle")
.text(function(d) { return d.className.substring(0, d.r / 3); });
//REMOVE
node.exit().remove();
}
function renderByGender() {
update(split_nodes, 'split');
}
function renderAll() {
update(all_nodes, 'all');
}
function addLegend() {
var legend = svg.append("g")
.attr("class", "legend")
.attr("x", 0)
.attr("y", 25)
.attr("height", 100)
.attr("width", 100);
legend.append("rect")
.attr("x", 2)
.attr("y", 25)
.attr("width", 20)
.attr("height", 20)
.style("fill", '#fc8d59');
legend.append("text")
.attr("x", 27)
.attr("y", 35)
.attr("height",30)
.attr("width",100)
.style("fill", '#fc8d59')
.text('Female');
legend.append("rect")
.attr("x", 2)
.attr("y", 50)
.attr("width", 20)
.attr("height", 20)
.style("fill", '#91bfdb');
legend.append("text")
.attr("x", 27)
.attr("y", 60)
.attr("height",30)
.attr("width",100)
.style("fill", '#91bfdb')
.text('Male');
}
d3.select(self.frameElement).style("height", diameter + "px");
</script>
[
{
"text":"cansada",
"size":19.48,
"gender":"Female"
},
{
"text":"passarella",
"size":17.23,
"gender":"Male"
},
{
"text":"conocerlo",
"size":15.81,
"gender":"Female"
},
{
"text":"que grande",
"size":14.21,
"gender":"Male"
},
{
"text":"nosotras",
"size":12.87,
"gender":"Female"
},
{
"text":"ramon",
"size":11.29,
"gender":"Male"
},
{
"text":"con las chicas",
"size":10.66,
"gender":"Female"
},
{
"text":"los jugadores",
"size":9.97,
"gender":"Male"
},
{
"text":"laburo",
"size":9.37,
"gender":"Male"
},
{
"text":"jugadores",
"size":8.77,
"gender":"Male"
},
{
"text":"fabbro",
"size":8.77,
"gender":"Male"
},
{
"text":"me da miedo",
"size":8.45,
"gender":"Female"
},
{
"text":"mi familia",
"size":8.45,
"gender":"Female"
},
{
"text":"muy feliz",
"size":8.45,
"gender":"Female"
},
{
"text":"mejor amiga",
"size":8.45,
"gender":"Female"
},
{
"text":"aw",
"size":8.45,
"gender":"Female"
},
{
"text":"es imposible",
"size":8.45,
"gender":"Female"
},
{
"text":"abrazarlo",
"size":8.35,
"gender":"Female"
},
{
"text":"informacion",
"size":8.16,
"gender":"Male"
},
{
"text":"kranevitter",
"size":8.16,
"gender":"Male"
},
{
"text":"el twitter",
"size":8.16,
"gender":"Male"
},
{
"text":"afip",
"size":8.16,
"gender":"Male"
},
{
"text":"lo amo",
"size":7.92,
"gender":"Female"
},
{
"text":"se nada",
"size":7.72,
"gender":"Female"
},
{
"text":"la foto que",
"size":7.72,
"gender":"Female"
},
{
"text":"mi cabeza",
"size":7.72,
"gender":"Female"
},
{
"text":"los pies",
"size":7.72,
"gender":"Female"
},
{
"text":"ansias",
"size":7.72,
"gender":"Female"
},
{
"text":"cumplir",
"size":7.72,
"gender":"Female"
},
{
"text":"tan lindo",
"size":7.72,
"gender":"Female"
},
{
"text":"a ti",
"size":7.72,
"gender":"Female"
},
{
"text":"tapa",
"size":7.56,
"gender":"Male"
},
{
"text":"gol de",
"size":7.56,
"gender":"Male"
},
{
"text":"bien el",
"size":7.56,
"gender":"Male"
},
{
"text":"vamos river",
"size":7.07,
"gender":"Male"
},
{
"text":"monumental",
"size":7.07,
"gender":"Male"
},
{
"text":"juve",
"size":7.07,
"gender":"Male"
},
{
"text":"concierto",
"size":6.98,
"gender":"Female"
},
{
"text":"aprendi",
"size":6.98,
"gender":"Female"
},
{
"text":"amamos",
"size":6.98,
"gender":"Female"
},
{
"text":"ntvg",
"size":6.98,
"gender":"Female"
},
{
"text":"vieja no",
"size":6.98,
"gender":"Female"
},
{
"text":"que haria",
"size":6.98,
"gender":"Female"
},
{
"text":"tanto que",
"size":6.98,
"gender":"Female"
},
{
"text":"hace mucho no",
"size":6.98,
"gender":"Female"
},
{
"text":"bronca me",
"size":6.98,
"gender":"Female"
},
{
"text":"el amigo",
"size":6.98,
"gender":"Female"
},
{
"text":"twittear",
"size":6.98,
"gender":"Female"
},
{
"text":"tu amiga",
"size":6.98,
"gender":"Female"
},
{
"text":"la afip",
"size":6.95,
"gender":"Male"
},
{
"text":"el monumental",
"size":6.95,
"gender":"Male"
},
{
"text":"dictadura",
"size":6.95,
"gender":"Male"
},
{
"text":"click",
"size":6.95,
"gender":"Male"
},
{
"text":"mi sueno",
"size":6.87,
"gender":"Female"
},
{
"text":"mi papa",
"size":6.74,
"gender":"Female"
},
{
"text":"presenta",
"size":6.71,
"gender":"Male"
},
{
"text":"lanus",
"size":6.63,
"gender":"Male"
},
{
"text":"toda mi",
"size":6.4,
"gender":"Female"
},
{
"text":"marcha",
"size":6.35,
"gender":"Male"
},
{
"text":"de esto",
"size":6.35,
"gender":"Male"
},
{
"text":"no vas a",
"size":6.35,
"gender":"Male"
},
{
"text":"votar",
"size":6.35,
"gender":"Male"
},
{
"text":"a ramon",
"size":6.35,
"gender":"Male"
},
{
"text":"y me la",
"size":6.35,
"gender":"Male"
},
{
"text":"la champions",
"size":6.35,
"gender":"Male"
},
{
"text":"a jugar al",
"size":6.35,
"gender":"Male"
},
{
"text":"se presenta",
"size":6.35,
"gender":"Male"
},
{
"text":"menseguez",
"size":6.35,
"gender":"Male"
},
{
"text":"iphone",
"size":6.35,
"gender":"Male"
},
{
"text":"las elecciones",
"size":6.35,
"gender":"Male"
},
{
"text":"llegado",
"size":6.35,
"gender":"Male"
},
{
"text":"odio el",
"size":6.25,
"gender":"Female"
},
{
"text":"ojala te",
"size":6.25,
"gender":"Female"
},
{
"text":"tendre",
"size":6.25,
"gender":"Female"
},
{
"text":"sol y",
"size":6.25,
"gender":"Female"
},
{
"text":"me quiero ir a",
"size":6.25,
"gender":"Female"
},
{
"text":"cosa que",
"size":6.25,
"gender":"Female"
},
{
"text":"manana va",
"size":6.25,
"gender":"Female"
},
{
"text":"canciones de",
"size":6.25,
"gender":"Female"
},
{
"text":"voy a quedar",
"size":6.25,
"gender":"Female"
},
{
"text":"q ser",
"size":6.25,
"gender":"Female"
},
{
"text":"el telefono",
"size":6.25,
"gender":"Female"
},
{
"text":"grax",
"size":6.25,
"gender":"Female"
},
{
"text":"say",
"size":6.25,
"gender":"Female"
},
{
"text":"la profe de",
"size":6.25,
"gender":"Female"
},
{
"text":"amo el",
"size":6.25,
"gender":"Female"
},
{
"text":"a mi idolo",
"size":6.25,
"gender":"Female"
},
{
"text":"cerebro",
"size":6.25,
"gender":"Female"
},
{
"text":"la integradora de",
"size":6.25,
"gender":"Female"
},
{
"text":"respuestas",
"size":6.25,
"gender":"Female"
},
{
"text":"me dejan",
"size":6.25,
"gender":"Female"
},
{
"text":"prohibido",
"size":6.25,
"gender":"Female"
},
{
"text":"la revista",
"size":6.25,
"gender":"Female"
},
{
"text":"que estudiar para",
"size":6.25,
"gender":"Female"
},
{
"text":"soy lo",
"size":6.25,
"gender":"Female"
},
{
"text":"no se por",
"size":6.25,
"gender":"Female"
},
{
"text":"madrid",
"size":6.09,
"gender":"Male"
},
{
"text":"niembro",
"size":5.98,
"gender":"Male"
},
{
"text":"barca",
"size":5.98,
"gender":"Male"
},
{
"text":"champions",
"size":5.98,
"gender":"Male"
}
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment