josiahdavis · October 3, 2015 22:10
diff --git a/.DS_Store b/.DS_Store
diff --git a/Readme.md b/Readme.md
diff --git a/data.csv b/data.csv
diff --git a/index.html b/index.html
 <!DOCTYPE html>
 <meta charset="utf-8">
 <style>

 text {
  font: 24px "Helvetica Neue", Helvetica, Arial, sans-serif;
  text-anchor: middle;
  pointer-events: none;
 }

 circle {
  fill: #ccc;
 }

 .node:hover circle {
  fill: #d62728;
 }

 .d3-tip {
  line-height: 1;
  font: 14px sans-serif;
  padding: 12px;
  background: rgba(0, 0, 0, 0.8);
  color: rgb(185, 185, 185);
  border-radius: 2px;
 }

 /* Creates a small triangle extender for the tooltip */
 .d3-tip:after {
  box-sizing: border-box;
  display: inline;
  font-size: 10px;
  width: 100%;
  line-height: 1;
  color: rgba(0, 0, 0, 0.8);
  content: "\25BC";
  position: absolute;
  text-align: center;
 }

 /* Style northward tooltips differently */
 .d3-tip.n:after {
  margin: -1px 0 0 0;
  top: 100%;
  left: 0;
 }

 </style>
 <body>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>
 <script src="http://labratrevenge.com/d3-tip/javascripts/d3.tip.v0.6.3.js"></script>
 <script>

 // Modified from the Mike Bostock example here: http://bl.ocks.org/mbostock/1846692

 var margin = {top: 20, right: 20, bottom: 30, left: 50},
    width = 960 - margin.left - margin.right,
    height = 500 - margin.top - margin.bottom,
    bleed = 100;

 var pack = d3.layout.pack()
            .sort(null)
            .size([width, height])
            .padding(2)
            .value(function(d) { return d.counts; });

 var svg = d3.select("body").append("svg")
    .attr("width", width + margin.left + margin.right)
    .attr("height", height + margin.top + margin.bottom)
  .append("g")
    .attr("transform", "translate(" + margin.left + "," + margin.top + ")");

 var tip = d3.tip()
    .attr('class', 'd3-tip')
    .offset([-10, 0])
    .html(function(d) {
      return  "<div><span>Word:</span> <span style='color:white'>" + d.words + "</span></div>" +
      "<div><span>Frequency:</span> <span style='color:white'>" + d.counts +
            " (" + d3.round(100 * d.counts / 5000, 0)+ "%)" + "</span></div>";
    })

 svg.call(tip);

 d3.csv("data.csv", function(error, data) {
  if (error) throw error;

  data.forEach(function(d){
    d.counts = + d.counts;
    return d
  })

  // Convert the data into a format copasetic for the pack layout
  data = { children: data };

  // Create the selection
  var node = svg.selectAll(".node")
    .data(pack.nodes(data).filter(function(d) { return !d.children; }))
  .enter().append("g")
    .attr("class", "node")
    .attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
    .on('mouseover', tip.show)
    .on('mouseout', tip.hide);

  // Append the circles
  node.append("circle")
      .attr("r", function(d) { return d.r; });

  // Append the text
  node.append("text")
      .text(function(d) { return d.words; })
      .style("font-size", function(d) { return Math.min(2 * d.r, (2 * d.r - 8) / this.getComputedTextLength() * 24) + "px"; })
      .attr("dy", ".35em");

 });
 </script>
diff --git a/words.R b/words.R
 # ANALYSIS OF YELP DATA
 # This script tokenizes each review, removes stopwords, identifies 
 # the most common words, and quantifies positivity and negativity 
 # associated with each word.

 # Clear working space
 rm(list = ls())
 gc()

 # Load packages
 library(NLP)
 library(magrittr)
 library(tm)
 library(openNLP)

 library(RWeka)
 library(RTextTools)
 library(SnowballC)
 library(reshape)

 # Load the Data
 loc <- '/Users/josiahdavis/Documents/GitHub/earl/'
 dr <- read.csv(paste(loc, 'yelp_review.csv', sep=""))
 str(dr)
 dim(dr)
 # Create sample because POS tagging is computationally expensive
 # Create a random set of indexes for sampling
 idx <- sample(1:nrow(dr), 5000, replace=FALSE)

 # Conver to list of strings
 texts <- dr[idx,]$text
 texts <- lapply(texts, as.String)

 # =====================================
 # Identify and reviews to only include nouns.
 # This section modified from an excellent tutorial:
 # http://rstudio-pubs-static.s3.amazonaws.com/34069_9ab9f30646474af89ba7849174cab6e9.html
 # =====================================

 # Define function for performing the annotations
 annotate_entities <- function(doc, annotation_pipeline) {
  annotations <- annotate(doc, annotation_pipeline)
  AnnotatedPlainTextDocument(doc, annotations)
 }

 # Define types of annotations to perform
 tagging_pipeline <- list(
  Maxent_Sent_Token_Annotator(),
  Maxent_Word_Token_Annotator(),
  Maxent_POS_Tag_Annotator()
 )

 # Annotate the texts (THIS STEP CAN TAKE A WHILE TO RUN)
 texts_annotated <- texts %>% lapply(annotate_entities, tagging_pipeline)
 str(texts_annotated[[1]], max.level = 2)

 # Define the POS getter function 
 POSGetter <- function(doc, parts) {
  s <- doc$content
  a <- annotations(doc)[[1]]
  k <- sapply(a$features, `[[`, "POS")
  if(sum(k %in% parts) == 0){
    ""
  }else{
    s[a[k %in% parts]]
  }
 }

 # Identify the nouns
 nouns <- texts_annotated %>% lapply(POSGetter, parts = c("NN", "NNS", "NNP", "NNPS"))

 # Turn each character vector into a single string
 nouns <- nouns %>% lapply(as.String)

 # =====================================
 # Perform text mining 
 # transformations
 # =====================================

 # Conver to dataframe
 d <- data.frame(reviews = as.character(nouns))

 # Replace new line characters with spaces
 d$reviews <- gsub("\n", " ", d$reviews)

 # Convert the relevant data into a corpus object with the tm package
 d <- Corpus(VectorSource(d$reviews))

 # Convert everything to lower case
 d <- tm_map(d, content_transformer(tolower))

 # Remove stopwords
 stopwords <- c(stopwords("english"))
 d <- tm_map(d, removeWords, stopwords)

 # Strip whitespace
 d <- tm_map(d, stripWhitespace)

 # Convert to a document term matrix (rows are documents, columns are words)
 dtm <- DocumentTermMatrix(d)
 dim(dtm)

 # Look up most frequent terms
 dtmd <- as.matrix(dtm)
 freq <- colSums(dtmd)
 ord <- order(freq, decreasing = TRUE)
 freq[head(ord, 100)]

 # Create a dataframe
 words <- data.frame(counts = freq[head(ord, 200)])
 words$words <- row.names(words)
 row.names(words) <- 1:dim(words)[1]
 head(words)

 # Sort randomly
 idx <- sample(1:nrow(words), nrow(words), replace=FALSE)
 words <- words[idx,]

 # Write to csv file
 writeLoc <- "/Users/josiahdavis/Documents/d3/words/"
 write.csv(words, paste(writeLoc, "data.csv", sep=""), row.names=FALSE)
	counts	words
	960	staff
	295	selection
	289	cake
	1317	service
	267	prices
	179	chai
	128	plenty
	402	ice
	221	night
	194	items
	231	business
	164	caramel
	105	sunday
	107	café
	1306	tea
	125	cookies
	121	course
	365	lot
	238	shops
	327	boba
	682	drink
	118	cakes
	256	salad
	210	anything
	110	case
	190	bacon
	244	price
	118	vibe
	326	line
	113	corner
	145	amount
	268	kind
	282	house
	156	bagel
	174	someone
	214	wifi
	220	nothing
	205	chicken
	545	menu
	151	water
	269	store
	101	life
	101	world
	101	sauce
	316	sandwiches
	110	honey
	304	side
	452	milk
	339	spot
	240	options
	319	friend
	190	lots
	351	everything
	273	drive
	533	chocolate
	143	front
	325	places
	357	seating
	388	thing
	153	meal
	114	soy
	135	point
	144	donut
	399	lunch
	193	fan
	4666	coffee
	125	size
	101	window
	457	atmosphere
	585	way
	292	counter
	153	bean
	160	yelp
	310	home
	200	door
	103	dinner
	116	neighborhood
	266	things
	105	family
	155	reviews
	249	pastries
	216	everyone
	156	girl
	231	mocha
	146	decor
	107	weekend
	203	star
	461	latte
	101	husband
	105	lattes
	112	money
	276	minutes
	101	treat
	292	tables
	501	cup
	244	taste
	116	coffees
	144	vegan
	315	stars
	223	space
	692	day
	221	hours
	158	gelato
	189	crepe
	160	room
	762	people
	242	bar
	218	owner
	181	week
	444	something
	141	stuff
	163	card
	262	friends
	168	thru
	745	shop
	274	table
	183	restaurant
	1363	food
	107	year
	112	right
	160	fact
	136	top
	3079	place
	106	bakery
	412	morning
	223	street
	159	patio
	132	person
	758	drinks
	153	wait
	305	work
	143	variety
	512	sandwich
	656	breakfast
	109	strip
	133	fruit
	141	crepes
	150	soup
	188	name
	378	cream
	130	eggs
	668	location
	119	half
	128	chairs
	200	sugar
	316	experience
	254	town
	207	teas
	453	bit
	172	years
	114	city
	458	area
	122	afternoon
	131	kids
	277	music
	190	part
	107	saturday
	133	dessert
	190	review
	160	egg
	160	flavors
	307	customer
	114	pastry
	146	days
	210	bread
	193	customers
	552	order
	216	barista
	107	trip
	150	reason
	981	starbucks
	281	flavor
	488	cafe
	198	one
	189	love
	345	donuts
	200	beans
	185	baristas
	192	employees
	287	vegas
	111	toast
	109	fun
	198	today
	303	cheese
	449	times
	142	dunkin
	209	visit
	158	vanilla
	148	guy
	124	art
	274	quality
	106	cappuccino
	1422	time
	110	hour
	322	espresso
	129	desserts
	108	choice
	217	parking
	152	phoenix
	202	couple
	<!DOCTYPE html>
	<meta charset="utf-8">
	<style>

	text {
	font: 24px "Helvetica Neue", Helvetica, Arial, sans-serif;
	text-anchor: middle;
	pointer-events: none;
	}

	circle {
	fill: #ccc;
	}

	.node:hover circle {
	fill: #d62728;
	}

	.d3-tip {
	line-height: 1;
	font: 14px sans-serif;
	padding: 12px;
	background: rgba(0, 0, 0, 0.8);
	color: rgb(185, 185, 185);
	border-radius: 2px;
	}

	/* Creates a small triangle extender for the tooltip */
	.d3-tip:after {
	box-sizing: border-box;
	display: inline;
	font-size: 10px;
	width: 100%;
	line-height: 1;
	color: rgba(0, 0, 0, 0.8);
	content: "\25BC";
	position: absolute;
	text-align: center;
	}

	/* Style northward tooltips differently */
	.d3-tip.n:after {
	margin: -1px 0 0 0;
	top: 100%;
	left: 0;
	}

	</style>
	<body>
	<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>
	<script src="http://labratrevenge.com/d3-tip/javascripts/d3.tip.v0.6.3.js"></script>
	<script>

	// Modified from the Mike Bostock example here: http://bl.ocks.org/mbostock/1846692

	var margin = {top: 20, right: 20, bottom: 30, left: 50},
	width = 960 - margin.left - margin.right,
	height = 500 - margin.top - margin.bottom,
	bleed = 100;

	var pack = d3.layout.pack()
	.sort(null)
	.size([width, height])
	.padding(2)
	.value(function(d) { return d.counts; });

	var svg = d3.select("body").append("svg")
	.attr("width", width + margin.left + margin.right)
	.attr("height", height + margin.top + margin.bottom)
	.append("g")
	.attr("transform", "translate(" + margin.left + "," + margin.top + ")");

	var tip = d3.tip()
	.attr('class', 'd3-tip')
	.offset([-10, 0])
	.html(function(d) {
	return "<div><span>Word:</span> <span style='color:white'>" + d.words + "</span></div>" +
	"<div><span>Frequency:</span> <span style='color:white'>" + d.counts +
	" (" + d3.round(100 * d.counts / 5000, 0)+ "%)" + "</span></div>";
	})

	svg.call(tip);

	d3.csv("data.csv", function(error, data) {
	if (error) throw error;

	data.forEach(function(d){
	d.counts = + d.counts;
	return d
	})

	// Convert the data into a format copasetic for the pack layout
	data = { children: data };

	// Create the selection
	var node = svg.selectAll(".node")
	.data(pack.nodes(data).filter(function(d) { return !d.children; }))
	.enter().append("g")
	.attr("class", "node")
	.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
	.on('mouseover', tip.show)
	.on('mouseout', tip.hide);

	// Append the circles
	node.append("circle")
	.attr("r", function(d) { return d.r; });

	// Append the text
	node.append("text")
	.text(function(d) { return d.words; })
	.style("font-size", function(d) { return Math.min(2 * d.r, (2 * d.r - 8) / this.getComputedTextLength() * 24) + "px"; })
	.attr("dy", ".35em");

	});
	</script>
	# ANALYSIS OF YELP DATA
	# This script tokenizes each review, removes stopwords, identifies
	# the most common words, and quantifies positivity and negativity
	# associated with each word.

	# Clear working space
	rm(list = ls())
	gc()

	# Load packages
	library(NLP)
	library(magrittr)
	library(tm)
	library(openNLP)

	library(RWeka)
	library(RTextTools)
	library(SnowballC)
	library(reshape)

	# Load the Data
	loc <- '/Users/josiahdavis/Documents/GitHub/earl/'
	dr <- read.csv(paste(loc, 'yelp_review.csv', sep=""))
	str(dr)
	dim(dr)
	# Create sample because POS tagging is computationally expensive
	# Create a random set of indexes for sampling
	idx <- sample(1:nrow(dr), 5000, replace=FALSE)

	# Conver to list of strings
	texts <- dr[idx,]$text
	texts <- lapply(texts, as.String)

	# =====================================
	# Identify and reviews to only include nouns.
	# This section modified from an excellent tutorial:
	# http://rstudio-pubs-static.s3.amazonaws.com/34069_9ab9f30646474af89ba7849174cab6e9.html
	# =====================================

	# Define function for performing the annotations
	annotate_entities <- function(doc, annotation_pipeline) {
	annotations <- annotate(doc, annotation_pipeline)
	AnnotatedPlainTextDocument(doc, annotations)
	}

	# Define types of annotations to perform
	tagging_pipeline <- list(
	Maxent_Sent_Token_Annotator(),
	Maxent_Word_Token_Annotator(),
	Maxent_POS_Tag_Annotator()
	)

	# Annotate the texts (THIS STEP CAN TAKE A WHILE TO RUN)
	texts_annotated <- texts %>% lapply(annotate_entities, tagging_pipeline)
	str(texts_annotated[[1]], max.level = 2)

	# Define the POS getter function
	POSGetter <- function(doc, parts) {
	s <- doc$content
	a <- annotations(doc)[[1]]
	k <- sapply(a$features, `[[`, "POS")
	if(sum(k %in% parts) == 0){
	""
	}else{
	s[a[k %in% parts]]
	}
	}

	# Identify the nouns
	nouns <- texts_annotated %>% lapply(POSGetter, parts = c("NN", "NNS", "NNP", "NNPS"))

	# Turn each character vector into a single string
	nouns <- nouns %>% lapply(as.String)

	# =====================================
	# Perform text mining
	# transformations
	# =====================================

	# Conver to dataframe
	d <- data.frame(reviews = as.character(nouns))

	# Replace new line characters with spaces
	d$reviews <- gsub("\n", " ", d$reviews)

	# Convert the relevant data into a corpus object with the tm package
	d <- Corpus(VectorSource(d$reviews))

	# Convert everything to lower case
	d <- tm_map(d, content_transformer(tolower))

	# Remove stopwords
	stopwords <- c(stopwords("english"))
	d <- tm_map(d, removeWords, stopwords)

	# Strip whitespace
	d <- tm_map(d, stripWhitespace)

	# Convert to a document term matrix (rows are documents, columns are words)
	dtm <- DocumentTermMatrix(d)
	dim(dtm)

	# Look up most frequent terms
	dtmd <- as.matrix(dtm)
	freq <- colSums(dtmd)
	ord <- order(freq, decreasing = TRUE)
	freq[head(ord, 100)]

	# Create a dataframe
	words <- data.frame(counts = freq[head(ord, 200)])
	words$words <- row.names(words)
	row.names(words) <- 1:dim(words)[1]
	head(words)

	# Sort randomly
	idx <- sample(1:nrow(words), nrow(words), replace=FALSE)
	words <- words[idx,]

	# Write to csv file
	writeLoc <- "/Users/josiahdavis/Documents/d3/words/"
	write.csv(words, paste(writeLoc, "data.csv", sep=""), row.names=FALSE)