|
# ANALYSIS OF YELP DATA |
|
# This script tokenizes each review, removes stopwords, identifies |
|
# the most common words, and quantifies positivity and negativity |
|
# associated with each word. |
|
|
|
# Clear working space |
|
rm(list = ls()) |
|
gc() |
|
|
|
# Load packages |
|
library(NLP) |
|
library(magrittr) |
|
library(tm) |
|
library(openNLP) |
|
|
|
library(RWeka) |
|
library(RTextTools) |
|
library(SnowballC) |
|
library(reshape) |
|
|
|
# Load the Data |
|
loc <- '/Users/josiahdavis/Documents/GitHub/earl/' |
|
dr <- read.csv(paste(loc, 'yelp_review.csv', sep="")) |
|
str(dr) |
|
dim(dr) |
|
# Create sample because POS tagging is computationally expensive |
|
# Create a random set of indexes for sampling |
|
idx <- sample(1:nrow(dr), 5000, replace=FALSE) |
|
|
|
# Conver to list of strings |
|
texts <- dr[idx,]$text |
|
texts <- lapply(texts, as.String) |
|
|
|
# ===================================== |
|
# Identify and reviews to only include nouns. |
|
# This section modified from an excellent tutorial: |
|
# http://rstudio-pubs-static.s3.amazonaws.com/34069_9ab9f30646474af89ba7849174cab6e9.html |
|
# ===================================== |
|
|
|
# Define function for performing the annotations |
|
annotate_entities <- function(doc, annotation_pipeline) { |
|
annotations <- annotate(doc, annotation_pipeline) |
|
AnnotatedPlainTextDocument(doc, annotations) |
|
} |
|
|
|
# Define types of annotations to perform |
|
tagging_pipeline <- list( |
|
Maxent_Sent_Token_Annotator(), |
|
Maxent_Word_Token_Annotator(), |
|
Maxent_POS_Tag_Annotator() |
|
) |
|
|
|
# Annotate the texts (THIS STEP CAN TAKE A WHILE TO RUN) |
|
texts_annotated <- texts %>% lapply(annotate_entities, tagging_pipeline) |
|
str(texts_annotated[[1]], max.level = 2) |
|
|
|
# Define the POS getter function |
|
POSGetter <- function(doc, parts) { |
|
s <- doc$content |
|
a <- annotations(doc)[[1]] |
|
k <- sapply(a$features, `[[`, "POS") |
|
if(sum(k %in% parts) == 0){ |
|
"" |
|
}else{ |
|
s[a[k %in% parts]] |
|
} |
|
} |
|
|
|
# Identify the nouns |
|
nouns <- texts_annotated %>% lapply(POSGetter, parts = c("NN", "NNS", "NNP", "NNPS")) |
|
|
|
# Turn each character vector into a single string |
|
nouns <- nouns %>% lapply(as.String) |
|
|
|
# ===================================== |
|
# Perform text mining |
|
# transformations |
|
# ===================================== |
|
|
|
# Conver to dataframe |
|
d <- data.frame(reviews = as.character(nouns)) |
|
|
|
# Replace new line characters with spaces |
|
d$reviews <- gsub("\n", " ", d$reviews) |
|
|
|
# Convert the relevant data into a corpus object with the tm package |
|
d <- Corpus(VectorSource(d$reviews)) |
|
|
|
# Convert everything to lower case |
|
d <- tm_map(d, content_transformer(tolower)) |
|
|
|
# Remove stopwords |
|
stopwords <- c(stopwords("english")) |
|
d <- tm_map(d, removeWords, stopwords) |
|
|
|
# Strip whitespace |
|
d <- tm_map(d, stripWhitespace) |
|
|
|
# Convert to a document term matrix (rows are documents, columns are words) |
|
dtm <- DocumentTermMatrix(d) |
|
dim(dtm) |
|
|
|
# Look up most frequent terms |
|
dtmd <- as.matrix(dtm) |
|
freq <- colSums(dtmd) |
|
ord <- order(freq, decreasing = TRUE) |
|
freq[head(ord, 100)] |
|
|
|
# Create a dataframe |
|
words <- data.frame(counts = freq[head(ord, 200)]) |
|
words$words <- row.names(words) |
|
row.names(words) <- 1:dim(words)[1] |
|
head(words) |
|
|
|
# Sort randomly |
|
idx <- sample(1:nrow(words), nrow(words), replace=FALSE) |
|
words <- words[idx,] |
|
|
|
# Write to csv file |
|
writeLoc <- "/Users/josiahdavis/Documents/d3/words/" |
|
write.csv(words, paste(writeLoc, "data.csv", sep=""), row.names=FALSE) |