Created
January 9, 2019 00:52
-
-
Save neerajt/25704aaacf342605fb12839e3494d0c8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(pdftools) | |
library(tidyverse) | |
library(tidytext) | |
library(tm) | |
# ============================================================ | |
# Part 1: Recreating PDF Table | |
# ============================================================ | |
pdf_url <- "https://www.bea.gov/system/files/2018-12/pi1118_hist.pdf" | |
# Read the PDF into R using pdf_text() from pdftools package | |
# import <- pdf_text("personal_income_11.2018.pdf") | |
import <- pdf_text(pdf_url) | |
# Most PDF data when read into R is very messy and require | |
# significants amount of cleanup | |
head(import[1]) | |
# For this example we will focus on recreating the table on | |
# page 3 | |
pg3_raw <- import[3] | |
# New Lines are designated by \r\n so we'll begin the cleanup | |
# by creating line breaks | |
pg3_linesep <- str_split(pg3_raw, "\r\n") | |
# Str_split creats a list class. Removing the list structure | |
# makes cleanup easier | |
pg3 <- unlist(pg3_linesep) | |
# Let's see what we have now | |
head(pg3, 10) | |
tail(pg3, 10) | |
length(pg3) | |
# There is text above and below the table that is not need | |
# to recreate the table so let's remove it | |
pg3_rm_headerfooter <- pg3[5:63] | |
# There is extra spacing where our first column will be | |
# the data that needs to be in order for the table to | |
# format correctly | |
pg3_removespace1 <- sub("\\s+", "", pg3_rm_headerfooter) | |
# With the the excess first column spacing removed we are | |
# going to use the remaining spaces to our advantage | |
# by replacing them with a delimiter to identify our column | |
# stops | |
pg3_create_cols <- gsub("\\s+", "|", pg3_removespace1) | |
# With our delimiter set we will create a table in R | |
convert <- textConnection(pg3_create_cols) | |
pg3_cleantable <- read.csv(convert, sep = "|", header = FALSE) | |
# Finally, we'll add headers to the table | |
names(pg3_cleantable) <- c("year", "pi.bil", "pi.pct.chg", "dpi.bil", "dpi.pct.chg", "dpi.real.chain.2012", "dpi.real.pct.chg", "pce.bil", "pce.pct.chg", "pce.real.chain.2012", "pce.real.pct.chg", "ps.bil", "ps.save.rate") | |
# Final table | |
View(pg3_cleantable) | |
summary(pg3_cleantable) | |
ggplot(pg3_cleantable, aes(x = year, y = pce.pct.chg)) + | |
geom_line() | |
# ============================================================ | |
# Part 2: Extracting Textual Data from PDF | |
# ============================================================ | |
coke_npa_url <- "http://investor.cokeconsolidated.com/static-files/0ca75582-1fc2-4f99-b4ef-a38dbba0f133" | |
# Read the PDF into R using pdf_text() from pdftools package | |
# raw_npa <- pdf_text("coke_npa_03.2018.pdf") | |
raw_npa <- pdf_text(coke_npa_url) | |
# Each PDF is different so it's recommened to inspect how R is | |
# interpreting it in order to make the cleanup easier | |
raw_npa[1] | |
# In this example we are not going to subset pages since we want to | |
# analyze the text of the entire document, and as we saw lines | |
# are broken up using the same patter as the first PDF | |
npa_linesep <- str_split(raw_npa, "\r\n") | |
# Again, we are going to unlist the data to make it easier to work with | |
npa <- unlist(npa_linesep) | |
# ************************************************************ | |
# Text mining the PDF using tidytext | |
# ************************************************************ | |
# Create a data fram where each line of text is represented by a row | |
npa_df <- as.data.frame(npa) | |
# The data fram only has 1 column which we'll name text | |
names(npa_df) <- c("text") | |
# Converts the text column to character vector | |
npa_df$text <- as.character(npa_df$text) | |
# Tokenized the data frame | |
npa_words <- unnest_tokens(npa_df, word, text) | |
# Remove stop words | |
npa_nostop <- anti_join(npa_words, stop_words) | |
# Add word sentiment to the data frame and count the word sentments | |
npa_sentiment <- npa_nostop %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(word, sentiment, sort = T) | |
# Display final data | |
npa_sentiment %>% | |
group_by(sentiment) %>% | |
top_n(10) %>% | |
ungroup() %>% | |
mutate(word = reorder(word, n)) %>% | |
ggplot(aes(x = word, y = n, fill = sentiment)) + | |
geom_col() + | |
facet_wrap(~sentiment, scales = "free_y") + | |
coord_flip() | |
# ************************************************************ | |
# Text mining the PDF using tm | |
# ************************************************************ | |
# Create corpus from npa vector | |
npa_vec <- VectorSource(npa) | |
doc_corp <- Corpus(npa_vec) | |
# Make all letter lower case, delete exccess spacing, and | |
# Remove stop words | |
lowercase <- tm_map(doc_corp, content_transformer(tolower)) | |
nospace <- tm_map(lowercase, stripWhitespace) | |
nostops <- tm_map(nospace, removeWords, c(stopwords("en"))) | |
# Create a term document matric | |
npa_tdm <- TermDocumentMatrix(nostops) | |
npa_m <- as.matrix(npa_tdm) | |
# Find highest frequency words | |
term_freq <- rowSums(npa_m) | |
term_freq <- sort(term_freq, decreasing = T) | |
# Word frequency plot | |
barplot(term_freq[1:10]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment