Skip to content

Instantly share code, notes, and snippets.

@neerajt
Created January 9, 2019 00:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save neerajt/25704aaacf342605fb12839e3494d0c8 to your computer and use it in GitHub Desktop.
Save neerajt/25704aaacf342605fb12839e3494d0c8 to your computer and use it in GitHub Desktop.
library(pdftools)
library(tidyverse)
library(tidytext)
library(tm)
# ============================================================
# Part 1: Recreating PDF Table
# ============================================================
pdf_url <- "https://www.bea.gov/system/files/2018-12/pi1118_hist.pdf"
# Read the PDF into R using pdf_text() from pdftools package
# import <- pdf_text("personal_income_11.2018.pdf")
import <- pdf_text(pdf_url)
# Most PDF data when read into R is very messy and require
# significants amount of cleanup
head(import[1])
# For this example we will focus on recreating the table on
# page 3
pg3_raw <- import[3]
# New Lines are designated by \r\n so we'll begin the cleanup
# by creating line breaks
pg3_linesep <- str_split(pg3_raw, "\r\n")
# Str_split creats a list class. Removing the list structure
# makes cleanup easier
pg3 <- unlist(pg3_linesep)
# Let's see what we have now
head(pg3, 10)
tail(pg3, 10)
length(pg3)
# There is text above and below the table that is not need
# to recreate the table so let's remove it
pg3_rm_headerfooter <- pg3[5:63]
# There is extra spacing where our first column will be
# the data that needs to be in order for the table to
# format correctly
pg3_removespace1 <- sub("\\s+", "", pg3_rm_headerfooter)
# With the the excess first column spacing removed we are
# going to use the remaining spaces to our advantage
# by replacing them with a delimiter to identify our column
# stops
pg3_create_cols <- gsub("\\s+", "|", pg3_removespace1)
# With our delimiter set we will create a table in R
convert <- textConnection(pg3_create_cols)
pg3_cleantable <- read.csv(convert, sep = "|", header = FALSE)
# Finally, we'll add headers to the table
names(pg3_cleantable) <- c("year", "pi.bil", "pi.pct.chg", "dpi.bil", "dpi.pct.chg", "dpi.real.chain.2012", "dpi.real.pct.chg", "pce.bil", "pce.pct.chg", "pce.real.chain.2012", "pce.real.pct.chg", "ps.bil", "ps.save.rate")
# Final table
View(pg3_cleantable)
summary(pg3_cleantable)
ggplot(pg3_cleantable, aes(x = year, y = pce.pct.chg)) +
geom_line()
# ============================================================
# Part 2: Extracting Textual Data from PDF
# ============================================================
coke_npa_url <- "http://investor.cokeconsolidated.com/static-files/0ca75582-1fc2-4f99-b4ef-a38dbba0f133"
# Read the PDF into R using pdf_text() from pdftools package
# raw_npa <- pdf_text("coke_npa_03.2018.pdf")
raw_npa <- pdf_text(coke_npa_url)
# Each PDF is different so it's recommened to inspect how R is
# interpreting it in order to make the cleanup easier
raw_npa[1]
# In this example we are not going to subset pages since we want to
# analyze the text of the entire document, and as we saw lines
# are broken up using the same patter as the first PDF
npa_linesep <- str_split(raw_npa, "\r\n")
# Again, we are going to unlist the data to make it easier to work with
npa <- unlist(npa_linesep)
# ************************************************************
# Text mining the PDF using tidytext
# ************************************************************
# Create a data fram where each line of text is represented by a row
npa_df <- as.data.frame(npa)
# The data fram only has 1 column which we'll name text
names(npa_df) <- c("text")
# Converts the text column to character vector
npa_df$text <- as.character(npa_df$text)
# Tokenized the data frame
npa_words <- unnest_tokens(npa_df, word, text)
# Remove stop words
npa_nostop <- anti_join(npa_words, stop_words)
# Add word sentiment to the data frame and count the word sentments
npa_sentiment <- npa_nostop %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = T)
# Display final data
npa_sentiment %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n, fill = sentiment)) +
geom_col() +
facet_wrap(~sentiment, scales = "free_y") +
coord_flip()
# ************************************************************
# Text mining the PDF using tm
# ************************************************************
# Create corpus from npa vector
npa_vec <- VectorSource(npa)
doc_corp <- Corpus(npa_vec)
# Make all letter lower case, delete exccess spacing, and
# Remove stop words
lowercase <- tm_map(doc_corp, content_transformer(tolower))
nospace <- tm_map(lowercase, stripWhitespace)
nostops <- tm_map(nospace, removeWords, c(stopwords("en")))
# Create a term document matric
npa_tdm <- TermDocumentMatrix(nostops)
npa_m <- as.matrix(npa_tdm)
# Find highest frequency words
term_freq <- rowSums(npa_m)
term_freq <- sort(term_freq, decreasing = T)
# Word frequency plot
barplot(term_freq[1:10])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment