neerajt · January 9, 2019 00:52
diff --git a/PDF_Scraping.R b/PDF_Scraping.R
 library(pdftools)
 library(tidyverse)
 library(tidytext)
 library(tm)

 # ============================================================
 # Part 1: Recreating PDF Table
 # ============================================================

 pdf_url <- "https://www.bea.gov/system/files/2018-12/pi1118_hist.pdf"

 # Read the PDF into R using pdf_text() from pdftools package 
 # import <- pdf_text("personal_income_11.2018.pdf")
 import <- pdf_text(pdf_url)

 # Most PDF data when read into R is very messy and require
 # significants amount of cleanup
 head(import[1])

 # For this example we will focus on recreating the table on 
 # page 3
 pg3_raw <- import[3]

 # New Lines are designated by \r\n so we'll begin the cleanup
 # by creating line breaks
 pg3_linesep <- str_split(pg3_raw, "\r\n")

 # Str_split creats a list class. Removing the list structure
 # makes cleanup easier
 pg3 <- unlist(pg3_linesep)

 # Let's see what we have now
 head(pg3, 10)
 tail(pg3, 10)
 length(pg3)

 # There is text above and below the table that is not need
 # to recreate the table so let's remove it
 pg3_rm_headerfooter <- pg3[5:63]

 # There is extra spacing where our first column will be
 # the data that needs to be in order for the table to 
 # format correctly
 pg3_removespace1 <- sub("\\s+", "", pg3_rm_headerfooter)

 # With the the excess first column spacing removed we are
 # going to use the remaining spaces to our advantage
 # by replacing them with a delimiter to identify our column
 # stops
 pg3_create_cols <- gsub("\\s+", "|", pg3_removespace1)

 # With our delimiter set we will create a table in R
 convert <- textConnection(pg3_create_cols)
 pg3_cleantable <- read.csv(convert, sep = "|", header = FALSE)

 # Finally, we'll add headers to the table
 names(pg3_cleantable) <- c("year", "pi.bil", "pi.pct.chg", "dpi.bil", "dpi.pct.chg", "dpi.real.chain.2012", "dpi.real.pct.chg", "pce.bil", "pce.pct.chg", "pce.real.chain.2012", "pce.real.pct.chg", "ps.bil", "ps.save.rate")

 # Final table
 View(pg3_cleantable)

 summary(pg3_cleantable)

 ggplot(pg3_cleantable, aes(x = year, y = pce.pct.chg)) + 
  geom_line()

 # ============================================================
 # Part 2: Extracting Textual Data from PDF
 # ============================================================

 coke_npa_url <- "http://investor.cokeconsolidated.com/static-files/0ca75582-1fc2-4f99-b4ef-a38dbba0f133"

 # Read the PDF into R using pdf_text() from pdftools package 
 # raw_npa <- pdf_text("coke_npa_03.2018.pdf")
 raw_npa <- pdf_text(coke_npa_url)

 # Each PDF is different so it's recommened to inspect how R is 
 # interpreting it in order to make the cleanup easier
 raw_npa[1]

 # In this example we are not going to subset pages since we want to 
 # analyze the text of the entire document, and as we saw lines
 # are broken up using the same patter as the first PDF
 npa_linesep <- str_split(raw_npa, "\r\n")

 # Again, we are going to unlist the data to make it easier to work with
 npa <- unlist(npa_linesep)

 # ************************************************************
 # Text mining the PDF using tidytext
 # ************************************************************
  
 # Create a data fram where each line of text is represented by a row
 npa_df <- as.data.frame(npa)

 # The data fram only has 1 column which we'll name text
 names(npa_df) <- c("text")

 # Converts the text column to character vector
 npa_df$text <- as.character(npa_df$text)

 # Tokenized the data frame
 npa_words <- unnest_tokens(npa_df, word, text)

 # Remove stop words
 npa_nostop <- anti_join(npa_words, stop_words)

 # Add word sentiment to the data frame and count the word sentments
 npa_sentiment <- npa_nostop %>% 
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = T)

 # Display final data
 npa_sentiment %>% 
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n, fill = sentiment)) +
  geom_col() +
  facet_wrap(~sentiment, scales = "free_y") + 
  coord_flip()

 # ************************************************************
 # Text mining the PDF using tm
 # ************************************************************

 # Create corpus from npa vector
 npa_vec <- VectorSource(npa)
 doc_corp <- Corpus(npa_vec)

 # Make all letter lower case, delete exccess spacing, and 
 # Remove stop words
 lowercase <- tm_map(doc_corp, content_transformer(tolower))
 nospace <- tm_map(lowercase, stripWhitespace)
 nostops <- tm_map(nospace, removeWords, c(stopwords("en")))

 # Create a term document matric
 npa_tdm <- TermDocumentMatrix(nostops)
 npa_m <- as.matrix(npa_tdm)

 # Find highest frequency words
 term_freq <- rowSums(npa_m)
 term_freq <- sort(term_freq, decreasing = T)

 # Word frequency plot
 barplot(term_freq[1:10])
	library(pdftools)
	library(tidyverse)
	library(tidytext)
	library(tm)

	# ============================================================
	# Part 1: Recreating PDF Table
	# ============================================================

	pdf_url <- "https://www.bea.gov/system/files/2018-12/pi1118_hist.pdf"

	# Read the PDF into R using pdf_text() from pdftools package
	# import <- pdf_text("personal_income_11.2018.pdf")
	import <- pdf_text(pdf_url)

	# Most PDF data when read into R is very messy and require
	# significants amount of cleanup
	head(import[1])

	# For this example we will focus on recreating the table on
	# page 3
	pg3_raw <- import[3]

	# New Lines are designated by \r\n so we'll begin the cleanup
	# by creating line breaks
	pg3_linesep <- str_split(pg3_raw, "\r\n")

	# Str_split creats a list class. Removing the list structure
	# makes cleanup easier
	pg3 <- unlist(pg3_linesep)

	# Let's see what we have now
	head(pg3, 10)
	tail(pg3, 10)
	length(pg3)

	# There is text above and below the table that is not need
	# to recreate the table so let's remove it
	pg3_rm_headerfooter <- pg3[5:63]

	# There is extra spacing where our first column will be
	# the data that needs to be in order for the table to
	# format correctly
	pg3_removespace1 <- sub("\\s+", "", pg3_rm_headerfooter)

	# With the the excess first column spacing removed we are
	# going to use the remaining spaces to our advantage
	# by replacing them with a delimiter to identify our column
	# stops
	pg3_create_cols <- gsub("\\s+", "\|", pg3_removespace1)

	# With our delimiter set we will create a table in R
	convert <- textConnection(pg3_create_cols)
	pg3_cleantable <- read.csv(convert, sep = "\|", header = FALSE)

	# Finally, we'll add headers to the table
	names(pg3_cleantable) <- c("year", "pi.bil", "pi.pct.chg", "dpi.bil", "dpi.pct.chg", "dpi.real.chain.2012", "dpi.real.pct.chg", "pce.bil", "pce.pct.chg", "pce.real.chain.2012", "pce.real.pct.chg", "ps.bil", "ps.save.rate")

	# Final table
	View(pg3_cleantable)

	summary(pg3_cleantable)

	ggplot(pg3_cleantable, aes(x = year, y = pce.pct.chg)) +
	geom_line()

	# ============================================================
	# Part 2: Extracting Textual Data from PDF
	# ============================================================

	coke_npa_url <- "http://investor.cokeconsolidated.com/static-files/0ca75582-1fc2-4f99-b4ef-a38dbba0f133"

	# Read the PDF into R using pdf_text() from pdftools package
	# raw_npa <- pdf_text("coke_npa_03.2018.pdf")
	raw_npa <- pdf_text(coke_npa_url)

	# Each PDF is different so it's recommened to inspect how R is
	# interpreting it in order to make the cleanup easier
	raw_npa[1]

	# In this example we are not going to subset pages since we want to
	# analyze the text of the entire document, and as we saw lines
	# are broken up using the same patter as the first PDF
	npa_linesep <- str_split(raw_npa, "\r\n")

	# Again, we are going to unlist the data to make it easier to work with
	npa <- unlist(npa_linesep)

	# ************************************************************
	# Text mining the PDF using tidytext
	# ************************************************************

	# Create a data fram where each line of text is represented by a row
	npa_df <- as.data.frame(npa)

	# The data fram only has 1 column which we'll name text
	names(npa_df) <- c("text")

	# Converts the text column to character vector
	npa_df$text <- as.character(npa_df$text)

	# Tokenized the data frame
	npa_words <- unnest_tokens(npa_df, word, text)

	# Remove stop words
	npa_nostop <- anti_join(npa_words, stop_words)

	# Add word sentiment to the data frame and count the word sentments
	npa_sentiment <- npa_nostop %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = T)

	# Display final data
	npa_sentiment %>%
	group_by(sentiment) %>%
	top_n(10) %>%
	ungroup() %>%
	mutate(word = reorder(word, n)) %>%
	ggplot(aes(x = word, y = n, fill = sentiment)) +
	geom_col() +
	facet_wrap(~sentiment, scales = "free_y") +
	coord_flip()

	# ************************************************************
	# Text mining the PDF using tm
	# ************************************************************

	# Create corpus from npa vector
	npa_vec <- VectorSource(npa)
	doc_corp <- Corpus(npa_vec)

	# Make all letter lower case, delete exccess spacing, and
	# Remove stop words
	lowercase <- tm_map(doc_corp, content_transformer(tolower))
	nospace <- tm_map(lowercase, stripWhitespace)
	nostops <- tm_map(nospace, removeWords, c(stopwords("en")))

	# Create a term document matric
	npa_tdm <- TermDocumentMatrix(nostops)
	npa_m <- as.matrix(npa_tdm)

	# Find highest frequency words
	term_freq <- rowSums(npa_m)
	term_freq <- sort(term_freq, decreasing = T)

	# Word frequency plot
	barplot(term_freq[1:10])