Skip to content

Instantly share code, notes, and snippets.

@richshaw
Created August 11, 2016 06:09
Show Gist options
  • Save richshaw/6baad5b180540e7eb96057e98a425502 to your computer and use it in GitHub Desktop.
Save richshaw/6baad5b180540e7eb96057e98a425502 to your computer and use it in GitHub Desktop.
Get website metadata
library("rvest", lib.loc="/usr/local/lib/R/3.3/site-library")
getMetaData <- function (sites) {
url <- c()
domain <- c()
lang <- c()
name <- c()
description <- c()
type <- c()
locale <- c()
for (site in sites) {
cat("Reading: ",site,"\n")
err <- tryCatch(
dom <- read_html(paste0("http://",site)),
error=function(e) {
cat("ERROR: Couldn't read ",site,"\n")
e
}
)
if(inherits(err, "error")) {
next
}
## Snarf domain
url <- c(url,site)
## Get lang
meta.lang <- dom %>% html_attr("lang")
if(length(meta.lang) == 0 || is.na(meta.lang)) {
meta.lang <- dom %>% html_attr("xml:lang")
}
if(length(meta.lang) == 0 || is.na(meta.lang)) {
meta.lang <- dom %>% html_node("meta[name='DC.Language']") %>% html_attr("content")
}
if(length(meta.lang) == 0) {
meta.lang <- NA
}
## Get dom name
meta.name <- dom %>% html_node("meta[property='og:dom_name']") %>% html_attr("content")
if(length(meta.name) == 0 || is.na(meta.name)) {
meta.name <- dom %>% html_node("meta[name='DC.Publisher']") %>% html_attr("content")
}
if(length(meta.name) == 0 || is.na(meta.name)) {
meta.name <- dom %>% html_node("title") %>% html_text()
}
if(length(meta.name) == 0) {
meta.name <- NA
}
## Get description
meta.description <- dom %>% html_node("meta[name=description]") %>% html_attr("content")
if(length(meta.description) == 0 || is.na(meta.description)) {
meta.description <- dom %>% html_node("meta[property='og:description']") %>% html_attr("content")
}
if(length(meta.description) == 0) {
meta.description <- NA
}
## Get type
meta.type <- dom %>% html_node("meta[property='og:type']") %>% html_attr("content")
if(length(meta.type) == 0) {
meta.type <- NA
}
## Get locale
meta.locale <- dom %>% html_node("meta[property='og:locale']") %>% html_attr("content")
if(length(meta.locale) == 0) {
meta.locale <- NA
}
# url <- c(url,site)
lang <- c(lang,meta.lang)
name <- c(name,meta.name)
description <- c(description,meta.description)
type <- c(type,meta.type)
locale <- c(locale,meta.locale)
} # End loop
data.frame(url,lang,name,description,type,locale)
}
sites <- scan(file="popular_news_sites.txt", what="", sep="\n")
data <- getMetaData(sites)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment