Skip to content

Instantly share code, notes, and snippets.

@mmparker
Created August 31, 2020 02:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mmparker/b661c5c0ca958a57479ee3dc83dc4da5 to your computer and use it in GitHub Desktop.
Save mmparker/b661c5c0ca958a57479ee3dc83dc4da5 to your computer and use it in GitHub Desktop.
library(dplyr)
library(rvest)
# Get the list of files to work on
files_to_process <- list.files('r:/shared documents/',
pattern = 'html',
full.names = TRUE)
# Iterate over each file
extracted_tables <- lapply(files_to_process, FUN = function(this_file) {
print(paste("Extracting data from", this_file))
# Delete the "Software Inventory" multi-column header
this_file_parsed <- readLines(this_file) %>%
.[!grepl(x = ., pattern = "Software Inventory")] %>%
paste(collapse = "") %>%
read_html()
# Extract data
software_inventory <- this_file_parsed %>%
html_node("#softwareInventory") %>%
html_table(header = TRUE)
machine_info <- this_file_parsed %>%
html_node("#machineInfo") %>%
html_table(header = TRUE)
report_datetime <- this_file_parsed %>%
html_node("#dateTime") %>%
html_text()
# Add software inventory
software_inventory$machine_name <- machine_info$machine_name
software_inventory$login_name <- machine_info$login_name
software_inventory$report_datetime <- report_datetime
# Write out with the name of the HTML file
output_path <- sub(x = basename(this_file), pattern = ".html", replacement = ".csv")
print(paste("Writing extracted data to", output_path))
write.csv(renamed_table, file = output_path)
extracted_table
})
# Quick check of the first few rows to make sure every table is looking good
lapply(extracted_tables, head)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment