Skip to content

Instantly share code, notes, and snippets.

@beemyfriend
Last active September 28, 2018 06:52
Show Gist options
  • Save beemyfriend/47479890f82f0c80e3a247aebada51d1 to your computer and use it in GitHub Desktop.
Save beemyfriend/47479890f82f0c80e3a247aebada51d1 to your computer and use it in GitHub Desktop.
Mining CCS 2018 Presentation Data and Creating. a Graph
library(tidyverse)
library(rvest)
library(igraph)
##########
## Pull Data from website
##########
html <- read_html("http://ccs2018.web.auth.gr/schedule")
#############
## Pull Data from talk rows and tracks selection widget
## Thanks to selector gadget for pulling the css identifier
#############
tracks <- html_nodes(html, '#edit-field-session-tid') %>%
html_children() %>%
html_text() %>%
.[2:length(.)] %>%
c('All Tracks') %>%
str_replace_all('\\(', '\\\\(') %>%
str_replace_all('\\)', '\\\\)') %>%
str_replace('Complexity in Pyshics and Chemistry', 'Complexity in Physics and Chemistry')
talks <- html_nodes(html, ".views-align-center")
############
## change '\\C'to '\\\\C' to avoid regex issues
############
talk_title <- talks %>%
html_children %>%
html_text %>%
str_replace_all('\\(', '\\\\(') %>%
str_replace_all('\\)', '\\\\)')
#############
## Rooms are consistently the last thing provided in text
#############
talk_rooms <- talks %>%
html_text() %>%
str_extract('Room.+$') %>%
str_replace('\\s*$', '') %>%
{
ifelse(is.na(.), '', .)
}
##################
## Extract text that matches an option from the selection dropdown
##################
talk_track <- talks %>%
html_text() %>%
map_chr(function(x){
track <- tracks[str_detect(x, tracks)]
ifelse(identical(track, character(0)), '', track)
})
##################
## Remove all extracted text. Anything left over is a name
###################
talks_speakers <- talks %>%
html_text %>%
str_replace('^\\s+', '') %>%
imap_chr(function(x, i){
speakers <- x
if(talk_track[i] != '') speakers <- str_replace(speakers, talk_track[i], '')
if(talk_title[i] != '') speakers <- str_replace(speakers, talk_title[i], '')
if(talk_rooms[i] != '') speakers <- str_replace(speakers, talk_rooms[i], '')
str_trim(speakers)
})
##############
## Create a daa frame with all extracted information
##############
ccsTalks <- tibble(
title = talk_title,
track = talk_track,
rooms = talk_rooms,
speakers = talks_speakers
) %>%
filter(speakers != '') %>%
mutate(speakers = str_split(speakers, ',| and ')) %>%
unnest() %>%
mutate(speakers = str_trim(speakers))
##################
## Create edge list by joining data frame to itself by track
###################
ccsEL <- ccsTalks %>%
select(from = speakers, track) %>%
left_join(select(ccsTalks, to = speakers, track)) %>%
filter(from < to) %>%
group_by(from, to) %>%
summarize(
n = n(),
track = ifelse(n > 1, 'multiple', track)
)
#################
## Node List of authors/presenters with attribute being the author's track
##################
ccsNL <- ccsTalks %>%
select(speakers, track) %>%
group_by(speakers) %>%
summarize(
track = ifelse(n() > 1, 'multiple', track)
)
####################
## Create a color scale matching topic
####################
colorVertices <- ccsNL$track %>%
unique %>%
imap(function(x, i){
temp <- i
names(temp) <- x
temp
}) %>%
do.call(c, .)
#####################
## Create graph
####################
set.seed(4321)
ccsG <- graph_from_data_frame(ccsEL, F, ccsNL) %>%
set_edge_attr('width', value = .1) %>%
set_vertex_attr('size', value = 5) %>%
set_vertex_attr("label", value = '') %>%
set_vertex_attr("color", value = colorVertices[V(.)$track] + 1) %>%
set_graph_attr('layout', value = layout_nicely(.))
plot(ccsG, main = "Speakers Connected by Tracks")
###################
#### Same as previous, but edges are people and nodes are tracks
###################
ccsTracksEL <- ccsTalks %>%
select(from = track, speakers) %>%
left_join(select(ccsTalks, speakers, to = track)) %>%
filter(from < to) %>%
group_by(from, to) %>%
summarize(
n = n()
)
##############
## Node size will be the number of authors in a track
##############
ccsTracksNL <- ccsTalks %>%
select(speakers, track) %>%
group_by(track) %>%
summarize(
n = n()
)
set.seed(4321)
ccsTracksG <- graph_from_data_frame(ccsTracksEL, F, ccsTracksNL) %>%
set_edge_attr('width', value = 5 * E(.)$n / max(E(.)$n)) %>%
set_vertex_attr('size', value = V(.)$n/max(V(.)$n)*30) %>%
set_vertex_attr("label", value = V(.)$name) %>%
set_vertex_attr("color", value = colorVertices[V(.)$name] + 1) %>%
set_vertex_attr("label.cex", value = .8) %>%
set_graph_attr('layout', value = layout_nicely(.))
plot(ccsTracksG, main = "Tracks Connected by Speakers")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment