Last active
April 17, 2018 10:42
-
-
Save sjockers/071499234f4ede6fa508efc18c246757 to your computer and use it in GitHub Desktop.
Scrape the 2013 Bundestag election results for individual constituencies from Wikipedia using artoo.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Scrape the 2013 Bundestag election results for individual constituencies from Wikipedia | |
// - using artoo.js v0.3.3.1 (http://medialab.github.io/artoo/) | |
// - starting point: https://de.wikipedia.org/wiki/Liste_der_Bundestagswahlkreise_2013 | |
var result = [] | |
var baseUrl = 'https://de.wikipedia.org' | |
var pages = artoo.scrape('.wikitable:nth-of-type(2) td:nth-of-type(2) a,\ | |
.wikitable:nth-of-type(2) td:nth-of-type(6) a', function() { | |
return baseUrl + $(this).attr('href') | |
}) | |
function findRows (doc) { | |
return doc.find('h2:contains("Bundestagswahl 2013"), h3:contains("Bundestagswahl 2013")') | |
.nextAll('.sortable') | |
.first() | |
.find('tr') | |
} | |
function parseRows (rows, doc) { | |
var constituencyId = doc.find('.infobox td:contains("Wahlkreisnummer")').next().text() | |
var constituencyName = doc.find('h1.firstHeading').text() | |
return rows.map(function (i, tr) { | |
return parseRow(tr, constituencyName, constituencyId) | |
}).toArray() | |
} | |
function parseRow (tr, constituencyName, constituencyId) { | |
var td = $(tr).find('td') | |
if (td.length > 0) { | |
var partyPath = $(td[1]).find('a').attr('href') | |
var candidatePath = $(td[0]).find('a').attr('href') | |
return { | |
constituency_id: constituencyId, | |
constituency_name: constituencyName, | |
candidate_name: td[0].innerText, | |
candidate_url: candidatePath && baseUrl + candidatePath, | |
party: td[1].innerText, | |
party_url: partyPath && baseUrl + partyPath, | |
vote_1: td[2].innerText, | |
vote_2: td[3].innerText | |
} | |
} | |
} | |
artoo.ajaxSpider(pages, { | |
jquerify: true, | |
throttle: 3000, | |
process: function (doc) { | |
var rows = findRows(doc) | |
var data = parseRows(rows, doc) | |
result = result.concat(data) | |
artoo.log.debug('parsing...', data) | |
}, | |
done: function () { | |
artoo.saveCsv(result) | |
} | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment