Skip to content

Instantly share code, notes, and snippets.

@keithrozario
Last active July 22, 2019 20:21
Show Gist options
  • Save keithrozario/a071280dc2691e9f175959996053d656 to your computer and use it in GitHub Desktop.
Save keithrozario/a071280dc2691e9f175959996053d656 to your computer and use it in GitHub Desktop.
Script to scrape PRU website for candidates
import requests
import re
from bs4 import BeautifulSoup
import json
import pru_14_json # https://calon.spr.gov.my/pru14_json.js
import time
import csv
import operator
# Full list of Parlimen seats, :https://calon.spr.gov.my/pru14_json.js
parlimen_seats = []
for state in pru_14_json.json_data:
for seat in pru_14_json.json_data[state]['parlimen']:
parlimen_seats.append(seat)
# Full list of state seats
state_seats = []
for state in pru_14_json.json_data:
for seat in pru_14_json.json_data[state]['dun']:
state_seats.append(seat)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.5',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '_ga=GA1.3.1001500445.1524900877; _gid=GA1.3.593399681.1524900877; PHPSESSID=56433b80694baae3aad91e66563c3484',
'DNT': '1',
'Host': 'calon.spr.gov.my',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:59.0) Gecko/20100101 Firefox/59.0'
}
# Initial get for first token
r = requests.post('https://calon.spr.gov.my/', headers=headers)
soup = BeautifulSoup(r.content, "html.parser")
token = soup.find('input', {'id': 'to_spr_ken'}).get('value')
# Ok let's get those names For parlimen
records = []
for seat in parlimen_seats:
payload = {'kod': str(seat['id']),
'token': token}
r = requests.post(url='https://calon.spr.gov.my/ajax.php', data=payload, headers=headers, allow_redirects=True)
response_json = json.loads(r.text)
for calon in response_json['calon']:
record = { 'seat': seat, 'calon': calon, 'id': seat['kerusi_id']}
records.append(record)
print(record)
token = response_json['token'] # refresh token
time.sleep(1) # don't overload the site hashtag:#responsibleScraper
records.sort(key=operator.itemgetter('id')) # sort by seat id, e.g P.001
# Write out to file
with open("election_results/parlimen.csv", "w") as csvfile:
csv_writer = csv.writer(csvfile, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(["Seat ID",
"Seat Name",
"Candidate Name",
"Candidate Ballot Name",
"Candidate Party"])
for record in records:
csv_writer.writerow([record['seat']['kerusi_id'],
record['seat']['name'],
record['calon']['nama'],
record['calon']['nama_undi'],
record['calon']['parti'] ])
# and now states
records = []
for seat in state_seats:
payload = {'kod': str(seat['id']),
'token': token}
r = requests.post(url='https://calon.spr.gov.my/ajax.php', data=payload, headers=headers, allow_redirects=True)
response_json = json.loads(r.text)
for calon in response_json['calon']:
record = { 'seat': seat, 'calon': calon, 'id': seat['state_id']}
records.append(record)
print (record)
token = response_json['token'] # refresh token
time.sleep(1) # responsible Scraper
records.sort(key=operator.itemgetter('id')) # sort by state_id
with open("election_results/state.csv", "w") as csvfile:
csv_writer = csv.writer(csvfile, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(["State",
"Seat ID",
"Seat Name",
"Candidate Name",
"Candidate Ballot Name",
"Candidate Party"])
for record in records:
csv_writer.writerow([pru_14_json.state_mapping[record['seat']['state_id']],
record['seat']['kerusi_id'],
record['seat']['name'],
record['calon']['nama'],
record['calon']['nama_undi'],
record['calon']['parti'] ])
# end like a boss
print("Keith is awesome!")
@keithrozario
Copy link
Author

Updated much better version that used csv module instead of crafting csv 'by-hand' :)

Also included the full js from PRU website with all mappings and id.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment