Last active
August 29, 2015 14:21
-
-
Save RinkeHoekstra/c08b84fa7916a5a13100 to your computer and use it in GitHub Desktop.
Frankwatching Wordpress export to RDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2015, Rinke Hoekstra, VU University Amsterdam | |
# Licensed under the MIT License | |
from bs4 import BeautifulSoup | |
from rdflib import Graph, Namespace, RDF, RDFS, XSD, Literal, URIRef | |
# Prepare the RDF graph | |
g = Graph() | |
DCT = Namespace('http://purl.org/dc/terms/') | |
FOAF = Namespace('http://xmlns.com/foaf/0.1/') | |
SIOC = Namespace('http://rdfs.org/sioc/ns#') | |
SKOS = Namespace('http://www.w3.org/2004/02/skos/core#') | |
FW = Namespace('http://frankwatching.com/resource/') | |
g.bind('dct', DCT) | |
g.bind('foaf', FOAF) | |
g.bind('sioc', SIOC) | |
g.bind('skos', SKOS) | |
g.bind('fw', FW) | |
soup = BeautifulSoup(open('frankwatching.wordpress.2015-05-27.xml'), 'xml') | |
items = soup.findAll('item') | |
# Loop over all 'items' (i.e. the Wordpress posts) | |
# TODO: One problem is that comment authors and post authors are represented | |
# in different ways in the XML file. Comment authors have email addresses and | |
# long names, while post authors only have their login name. This means that | |
# the same person who writes posts *and* comments is represented as two distinct | |
# persons in the resulting RDF file. | |
for item in items: | |
print item.title.text | |
title = Literal(item.title.text) | |
url = URIRef(item.link.text) | |
date = Literal(item.pubDate.text, datatype=XSD.datetime) | |
creator = FW[item.creator.text.replace(' ','_')] | |
categories = [(FW[c['nicename']], Literal(c.text)) for c in item.findAll('category', domain='category')] | |
tags = [(FW[c['nicename']], Literal(c.text)) for c in item.findAll('category', domain='post_tag')] | |
# Build the RDF graph | |
g.add((url, RDF.type, SIOC['Post'])) | |
g.add((url, DCT['title'], title)) | |
g.add((url, DCT['created'], date)) | |
g.add((url, SIOC['has_creator'], creator)) | |
g.add((creator, RDF.type, SIOC['UserAccount'])) | |
g.add((creator, RDF.type, FOAF['Person'])) | |
g.add((creator, FOAF['name'], Literal(item.creator.text))) | |
# Link the posts to the categories | |
for (c,label) in categories+tags: | |
g.add((url, SIOC['has_topic'], c)) | |
g.add((c, RDF.type, SKOS['Concept'])) | |
g.add((c, SKOS['prefLabel'], label)) | |
# Link the posts to the comments | |
comments = item.findAll('comment') | |
for comment in comments: | |
post_url = FW[comment.comment_id.text] | |
author = Literal(comment.comment_author.text) | |
author_email = Literal(comment.comment_author_email.text) | |
author_url = FW[comment.comment_author_email.text] | |
date = Literal(comment.comment_date.text, datatype=XSD.datetime) | |
g.add((url, SIOC['has_reply'], post_url)) | |
g.add((post_url, RDF.type, SIOC['Post'])) | |
g.add((post_url, DCT['created'], date)) | |
g.add((post_url, SIOC['has_creator'], author_url)) | |
g.add((author_url, RDF.type, SIOC['UserAccount'])) | |
g.add((author_url, RDF.type, FOAF['Person'])) | |
g.add((author_url, FOAF['name'], author)) | |
g.add((author_url, FOAF['email'], author_email)) | |
with open('frankwatching.ttl', 'w') as f: | |
g.serialize(f, format='turtle') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment