Skip to content

Instantly share code, notes, and snippets.

@RinkeHoekstra
Last active August 29, 2015 14:21
Show Gist options
  • Save RinkeHoekstra/c08b84fa7916a5a13100 to your computer and use it in GitHub Desktop.
Save RinkeHoekstra/c08b84fa7916a5a13100 to your computer and use it in GitHub Desktop.
Frankwatching Wordpress export to RDF
# Copyright (c) 2015, Rinke Hoekstra, VU University Amsterdam
# Licensed under the MIT License
from bs4 import BeautifulSoup
from rdflib import Graph, Namespace, RDF, RDFS, XSD, Literal, URIRef
# Prepare the RDF graph
g = Graph()
DCT = Namespace('http://purl.org/dc/terms/')
FOAF = Namespace('http://xmlns.com/foaf/0.1/')
SIOC = Namespace('http://rdfs.org/sioc/ns#')
SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')
FW = Namespace('http://frankwatching.com/resource/')
g.bind('dct', DCT)
g.bind('foaf', FOAF)
g.bind('sioc', SIOC)
g.bind('skos', SKOS)
g.bind('fw', FW)
soup = BeautifulSoup(open('frankwatching.wordpress.2015-05-27.xml'), 'xml')
items = soup.findAll('item')
# Loop over all 'items' (i.e. the Wordpress posts)
# TODO: One problem is that comment authors and post authors are represented
# in different ways in the XML file. Comment authors have email addresses and
# long names, while post authors only have their login name. This means that
# the same person who writes posts *and* comments is represented as two distinct
# persons in the resulting RDF file.
for item in items:
print item.title.text
title = Literal(item.title.text)
url = URIRef(item.link.text)
date = Literal(item.pubDate.text, datatype=XSD.datetime)
creator = FW[item.creator.text.replace(' ','_')]
categories = [(FW[c['nicename']], Literal(c.text)) for c in item.findAll('category', domain='category')]
tags = [(FW[c['nicename']], Literal(c.text)) for c in item.findAll('category', domain='post_tag')]
# Build the RDF graph
g.add((url, RDF.type, SIOC['Post']))
g.add((url, DCT['title'], title))
g.add((url, DCT['created'], date))
g.add((url, SIOC['has_creator'], creator))
g.add((creator, RDF.type, SIOC['UserAccount']))
g.add((creator, RDF.type, FOAF['Person']))
g.add((creator, FOAF['name'], Literal(item.creator.text)))
# Link the posts to the categories
for (c,label) in categories+tags:
g.add((url, SIOC['has_topic'], c))
g.add((c, RDF.type, SKOS['Concept']))
g.add((c, SKOS['prefLabel'], label))
# Link the posts to the comments
comments = item.findAll('comment')
for comment in comments:
post_url = FW[comment.comment_id.text]
author = Literal(comment.comment_author.text)
author_email = Literal(comment.comment_author_email.text)
author_url = FW[comment.comment_author_email.text]
date = Literal(comment.comment_date.text, datatype=XSD.datetime)
g.add((url, SIOC['has_reply'], post_url))
g.add((post_url, RDF.type, SIOC['Post']))
g.add((post_url, DCT['created'], date))
g.add((post_url, SIOC['has_creator'], author_url))
g.add((author_url, RDF.type, SIOC['UserAccount']))
g.add((author_url, RDF.type, FOAF['Person']))
g.add((author_url, FOAF['name'], author))
g.add((author_url, FOAF['email'], author_email))
with open('frankwatching.ttl', 'w') as f:
g.serialize(f, format='turtle')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment