Skip to content

Instantly share code, notes, and snippets.

Last active January 7, 2021 20:09
Show Gist options
  • Save jbencina/171a6438eb28e6c58bb855a3cafd25e4 to your computer and use it in GitHub Desktop.
Save jbencina/171a6438eb28e6c58bb855a3cafd25e4 to your computer and use it in GitHub Desktop.
Quick (and dirty) Python script that takes the transcript information from CSPAN video pages and parses it into a JSON file with the time stamp, speaker name, and speech text. There are probably 100 more elegant ways to write this, but it works for what I needed.
from html.parser import HTMLParser
import re
import json
import datetime as dt
class Entry():
def __init__(self, time):
self.time = ''
self.time_base = time
self.speaker = ''
self.text = ''
def value(self):
return {
'time': self.time,
'time_utc': self.get_utc_time(self.time_base, self.time),
'speaker': self.speaker,
'text': re.sub('(\.){2,}$','.',self.text)
def get_utc_time(base, delta):
hr, min, sec = map(float, delta.split(':'))
td = dt.timedelta(hours=hr, minutes=min, seconds=sec)
return (base + td).isoformat() + 'Z'
class HTMLParseTable(HTMLParser):
__outer_class = 'table-wrap load-transcript'
__outer_tag = 'div'
__inner_tags = {
'th': 0,
'strong': 0,
'p': 0
__outer_capture = False
__outer_instances = 0
__entry = None
__output = []
__time = ''
def output(self):
return self.__output
def time_start(self):
return self.__time
def time_start(self, value):
self.__time = dt.datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ" )
def handle_starttag(self, tag, attrs):
if not self.__outer_capture:
if tag == self.__outer_tag:
if self.has_attribute(attrs, 'class', self.__outer_class):
self.__outer_capture = True
print('Found outer tag!')
if self.__outer_capture:
self.__outer_instances += 1
if self.__outer_capture:
if tag in self.__inner_tags.keys():
self.__inner_tags[tag] += 1
def handle_endtag(self, tag):
if self.__outer_capture:
if tag == self.__outer_tag:
self.__outer_instances -= 1
elif tag in self.__inner_tags.keys():
self.__inner_tags[tag] -= 1
if tag == 'p':
if self.__outer_instances == 0:
self.__outer_capture = False
print('Found end tag!')
def handle_data(self, data):
if self.__outer_capture:
if self.__inner_tags.get('th') > 0:
m ='(\d{2}\:\d{2}\:\d{2})', data)
if m:
self.__entry = Entry(self.time_start)
self.__entry.time =
elif self.__inner_tags.get('strong') > 0:
self.__entry.speaker = data
elif self.__inner_tags.get('p') > 0:
self.__entry.text += data
def has_attribute(attrs, key, value):
found = False
for k,v in attrs:
if k == key and v == value:
found = True
return found
# Set the start time of the event as YYYY-MM-DDTHH:MM:SSZ
parser = HTMLParseTable()
ts = input('Event UTC Start Time (YYYY-MM-DDTHH:MM:SSZ):')
parser.time_start = ts
with open('C:/yourdir/input.html','r') as f:
s =
with open('C:/yourdir/output.json', 'w') as f:
json.dump(parser.output, f, sort_keys=True,indent=4, separators=(',', ': '))
Copy link

Just noticed that some parts of the CSPAN transcript are repeated word-for-word. These are clearly mistakes as they don't even sync to the video. You should parse and select the 1st instance of when something was said as the correct time

Copy link

Any chance of the cleaned up data?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment