Skip to content

Instantly share code, notes, and snippets.

@glenrobertson
Created June 11, 2017 01:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save glenrobertson/0ad29536ac050d86d739214541c4701b to your computer and use it in GitHub Desktop.
Save glenrobertson/0ad29536ac050d86d739214541c4701b to your computer and use it in GitHub Desktop.
Convert transcript to list of digits
import sys
import re
words_to_digits = {
'zero': 0,
'one': 1,
'two': 2,
'three': 3,
'four': 4,
'five': 5,
'six': 6,
'seven': 7,
'eight': 8,
'nine': 9,
'ten': 10,
'eleven': 11,
'twelve': 12,
'thirteen': 13,
'fourteen': 14,
'fifteen': 15,
'sixteen': 16,
'seventeen': 17,
'eighteen': 18,
'nineteen': 19,
# homonyms
'for': 4,
'oh': 0,
'too': 2,
'to': 2,
}
prefixes = {
'twenty': 20,
'thirty': 30,
'forty': 40,
'fifty': 50,
'sixty': 60,
'seventy': 70,
'eighty': 80,
'ninety': 90,
}
suffixes = {
'hundred': 100,
'thousand': 1000,
}
repeaters = {
'double': 2,
'triple': 3,
}
def extract_digits(transcript):
"""
:param transcript str: a text transcript
Given a transcript, try to extract a series of digits.
One example usage is a transcript of someone speaking a phone number:
They may say:
1) "four one five eight double two five thousand"
Result: 4158225000
or
2) "triple five eight for too two two thirty"
Result: 5558422230
Words are mapped to digits.
Prefixes are stored to be added to any next occuring digit.
Suffixes are multiplied against any previous digit.
Repeaters are stored to repeat the next occuring digit.
"""
digits = []
current_repeater = None
current_prefix = None
words = transcript.split(' ')
for word in words:
digit = words_to_digits.get(word)
prefix = prefixes.get(word)
suffix = suffixes.get(word)
repeater = repeaters.get(word)
if repeater is not None:
current_repeater = repeater
elif suffix is not None:
if len(digits) > 0:
digits[-1] *= suffix
elif prefix is not None:
if current_prefix is not None:
digits.append(current_prefix)
current_prefix = prefix
elif digit is not None:
if current_repeater is not None:
digits.extend(current_repeater * [digit])
current_repeater = None
elif current_prefix is not None:
digits.append(current_prefix + digit)
current_prefix = None
else:
digits.append(digit)
if current_prefix is not None:
digits.append(current_prefix)
return digits
"""
Tests
"""
transcript_digits = [
(
'nine oh double eight three four four twenty eight twenty two five thousand two',
[9, 0, 8, 8, 3, 4, 4, 28, 22, 5000, 2]
),
(
'four one five eight double two five thousand',
[4, 1, 5, 8, 22, 5000]
),
(
'triple five eight for too two two thirty',
[555, 8, 4, 2, 2, 2, 30]
),
(
'four one five um eight one uh too for six hundred',
[4, 1, 5, 8, 1, 2, 4, 600]
)
]
if __name__ == '__main__':
transcript = sys.argv[1]
transcript = transcript.lower()
transcript = ''.join([c if c.isalpha() else ' ' for c in transcript])
digits = extract_digits(transcript)
print ' '.join(map(str, digits))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment