Skip to content

Instantly share code, notes, and snippets.

@christabor
Last active April 8, 2021 17:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save christabor/58495427c17209dc9ab23a652a33cc45 to your computer and use it in GitHub Desktop.
Save christabor/58495427c17209dc9ab23a652a33cc45 to your computer and use it in GitHub Desktop.
Make your chrome bookmarks into a searchable, tokenized inverted index
"""Tokenize bookmark titles and create an inverted index."""
import re
import os
import sys
from pprint import pprint as ppr
from collections import Counter, defaultdict
from pyquery import PyQuery as pq
def fmt_token(regex, t):
return str(re.sub(regex, '', t.lower()))
def make_iidex():
iidex = defaultdict(list)
path = '{}/CHROME_bookmarks_6_14_17.html'.format(os.getcwd())
replace_re = re.compile(r'[^a-zA-Z]+')
totals = []
# TODO: nltk stopwords
stopwords = [
'the', 'and', 'is', 'for', 'in', 'of', 'to', 'with',
]
with open(path, 'r') as bmarks:
dom = pq(bmarks.read())
titles = dom.find('DT > a')
for title in titles:
if title.text is None:
continue
tokens = title.text.split(' ')
tokens = [fmt_token(replace_re, t) for t in tokens]
tokens = [t for t in tokens if t not in stopwords and len(t) > 1]
for token in tokens:
iidex[token].append(title.text)
totals.append(token)
distribution = Counter(totals)
return distribution, iidex
def interact(distr, iidex):
def find(term):
matches = []
for k, v in iidex.items():
if term in k:
matches += v
matches = list(set(matches))
print('\n--- Searching for "{term}", found {amt} results. --\n'.format(
term=term, amt=len(matches)
))
for i, m in enumerate(matches):
print('{}{}. {}\n'.format(' ' * 4, i, m))
print('=== Use `find("xxx")` to explore something.')
import pdb; pdb.set_trace()
if __name__ == '__main__':
distribution, iidex = make_iidex()
if '-i' in sys.argv:
interact(distribution, iidex)
else:
ppr(iidex)
ppr(distribution.most_common(20))
ppr(iidex.keys()[:20])
@christabor
Copy link
Author

TODO: handle unicode issues

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment