Skip to content

Instantly share code, notes, and snippets.

@tingletech
Created July 25, 2019 00:37
Show Gist options
  • Save tingletech/03db322757764313784a3b19f4911ff1 to your computer and use it in GitHub Desktop.
Save tingletech/03db322757764313784a3b19f4911ff1 to your computer and use it in GitHub Desktop.
# -*- coding: UTF-8 -*-
import unicodedata
import re
RE_ALPHANUMSPACE = re.compile(r'[^0-9A-Za-z\s]*') # \W include "_" as does A-z
def normalize_sort_field(sort_field,
default_missing='~title unknown',
missing_equivalents=['title unknown']):
#fold case
sort_field = sort_field.lower()
# fold diacritics https://stackoverflow.com/a/518232/1763984
sort_field = ''.join(c for c in unicodedata.normalize('NFD', sort_field)
if unicodedata.category(c) != 'Mn')
# remove punctuation
sort_field = RE_ALPHANUMSPACE.sub('', sort_field)
# remove English initial articles
words = sort_field.split()
if words:
if words[0] in ('the', 'a', 'an'):
sort_field = ' '.join(words[1:])
if not sort_field or sort_field in missing_equivalents:
sort_field = default_missing
# normalize whitespace https://stackoverflow.com/a/46501496/1763984
sort_field = u' '.join(sort_field.split())
return sort_field
print(normalize_sort_field(u'嶋巡月弓張 [Shimameguri tsuki no yumihari]'))
print(normalize_sort_field(u'$ 容競出入湊 [Sugatakurabe deiri no minato]'))
print(normalize_sort_field(u'菅原伝授手習鑑 [Sugawara denju tenarai kagami]'))
print(normalize_sort_field(u'$ 崇禅寺馬場 [Sōzenji bama]'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment