rjurney · December 14, 2023 17:42
diff --git a/AREADME.md b/AREADME.md
diff --git a/cosine_sentence_encoding.py b/cosine_sentence_encoding.py
 import json
 from typing import List

 import numpy as np
 from scipy.spatial.distance import cosine
 from sentence_transformers import SentenceTransformer


 def compare_records_to_csv(record_pairs: List[List[str]], models):
    """compare_records_to_csv Generate CSV cosine similarity comparisons for a list of record pairs and models

    Parameters
    ----------
    record_pairs : List[List[str]]
        Pairs of records to compare
    models : Dict[str, SentenceTransformer]
        A pair of sentence transformers to compare
    """

    for name_one, name_two in record_pairs:
        scores = []
        for model_name in models.keys():
            model = models[model_name]

            embedding_one = model.encode(name_one)
            embedding_two = model.encode(name_two)
            score = 1.0 - cosine(embedding_one, embedding_two)

            scores.append(score)

        print(f"{name_one}\t{name_two}\t{scores[0]:,.3f}\t{scores[1]:,.3f}")


 models = {
    "paraphrase-multilingual-MiniLM-L12-v2": SentenceTransformer(
        "paraphrase-multilingual-MiniLM-L12-v2"
    ),
    "sentence-transformers/all-MiniLM-L12-v2": SentenceTransformer(
        "sentence-transformers/all-MiniLM-L12-v2"
    ),
 }

 name_pairs = np.array(
    [
        ["Russell H Jurney", "Russell Jurney"],
        ["Russ H. Jurney", "Russell Jurney"],
        ["Russ H Jurney", "Russell Jurney"],
        ["Russ Howard Jurney", "Russell H Jurney"],
        ["Russell H. Jurney", "Russell Howard Jurney"],
        ["Russell H Jurney", "Russell Howard Jurney"],
        ["Alex Ratner", "Alexander Ratner"],
        ["ʿAlī ibn Abī Ṭālib", "عَلِيّ بْن أَبِي طَالِب"],
        ["Igor Berezovsky", "Игорь Березовский"],
        ["Oleg Konovalov", "Олег Коновалов"],
        ["Ben Lorica", "罗瑞卡"],
        ["Sam Smith", "Tom Jones"],
        ["Sam Smith", "Ron Smith"],
        ["Sam Smith", "Samuel Smith"],
    ]
 )

 json_pairs = np.array(
    [
        [
            json.dumps({"name": "Russell H Jurney", "birthday": "02/01/1980"}),
            json.dumps({"name": "Russell Jurney", "birthday": "02/01/1990"}),
        ],
        [
            json.dumps({"name": "Russ H. Jurney", "birthday": "02/01/1980"}),
            json.dumps({"name": "Russell Jurney", "birthday": "02/01/1991"}),
        ],
        [
            json.dumps({"name": "Russ H Jurney", "birthday": "02/01/1980"}),
            json.dumps({"name": "Russell Jurney", "birthday": "02/02/1990"}),
        ],
        [
            json.dumps({"name": "Russ Howard Jurney", "birthday": "02/01/1980"}),
            json.dumps({"name": "Russell H Jurney", "birthday": "02/01/1990"}),
        ],
        [
            json.dumps({"name": "Russell H. Jurney", "birthday": "02/01/1980"}),
            json.dumps({"name": "Russell Howard Jurney", "birthday": "02/01/1990"}),
        ],
        [
            json.dumps({"name": "Russell H Jurney", "birthday": "02/01/1980"}),
            json.dumps({"name": "Russell Howard Jurney", "birthday": "02/01/1990"}),
        ],
        [
            json.dumps({"name": "Alex Ratner", "birthday": "02/01/1901"}),
            json.dumps({"name": "Alexander Ratner", "birthday": "02/01/1976"}),
        ],
        [
            json.dumps({"name": "ʿAlī ibn Abī Ṭālib", "birthday": "02/01/1980"}),
            json.dumps({"name": "عَلِيّ بْن أَبِي طَالِب", "birthday": "02/01/1980"}),
        ],
        [
            json.dumps({"name": "Igor Berezovsky", "birthday": "01/01/1980"}),
            json.dumps({"name": "Игорь Березовский", "birthday": "02/03/1908"}),
        ],
        [
            json.dumps({"name": "Oleg Konovalov", "birthday": "02/01/1980"}),
            json.dumps({"name": "Олег Коновалов", "birthday": "05/04/1980"}),
        ],
        [
            json.dumps({"name": "Ben Lorica", "birthday": "02/01/1980"}),
            json.dumps({"name": "罗瑞卡", "birthday": "02/01/1980"}),
        ],
        [
            json.dumps({"name": "Sam Smith", "birthday": "02/01/1980"}),
            json.dumps({"name": "Tom Jones", "birthday": "02/01/1976"}),
        ],
        [
            json.dumps({"name": "Sam Smith", "birthday": "02/01/1980"}),
            json.dumps({"name": "Ron Smith", "birthday": "02/01/2001"}),
        ],
        [
            json.dumps({"name": "Sam Smith", "birthday": "02/01/1980"}),
            json.dumps({"name": "Samuel Smith", "birthday": "02/01/1801"}),
        ],
        [
            json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
            json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
        ],
        [
            json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
            json.dumps({"name": "Samuel Smith", "birthday": "02/01/1991"}),
        ],
        [
            json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
            json.dumps({"name": "Samuel Smith", "birthday": "02/01/2011"}),
        ],
    ]
 )

 print("Name One\tName Two\tAll Cosine\tParaphrase Cosine")
 compare_records_to_csv(name_pairs, models)

 print()

 print("JSON One\tJSON Two\tAll Cosine\tParaphrase Cosine")
 compare_records_to_csv(json_pairs, models)
diff --git a/json_distances.tsv b/json_distances.tsv
 JSON One	JSON Two	All Cosine	Paraphrase Cosine
 {"name": "Russell H Jurney", "birthday": "02/01/1980"}	{"name": "Russell Jurney", "birthday": "02/01/1990"}	0.946	0.982
 {"name": "Russ H. Jurney", "birthday": "02/01/1980"}	{"name": "Russell Jurney", "birthday": "02/01/1991"}	0.904	0.929
 {"name": "Russ H Jurney", "birthday": "02/01/1980"}	{"name": "Russell Jurney", "birthday": "02/02/1990"}	0.896	0.932
 {"name": "Russ Howard Jurney", "birthday": "02/01/1980"}	{"name": "Russell H Jurney", "birthday": "02/01/1990"}	0.929	0.926
 {"name": "Russell H. Jurney", "birthday": "02/01/1980"}	{"name": "Russell Howard Jurney", "birthday": "02/01/1990"}	0.931	0.963
 {"name": "Russell H Jurney", "birthday": "02/01/1980"}	{"name": "Russell Howard Jurney", "birthday": "02/01/1990"}	0.926	0.967
 {"name": "Alex Ratner", "birthday": "02/01/1901"}	{"name": "Alexander Ratner", "birthday": "02/01/1976"}	0.916	0.932
 {"name": "\u02bfAl\u012b ibn Ab\u012b \u1e6c\u0101lib", "birthday": "02/01/1980"}	{"name": "\u0639\u064e\u0644\u0650\u064a\u0651 \u0628\u0652\u0646 \u0623\u064e\u0628\u0650\u064a \u0637\u064e\u0627\u0644\u0650\u0628", "birthday": "02/01/1980"}	0.756	0.812
 {"name": "Igor Berezovsky", "birthday": "01/01/1980"}	{"name": "\u0418\u0433\u043e\u0440\u044c \u0411\u0435\u0440\u0435\u0437\u043e\u0432\u0441\u043a\u0438\u0439", "birthday": "02/03/1908"}	0.608	0.604
 {"name": "Oleg Konovalov", "birthday": "02/01/1980"}	{"name": "\u041e\u043b\u0435\u0433 \u041a\u043e\u043d\u043e\u0432\u0430\u043b\u043e\u0432", "birthday": "05/04/1980"}	0.573	0.687
 {"name": "Ben Lorica", "birthday": "02/01/1980"}	{"name": "\u7f57\u745e\u5361", "birthday": "02/01/1980"}	0.794	0.786
 {"name": "Sam Smith", "birthday": "02/01/1980"}	{"name": "Tom Jones", "birthday": "02/01/1976"}	0.808	0.767
 {"name": "Sam Smith", "birthday": "02/01/1980"}	{"name": "Ron Smith", "birthday": "02/01/2001"}	0.886	0.878
 {"name": "Sam Smith", "birthday": "02/01/1980"}	{"name": "Samuel Smith", "birthday": "02/01/1801"}	0.911	0.859
diff --git a/name_distances.tsv b/name_distances.tsv
	import json
	from typing import List

	import numpy as np
	from scipy.spatial.distance import cosine
	from sentence_transformers import SentenceTransformer


	def compare_records_to_csv(record_pairs: List[List[str]], models):
	"""compare_records_to_csv Generate CSV cosine similarity comparisons for a list of record pairs and models

	Parameters
	----------
	record_pairs : List[List[str]]
	Pairs of records to compare
	models : Dict[str, SentenceTransformer]
	A pair of sentence transformers to compare
	"""

	for name_one, name_two in record_pairs:
	scores = []
	for model_name in models.keys():
	model = models[model_name]

	embedding_one = model.encode(name_one)
	embedding_two = model.encode(name_two)
	score = 1.0 - cosine(embedding_one, embedding_two)

	scores.append(score)

	print(f"{name_one}\t{name_two}\t{scores[0]:,.3f}\t{scores[1]:,.3f}")


	models = {
	"paraphrase-multilingual-MiniLM-L12-v2": SentenceTransformer(
	"paraphrase-multilingual-MiniLM-L12-v2"
	),
	"sentence-transformers/all-MiniLM-L12-v2": SentenceTransformer(
	"sentence-transformers/all-MiniLM-L12-v2"
	),
	}

	name_pairs = np.array(
	[
	["Russell H Jurney", "Russell Jurney"],
	["Russ H. Jurney", "Russell Jurney"],
	["Russ H Jurney", "Russell Jurney"],
	["Russ Howard Jurney", "Russell H Jurney"],
	["Russell H. Jurney", "Russell Howard Jurney"],
	["Russell H Jurney", "Russell Howard Jurney"],
	["Alex Ratner", "Alexander Ratner"],
	["ʿAlī ibn Abī Ṭālib", "عَلِيّ بْن أَبِي طَالِب"],
	["Igor Berezovsky", "Игорь Березовский"],
	["Oleg Konovalov", "Олег Коновалов"],
	["Ben Lorica", "罗瑞卡"],
	["Sam Smith", "Tom Jones"],
	["Sam Smith", "Ron Smith"],
	["Sam Smith", "Samuel Smith"],
	]
	)

	json_pairs = np.array(
	[
	[
	json.dumps({"name": "Russell H Jurney", "birthday": "02/01/1980"}),
	json.dumps({"name": "Russell Jurney", "birthday": "02/01/1990"}),
	],
	[
	json.dumps({"name": "Russ H. Jurney", "birthday": "02/01/1980"}),
	json.dumps({"name": "Russell Jurney", "birthday": "02/01/1991"}),
	],
	[
	json.dumps({"name": "Russ H Jurney", "birthday": "02/01/1980"}),
	json.dumps({"name": "Russell Jurney", "birthday": "02/02/1990"}),
	],
	[
	json.dumps({"name": "Russ Howard Jurney", "birthday": "02/01/1980"}),
	json.dumps({"name": "Russell H Jurney", "birthday": "02/01/1990"}),
	],
	[
	json.dumps({"name": "Russell H. Jurney", "birthday": "02/01/1980"}),
	json.dumps({"name": "Russell Howard Jurney", "birthday": "02/01/1990"}),
	],
	[
	json.dumps({"name": "Russell H Jurney", "birthday": "02/01/1980"}),
	json.dumps({"name": "Russell Howard Jurney", "birthday": "02/01/1990"}),
	],
	[
	json.dumps({"name": "Alex Ratner", "birthday": "02/01/1901"}),
	json.dumps({"name": "Alexander Ratner", "birthday": "02/01/1976"}),
	],
	[
	json.dumps({"name": "ʿAlī ibn Abī Ṭālib", "birthday": "02/01/1980"}),
	json.dumps({"name": "عَلِيّ بْن أَبِي طَالِب", "birthday": "02/01/1980"}),
	],
	[
	json.dumps({"name": "Igor Berezovsky", "birthday": "01/01/1980"}),
	json.dumps({"name": "Игорь Березовский", "birthday": "02/03/1908"}),
	],
	[
	json.dumps({"name": "Oleg Konovalov", "birthday": "02/01/1980"}),
	json.dumps({"name": "Олег Коновалов", "birthday": "05/04/1980"}),
	],
	[
	json.dumps({"name": "Ben Lorica", "birthday": "02/01/1980"}),
	json.dumps({"name": "罗瑞卡", "birthday": "02/01/1980"}),
	],
	[
	json.dumps({"name": "Sam Smith", "birthday": "02/01/1980"}),
	json.dumps({"name": "Tom Jones", "birthday": "02/01/1976"}),
	],
	[
	json.dumps({"name": "Sam Smith", "birthday": "02/01/1980"}),
	json.dumps({"name": "Ron Smith", "birthday": "02/01/2001"}),
	],
	[
	json.dumps({"name": "Sam Smith", "birthday": "02/01/1980"}),
	json.dumps({"name": "Samuel Smith", "birthday": "02/01/1801"}),
	],
	[
	json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
	json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
	],
	[
	json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
	json.dumps({"name": "Samuel Smith", "birthday": "02/01/1991"}),
	],
	[
	json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
	json.dumps({"name": "Samuel Smith", "birthday": "02/01/2011"}),
	],
	]
	)

	print("Name One\tName Two\tAll Cosine\tParaphrase Cosine")
	compare_records_to_csv(name_pairs, models)

	print()

	print("JSON One\tJSON Two\tAll Cosine\tParaphrase Cosine")
	compare_records_to_csv(json_pairs, models)
	JSON One JSON Two All Cosine Paraphrase Cosine
	{"name": "Russell H Jurney", "birthday": "02/01/1980"} {"name": "Russell Jurney", "birthday": "02/01/1990"} 0.946 0.982
	{"name": "Russ H. Jurney", "birthday": "02/01/1980"} {"name": "Russell Jurney", "birthday": "02/01/1991"} 0.904 0.929
	{"name": "Russ H Jurney", "birthday": "02/01/1980"} {"name": "Russell Jurney", "birthday": "02/02/1990"} 0.896 0.932
	{"name": "Russ Howard Jurney", "birthday": "02/01/1980"} {"name": "Russell H Jurney", "birthday": "02/01/1990"} 0.929 0.926
	{"name": "Russell H. Jurney", "birthday": "02/01/1980"} {"name": "Russell Howard Jurney", "birthday": "02/01/1990"} 0.931 0.963
	{"name": "Russell H Jurney", "birthday": "02/01/1980"} {"name": "Russell Howard Jurney", "birthday": "02/01/1990"} 0.926 0.967
	{"name": "Alex Ratner", "birthday": "02/01/1901"} {"name": "Alexander Ratner", "birthday": "02/01/1976"} 0.916 0.932
	{"name": "\u02bfAl\u012b ibn Ab\u012b \u1e6c\u0101lib", "birthday": "02/01/1980"} {"name": "\u0639\u064e\u0644\u0650\u064a\u0651 \u0628\u0652\u0646 \u0623\u064e\u0628\u0650\u064a \u0637\u064e\u0627\u0644\u0650\u0628", "birthday": "02/01/1980"} 0.756 0.812
	{"name": "Igor Berezovsky", "birthday": "01/01/1980"} {"name": "\u0418\u0433\u043e\u0440\u044c \u0411\u0435\u0440\u0435\u0437\u043e\u0432\u0441\u043a\u0438\u0439", "birthday": "02/03/1908"} 0.608 0.604
	{"name": "Oleg Konovalov", "birthday": "02/01/1980"} {"name": "\u041e\u043b\u0435\u0433 \u041a\u043e\u043d\u043e\u0432\u0430\u043b\u043e\u0432", "birthday": "05/04/1980"} 0.573 0.687
	{"name": "Ben Lorica", "birthday": "02/01/1980"} {"name": "\u7f57\u745e\u5361", "birthday": "02/01/1980"} 0.794 0.786
	{"name": "Sam Smith", "birthday": "02/01/1980"} {"name": "Tom Jones", "birthday": "02/01/1976"} 0.808 0.767
	{"name": "Sam Smith", "birthday": "02/01/1980"} {"name": "Ron Smith", "birthday": "02/01/2001"} 0.886 0.878
	{"name": "Sam Smith", "birthday": "02/01/1980"} {"name": "Samuel Smith", "birthday": "02/01/1801"} 0.911 0.859
Name One	Name Two	All Cosine	Paraphrase Cosine
Russell H Jurney	Russell Jurney	0.962	0.952
Russ H. Jurney	Russell Jurney	0.830	0.854
Russ H Jurney	Russell Jurney	0.844	0.872
Russ Howard Jurney	Russell H Jurney	0.865	0.849
Russell H. Jurney	Russell Howard Jurney	0.915	0.875
Russell H Jurney	Russell Howard Jurney	0.923	0.902
Alex Ratner	Alexander Ratner	0.893	0.801
ʿAlī ibn Abī Ṭālib	عَلِيّ بْن أَبِي طَالِب	0.555	0.444
Igor Berezovsky	Игорь Березовский	0.922	0.323
Oleg Konovalov	Олег Коновалов	0.965	0.339
Ben Lorica	罗瑞卡	0.764	0.105
Sam Smith	Tom Jones	0.622	0.456
Sam Smith	Ron Smith	0.895	0.688
Sam Smith	Samuel Smith	0.882	0.807