Skip to content

Instantly share code, notes, and snippets.

@rjurney
Last active December 14, 2023 17:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjurney/35e18a749bd3064ad1c57ea4420fb01c to your computer and use it in GitHub Desktop.
Save rjurney/35e18a749bd3064ad1c57ea4420fb01c to your computer and use it in GitHub Desktop.
Excellent name similarity results between sentence encoders 'sentence-transformers/all-MiniLM-L12-v2' and 'paraphrase-multilingual-MiniLM-L12-v2'

All vs Paraphrase Mini-LM Model Comparisons

This experiment compares multiple methods of sentence encoding on people's names - including across character sets - using the following models:

Notes

Compared to the names, JSON tends to compress scores together owing to overlapping text in formatting: field names, quotes and brackets. You can see in the name pairs name length is a source of error. The dates behave well in the JSON records.

import json
from typing import List
import numpy as np
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
def compare_records_to_csv(record_pairs: List[List[str]], models):
"""compare_records_to_csv Generate CSV cosine similarity comparisons for a list of record pairs and models
Parameters
----------
record_pairs : List[List[str]]
Pairs of records to compare
models : Dict[str, SentenceTransformer]
A pair of sentence transformers to compare
"""
for name_one, name_two in record_pairs:
scores = []
for model_name in models.keys():
model = models[model_name]
embedding_one = model.encode(name_one)
embedding_two = model.encode(name_two)
score = 1.0 - cosine(embedding_one, embedding_two)
scores.append(score)
print(f"{name_one}\t{name_two}\t{scores[0]:,.3f}\t{scores[1]:,.3f}")
models = {
"paraphrase-multilingual-MiniLM-L12-v2": SentenceTransformer(
"paraphrase-multilingual-MiniLM-L12-v2"
),
"sentence-transformers/all-MiniLM-L12-v2": SentenceTransformer(
"sentence-transformers/all-MiniLM-L12-v2"
),
}
name_pairs = np.array(
[
["Russell H Jurney", "Russell Jurney"],
["Russ H. Jurney", "Russell Jurney"],
["Russ H Jurney", "Russell Jurney"],
["Russ Howard Jurney", "Russell H Jurney"],
["Russell H. Jurney", "Russell Howard Jurney"],
["Russell H Jurney", "Russell Howard Jurney"],
["Alex Ratner", "Alexander Ratner"],
["ʿAlī ibn Abī Ṭālib", "عَلِيّ بْن أَبِي طَالِب"],
["Igor Berezovsky", "Игорь Березовский"],
["Oleg Konovalov", "Олег Коновалов"],
["Ben Lorica", "罗瑞卡"],
["Sam Smith", "Tom Jones"],
["Sam Smith", "Ron Smith"],
["Sam Smith", "Samuel Smith"],
]
)
json_pairs = np.array(
[
[
json.dumps({"name": "Russell H Jurney", "birthday": "02/01/1980"}),
json.dumps({"name": "Russell Jurney", "birthday": "02/01/1990"}),
],
[
json.dumps({"name": "Russ H. Jurney", "birthday": "02/01/1980"}),
json.dumps({"name": "Russell Jurney", "birthday": "02/01/1991"}),
],
[
json.dumps({"name": "Russ H Jurney", "birthday": "02/01/1980"}),
json.dumps({"name": "Russell Jurney", "birthday": "02/02/1990"}),
],
[
json.dumps({"name": "Russ Howard Jurney", "birthday": "02/01/1980"}),
json.dumps({"name": "Russell H Jurney", "birthday": "02/01/1990"}),
],
[
json.dumps({"name": "Russell H. Jurney", "birthday": "02/01/1980"}),
json.dumps({"name": "Russell Howard Jurney", "birthday": "02/01/1990"}),
],
[
json.dumps({"name": "Russell H Jurney", "birthday": "02/01/1980"}),
json.dumps({"name": "Russell Howard Jurney", "birthday": "02/01/1990"}),
],
[
json.dumps({"name": "Alex Ratner", "birthday": "02/01/1901"}),
json.dumps({"name": "Alexander Ratner", "birthday": "02/01/1976"}),
],
[
json.dumps({"name": "ʿAlī ibn Abī Ṭālib", "birthday": "02/01/1980"}),
json.dumps({"name": "عَلِيّ بْن أَبِي طَالِب", "birthday": "02/01/1980"}),
],
[
json.dumps({"name": "Igor Berezovsky", "birthday": "01/01/1980"}),
json.dumps({"name": "Игорь Березовский", "birthday": "02/03/1908"}),
],
[
json.dumps({"name": "Oleg Konovalov", "birthday": "02/01/1980"}),
json.dumps({"name": "Олег Коновалов", "birthday": "05/04/1980"}),
],
[
json.dumps({"name": "Ben Lorica", "birthday": "02/01/1980"}),
json.dumps({"name": "罗瑞卡", "birthday": "02/01/1980"}),
],
[
json.dumps({"name": "Sam Smith", "birthday": "02/01/1980"}),
json.dumps({"name": "Tom Jones", "birthday": "02/01/1976"}),
],
[
json.dumps({"name": "Sam Smith", "birthday": "02/01/1980"}),
json.dumps({"name": "Ron Smith", "birthday": "02/01/2001"}),
],
[
json.dumps({"name": "Sam Smith", "birthday": "02/01/1980"}),
json.dumps({"name": "Samuel Smith", "birthday": "02/01/1801"}),
],
[
json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
],
[
json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
json.dumps({"name": "Samuel Smith", "birthday": "02/01/1991"}),
],
[
json.dumps({"name": "Samuel Smith", "birthday": "02/01/1980"}),
json.dumps({"name": "Samuel Smith", "birthday": "02/01/2011"}),
],
]
)
print("Name One\tName Two\tAll Cosine\tParaphrase Cosine")
compare_records_to_csv(name_pairs, models)
print()
print("JSON One\tJSON Two\tAll Cosine\tParaphrase Cosine")
compare_records_to_csv(json_pairs, models)
We can make this file beautiful and searchable if this error is corrected: Illegal quoting in line 2.
JSON One JSON Two All Cosine Paraphrase Cosine
{"name": "Russell H Jurney", "birthday": "02/01/1980"} {"name": "Russell Jurney", "birthday": "02/01/1990"} 0.946 0.982
{"name": "Russ H. Jurney", "birthday": "02/01/1980"} {"name": "Russell Jurney", "birthday": "02/01/1991"} 0.904 0.929
{"name": "Russ H Jurney", "birthday": "02/01/1980"} {"name": "Russell Jurney", "birthday": "02/02/1990"} 0.896 0.932
{"name": "Russ Howard Jurney", "birthday": "02/01/1980"} {"name": "Russell H Jurney", "birthday": "02/01/1990"} 0.929 0.926
{"name": "Russell H. Jurney", "birthday": "02/01/1980"} {"name": "Russell Howard Jurney", "birthday": "02/01/1990"} 0.931 0.963
{"name": "Russell H Jurney", "birthday": "02/01/1980"} {"name": "Russell Howard Jurney", "birthday": "02/01/1990"} 0.926 0.967
{"name": "Alex Ratner", "birthday": "02/01/1901"} {"name": "Alexander Ratner", "birthday": "02/01/1976"} 0.916 0.932
{"name": "\u02bfAl\u012b ibn Ab\u012b \u1e6c\u0101lib", "birthday": "02/01/1980"} {"name": "\u0639\u064e\u0644\u0650\u064a\u0651 \u0628\u0652\u0646 \u0623\u064e\u0628\u0650\u064a \u0637\u064e\u0627\u0644\u0650\u0628", "birthday": "02/01/1980"} 0.756 0.812
{"name": "Igor Berezovsky", "birthday": "01/01/1980"} {"name": "\u0418\u0433\u043e\u0440\u044c \u0411\u0435\u0440\u0435\u0437\u043e\u0432\u0441\u043a\u0438\u0439", "birthday": "02/03/1908"} 0.608 0.604
{"name": "Oleg Konovalov", "birthday": "02/01/1980"} {"name": "\u041e\u043b\u0435\u0433 \u041a\u043e\u043d\u043e\u0432\u0430\u043b\u043e\u0432", "birthday": "05/04/1980"} 0.573 0.687
{"name": "Ben Lorica", "birthday": "02/01/1980"} {"name": "\u7f57\u745e\u5361", "birthday": "02/01/1980"} 0.794 0.786
{"name": "Sam Smith", "birthday": "02/01/1980"} {"name": "Tom Jones", "birthday": "02/01/1976"} 0.808 0.767
{"name": "Sam Smith", "birthday": "02/01/1980"} {"name": "Ron Smith", "birthday": "02/01/2001"} 0.886 0.878
{"name": "Sam Smith", "birthday": "02/01/1980"} {"name": "Samuel Smith", "birthday": "02/01/1801"} 0.911 0.859
Name One Name Two All Cosine Paraphrase Cosine
Russell H Jurney Russell Jurney 0.962 0.952
Russ H. Jurney Russell Jurney 0.830 0.854
Russ H Jurney Russell Jurney 0.844 0.872
Russ Howard Jurney Russell H Jurney 0.865 0.849
Russell H. Jurney Russell Howard Jurney 0.915 0.875
Russell H Jurney Russell Howard Jurney 0.923 0.902
Alex Ratner Alexander Ratner 0.893 0.801
ʿAlī ibn Abī Ṭālib عَلِيّ بْن أَبِي طَالِب 0.555 0.444
Igor Berezovsky Игорь Березовский 0.922 0.323
Oleg Konovalov Олег Коновалов 0.965 0.339
Ben Lorica 罗瑞卡 0.764 0.105
Sam Smith Tom Jones 0.622 0.456
Sam Smith Ron Smith 0.895 0.688
Sam Smith Samuel Smith 0.882 0.807
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment