Skip to content

Instantly share code, notes, and snippets.

@gwpl
Last active April 28, 2024 23:39
Show Gist options
  • Save gwpl/4f6c2aa1401febbf0873859495496315 to your computer and use it in GitHub Desktop.
Save gwpl/4f6c2aa1401febbf0873859495496315 to your computer and use it in GitHub Desktop.
Vector Embeddings Algebra Tests

Make python venv

python -m venv venv_foo

Now you can activate it:

. venv_foo/bin/activate

once finished , deactivate with:

deactivate

Install

Inside activated virtual environment:

pip3 install -U jupyter ipykernel ollama openai ipython grep-ast qdrant-client scipy numpy voyageai
# If you want to use jupyter notebook:
python3 -m ipykernel install --user --name=myenv_ipython

For Jupyter Lab. In your notebook, you can change the kernel by clicking on Kernel -> Change kernel -> myenv_ipython

Setup API Keys

  • to use OpenAI: export OPENAI_API_KEY='...'
  • to use Anthropic's Voyage AI embeddings: export VOYAGE_API_KEY='...'
  • to use ollama...

You need to install and run ollama locally, e.g. with Docker: https://github.com/ollama/ollama/tree/main/docs , and pull models (e.g. ollama pull mxbai-embed-large)

References

# https://docs.anthropic.com/claude/docs/embeddings
import os
import voyageai
import numpy as np
from scipy.spatial import distance
from typing import List
descriptions = "Queen, Woman, Man, King, Knight, Carpenter, Baker, girl, boy".split(", ")
texts = descriptions
# Needs VOYAGE_API_KEY in environment
vo = voyageai.Client()
computed_embeddings = vo.embed(texts, model="voyage-2", input_type="document")
cached_embeddings = dict(zip(texts,computed_embeddings.embeddings))
def embedding(key):
# we convert list to np.array, so we can use `-`, `+` operators.
return np.array(cached_embeddings[key])
def cosine_similarity(embedding1, embedding2):
return 1 - distance.cosine(embedding1, embedding2)
def sim(e1, e2):
return cosine_similarity(e1, e2)
def similarities_to_embeddings(query_embedding: List[float], embeddings: List[List[float]], distance_metric="cosine") -> List[float]:
return [cosine_similarity(query_embedding, embedding) for embedding in embeddings]
queen = embedding("Queen")
woman = embedding("Woman")
man = embedding("Man")
king = embedding("King")
knight = embedding("Knight")
carpenter = embedding("Carpenter")
baker = embedding("Baker")
girl = embedding("girl")
boy = embedding("boy")
descriptions = "queen, woman, man, king, knight, carpenter, baker, girl, boy".split(", ")
# let's check!
result = queen - woman + man
similarities = similarities_to_embeddings(result, [queen, woman, man, king, knight, carpenter, baker, girl, boy])
print("similarities between \"queen-woman+man\" and ... = {}".format(dict(zip(descriptions, similarities))))
# let's make another check!
result = queen - girl + boy
similarities = similarities_to_embeddings(result, [queen, woman, man, king, knight, carpenter, baker, girl, boy])
print("similarities between \"queen-girl+boy\" and ... = {}".format(dict(zip(descriptions, similarities))))
# let's make another check!
result = queen - woman - girl + man + boy
similarities = similarities_to_embeddings(result, [queen, woman, man, king, knight, carpenter, baker, girl, boy])
print("similarities between \"queen-woman-girl+man+boy\" and ... = {}".format(dict(zip(descriptions, similarities))))
# https://ollama.com/blog/embedding-models
import os
import ollama
import numpy as np
from scipy.spatial import distance
from typing import List
def embedding(text: str) -> list[float]:
response = ollama.embeddings(
model='mxbai-embed-large',
#model='nomic-embed-text',
#model='snowflake-arctic-embed:335m',
#model='all-minilm',
prompt=text,
)
# we convert list to np.array, so we can use `-`, `+` operators.
return np.array(response['embedding'])
def cosine_similarity(embedding1, embedding2):
return 1 - distance.cosine(embedding1, embedding2)
def sim(e1, e2):
return cosine_similarity(e1, e2)
def similarities_to_embeddings(query_embedding: List[float], embeddings: List[List[float]], distance_metric="cosine") -> List[float]:
return [cosine_similarity(query_embedding, embedding) for embedding in embeddings]
queen = embedding("Queen")
woman = embedding("Woman")
man = embedding("Man")
king = embedding("King")
knight = embedding("Knight")
carpenter = embedding("Carpenter")
baker = embedding("Baker")
girl = embedding("girl")
boy = embedding("boy")
descriptions = "queen, woman, man, king, knight, carpenter, baker, girl, boy".split(", ")
# let's check!
result = queen - woman + man
similarities = similarities_to_embeddings(result, [queen, woman, man, king, knight, carpenter, baker, girl, boy])
print("similarities between \"queen-woman+man\" and ... = {}".format(dict(zip(descriptions, similarities))))
# let's make another check!
result = queen - girl + boy
similarities = similarities_to_embeddings(result, [queen, woman, man, king, knight, carpenter, baker, girl, boy])
print("similarities between \"queen-girl+boy\" and ... = {}".format(dict(zip(descriptions, similarities))))
# let's make another check!
result = queen - woman - girl + man + boy
similarities = similarities_to_embeddings(result, [queen, woman, man, king, knight, carpenter, baker, girl, boy])
print("similarities between \"queen-woman-girl+man+boy\" and ... = {}".format(dict(zip(descriptions, similarities))))
# https://www.phind.com/search?cache=tyogewfb0jzrit41eq6dufgr
# https://cookbook.openai.com/examples/question_answering_using_embeddings
import os
import numpy as np
from openai import OpenAI
from scipy.spatial import distance
from typing import List
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
def embedding(text: str) -> list[float]:
response = client.embeddings.create(
model='text-embedding-ada-002',
input=text)
# we convert list to np.array, so we can use `-`, `+` operators.
return np.array(response.data[0].embedding)
def cosine_similarity(embedding1, embedding2):
return 1 - distance.cosine(embedding1, embedding2)
def sim(e1, e2):
return cosine_similarity(e1, e2)
def similarities_to_embeddings(query_embedding: List[float], embeddings: List[List[float]], distance_metric="cosine") -> List[float]:
return [cosine_similarity(query_embedding, embedding) for embedding in embeddings]
queen = embedding("Queen")
woman = embedding("Woman")
man = embedding("Man")
king = embedding("King")
knight = embedding("Knight")
carpenter = embedding("Carpenter")
baker = embedding("Baker")
girl = embedding("girl")
boy = embedding("boy")
descriptions = "queen, woman, man, king, knight, carpenter, baker, girl, boy".split(", ")
# let's check!
result = queen - woman + man
similarities = similarities_to_embeddings(result, [queen, woman, man, king, knight, carpenter, baker, girl, boy])
print("similarities between \"queen-woman+man\" and ... = {}".format(dict(zip(descriptions, similarities))))
# let's make another check!
result = queen - girl + boy
similarities = similarities_to_embeddings(result, [queen, woman, man, king, knight, carpenter, baker, girl, boy])
print("similarities between \"queen-girl+boy\" and ... = {}".format(dict(zip(descriptions, similarities))))
# let's make another check!
result = queen - woman - girl + man + boy
similarities = similarities_to_embeddings(result, [queen, woman, man, king, knight, carpenter, baker, girl, boy])
print("similarities between \"queen-woman-girl+man+boy\" and ... = {}".format(dict(zip(descriptions, similarities))))

Exmaple outputs

Ollama with mxbai-embed-large

similarities between "queen-woman+man" and ... = {'queen': 0.7664688552859836, 'woman': 0.16067344014964902, 'man': 0.643216410618745, 'king': 0.5296803458540953, 'knight': 0.36225247833752694, 'carpenter': 0.7134908064255635, 'baker': 0.723708193109565, 'girl': 0.734322206946502, 'boy': 0.7525008550380998}
similarities between "queen-girl+boy" and ... = {'queen': 0.9750610709453384, 'woman': 0.5392273263146471, 'man': 0.5579780738003502, 'king': 0.5312961528030734, 'knight': 0.5244125308085369, 'carpenter': 0.8835927517707559, 'baker': 0.9011729720423287, 'girl': 0.9227513575082695, 'boy': 0.9679081680297743}
similarities between "queen-woman-girl+man+boy" and ... = {'queen': 0.7059391012944155, 'woman': 0.11622645526937725, 'man': 0.6346237227600416, 'king': 0.5228061118283761, 'knight': 0.3225914898607214, 'carpenter': 0.6156882192910788, 'baker': 0.6321456800653157, 'girl': 0.6540596727195828, 'boy': 0.7065316024823556}

OpenAI with text-embedding-ada-002

similarities between "queen-woman+man" and ... = {'queen': 0.9090356922999644, 'woman': 0.7383125634285356, 'man': 0.8727999593079602, 'king': 0.874981918416208, 'knight': 0.8353931464007507, 'carpenter': 0.7383758305625431, 'baker': 0.7273738367212959, 'girl': 0.789644753413831, 'boy': 0.778626306263984}
similarities between "queen-girl+boy" and ... = {'queen': 0.9204091994580554, 'woman': 0.7734784225169276, 'man': 0.7604205043362579, 'king': 0.8756058372578365, 'knight': 0.8520403395363502, 'carpenter': 0.760029945608706, 'baker': 0.7739899710344146, 'girl': 0.7449389440765709, 'boy': 0.857317983566755}
similarities between "queen-woman-girl+man+boy" and ... = {'queen': 0.8103023927350899, 'woman': 0.6334801323290904, 'man': 0.7941144294821149, 'king': 0.8171057447852228, 'knight': 0.78420593875044, 'carpenter': 0.6934188951197728, 'baker': 0.6921505787604254, 'girl': 0.6679992193937505, 'boy': 0.8067563497811716}

Anthropic VoyageAI

similarities between "queen-woman+man" and ... = {'queen': 0.9620250632469288, 'woman': 0.8676377281008143, 'man': 0.935170276655766, 'king': 0.943568904457732, 'knight': 0.9066659504355087, 'carpenter': 0.8706774494101556, 'baker': 0.8980991483750144, 'girl': 0.8739373822391289, 'boy': 0.893270024278836}
similarities between "queen-girl+boy" and ... = {'queen': 0.9685843012348166, 'woman': 0.8957173046404923, 'man': 0.9241750945328031, 'king': 0.9538051100225776, 'knight': 0.9144481997105646, 'carpenter': 0.8832440810460184, 'baker': 0.9075603571749681, 'girl': 0.8896548358203663, 'boy': 0.9344322624296575}
similarities between "queen-woman-girl+man+boy" and ... = {'queen': 0.9010516700461723, 'woman': 0.8047944044077067, 'man': 0.90482593307681, 'king': 0.9113561126876575, 'knight': 0.8663787097895922, 'carpenter': 0.8346619494356733, 'baker': 0.8580583073300947, 'girl': 0.8045095605128337, 'boy': 0.8821508337744733}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment