import numpy as np
def cosine_similarity(vec1: np.array, vec2: np.array) -> float:
return np.dot(vec1, vec2) / ( np.linalg.norm(vec1) * np.linalg.norm(vec2) )Exercise: Embedding similarity
Task: Use the OpenAI embeddings API to compute the similarity between two given words or phrases.
Instructions:
- Choose two words or phrases with similar or related meanings.
- Use the OpenAI embeddings API to obtain embeddings for both words or phrases.
- Calculate the cosine similarity between the embeddings to measure their similarity.
- Print the similarity score and interpret the results.
Show solution
import os
from llm_utils.client import get_openai_client, OpenAIModels
MODEL = OpenAIModels.EMBED.value
# get the OpenAI client
client = get_openai_client(
model=MODEL,
config_path=os.environ.get("CONFIG_PATH")
)
# create the embeddings
word_1 = "king"
word_2 = "queen"
response_1 = client.embeddings.create(input=word_1, model=MODEL)
embedding_1 = response_1.data[0].embedding
response_2 = client.embeddings.create(input=word_2, model=MODEL)
embedding_2 = response_2.data[0].embedding# calculate the distance
dist_12 = cosine_similarity(embedding_1, embedding_2)
print(f"Cosine similarity between {word_1} and {word_2}: {round(dist_12, 3)}.")Cosine similarity between king and queen: 0.915.
word_3 = "pawn"
embedding_3 = client.embeddings.create(input=word_3, model=MODEL).data[0].embedding
dist_13 = cosine_similarity(embedding_1, embedding_3)
print(f"Cosine similarity between {word_1} and {word_3}: {round(dist_13, 3)}.")Cosine similarity between king and pawn: 0.829.
Task: Use the OpenAI embeddings API and simple embedding arithmetics to introduce more context to word similarities.
Instructions:
- Create embeddings for the following three words:
python, snake, javascriptusing the OpenAI API. - Calculate the cosine similarity between each pair.
- Create another embedding for the word
reptileand add it topython. You can usenumpyfor this. - Calculate the cosine similarity between
pythonand this sum. What do you notice?
Show solution
words = ["python", "snake", "javascript", "reptile"]
response = client.embeddings.create(input=words, model=MODEL)
embeddings = [emb.embedding for emb in response.data]print(f"Similarity between '{words[0]}' and '{words[1]}': {round(cosine_similarity(embeddings[0], embeddings[1]), 3)}.")
print(f"Similarity between '{words[0]}' and '{words[2]}': {round(cosine_similarity(embeddings[0], embeddings[2]), 3)}.")
print(f"Similarity between '{words[0]} + {words[3]}' and '{words[1]}': {round(cosine_similarity(np.array(embeddings[0]) + np.array(embeddings[3]), embeddings[1]), 3)}.")Similarity between 'python' and 'snake': 0.841.
Similarity between 'python' and 'javascript': 0.85.
Similarity between 'python + reptile' and 'snake': 0.894.