# prerequisites
import os
from llm_utils.client import get_openai_client, OpenAIModels
= OpenAIModels.EMBED.value
MODEL
# get the OpenAI client
= get_openai_client(
client =MODEL,
model=os.environ.get("CONFIG_PATH")
config_path )
Visualization & clustering of embeddings
Visualization of embeddings
Let’s go a little bit further down the road of similarities between words or texts in general. Wouldn’t it be great if we could somehow visualize a text as a point in space and see how other texts relate to it? Embeddings allow us to do that as well, with one caveat: Embeddings have too many dimensions to visualize (usually a few thousand). Luckily, there are tools such as principal component analysis or T-SNE available to reduce the dimension of vectors (for example, to two dimensions), while preserving most of the relations between them. Let’s see how this works.
# Define a list of words to visualize
= ["king", "queen", "man", "woman", "apple", "banana", "grapes", "cat", "dog", "happy", "sad"]
words
# Get embeddings for the words
= client.embeddings.create(
response input=words,
=MODEL
model
)
= [emb.embedding for emb in response.data] embeddings
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# Apply t-SNE dimensionality reduction
= TSNE(
tsne =2,
n_components=42,
random_state=5 # see documentation to set this correctly
perplexity
)= tsne.fit_transform(np.array(embeddings))
embeddings_2d
# Plot the embeddings in a two-dimensional scatter plot
=(10, 8))
plt.figure(figsizefor i, word in enumerate(words):
= embeddings_2d[i]
x, y ='o', color='red')
plt.scatter(x, y, marker=9)
plt.text(x, y, word, fontsize
"t-SNE dimension 1")
plt.xlabel("t-SNE dimension 2")
plt.ylabel(True)
plt.grid(
plt.xticks([])
plt.yticks([]) plt.show()
Clustering of embeddings
The great thing is, we can already see that there are some clusters forming. We can again use models like KMeans to find them explicitly. In this case, we obviously have five clusters, so let’s try to identify them.
# do the clustering
import numpy as np
from sklearn.cluster import KMeans
= 5
n_clusters
# define the model
= KMeans(
kmeans =n_clusters,
n_clusters="auto",
n_init=2 # do this to get the same output
random_state
)
# fit the model to the data
kmeans.fit(np.array(embeddings))
# get the cluster labels
= kmeans.labels_ cluster_labels
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# Apply t-SNE dimensionality reduction
= TSNE(
tsne =2,
n_components=42,
random_state=5 # see documentation to set this correctly
perplexity
)= tsne.fit_transform(np.array(embeddings))
embeddings_2d
# Define a color map for clusters
= plt.cm.viridis(np.linspace(0, 1, n_clusters))
colors
# Plot the embeddings in a two-dimensional scatter plot
=(10, 8))
plt.figure(figsizefor i, word in enumerate(words):
= embeddings_2d[i]
x, y = cluster_labels[i]
cluster_label = colors[cluster_label]
color ='o', color=color)
plt.scatter(x, y, marker=9)
plt.text(x, y, word, fontsize
"t-SNE dimension 1")
plt.xlabel("t-SNE dimension 2")
plt.ylabel(True)
plt.grid(
plt.xticks([])
plt.yticks([]) plt.show()
That, again, is great news. Embeddings allow us to find clusters in texts, based on the semantics included. This helps in many applications where documents need to be analyzed without having been seen by a human. Maybe you can use it in your project?