Embeddings …
Train a model to:
Throw away the parts after the embedding layer!
import numpy as np
def cosine_similarity(vec1: np.array, vec2: np.array) -> float:
return np.dot(vec1, vec2) / ( np.linalg.norm(vec1) * np.linalg.norm(vec2) )
for text, text_embedding in zip(texts, text_embeddings):
similarity = cosine_similarity(text_embedding, prompt_embedding)
print(f"{text}: {round(similarity, 2)}")
This is the first document.: 0.95
This document is the second document.: 0.88
And this is the third one.: 0.8
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# Apply t-SNE dimensionality reduction
tsne = TSNE(
n_components=2,
random_state=42,
perplexity=5 # see documentation to set this correctly
)
embeddings_2d = tsne.fit_transform(np.array(embeddings))
# Plot the embeddings in a two-dimensional scatter plot
plt.figure(figsize=(9, 7))
for i, word in enumerate(words):
x, y = embeddings_2d[i]
plt.scatter(x, y, marker='o', color='red')
plt.text(x, y, word, fontsize=9)
plt.xlabel("t-SNE dimension 1")
plt.ylabel("t-SNE dimension 2")
plt.grid(True)
plt.xticks([])
plt.yticks([])
plt.show()
# do the clus#| tering
import numpy as np
from sklearn.cluster import KMeans
n_clusters = 5
# define the model
kmeans = KMeans(
n_clusters=n_clusters,
n_init="auto",
random_state=2 # do this to get the same output
)
# fit the model to the data
kmeans.fit(np.array(embeddings))
# get the cluster labels
cluster_labels = kmeans.labels_
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# Apply t-SNE dimensionality reduction
tsne = TSNE(
n_components=2,
random_state=42,
perplexity=5 # see documentation to set this correctly
)
embeddings_2d = tsne.fit_transform(np.array(embeddings))
# Define a color map for clusters
colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
# Plot the embeddings in a two-dimensional scatter plot
plt.figure(figsize=(9, 7))
for i, word in enumerate(words):
x, y = embeddings_2d[i]
cluster_label = cluster_labels[i]
color = colors[cluster_label]
plt.scatter(x, y, marker='o', color=color)
plt.text(x, y, word, fontsize=9)
plt.xlabel("t-SNE dimension 1")
plt.ylabel("t-SNE dimension 2")
plt.grid(True)
plt.xticks([])
plt.yticks([])
plt.show()
Seminar: LLM, WiSe 2024/25