from nltk.tokenize import wordpunct_tokenize
from string import punctuation
from typing import List
from nltk.corpus import stopwords
# python -m nltk.downloader stopwords -> run this in your console once to get the stopwords
def preprocess_text(text: str) -> List[str]:
# tokenize text
= wordpunct_tokenize(text.lower())
tokens
# remove punctuation
= [t for t in tokens if t not in punctuation]
tokens
# remove stopwords
= stopwords.words("english")
stop_words = [t for t in tokens if t not in stop_words]
tokens
return tokens
Exercise: TF-IDF
Task: Extend the code for the bag of words to TF-IDF (Term Frequency-Inverse Document Frequency) vectors for a given set of documents. TF-IDF is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents. This measure helps in identifying words that are unique and informative to a particular document while downweighting common words that appear across many documents.
TF-IDF consists of two main components:
Term Frequency (TF): This component measures how frequently a term occurs in a document. It is calculated as the ratio of the count of a term in a document to the total number of terms in the document. TF is higher for words that occur more frequently within a document.
TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
Inverse Document Frequency (IDF): This component measures the rarity of a term across the entire corpus of documents. It is calculated as the logarithm of the ratio of the total number of documents to the number of documents containing the term, plus one. IDF is higher for terms that are rare across documents but present in only a few documents.
IDF(t) = log((1 + Total number of documents) / (1 + Number of documents containing term t))
The TF-IDF score for a term in a document is obtained by multiplying its TF and IDF scores. This score reflects the importance of the term in the context of the document and the entire corpus.
Instructions:
- Implement functions
calculate_tf
andcalculate_idf
to calculate Term Frequency (TF) and Inverse Document Frequency (IDF) respectively. - Write a
create_tf_idf
function to create TF-IDF vectors for a given set of documents. This function should count the frequency of each word in the corpus, calculate TF and IDF, and compute TF-IDF vectors for each document.
Show solution
from collections import Counter
import math
def calculate_tf(word_counts, total_words):
# Calculate Term Frequency (TF)
= {}
tf for word, count in word_counts.items():
= count / total_words
tf[word] return tf
def calculate_idf(word_counts, num_documents):
# Calculate Inverse Document Frequency (IDF)
= {}
idf for word, count in word_counts.items():
= math.log((1 + num_documents) / (1 + count))
idf[word] return idf
def create_tf_idf(texts):
# Count the frequency of each word in the corpus and total number of words
= Counter()
word_counts = 0
total_words for text in texts:
# Preprocess the text
= preprocess_text(text)
words
# Update word counts and total number of words
word_counts.update(words)+= len(words)
total_words
# Create sorted vocabulary
= sorted(word_counts.keys())
vocabulary
# Calculate TF-IDF for each document
= []
tf_idf_vectors = len(texts)
num_documents for text in texts:
# Preprocess the text
= preprocess_text(text)
words
# Calculate TF for the document
= calculate_tf(Counter(words), len(words))
tf
# Calculate IDF based on word counts across all documents
= calculate_idf(word_counts, num_documents)
idf
# Calculate TF-IDF for the document
= {}
tf_idf_vector for word in vocabulary:
= round(tf.get(word, 0) * idf[word], 2)
tf_idf_vector[word]
# Sort the IFIDF vector based on the vocabulary order
= [tf_idf_vector[word] for word in vocabulary]
sorted_tfidf_vector
# Append the BoW vector to the list
tf_idf_vectors.append(sorted_tfidf_vector)
return vocabulary, tf_idf_vectors
# Example texts
= [
texts "This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
# Create TF-IDF vectors
= create_tf_idf(texts)
vocabulary, tf_idf_vectors
# Print vocabulary
print("Vocabulary:")
print(vocabulary)
# Print TF-IDF vectors
print("\nTF-IDF Vectors:")
for i, tf_idf_vector in enumerate(tf_idf_vectors):
print(f"Document {i + 1}: {tf_idf_vector}")
Vocabulary:
['document', 'first', 'one', 'second', 'third']
TF-IDF Vectors:
Document 1: [0.0, 0.26, 0.0, 0.0, 0.0]
Document 2: [0.0, 0.0, 0.0, 0.31, 0.0]
Document 3: [0.0, 0.0, 0.46, 0.0, 0.46]
Document 4: [0.0, 0.26, 0.0, 0.0, 0.0]