Simple statistical text analysis

Term frequency: Token counting

from nltk.tokenize import wordpunct_tokenize
from string import punctuation
from collections import Counter
from typing import List

from nltk.corpus import stopwords
# python -m nltk.downloader stopwords -> run this in your console once to get the stopwords


# load a text from file
text = ""
with open("../../assets/chapter1.txt", "r") as file:  
    for line in file:
        text += line.strip()


def preprocess_text(text: str) -> List[str]:
    # tokenize text
    tokens = wordpunct_tokenize(text.lower())

    # remove punctuation
    tokens = [t for t in tokens if t not in punctuation]

    # remove stopwords
    stop_words = stopwords.words("english")
    tokens = [t for t in tokens if t not in stop_words]

    return tokens

# count the most frequent words
tokens = preprocess_text(text=text)

for t in Counter(tokens).most_common(15):
    print(f"{t[0]}: {t[1]}")

Term frequency: Token counting

one: 35
winston: 32
face: 28
even: 24
--: 24
big: 22
could: 19
party: 18
would: 18
moment: 18
like: 17
brother: 15
goldstein: 15
telescreen: 14
seemed: 14

Bag of Words: Creating a vocabulary

from collections import Counter


def create_bag_of_words(texts):
    # Count the frequency of each word in the corpus
    word_counts = Counter()
    
    for text in texts:
        # Preprocess the text
        words = preprocess_text(text)
        
        # Update word counts
        word_counts.update(words)
    
    # Create vocabulary by sorting the words based on their frequency
    vocabulary = [word for word, _ in sorted(word_counts.items())]
    
    # Create BoW vectors for each document
    bow_vectors = []
    for text in texts:
        # Preprocess the text
        words = preprocess_text(text)
        
        # Create a Counter object to count word frequencies
        bow_vector = Counter(words)
        
        # Fill in missing words with zero counts
        for word in vocabulary:
            if word not in bow_vector:
                bow_vector[word] = 0

        # Sort the BoW vector based on the vocabulary order
        sorted_bow_vector = [bow_vector[word] for word in vocabulary]
        
        # Append the BoW vector to the list
        bow_vectors.append(sorted_bow_vector)
    
    return vocabulary, bow_vectors

# Example texts
texts = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create Bag of Words
vocabulary, bow_vectors = create_bag_of_words(texts)

# Print vocabulary
print("Vocabulary:")
print(vocabulary)

# Print BoW vectors
print("\nBag of Words Vectors:")
for i, bow_vector in enumerate(bow_vectors):
    print(f"Document {i + 1}: {bow_vector}")

Bag of Words: Creating a vocabulary

Vocabulary:
['document', 'first', 'one', 'second', 'third']

Bag of Words Vectors:
Document 1: [1, 1, 0, 0, 0]
Document 2: [2, 0, 0, 1, 0]
Document 3: [0, 0, 1, 0, 1]
Document 4: [1, 1, 0, 0, 0]

Statistical similarity of texts

import numpy as np

def cosine_similarity(vec1: np.array, vec2: np.array) -> float: 
    return np.dot(vec1, vec2) / ( np.linalg.norm(vec1) * np.linalg.norm(vec2) )

query = bow_vectors[3]

similarities = []
for i, bv in enumerate(bow_vectors):
    similarity = cosine_similarity(
            vec1=query, 
            vec2=bv
        )
    similarities.append(
        (texts[i], round(similarity, 2))
    )

similarities
[('This is the first document.', 1.0),
 ('This document is the second document.', 0.63),
 ('And this is the third one.', 0.0),
 ('Is this the first document?', 1.0)]

Limitations of Term Frequency and Bag of Words

Vocabulary

  • Grows with corpus size
  • Leads to computational inefficiencies and increased memory requirements
  • Fixed vocabulary after training, handling out-of-vocabulary words difficult

Context & structure

  • Treats each word independently
  • Fails to capture sequential and syntactical relationships
  • Limits understanding of sarcasm, irony, and metaphors
  • Lack of structural awareness (sentences with similar word distributions but differing meanings or intents)

Idea: Clustering of Bag of Word vectors

General idea

  • Clustering a common technique in text analysis
  • Groups similar documents based on word distributions
  • Each document represented as high-dimensional vector (e.g., BoW)
  • Dimensions correspond to unique words
  • Values reflect frequency of words in document

Approach

  • Clustering algorithms applied to partition documents
  • Similarity measured using distance metrics (cosine similarity, Euclidean distance)
  • Advantages:
    • Simplicity and scalability
    • Manageable computational complexity
  • Limitations:
    • Reliance on word frequency alone
    • Curse of dimensionality with large vocabularies

Code example

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Example texts representing different topics
texts = [
    "apple orange banana",
    "apple orange mango",
    "banana apple kiwi",
    "strawberry raspberry blueberry",
    "strawberry raspberry blackberry"
]

# create Bag of Words using CountVectorizer
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(texts)

print("Bag of Word vectors:")
print(bow_matrix.toarray())

# perform K-means clustering
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, n_init="auto")
cluster_labels = kmeans.fit_predict(bow_matrix)

print("\nCluster labels:")
for i, label in enumerate(cluster_labels):
    print(f"Document {i + 1} belongs to Cluster {label + 1}")

Code example

Bag of Word vectors:
[[1 1 0 0 0 0 1 0 0]
 [1 0 0 0 0 1 1 0 0]
 [1 1 0 0 1 0 0 0 0]
 [0 0 0 1 0 0 0 1 1]
 [0 0 1 0 0 0 0 1 1]]

Cluster labels:
Document 1 belongs to Cluster 2
Document 2 belongs to Cluster 2
Document 3 belongs to Cluster 2
Document 4 belongs to Cluster 1
Document 5 belongs to Cluster 1
# Get the vocabulary
vocabulary = vectorizer.get_feature_names_out()

# print the vocabulary
print("Vocabulary:")
print(vocabulary)

# print the Bag of Words matrix with corresponding words
print("\nBag of Words matrix with corresponding words:")
bow_matrix_array = bow_matrix.toarray()
for i, document_vector in enumerate(bow_matrix_array):
    words_in_document = [(word, frequency) for word, frequency in zip(vocabulary, document_vector) if frequency > 0]
    print(f"Document {i + 1}: {words_in_document}")
Vocabulary:
['apple' 'banana' 'blackberry' 'blueberry' 'kiwi' 'mango' 'orange'
 'raspberry' 'strawberry']

Bag of Words matrix with corresponding words:
Document 1: [('apple', 1), ('banana', 1), ('orange', 1)]
Document 2: [('apple', 1), ('mango', 1), ('orange', 1)]
Document 3: [('apple', 1), ('banana', 1), ('kiwi', 1)]
Document 4: [('blueberry', 1), ('raspberry', 1), ('strawberry', 1)]
Document 5: [('blackberry', 1), ('raspberry', 1), ('strawberry', 1)]