from nltk.tokenize import wordpunct_tokenize
from string import punctuation
from collections import Counter
from typing import List
from nltk.corpus import stopwords
# python -m nltk.downloader stopwords -> run this in your console once to get the stopwords
# load a text from file
text = ""
with open("../../assets/chapter1.txt", "r") as file:
for line in file:
text += line.strip()
def preprocess_text(text: str) -> List[str]:
# tokenize text
tokens = wordpunct_tokenize(text.lower())
# remove punctuation
tokens = [t for t in tokens if t not in punctuation]
# remove stopwords
stop_words = stopwords.words("english")
tokens = [t for t in tokens if t not in stop_words]
return tokens
# count the most frequent words
tokens = preprocess_text(text=text)
for t in Counter(tokens).most_common(15):
print(f"{t[0]}: {t[1]}")