Definition
Tokenization is the process of breaking down a text into smaller units called tokens.
from collections import Counter
token_counter = Counter(tokenized_sentence)
print(token_counter.most_common(3))
[('books', 2), ('I', 1), ('love', 1)]
from nltk.tokenize import wordpunct_tokenize
from string import punctuation
tokenized_sentence = wordpunct_tokenize(sentence)
tokenized_sentence = [t for t in tokenized_sentence if t not in punctuation]
print(tokenized_sentence)
['I', 'love', 'reading', 'science', 'fiction', 'books', 'or', 'books', 'about', 'science']
meeting
–> meet
(verb)['The', 'three', 'brother', 'went', 'over', 'three', 'big', 'bridge']
['The', 'three', 'brothers', 'go', 'over', 'three', 'big', 'bridge']
pos_dict = {
"brothers": "n",
"went": "v",
"big": "a",
"bridges": "n"
}
lemmatized_sentence_token = []
for token in sentence.split(" "):
if token in pos_dict:
lemma = wnl.lemmatize(token, pos=pos_dict[token])
else:
lemma = token # leave as it is
lemmatized_sentence_token.append(lemma)
print(lemmatized_sentence_token)
['The', 'three', 'brother', 'go', 'over', 'three', 'big', 'bridge']
Seminar: LLM, WiSe 2024/25