Exercise: Word matching

Task: For each element of the following list of keywords, determine whether it is contained in the text.

Instructions:

text = "The company's latest quarterly earnings reports exceeded analysts' expectations, driving up the stock price. However, concerns about future growth prospects weighed on investor sentiment. The CEO announced plans to diversify the company's product portfolio and expand into new markets, aiming to sustain long-term profitability. The marketing team launched a new advertising campaign to promote the company's flagship product, targeting key demographics. Despite challenges in the competitive landscape, the company remains committed to innovation and customer satisfaction."
keywords = [
    "Announce", 
    "Aim",
    "Earnings",
    "Quarter",
    "Report",
    "Investor",
    "Analysis",
    "Market",
    "Diversity",
    "Product portfolio",
    "Advertisment",
    "Stock",
    "Landscpe" # yes, this is here on purpose
]
Show solution
from pprint import pprint
from nltk.tokenize import wordpunct_tokenize

text_token = wordpunct_tokenize(text=text.lower())
detected_words = [
    (keyword, keyword.lower() in text_token) for keyword in keywords
]
pprint(detected_words)
print(f"\nDetected {sum([x[1] for x in detected_words])}/{len(keywords)} words.")
[('Announce', False),
 ('Aim', False),
 ('Earnings', True),
 ('Quarter', False),
 ('Report', False),
 ('Investor', True),
 ('Analysis', False),
 ('Market', False),
 ('Diversity', False),
 ('Product portfolio', False),
 ('Advertisment', False),
 ('Stock', True),
 ('Landscpe', False)]

Detected 3/13 words.
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

lemmatized_text_token = [
    wnl.lemmatize(w) for w in text_token
]
detected_words = [
    (keyword, keyword.lower() in lemmatized_text_token) for keyword in keywords
]
pprint(detected_words)
print(f"\nDetected {sum([x[1] for x in detected_words])}/{len(keywords)} words.")
[('Announce', False),
 ('Aim', False),
 ('Earnings', True),
 ('Quarter', False),
 ('Report', True),
 ('Investor', True),
 ('Analysis', False),
 ('Market', True),
 ('Diversity', False),
 ('Product portfolio', False),
 ('Advertisment', False),
 ('Stock', True),
 ('Landscpe', False)]

Detected 5/13 words.
fully_lemmatized_text_token = []

for token in text_token:
    lemmatized_token = token
    for pos in ["n", "v", "a"]:
        lemmatized_token = wnl.lemmatize(token, pos=pos)
        
        fully_lemmatized_text_token.append(lemmatized_token)

detected_words = [
    (keyword, keyword.lower() in fully_lemmatized_text_token) for keyword in keywords
]
pprint(detected_words)    
print(f"\nDetected {sum([x[1] for x in detected_words])}/{len(keywords)} words.")  
        
[('Announce', True),
 ('Aim', True),
 ('Earnings', True),
 ('Quarter', False),
 ('Report', True),
 ('Investor', True),
 ('Analysis', False),
 ('Market', True),
 ('Diversity', False),
 ('Product portfolio', False),
 ('Advertisment', False),
 ('Stock', True),
 ('Landscpe', False)]

Detected 7/13 words.
Back to top