To count the literal occurrences of a pair of phrases in a small file:
with open("input_text.txt") as file:
text = file.read()
n = text.count("high inflation rate")
There is a nltk.collocationsmodule that provides tools for identifying words that often appear sequentially:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
words = [word.casefold() for sentence in sent_tokenize(text)
for word in word_tokenize(sentence)]
words_fd = nltk.FreqDist(words)
bigram_fd = nltk.FreqDist(nltk.bigrams(words))
finder = BigramCollocationFinder(word_fd, bigram_fd)
bigram_measures = nltk.collocations.BigramAssocMeasures()
print(finder.nbest(bigram_measures.pmi, 5))
print(finder.score_ngrams(bigram_measures.raw_freq))
finder = TrigramCollocationFinder.from_words(words)
finder.apply_word_filter(lambda w: w not in wanted)
trigram_measures = nltk.collocations.TrigramAssocMeasures()
print(sorted(finder.nbest(trigram_measures.raw_freq, 2)))
source
share