WordPunctTokenizer apply_word_filter().
, .
from nltk import bigrams
from nltk import collocations
from nltk import FreqDist
from nltk.collocations import *
from nltk import WordPunctTokenizer
content_part = """test. foo 0 test. foo 1 test.
foo 2 test. foo 3 test. foo 4 test, foo 4 test."""
tokens = WordPunctTokenizer().tokenize(content_part)
bigram_measures = collocations.BigramAssocMeasures()
word_fd = FreqDist(tokens)
bigram_fd = FreqDist(bigrams(tokens))
finder = BigramCollocationFinder(word_fd, bigram_fd)
finder.apply_word_filter(lambda w: w in ('.', ','))
scored = finder.score_ngrams(bigram_measures.raw_freq)
print tokens
print sorted(finder.nbest(bigram_measures.raw_freq,2),reverse=True)
:
['test', '.', 'foo', '0', 'test', '.', 'foo', '1', 'test', '.', 'foo', '2', 'test', '.', 'foo', '3', 'test', '.', 'foo', '4', 'test', ',', 'foo', '4', 'test', '.']
[('4', 'test'), ('foo', '4')]