Counting word frequency in documents using python regex

Created a python module that reads in a file, deletes stop words and displays a python dictionary with the word and its frequency (How many times this happened in the document).

def run():
filelist = os.listdir(path)
regex = re.compile(r'.*<div class="body">(.*?)</div>.*', re.DOTALL | re.IGNORECASE)
reg1 = re.compile(r'<\/?[ap][^>]*>', re.DOTALL | re.IGNORECASE)
quotereg = re.compile(r'&quot;', re.DOTALL | re.IGNORECASE)
puncreg = re.compile(r'[^\w]', re.DOTALL | re.IGNORECASE)
f = open(stopwordfile, 'r')
stopwords = f.read().lower().split()
totalfreq = {}

filewords = {}
htmlfiles = []
for file in filelist:
    if file[-5:] == '.html':
        htmlfiles.append(file)

for file in htmlfiles:
    f = open(path + file, 'r')
    words = f.read().lower()
    words = regex.findall(words)[0]
    words = quotereg.sub(' ', words)
    words = reg1.sub(' ', words)
    words = puncreg.sub(' ', words)
    words = words.strip().split()

    for w in stopwords:
        while w in words:
            words.remove(w)

     freq = {}
    for w in words:
       if w in freq:
           totalfreq[w] = totalfreq[w] + 1
           freq[w] = freq[w] + 1
       else:
           totalfreq[w] = 1
           freq[w] = 1
           filewords[file] = freq


    print totalfreq

This prints all the words "non-stop" inside this file and the frequency at which they occur in the file: The result looks like this:

{{'saturday': 1, 'irish': 1, 'family': 1, 'give': 1, 'year': 2, 'weekend': 1, 'steve': 1, '': 1, 'questions': 1, 'in': 2, 'effort': 1, 'partner': 1, 'extinction': 1, 'dress': 1, 'children': 4, 'utans': 1, '27': 1, 'raise': 1, 'closet': 1, 'haired': 2, 'make': 1, 'humphreys': 1, '': 1, 'zoo': 5, ' 1: "1" , "1" , "1" , "1" , "1" , "1" , "1" , "",: 1, 'orangutans': 4, 'plans': 1, 'leonie': 1, 'orang': 1, '': 2, 'free': 2, 'hand': 1, 'wild': 1, 'independent': 1, 'part': 1, 'prepare': 1, 'detected': 1, 'day': 1, 'man': 1, 'picture': 1, 'keane': 1, ' "1" , "1" , "1" , "1" ,: 1, 'face': 1, 'mujur': 1, 'red': 2, 'orangutan': 1, 'species': 1, 'entry': 1, 'effort': 1, '': 1, '11am': 1, 'infux': 1, '3pm': 1}

{'newest': 1, 'birth': 2, 'orang': 1, 'month': 1, 'steve': 1, 'questions': 1, 'utans': 1, 'children': 1, 4, 'staff': 1, 'limelight': 1, '27': 1, 'based': 1, 'concerned': 1, 'sunday': 1, '3pm': 1, 'finally': 1, "4": 1, "maeve": 1, "": 1, "": 1, "": 1, "": 1, "facebook": 1, "": 1, " ': 1,' nurturing ': 1,' day ': 1,' debut ': 1,' rothschild ': 1,' keepers ': 1,' email ': 1,' steps ': 1,' 11am ': 1, 1, 'page': 1, 'picture': 1, 'born': 1, 'result': 1, 'year': 2, 'saturday': 1, 'special': 1, 'closet': 1, 'haired': 2, 'section': 1, 'bennet': 2, 'mum': 3, 'mujur': 1, 'conditions': 1, 'public': 1, 'red': 2, ' ': 1,' orangutans ': 4,' free ': 2,' keeper ': 1,' november ': 1,' care ': 1,' send ': 1,' great ': 1,' originins ': 1, 1, '32': 1, 'invite': 1, 'dublin': 2, 'expected': 1, 'orangutan': 1, 'effort': 1, 'infux': 1, 'named': 1, 'family': 1, 'delighted': 1, 'weather': 1, 'guests': 1, 'extinction': 1, 'post': 1, 'impressed': 1, 'raise': 1, 'detected ': 1,' rema ined ': 1,' humphreys ': 1,' ': 1,' ': 3,' ': 1,' shane ': 1,' part ': 1,' helen ': 1,' attentive ': 1, 'effort': 1, 'case': 1, 'made': 2, 'animals': 1, '14': 1, '16': 1, 'ms': 1, 'wild': 1, 'savanna': 1, 'irish': 1, 'give': 1, 'resident': 1, '': 1, 'slip': 1, 'in': 2, 'partner': 1, ' "1" , "1" , "1" , "1" , " ": 3," ": 1," ": 1," ",: 1, 'poor': 1, 'independent': 1, 'plans': 1, 'leonie': 1, 'time': 1, '': 1, 'hand': 1, 'hickey': 1, 'weekend': 1, 'man': 1, 'sibu': 1, 'age': 1, 'stable': 2, 'face': 1, 'confinement': 1, 'african': 2, ' entry ': 1,' keane ': 1,' clarke ': 2,' left ': 1}

, , , "" . 1- zoo = 5 2- zoo = 3 total = 8.

, , .

?!

+5
4

'<\/?[ap][^>]*>' , '/'

'[^\w]' '\W' , '[^\w]+' , '[^\w]'

re.DOTALL r'<\/?[ap][^>]*>', RE

.

words = f.read().lower(), , re.IGNORECASE

REs RE: reg123 = re.compile(r'(</?[ap][^>]*>|&quot;|\W+)')

,

, htmfiles

, '[0]' words = regex.findall(words)[0]

- RE, ' ':

stopwords = '|'.join(f.read().lower().split())

RE

filewords[file] = freq

.

; , . , , . .

def run():

    from collection import difaultdict

    with open(stopwordfile, 'r') as f:
        stopwords = '|'.join(f.read().lower().split())

    regex = re.compile(r'.*<div class="body">(.*?)</div>.*', re.DOTALL)
    reg123 = re.compile(r'(</?[ap][^>]*>|&quot;|\W+|'+stopwords+')')

    totalfreq = defaultdict(int)
    filewords = {}

    for filename in (fn for fn in os.listdir(path) if fn[-5:] == '.html'):
        with open(path + filename, 'r') as f:
            ch = regex.findall(f.read().lower())[0]
            ch = reg123.sub(' ', ch)
            words = ch.strip().split()

        freq = defaultdict(int)
        for w in words:
            totalfreq[w] += 1
            freq[w] += 1
        filewords[filename] = freq

    print totalfreq

. ,

+2
+2

:

result = {}
for d in dictionaries:
  for k,v in d.iteritems():
    result[k] = result.get(k,0) + v

for k,v in result.iteritems():
  print('total occurences of {0}: {1}'.format(k,v))

... where dictionariesis just a word-to-frequency list of cards for each input file.

0
source

Assuming this filesis a list of frequencies for each file you have, try something like:

from itertools import groupby, chain
total = dict(
              (key, sum(c[1] for c in vals))
              for key, vals in 
              groupby(
                  sorted(
                      chain(
                          *(f.items() for f in files)
                      )
                  ), 
                  lambda x: x[0]
              )
            )
0
source

All Articles