I am doing frequency word counting using python, the only process version:
#coding=utf-8 import string import time from collections import Counter starttime = time.clock() origin = open("document.txt", 'r').read().lower() for_split = [',','\n','\t','\'','.','\"','!','?','-', '~'] #the words below will be ignoered when counting ignored = ['the', 'and', 'i', 'to', 'of', 'a', 'in', 'was', 'that', 'had', 'he', 'you', 'his','my', 'it', 'as', 'with', 'her', 'for', 'on'] i=0 for ch in for_split: origin = string.replace(origin, ch, ' ') words = string.split(origin) result = Counter(words).most_common(40) for word, frequency in result: if not word in ignored and i < 10: print "%s : %d" % (word, frequency) i = i+1 print time.clock() - starttime
then the multiprocessing version is as follows:
#coding=utf-8 import time import multiprocessing from collections import Counter for_split = [',','\n','\t','\'','.','\"','!','?','-', '~'] ignored = ['the', 'and', 'i', 'to', 'of', 'a', 'in', 'was', 'that', 'had', 'he', 'you', 'his','my', 'it', 'as', 'with', 'her', 'for', 'on'] result_list = [] def worker(substr): result = Counter(substr) return result def log_result(result): result_list.append(result) def main(): pool = multiprocessing.Pool(processes=5) origin = open("document.txt", 'r').read().lower() for ch in for_split: origin = origin.replace(ch, ' ') words = origin.split() step = len(words)/4 substrs = [words[pos : pos+step] for pos in range(0, len(words), step)] result = Counter() for substr in substrs: pool.apply_async(worker, args=(substr,), callback = log_result) pool.close() pool.join() result = Counter() for item in result_list: result = result + item result = result.most_common(40) i=0 for word, frequency in result: if not word in ignored and i < 10: print "%s : %d" % (word, frequency) i = i+1 if __name__ == "__main__": starttime = time.clock() main() print time.clock() - starttime
"document.txt" is about 22 M, my laptop has cores, 2G memory, the result of the first version is 3.27, and the second is 8.15 s, I changed several processes ( pool = multiprocessing.Pool (processes = 5) ) , from 2 to 10, the results remain almost the same, why is this the way I can get this program to run faser than one version of the process?