Python - find all subwords that can be found inside a word

Question

Python - find all subwords that can be found inside a word

Ultimately, I want to find out which word in the English dictionary contains most of the subwords that contain at least three letters. I wrote this algorithm, but it is too slow to be useful. Interesting ways i can optimize it

def subWords(word):
    return set((word[0:i] for i in range(2, len(word)+1))) #returns all subWords of length 2 or greater

def checkDict(wordList, dictList):
    return set((word for word in wordList if word in dictList))

def main():
    dictList = [i.strip() for i in open('wordlist.txt').readlines()]
    allwords = list()
    maximum = (0, list())

    for dictWords in dictList:
        for i in range (len(dictWords)):
            for a in checkDict(subWords(dictWords[i: len(dictWords) + 1]), dictList):
                allwords.append(a)

        if len(allwords) > maximum[0]:
            maximum = (len(allwords), allwords)

        print maximum
        allwords = list()

    print maximum 
main()

+5

python algorithm

Parseltongue Aug 3 '11 at 21:50

source share

5 answers

, . , - "a", , , "b". "c", , "d". : " ?"

, . , node.

, , , , ( ). , , node, , . , node. , , . , , , , .

, ! :

class Node:
    def __init__( self, parent, valid_subword ):
        self.parent = parent
        self.valid_subword = valid_subword
        self.children = {}

    #Extend the tree with a new node
    def extend( self, transition, makes_valid_word ):
        next_node = None
        if transition in self.children:
            if makes_valid_word:
                self.children[transition].makes_valid_word = True
        else:
            self.children[transition] = Node( self, makes_valid_word )
        return self.children[transition]

def generateTree( allwords ):
  tree = Node( None, False )
    for word in allwords:
      makes_valid_word = False
      current_node = tree
      for i in range(len(word)):
        current_node = current_node.extend( word[i], True if i == len(word) - 1 else False )
  return tree

def checkDict( word, tree ):
    current_node = tree
    for letter in word:
        try:
            current_node = current_node.children[letter]
        except KeyError:
            return False

    return current_node.valid_subword

, :

for word in allWords:
  for subword in subWords(word):
    checkDict(subword)
    #Code to keep track of the number of words found, like you already have

, O (m), m - . , , . O (n) , n - .

+7

Slubb 03 . '11 22:49

Python, ( , , , PEP8 - , JBernardo ):

def check_dict(word, dictionary): 
  """Return all subwords of `word` that are in `dictionary`."""
  fragments = set(word[i:j] 
                  for i in xrange(len(word) - 2) 
                  for j in xrange(i + 3, len(word) + 1))
  return fragments & dictionary

dictionary = frozenset(word for word in word_list if len(word) >= 3)
print max(((word, check_dict(word, dictionary)) for word in dictionary), 
          key=lambda (word, subwords): len(subwords)) # max = the most subwords

- :

('greatgrandmothers',
set(['and', 'rand', 'great', 'her', 'mothers', 'moth', 'mother', 'others', 'grandmothers', 'grandmother', 'ran', 'other', 'greatgrandmothers', 'greatgrandmother', 'grand', 'hers', 'the', 'eat']))

http://www.mieliestronk.com/wordlist.html.

, ( 1 lt 58k ).

, - :)

check_dict , .
, ( ).
, , :
- trie , PATRICIA
- , ,
- , ,
- ( , ), !!
Python, , Python .

+3

Radim 04 . '11 0:32

. "sowpods.txt" 267627 3 Python2.5 2.6, at_least_3 = set(w for w in words if len(w)>=3)

words = open("sowpods.txt").read().split()

at_least_3 = {w for w in words if len(w)>=3}

def count_subwords(word):
    counter = 0
    for i in range(len(word)-2):
        for j in range(i+3,len(word)+1):
            candidate = word[i:j]
            if candidate in at_least_3:
                counter += 1
    return counter

for row in sorted((count_subwords(w),w) for w in at_least_3):
    print row

26

(26, 'CORESEARCHERS')
(26, 'FOREGONENESSES')
(26, 'METAGENETICALLY')
(26, 'PREPOSSESSIONS')
(26, 'SACRAMENTALISTS')
(26, 'WHOLESOMENESSES')

+1

John La Rooy 03 . '11 22:49

- ?

>>> words = ['a', 'asd', 'asdf', 'bla']
>>> [sum(1 for i in (a for a in words if a in b)) for b in words]
[1, 2, 3, 2]

, . , ...

, , O (n²)

Edit:

The question asks for all the subwords, but the code asks for only one with a large number of subwords ... If you really want this 1st behavior, just delete the part sum(...)and make genexp the concept of a list ...

0

Jbernardo Aug 3 '11 at 21:59

source share

Karl Knechtel · Accepted Answer · 2011-08-03T22:44:42+0000

1) Style and organization: it makes sense to have one function that generates the entire subword of a word.

2) Style: double parentheses are not needed to use set.

3) Productivity (hopefully): make setfrom the words you are viewing; then you can use the built-in set intersection check.

4) ( ): , ; max. (, ) ; Python , .

5) (): , 1 2 , .

6) ( , ): .

7) : - with , , , , .readlines().

( , "" ):

def countedSubWords(word, dictionary): 
  fragments = set(
    word[i:j]
    for i in range(len(word)) for j in range(i+3, len(word)+1)
  )
  subWords = fragments.intersection(dictionary)
  return (len(subWords), subWords)


def main():
  with open('wordlist.txt') as words:
    dictionary = set(word.strip() for word in words if len(word.strip()) > 2)
    print max(countedSubWords(word, dictionary) for word in dictionary)

Python - find all subwords that can be found inside a word

More articles: