Is there a faster way to get subtrees from tree structures in python than the standard "recursive" one?

Question

Is there a faster way to get subtrees from tree structures in python than the standard "recursive" one?

Suppose the following data structures with three arrays are numpy (id, parent_id) (parent_id of the root element is -1):

import numpy as np
class MyStructure(object):
  def __init__(self):
    """
    Default structure for now:

          1
         / \
        2   3
           / \
          4   5
    """
    self.ids = np.array([1,2,3,4,5])
    self.parent_ids = np.array([-1, 1, 1, 3, 3])

  def id_successors(self, idOfInterest):
    """
    Return logical index.
    """
    return self.parent_ids == idOfInterest

  def subtree(self, newRootElement):
    """
    Return logical index pointing to elements of the subtree.
    """
    init_vector = np.zeros(len(self.ids), bool)
    init_vector[np.where(self.ids==newRootElement)[0]] = 1
    if sum(self.id_successors(newRootElement))==0:
      return init_vector
    else:
      subtree_vec = init_vector
      for sucs in self.ids[self.id_successors(newRootElement)==1]:
        subtree_vec += self.subtree(sucs)
      return subtree_vec

This is very slow for many identifiers (> 1000). Is there a faster way to implement this?

+1

python tree

Philipp der Rautenberg Jul 28 '10 at 6:31

source share

4 answers

Tony Veijalainen · Answer 1 · 2010-07-28T07:12:14+0000

Have you tried using the psyco module if you are using Python 2.6? Sometimes this can lead to a sharp acceleration of the code.

Have you considered a recursive data structure: list?

Your example is also a standard list:

[1, 2 , [3, [4], [5]]]

or

[1, [2, None, None], [3, [4, None, None], [5, None, None]]]

:

[1, 
  [2, None, None], 
  [3, 
    [4, None, None], 
    [5, None, None]]]

, . , heapq.

http://python.org/doc/essays/graphs.html, , .

, Python , . Blist

Yann Vernier · Answer 2 · 2010-07-29T12:31:04+0000

, , , ( ) . :

init_vector[np.where(self.ids==newRootElement)[0]] = 1

, , . , - . , init_vector - ids == newRootElement.

if sum(self.id_successors(newRootElement))==0:

, , , . any , - " newRootElement self.parent_ids" , , for .

, :

for sucs in self.ids[self.id_successors(newRootElement)==1]:

id_successors , 1 . , ( newRootElement) .

- . . , numpy, - , , , , - . :

import collections
children=collections.defaultdict(list)
for i,p in zip(ids,parent_ids):
  children[p].append(i)

def subtree(i):
  return i, map(subtree, children[i])

, , , , , , , , , , . , + . , .

delnan · Answer 3 · 2010-07-28T07:02:57+0000

, . (, Turing-). . , ( , _ec ). x , , . , , , 5%. - /memoization, . , - O (log n) , , ( , , ).

zbyszek · Answer 4 · 2010-07-29T09:41:47+0000

( , , , , ):
======================= graph_array.py ======================== ===


import collections
import numpy

def find_subtree(pids, subtree_id):
    N = len(pids)
    assert 1 <= subtree_id <= N

    subtreeids = numpy.zeros(pids.shape, dtype=bool)
    todo = collections.deque([subtree_id])

    iter = 0
    while todo:
        id = todo.popleft()
        assert 1 <= id <= N
        subtreeids[id - 1] = True

        sons = (pids == id).nonzero()[0] + 1
        #print 'id={0} sons={1} todo={2}'.format(id, sons, todo)
        todo.extend(sons)

        iter = iter+1
        if iter>N:
            raise ValueError()

    return subtreeids

======================= file graph_array_test.py ================================================================== ======


import numpy
from graph_array import find_subtree

def _random_graph(n, maxsons):
    import random
    pids = numpy.zeros(n, dtype=int)
    sons = numpy.zeros(n, dtype=int)
    available = []
    for id in xrange(1, n+1):
        if available:
            pid = random.choice(available)

            sons[pid - 1] += 1
            if sons[pid - 1] == maxsons:
                available.remove(pid)
        else:
            pid = -1
        pids[id - 1] = pid
        available.append(id)
    assert sons.max() <= maxsons
    return pids

def verify_subtree(pids, subtree_id, subtree):
    ids = set(subtree.nonzero()[0] + 1)
    sons = set(ids) - set([subtree_id])
    fathers = set(pids[id - 1] for id in sons)
    leafs = set(id for id in ids if not (pids == id).any())
    rest = set(xrange(1, pids.size+1)) - fathers - leafs
    assert fathers & leafs == set()
    assert fathers | leafs == ids
    assert ids & rest == set()

def test_linear_graph_gen(n, genfunc, maxsons):
    assert maxsons == 1
    pids = genfunc(n, maxsons)

    last = -1
    seen = set()
    for _ in xrange(pids.size):
        id = int((pids == last).nonzero()[0]) + 1
        assert id not in seen
        seen.add(id)
        last = id
    assert seen == set(xrange(1, pids.size + 1))

def test_case1():
    """
            1
           / \
          2   4
         /
        3
    """
    pids = numpy.array([-1, 1, 2, 1])

    subtrees = {1: [True, True, True, True],
                2: [False, True, True, False],
                3: [False, False, True, False],
                4: [False, False, False, True]}

    for id in xrange(1, 5):
        sub = find_subtree(pids, id)
        assert (sub == numpy.array(subtrees[id])).all()
        verify_subtree(pids, id, sub)

def test_random(n, genfunc, maxsons):
    pids = genfunc(n, maxsons)
    for subtree_id in numpy.arange(1, n+1):
        subtree = find_subtree(pids, subtree_id)
        verify_subtree(pids, subtree_id, subtree)

def test_timing(n, genfunc, maxsons):
    import time
    pids = genfunc(n, maxsons)
    t = time.time()
    for subtree_id in numpy.arange(1, n+1):
        subtree = find_subtree(pids, subtree_id)
    t = time.time() - t
    print 't={0}s = {1:.2}ms/subtree = {2:.5}ms/subtree/node '.format(
        t, t / n * 1000, t / n**2 * 1000),

def pytest_generate_tests(metafunc):
    if 'case' in metafunc.function.__name__:
        return
    ns = [1, 2, 3, 4, 5, 10, 20, 50, 100, 1000]
    if 'timing' in metafunc.function.__name__:
        ns += [10000, 100000, 1000000]
        pass
    for n in ns:
        func = _random_graph
        for maxsons in sorted(set([1, 2, 3, 4, 5, 10, (n+1)//2, n])):
            metafunc.addcall(
                funcargs=dict(n=n, genfunc=func, maxsons=maxsons),
                id='n={0} {1.__name__}/{2}'.format(n, func, maxsons))
            if 'linear' in metafunc.function.__name__:
                break

=================== py.test --tb = short -v -s test_graph_array.py =============

...
test_graph_array.py:72: test_timing[n=1000 _random_graph/1] t=13.4850590229s = 13.0ms/subtree = 0.013485ms/subtree/node PASS
test_graph_array.py:72: test_timing[n=1000 _random_graph/2] t=0.318281888962s = 0.32ms/subtree = 0.00031828ms/subtree/node PASS
test_graph_array.py:72: test_timing[n=1000 _random_graph/3] t=0.265519142151s = 0.27ms/subtree = 0.00026552ms/subtree/node PASS
test_graph_array.py:72: test_timing[n=1000 _random_graph/4] t=0.24147105217s = 0.24ms/subtree = 0.00024147ms/subtree/node PASS
test_graph_array.py:72: test_timing[n=1000 _random_graph/5] t=0.211434841156s = 0.21ms/subtree = 0.00021143ms/subtree/node PASS
test_graph_array.py:72: test_timing[n=1000 _random_graph/10] t=0.178458213806s = 0.18ms/subtree = 0.00017846ms/subtree/node PASS
test_graph_array.py:72: test_timing[n=1000 _random_graph/500] t=0.209936141968s = 0.21ms/subtree = 0.00020994ms/subtree/node PASS
test_graph_array.py:72: test_timing[n=1000 _random_graph/1000] t=0.245707988739s = 0.25ms/subtree = 0.00024571ms/subtree/node PASS
...

Here, each subtree of each tree is executed, and an interesting value is the average time to extract a tree: ~ 0.2ms per subtree, except for strictly linear trees. I'm not sure what is going on here.

Is there a faster way to get subtrees from tree structures in python than the standard "recursive" one?

More articles: