How can I parse selected text for further processing?

See updated input and output in Edit-1.

I'm trying to execute

+ 1
 + 1.1
  + 1.1.1
   - 1.1.1.1
   - 1.1.1.2
 + 1.2
  - 1.2.1
  - 1.2.2
 - 1.3
+ 2
- 3

into a python data structure such as

[{'1': [{'1.1': {'1.1.1': ['1.1.1.1', '1.1.1.2']}, '1.2': ['1.2.1', '1.2.2']}, '1.3'], '2': {}}, ['3',]]

I looked through many different wiki markup languages, markdowns, restructured text, etc., but they are all very difficult for me to understand how this works, since they need to cover a large number of tags and syntax (I only need to “list” the parts most of them, but of course convert them to python instead of html.)

I also looked at tokenizers, lexers and parsers, but again they are much more complicated than I need, and I can understand.

, , .

-1. , , , * root node , + , - (root ) , node. * + ( root .)

* node , - . , *, , , .

* 1
 * 1.1
 * 1.2
  - Note for 1.2
* 2
* 3
- Note for root

[{'title': '1', 'children': [{'title': '1.1', 'children': []}, {'title': '1.2', 'children': []}]}, {'title': '2', 'children': [], 'notes': ['Note for 1.2', ]}, {'title': '3', 'children': []}, 'Note for root']

, python, .

+5
4

: , Node - , , ( ), ( "-", , , script - , , , !), , Python . , , , , OP , :

import sys

class Node(object):
  def __init__(self, title, indent):
    self.title = title
    self.indent = indent
    self.children = []
    self.notes = []
    self.parent = None
  def __repr__(self):
    return 'Node(%s, %s, %r, %s)' % (
        self.indent, self.parent, self.title, self.notes)
  def aspython(self):
    result = dict(title=self.title, children=topython(self.children))
    if self.notes:
      result['notes'] = self.notes
    return result

def print_tree(node):
  print ' ' * node.indent, node.title
  for subnode in node.children:
    print_tree(subnode)
  for note in node.notes:
    print ' ' * node.indent, 'Note:', note

def topython(nodelist):
  return [node.aspython() for node in nodelist]

def lines_to_tree(lines):
  nodes = []
  for line in lines:
    indent = len(line) - len(line.lstrip())
    marker, body = line.strip().split(None, 1)
    if marker == '*':
      nodes.append(Node(body, indent))
    elif marker == '-':
      nodes[-1].notes.append(body)
    else:
      print>>sys.stderr, "Invalid marker %r" % marker

  tree = Node('', -1)
  curr = tree
  for node in nodes:
    while node.indent <= curr.indent:
      curr = curr.parent
    node.parent = curr
    curr.children.append(node)
    curr = node

  return tree


data = """\
* 1
 * 1.1
 * 1.2
  - Note for 1.2
* 2
* 3
- Note for root
""".splitlines()

def main():
  tree = lines_to_tree(data)
  print_tree(tree)
  print
  alist = topython(tree.children)
  print alist

if __name__ == '__main__':
  main()

:

 1
  1.1
  1.2
  Note: 1.2
 2
 3
 Note: 3

[{'children': [{'children': [], 'title': '1.1'}, {'notes': ['Note for 1.2'], 'children': [], 'title': '1.2'}], 'title': '1'}, {'children': [], 'title': '2'}, {'notes': ['Note for root'], 'children': [], 'title': '3'}]

( dict, ), , , , dict notes , ( , , ).

, , ; , - , ( , ). , , dict , , , ; ( ) , , , node ? , , ( node 0 , 0 1, ).

( ), () 99% , , , / , , !

+6

, , . , dict, . , dict , dict . , , . , dict .

+1

- . node , . - :

import re
line_tokens = re.compile('( *)(\\*|-) (.*)')

def parse_tree(data):
    stack = [{'title': 'Root node', 'children': []}]
    for line in data.split("\n"):
        indent, symbol, content = line_tokens.match(line).groups()        
        while len(indent) + 1 < len(stack):
            stack.pop() # Remove everything up to current parent
        if symbol == '-':
            stack[-1].setdefault('notes', []).append(content)
        elif symbol == '*':
            node = {'title': content, 'children': []}
            stack[-1]['children'].append(node)
            stack.append(node) # Add as the current deepest node
    return stack[0]
+1

The syntax you use is very similar to Yaml. This has some differences, but it is fairly easy to recognize - its focus is on being human readable (and writable).

Take a look at the Yaml website. There are some python bindings, documentation, and other materials there.

0
source

All Articles