I ended up implementing a somewhat complex series of comprehensive regular expressions that covered all possible use cases using text “filters” that were replaced with the corresponding regular expressions when loading the parser.
If anyone is interested in the code, I will edit it in this answer.
Here is basically what I used. To build regular expressions from my "language", I had to make replacement classes:
class Replacer(object): def __call__(self, match): group = match.group(0) if group[1:].lower().endswith('_nm'): return '(?:' + Matcher(group).regex[1:] else: return '(?P<' + group[1:] + '>' + Matcher(group).regex[1:]
Then I created a generic Matcher class that built a regular expression for a specific pattern based on the pattern name:
class Matcher(object): name_component = r"([AZ][A-Za-z|'|\-]+|[AZ][az]{2,})" name_component_upper = r"([AZ][AZ|'|\-]+|[AZ]{2,})" year = r'(1[89][0-9]{2}|20[0-9]{2})' year_upper = year age = r'([1-9][0-9]|1[01][0-9])' age_upper = age ordinal = r'([1-9][0-9]|1[01][0-9])\s*(?:th|rd|nd|st|TH|RD|ND|ST)' ordinal_upper = ordinal date = r'((?:{0})\.? [0-9]{{1,2}}(?:th|rd|nd|st|TH|RD|ND|ST)?,? \d{{2,4}}|[0-9]{{1,2}} (?:{0}),? \d{{2,4}}|[0-9]{{1,2}}[\-/\.][0-9]{{1,2}}[\-/\.][0-9]{{2,4}})'.format('|'.join(months + months_short) + '|' + '|'.join(months + months_short).upper()) date_upper = date matchers = [ 'name_component', 'year', 'age', 'ordinal', 'date', ] def __init__(self, match=''): capitalized = '_upper' if match.isupper() else '' match = match.lower()[1:] if match.endswith('_instant'): match = match[:-8] if match in self.matchers: self.regex = getattr(self, match + capitalized) elif len(match) == 1: elif 'year' in match: self.regex = getattr(self, 'year') else: self.regex = getattr(self, 'name_component' + capitalized)
Finally, there is a generic Pattern object:
class Pattern(object): def __init__(self, text='', escape=None): self.text = text self.matchers = [] escape = not self.text.startswith('!') if escape is None else False if escape: self.regex = re.sub(r'([\[\].?+\-()\^\\])', r'\\\1', self.text) else: self.regex = self.text[1:] self.size = len(re.findall(r'(\$[A-Za-z0-9\-_]+)', self.regex)) self.regex = re.sub(r'(\$[A-Za-z0-9\-_]+)', Replacer(), self.regex) self.regex = re.sub(r'\s+', r'\\s+', self.regex) def search(self, text): return re.search(self.regex, text) def findall(self, text, max_depth=1.0): results = [] length = float(len(text)) for result in re.finditer(self.regex, text): if result.start() / length < max_depth: results.extend(result.groups()) return results def match(self, text): result = map(lambda x: (x.groupdict(), x.start()), re.finditer(self.regex, text)) if result: return result else: return []
It was pretty complicated, but it worked. I will not publish all of the source code, but this should get someone to start. As a result, he converted the file as follows:
$LASTNAME, $FirstName $I. said on $date
A compiled regex with named capture groups.