How to extract pieces from bio chunked sentences? - python

Give an input sentence with BIO tags :

[('What', 'B-NP'), ('is',' B-VP '), (' the ',' B-NP '), (' airspeed '"I-NP"), (' of ',' B-PP '), (' an ',' B-NP '), ("no threshold", "I-NP") (' swallow ',' I-NP '), ('? ' , 'O')]

I would need to extract relevant phrases, for example. if I want to extract 'NP' , I will need to extract fragments of tuples containing B-NP and I-NP .

[exit]:

 [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')] 

(Note: the numbers in the extraction tuples are a token token.)

I tried to extract it using the following code:

 def extract_chunks(tagged_sent, chunk_type): current_chunk = [] current_chunk_position = [] for idx, word_pos in enumerate(tagged_sent): word, pos = word_pos if '-'+chunk_type in pos: # Append the word to the current_chunk. current_chunk.append((word)) current_chunk_position.append((idx)) else: if current_chunk: # Flush the full chunk when out of an NP. _chunk_str = ' '.join(current_chunk) _chunk_pos_str = '-'.join(map(str, current_chunk_position)) yield _chunk_str, _chunk_pos_str current_chunk = [] current_chunk_position = [] if current_chunk: # Flush the last chunk. yield ' '.join(current_chunk), '-'.join(current_chunk_position) tagged_sent = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')] print (list(extract_chunks(tagged_sent, chunk_type='NP'))) 

But when I have a neighboring piece of the same type:

 tagged_sent = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'), ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')] print (list(extract_chunks(tagged_sent, chunk_type='NP'))) 

He outputs this:

 [('The Mitsubishi Electric Company Managing Director', '0-1-2-3-4-5'), ('ramen', '7')] 

Instead of the desired:

 [('The Mitsubishi Electric Company', '0-1-2-3'), ('Managing Director', '4-5'), ('ramen', '7')] 

How can this be solved from the above code?

Besides how this is done from the above code, is there a better solution for extracting the desired fragments of a particular chunk_type ?

+8
python list text-parsing nlp
source share
3 answers
 def extract_chunks(tagged_sent, chunk_type): grp1, grp2, chunk_type = [], [], "-" + chunk_type for ind, (s, tp) in enumerate(tagged_sent): if tp.endswith(chunk_type): if not tp.startswith("B"): grp2.append(str(ind)) grp1.append(s) else: if grp1: yield " ".join(grp1), "-".join(grp2) grp1, grp2 = [s], [str(ind)] yield " ".join(grp1), "-".join(grp2) 

Output:

 In [2]: l = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'), ...: ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')] In [3]: list(extract_chunks(l, "NP")) Out[3]: [('The Mitsubishi Electric Company', '0-1-2-3'), ('Managing Director', '4-5'), ('ramen', '7')] In [4]: l = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')] In [5]: list(extract_chunks(l, "NP")) Out[5]: [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')] 
+1
source share

Try this, it will extract all types of pieces with the indices of their corresponding words.

 def extract_chunks(tagged_sent, chunk_type='NP'): out_sen = [] for idx, word_pos in enumerate(tagged_sent): word,bio = word_pos boundary,tag = bio.split("-") if "-" in bio else ('','O') if tag != chunk_type:continue if boundary == "B": out_sen.append([word, str(idx)]) elif boundary == "I": out_sen[-1][0] += " "+ word out_sen[-1][-1] += "-"+ str(idx) else: out_sen.append([word, str(idx)]) return out_sen 

Demo:

 >>> tagged_sent = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'), ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')] >>> output_sent = extract_chunks(tagged_sent) >>> print map(tuple, output_sent) [('The Mitsubishi Electric Company', '0-1-2-3'), ('Managing Director', '4-5'), ('ramen', '7')] 
+2
source share

I would do it like this:

 import re def extract_chunks(tagged_sent, chunk_type): # compiles the expression we want to match regex = re.compile(chunk_type) # filters matched items in a dictionary whose keys are the matched indexes first_step = {index_:tag[0] for index_, tag in enumerate(tagged_sent) if regex.findall(tag[1])} # builds list of lists following output format second_step = [] for key_ in sorted(first_step.keys()): if second_step and int(second_step [len(second_step )-1][1].split('-')[-1]) == key_ -1: second_step[len(second_step)-1][0] += ' {0}'.format(first_step[key_]) second_step[len(second_step)-1][1] += '-{0}'.format(str(key_)) else: second_step.append([first_step[key_], str(key_)]) # builds output in final format return [tuple(item) for item in second_step] 

You can adapt it to use generators instead of building all the output in memory, as I do, and restore it for better performance (I'm in a hurry, so the code is far from optimal).

Hope this helps!

0
source share

All Articles