What if <tr> has rowspan
If a row has a rowspan element, then how to make a row corresponding to a table, as on a wikipedia page.
from bs4 import BeautifulSoup
import urllib2
from lxml.html import fromstring
import re
import csv
import pandas as pd
wiki = "http://en.wikipedia.org/wiki/List_of_England_Test_cricket_records"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
try:
table = soup.find_all('table')[6]
except AttributeError as e:
print 'No tables found, exiting'
try:
first = table.find_all('tr')[0]
except AttributeError as e:
print 'No table row found, exiting'
try:
allRows = table.find_all('tr')[1:-1]
except AttributeError as e:
print 'No table row found, exiting'
headers = [header.get_text() for header in first.find_all(['th', 'td'])]
results = [[data.get_text() for data in row.find_all(['th', 'td'])] for row in allRows]
df = pd.DataFrame(data=results, columns=headers)
df
I get the table as a result ... but for tables where the row contains rowspan - I get the table as follows:

+4
3 answers
The problem is due to the following case, as you know,
html content:
<tr>
<td rowspan="2">2=</td>
<td>West Indies</td>
<td>4</td>
<td>Lord's</td>
<td>2009</td>
</tr>
<tr>
<td style="text-align:left;">India</td>
<td>4</td>
<td>Mumbai</td>
<td>2012</td>
</tr>
therefore, when it tdhas an attribute rowspan, then consider that the same tdvaulue is repeated for the next trat the same level, and the value rowspanmeans the next number of tags tr.
-
rowspan.tr,td,rowspan, ..trtd,td. tr.
:: . .
:
from bs4 import BeautifulSoup
import urllib2
from lxml.html import fromstring
import re
import csv
import pandas as pd
wiki = "http://en.wikipedia.org/wiki/List_of_England_Test_cricket_records"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
table = soup.find_all('table')[6]
tmp = table.find_all('tr')
first = tmp[0]
allRows = tmp[1:-1]
#table.find_all('tr')[1:-1]
headers = [header.get_text() for header in first.find_all('th')]
results = [[data.get_text() for data in row.find_all('td')] for row in allRows]
#<td rowspan="2">2=</td>
# list of tuple (Level of tr, Level of td, total Count, Text Value)
#e.g.
#[(1, 0, 2, u'2=')]
# (<tr> is 1 , td sequence in tr is 0, reapted 2 times , value is 2=)
rowspan = []
for no, tr in enumerate(allRows):
tmp = []
for td_no, data in enumerate(tr.find_all('td')):
print data.has_key("rowspan")
if data.has_key("rowspan"):
rowspan.append((no, td_no, int(data["rowspan"]), data.get_text()))
if rowspan:
for i in rowspan:
# tr value of rowspan in present in 1th place in results
for j in xrange(1, i[2]):
#- Add value in next tr.
results[i[0]+j].insert(i[1], i[3])
df = pd.DataFrame(data=results, columns=headers)
print df
:
Rank Opponent No. wins Most recent venue Season
0 1 South Africa 6 Lord 1951
1 2= West Indies 4 Lord 2009
2 2= India 4 Mumbai 2012
3 4 Australia 3 Sydney 1932
4 5 Pakistan 2 Trent Bridge 1967
5 6 Sri Lanka 1 Old Trafford 2002
10
Rank Hundreds Player Matches Innings Average
0 1 25 Alastair Cook 107 191 45.61
1 2 23 Kevin Pietersen 104 181 47.28
2 3 22 Colin Cowdrey 114 188 44.07
3 3 22 Wally Hammond 85 140 58.46
4 3 22 Geoffrey Boycott 108 193 47.72
5 6 21 Andrew Strauss 100 178 40.91
6 6 21 Ian Bell 103 178 45.30
7 8= 20 Ken Barrington 82 131 58.67
8 8= 20 Graham Gooch 118 215 42.58
9 10 19 Len Hutton 79 138 56.67
+2
, stackoverflow , - . , , , . .
:
def pre_process_table(table):
"""
INPUT:
1. table - a bs4 element that contains the desired table: ie <table> ... </table>
OUTPUT:
a tuple of:
1. rows - a list of table rows ie: list of <tr>...</tr> elements
2. num_rows - number of rows in the table
3. num_cols - number of columns in the table
Options:
include_td_head_count - whether to use only th or th and td to count number of columns (default: False)
"""
rows = [x for x in table.find_all('tr')]
num_rows = len(rows)
# get an initial column count. Most often, this will be accurate
num_cols = max([len(x.find_all(['th','td'])) for x in rows])
# sometimes, the tables also contain multi-colspan headers. This accounts for that:
header_rows_set = [x.find_all(['th', 'td']) for x in rows if len(x.find_all(['th', 'td']))>num_cols/2]
num_cols_set = []
for header_rows in header_rows_set:
num_cols = 0
for cell in header_rows:
row_span, col_span = get_spans(cell)
num_cols+=len([cell.getText()]*col_span)
num_cols_set.append(num_cols)
num_cols = max(num_cols_set)
return (rows, num_rows, num_cols)
def get_spans(cell):
"""
INPUT:
1. cell - a <td>...</td> or <th>...</th> element that contains a table cell entry
OUTPUT:
1. a tuple with the cell row and col spans
"""
if cell.has_attr('rowspan'):
rep_row = int(cell.attrs['rowspan'])
else: # ~cell.has_attr('rowspan'):
rep_row = 1
if cell.has_attr('colspan'):
rep_col = int(cell.attrs['colspan'])
else: # ~cell.has_attr('colspan'):
rep_col = 1
return (rep_row, rep_col)
def process_rows(rows, num_rows, num_cols):
"""
INPUT:
1. rows - a list of table rows ie <tr>...</tr> elements
OUTPUT:
1. data - a Pandas dataframe with the html data in it
"""
data = pd.DataFrame(np.ones((num_rows, num_cols))*np.nan)
for i, row in enumerate(rows):
try:
col_stat = data.iloc[i,:][data.iloc[i,:].isnull()].index[0]
except IndexError:
print(i, row)
for j, cell in enumerate(row.find_all(['td', 'th'])):
rep_row, rep_col = get_spans(cell)
#print("cols {0} to {1} with rep_col={2}".format(col_stat, col_stat+rep_col, rep_col))
#print("\trows {0} to {1} with rep_row={2}".format(i, i+rep_row, rep_row))
#find first non-na col and fill that one
while any(data.iloc[i,col_stat:col_stat+rep_col].notnull()):
col_stat+=1
data.iloc[i:i+rep_row,col_stat:col_stat+rep_col] = cell.getText()
if col_stat<data.shape[1]-1:
col_stat+=rep_col
return data
def main(table):
rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)
return(df)
, Wisconsin. , bs4, ...
## Find tables on the page and locate the desired one:
tables = soup.findAll("table", class_='wikitable')
## I want table 3 or the one that contains years 2000-2018
table = tables[3]
## run the above functions to extract the data
rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)
-
, rowspan. Pandas read_html, html , , rowspan ( Wisconsin). fillna(method='ffill') . , . , .
html-:
s = """<table width="100%" border="1">
<tr>
<td rowspan="1">one</td>
<td rowspan="2">two</td>
<td rowspan="3">three</td>
</tr>
<tr><td>"4"</td></tr>
<tr>
<td>"55"</td>
<td>"99"</td>
</tr>
</table>
"""
, :
In [16]: df = pd.read_html(s)[0]
In [29]: df
Out[29]:
0 1 2
0 one two three
1 "4" NaN NaN
2 "55" "99" NaN
, NA,
In [30]: df.fillna(method='ffill')
Out[30]:
0 1 2
0 one two three
1 "4" two three
2 "55" "99" three
+2
:
<html>
<body>
<table width="100%" border="1">
<tr>
<td rowspan="2">one</td>
<td>two</td>
<td>three</td>
</tr>
<tr>
<td colspan="2">February</td>
</tr>
</table>
</body>
</html>
:
one two three
one February February
python:
# !/bin/python3
# coding: utf-8
from bs4 import BeautifulSoup
class Element(object):
def __init__(self, row, col, text, rowspan=1, colspan=1):
self.row = row
self.col = col
self.text = text
self.rowspan = rowspan
self.colspan = colspan
def __repr__(self):
return f'''{{"row": {self.row}, "col": {self.col}, "text": {self.text}, "rowspan": {self.rowspan}, "colspan": {self.colspan}}}'''
def isRowspan(self):
return self.rowspan > 1
def isColspan(self):
return self.colspan > 1
def parse(h) -> [[]]:
doc = BeautifulSoup(h, 'html.parser')
trs = doc.select('tr')
m = []
for row, tr in enumerate(trs): # collect Node, rowspan node, colspan node
it = []
ts = tr.find_all(['th', 'td'])
for col, tx in enumerate(ts):
element = Element(row, col, tx.text.strip())
if tx.has_attr('rowspan'):
element.rowspan = int(tx['rowspan'])
if tx.has_attr('colspan'):
element.colspan = int(tx['colspan'])
it.append(element)
m.append(it)
def solveColspan(ele):
row, col, text, rowspan, colspan = ele.row, ele.col, ele.text, ele.rowspan, ele.colspan
m[row].insert(col + 1, Element(row, col, text, rowspan, colspan - 1))
for column in range(col + 1, len(m[row])):
m[row][column].col += 1
def solveRowspan(ele):
row, col, text, rowspan, colspan = ele.row, ele.col, ele.text, ele.rowspan, ele.colspan
offset = row + 1
m[offset].insert(col, Element(offset, col, text, rowspan - 1, 1))
for column in range(col + 1, len(m[offset])):
m[offset][column].col += 1
for row in m:
for ele in row:
if ele.isColspan():
solveColspan(ele)
if ele.isRowspan():
solveRowspan(ele)
return m
def prettyPrint(m):
for i in m:
it = [f'{len(i)}']
for index, j in enumerate(i):
if j.text != '':
it.append(f'{index:2} {j.text[:4]:4}')
print(' --- '.join(it))
with open('./index.html', 'rb') as f:
index = f.read()
html = index.decode('utf-8')
matrix = parse(html)
prettyPrint(matrix)
+1