Skip to content
Snippets Groups Projects
Commit 628858e6 authored by Timmy Chan's avatar Timmy Chan
Browse files

Add html table parser

parent e666f238
No related branches found
No related tags found
1 merge request!5Feat/cve linux kernel
.vscode .vscode
/.vs/ /.vs/
vms/* vms/*
lib/* lib/buildroot/
.idea/* .idea/*
!lib/kconfiglib.py
!vms/.keep !vms/.keep
buildroot_setup_config.txt* buildroot_setup_config.txt*
\ No newline at end of file
import re
from abc import ABC
from html.parser import HTMLParser
from datetime import datetime
class TableHTMLParser(HTMLParser, ABC):
"""
Parse all table data of a HTML document string
"""
def __init__(self):
super().__init__()
self.tables = []
self._current_table = []
self._current_row = []
self._current_cell = []
self._hierarchy = []
def handle_starttag(self, tag, attrs):
self.handle_starttag_sub(tag, attrs)
if tag in ['th', 'td']:
self._hierarchy.append(tag)
def handle_data(self, data):
if any(x in self._hierarchy for x in ['th', 'td']):
self._current_cell.append(data.strip())
def handle_endtag(self, tag):
if tag in ['td', 'th']:
self._hierarchy.pop()
merged_cell = ' '.join(self._current_cell).strip()
self._current_row.append(merged_cell)
self._current_cell = []
elif tag == 'tr':
self._current_row = self.handle_current_row(self._current_row)
if self._current_row:
self._current_table.append(self._current_row)
self._current_row = []
elif tag == 'table':
self.tables.append(self._current_table)
self._current_table = []
def handle_current_row(self, current_row):
""" Override this method for customization """
return current_row
def handle_starttag_sub(self, tag, attrs):
""" Override this method for customization """
pass
class GccTableHTMLParser(TableHTMLParser, ABC):
def __init__(self):
super().__init__()
self._current_cell_datetime_string = None
def handle_current_row(self, current_row):
try:
version = re.sub('[^0-9.]', "", current_row[0])
date_string = current_row[1]
datetime_object = datetime.strptime(date_string, '%B %d, %Y')
return [version, datetime_object]
except Exception:
return []
class KernelTableHTMLParser(TableHTMLParser, ABC):
def __init__(self):
super().__init__()
self._current_cell_datetime_string = None
def handle_starttag_sub(self, tag, attrs):
if tag == 'span':
for name, value in attrs:
if name == 'title':
self._current_cell_datetime_string = value
def handle_current_row(self, current_row):
try:
version = current_row[0].replace("-", ".")
version = re.sub('[^0-9.-]', "", version)
date_string = self._current_cell_datetime_string.split(' ')[0]
datetime_object = datetime.strptime(date_string, '%Y-%m-%d')
return [version, datetime_object]
except Exception:
return []
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment