Source code for trafilatura.baseline
"""
Module regrouping baseline and basic extraction functions.
"""
# pylint:disable-msg=E0611
import json
from typing import Any, Tuple
from lxml.etree import _Element, Element, SubElement
from lxml.html import HtmlElement
from .settings import BASIC_CLEAN_XPATH
from .utils import load_html, trim
from .xml import delete_element
def basic_cleaning(tree: HtmlElement) -> HtmlElement:
"Remove a few section types from the document."
for elem in BASIC_CLEAN_XPATH(tree):
delete_element(elem)
return tree
[docs]
def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
"""Use baseline extraction function targeting text paragraphs and/or JSON metadata.
Args:
filecontent: HTML code as binary string or string.
Returns:
A LXML <body> element containing the extracted paragraphs,
the main text as string, and its length as integer.
"""
tree = load_html(filecontent)
postbody = Element('body')
if tree is None:
return postbody, '', 0
# scrape from json text
temp_text = ""
for elem in tree.iterfind('.//script[@type="application/ld+json"]'):
if elem.text and 'articleBody' in elem.text:
try:
json_body = json.loads(elem.text).get("articleBody", "")
except Exception: # JSONDecodeError or 'list' object has no attribute 'get'
json_body = ""
if json_body:
if "<p>" in json_body:
parsed = load_html(json_body)
text = trim(parsed.text_content()) if parsed is not None else ""
else:
text = trim(json_body)
SubElement(postbody, 'p').text = text
temp_text += " " + text if temp_text else text
# return postbody, elem.text, len(elem.text)
if len(temp_text) > 100:
return postbody, temp_text, len(temp_text)
tree = basic_cleaning(tree)
# scrape from article tag
temp_text = ""
for article_elem in tree.iterfind('.//article'):
text = trim(article_elem.text_content())
if len(text) > 100:
SubElement(postbody, 'p').text = text
temp_text += " " + text if temp_text else text
if len(postbody) > 0:
# temp_text = trim('\n'.join(postbody.itertext()))
return postbody, temp_text, len(temp_text)
# scrape from text paragraphs
results = set()
temp_text = ""
# postbody = Element('body')
for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'):
entry = trim(element.text_content())
if entry not in results:
SubElement(postbody, 'p').text = entry
temp_text += " " + entry if temp_text else entry
results.add(entry)
# temp_text = trim('\n'.join(postbody.itertext()))
if len(temp_text) > 100:
return postbody, temp_text, len(temp_text)
# default strategy: clean the tree and take everything
postbody = Element('body')
body_elem = tree.find('.//body')
if body_elem is not None:
p_elem = SubElement(postbody, 'p')
# todo: sanitize?
text_elems = [trim(e) for e in body_elem.itertext()]
p_elem.text = '\n'.join([e for e in text_elems if e])
return postbody, p_elem.text, len(p_elem.text)
# new fallback
text = html2txt(tree, clean=False)
SubElement(postbody, 'p').text = text
return postbody, text, len(text)
[docs]
def html2txt(content: Any, clean: bool = True) -> str:
"""Run basic html2txt on a document.
Args:
content: HTML document as string or LXML element.
clean: remove potentially undesirable elements.
Returns:
The extracted text in the form of a string or an empty string.
"""
tree = load_html(content)
if tree is None:
return ""
body = tree.find(".//body")
if body is None:
return ""
if clean:
body = basic_cleaning(body)
return " ".join(body.text_content().split()).strip()