Source code for trafilatura.baseline

"""
Module regrouping baseline and basic extraction functions.
"""
# pylint:disable-msg=E0611

import json

from typing import Any, Tuple

from lxml.etree import _Element, Element, SubElement
from lxml.html import HtmlElement

from .settings import BASIC_CLEAN_XPATH
from .utils import load_html, trim
from .xml import delete_element


def basic_cleaning(tree: HtmlElement) -> HtmlElement:
    "Remove a few section types from the document."
    for elem in BASIC_CLEAN_XPATH(tree):
        delete_element(elem)
    return tree


[docs] def baseline(filecontent: Any) -> Tuple[_Element, str, int]: """Use baseline extraction function targeting text paragraphs and/or JSON metadata. Args: filecontent: HTML code as binary string or string. Returns: A LXML <body> element containing the extracted paragraphs, the main text as string, and its length as integer. """ tree = load_html(filecontent) postbody = Element('body') if tree is None: return postbody, '', 0 # scrape from json text temp_text = "" for elem in tree.iterfind('.//script[@type="application/ld+json"]'): if elem.text and 'articleBody' in elem.text: try: json_body = json.loads(elem.text).get("articleBody", "") except Exception: # JSONDecodeError or 'list' object has no attribute 'get' json_body = "" if json_body: if "<p>" in json_body: parsed = load_html(json_body) text = trim(parsed.text_content()) if parsed is not None else "" else: text = trim(json_body) SubElement(postbody, 'p').text = text temp_text += " " + text if temp_text else text # return postbody, elem.text, len(elem.text) if len(temp_text) > 100: return postbody, temp_text, len(temp_text) tree = basic_cleaning(tree) # scrape from article tag temp_text = "" for article_elem in tree.iterfind('.//article'): text = trim(article_elem.text_content()) if len(text) > 100: SubElement(postbody, 'p').text = text temp_text += " " + text if temp_text else text if len(postbody) > 0: # temp_text = trim('\n'.join(postbody.itertext())) return postbody, temp_text, len(temp_text) # scrape from text paragraphs results = set() temp_text = "" # postbody = Element('body') for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'): entry = trim(element.text_content()) if entry not in results: SubElement(postbody, 'p').text = entry temp_text += " " + entry if temp_text else entry results.add(entry) # temp_text = trim('\n'.join(postbody.itertext())) if len(temp_text) > 100: return postbody, temp_text, len(temp_text) # default strategy: clean the tree and take everything postbody = Element('body') body_elem = tree.find('.//body') if body_elem is not None: p_elem = SubElement(postbody, 'p') # todo: sanitize? text_elems = [trim(e) for e in body_elem.itertext()] p_elem.text = '\n'.join([e for e in text_elems if e]) return postbody, p_elem.text, len(p_elem.text) # new fallback text = html2txt(tree, clean=False) SubElement(postbody, 'p').text = text return postbody, text, len(text)
[docs] def html2txt(content: Any, clean: bool = True) -> str: """Run basic html2txt on a document. Args: content: HTML document as string or LXML element. clean: remove potentially undesirable elements. Returns: The extracted text in the form of a string or an empty string. """ tree = load_html(content) if tree is None: return "" body = tree.find(".//body") if body is None: return "" if clean: body = basic_cleaning(body) return " ".join(body.text_content().split()).strip()