Source code for trafilatura.core

# pylint:disable-msg=E0611,I1101
"""
Extraction configuration and processing functions.
"""

import logging
import warnings
from configparser import ConfigParser
from copy import copy, deepcopy
from typing import Any

from lxml.etree import Element, XPath, _Element, strip_tags
from lxml.html import HtmlElement

# own
from .baseline import baseline
from .deduplication import content_fingerprint, duplicate_test
from .external import compare_extraction
from .htmlprocessing import (
    build_html_output,
    convert_tags,
    prune_unwanted_nodes,
    tree_cleaning,
)
from .main_extractor import extract_comments, extract_content
from .metadata import Document, extract_metadata
from .settings import DEFAULT_CONFIG, Extractor, use_config
from .utils import (
    LANGID_FLAG,
    check_html_lang,
    language_filter,
    load_html,
    normalize_unicode,
)
from .xml import build_json_output, control_xml_output, xmltocsv, xmltotxt
from .xpaths import REMOVE_COMMENTS_XPATH

LOGGER = logging.getLogger(__name__)

TXT_FORMATS = {"markdown", "txt"}


def determine_returnstring(document: Document, options: Extractor) -> str:
    """Convert XML tree to chosen format, clean the result and output it as a string"""
    # XML (TEI) steps
    if "xml" in options.format:
        # last cleaning
        for element in document.body.iter("*"):
            if element.tag != "graphic" and len(element) == 0 and not element.text and not element.tail:
                parent = element.getparent()
                # do not remove elements inside <code> to preserve formatting
                if parent is not None and parent.tag != "code":
                    parent.remove(element)
        # build output tree
        returnstring = control_xml_output(document, options)
    # CSV
    elif options.format == "csv":
        returnstring = xmltocsv(document, options.formatting)
    # JSON
    elif options.format == "json":
        returnstring = build_json_output(document, options.with_metadata)
    # HTML
    elif options.format == "html":
        returnstring = build_html_output(document, options.with_metadata)
    # Markdown and TXT
    else:
        if options.with_metadata:
            header = "---\n"
            for attr in (
                "title",
                "author",
                "url",
                "hostname",
                "description",
                "sitename",
                "date",
                "categories",
                "tags",
                "fingerprint",
                "id",
                "license",
            ):
                if getattr(document, attr):
                    header += f"{attr}: {str(getattr(document, attr))}\n"
            header += "---\n"
        else:
            header = ""
        returnstring = f"{header}{xmltotxt(document.body, options.formatting)}"
        if document.commentsbody is not None:
            returnstring = f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
    # normalize Unicode format (defaults to NFC)
    return normalize_unicode(returnstring)


def trafilatura_sequence(
    cleaned_tree: HtmlElement,
    cleaned_tree_backup: HtmlElement,
    tree_backup: HtmlElement,
    options: Extractor,
) -> tuple[_Element, str, int]:
    "Execute the standard cascade of extractors used by Trafilatura."
    # Trafilatura's main extractor
    postbody, temp_text, len_text = extract_content(cleaned_tree, options)

    # comparison with external extractors
    if not options.fast:
        postbody, temp_text, len_text = compare_extraction(
            cleaned_tree_backup,
            deepcopy(tree_backup),
            postbody,
            temp_text,
            len_text,
            options,
        )

    # rescue: baseline extraction on original/dirty tree
    if len_text < options.min_extracted_size and options.focus != "precision":
        postbody, temp_text, len_text = baseline(deepcopy(tree_backup))
        LOGGER.debug("non-clean extracted length: %s (extraction)", len_text)

    return postbody, temp_text, len_text


[docs] def bare_extraction( filecontent: Any, url: str | None = None, fast: bool = False, no_fallback: bool = False, favor_precision: bool = False, favor_recall: bool = False, include_comments: bool = True, output_format: str = "python", target_language: str | None = None, include_tables: bool = True, include_images: bool = False, include_formatting: bool = False, include_links: bool = False, deduplicate: bool = False, date_extraction_params: dict[str, Any] | None = None, with_metadata: bool = False, only_with_metadata: bool = False, max_tree_size: int | None = None, url_blacklist: set[str] | None = None, author_blacklist: set[str] | None = None, as_dict: bool = False, prune_xpath: str | list[str] | None = None, config: ConfigParser = DEFAULT_CONFIG, options: Extractor | None = None, ) -> Document | dict[str, Any] | None: """Internal function for text extraction returning bare Python variables. Args: filecontent: HTML code as string. url: URL of the webpage. fast: Use faster heuristics and skip backup extraction. no_fallback: Deprecated, use "fast" instead. favor_precision: prefer less text but correct extraction. favor_recall: prefer more text even when unsure. include_comments: Extract comments along with the main text. output_format: Define an output format, Python being the default and the interest of this internal function. Other values: "csv", "html", "json", "markdown", "txt", "xml", and "xmltei". target_language: Define a language to discard invalid documents (ISO 639-1 format). include_tables: Take into account information within the HTML <table> element. include_images: Take images into account (experimental). include_formatting: Keep structural elements related to formatting (present in XML format, converted to markdown otherwise). include_links: Keep links along with their targets (experimental). deduplicate: Remove duplicate segments and documents. date_extraction_params: Provide extraction parameters to htmldate as dict(). with_metadata: Extract metadata fields and add them to the output. only_with_metadata: Only keep documents featuring all essential metadata (date, title, url). url_blacklist: Provide a blacklist of URLs as set() to filter out documents. author_blacklist: Provide a blacklist of Author Names as set() to filter out authors. as_dict: Deprecated, use the .as_dict() method instead. prune_xpath: Provide an XPath expression to prune the tree before extraction. can be str or list of str. config: Directly provide a configparser configuration. options: Directly provide a whole extractor configuration. Returns: A Python dict() containing all the extracted information or None. Raises: ValueError: Extraction problem. """ # deprecations: stacklevel=3 → user → bare_extraction → _check_deprecation fast = _check_deprecation( fast, no_fallback=no_fallback, as_dict=as_dict, max_tree_size=max_tree_size, stacklevel=3, ) # regroup extraction options if not options or not isinstance(options, Extractor): options = Extractor( config=config, output_format=output_format, fast=fast, precision=favor_precision, recall=favor_recall, comments=include_comments, formatting=include_formatting, links=include_links, images=include_images, tables=include_tables, dedup=deduplicate, lang=target_language, url=url, with_metadata=with_metadata, only_with_metadata=only_with_metadata, author_blacklist=author_blacklist, url_blacklist=url_blacklist, date_params=date_extraction_params, ) try: # load the HTML tree tree = load_html(filecontent) if tree is None: LOGGER.error("empty HTML tree: %s", url) raise ValueError # quick and dirty HTML lang check if options.lang and (options.fast or not LANGID_FLAG): if check_html_lang(tree, options.lang) is False: LOGGER.error("wrong HTML meta language: %s", options.source) raise ValueError # extract metadata if necessary if options.with_metadata: document = extract_metadata( tree, options.url, options.date_params, options.fast, options.author_blacklist, ) # cut short if extracted URL in blacklist if document.url in options.url_blacklist: LOGGER.warning("blacklisted URL: %s", document.url) raise ValueError # cut short if core elements are missing if options.only_with_metadata and not (document.date and document.title and document.url): LOGGER.error("no metadata: %s", options.source) raise ValueError else: document = Document() # prune all xpath expressions that user specified # no backup as this is unetre full control of the user if prune_xpath is not None: if isinstance(prune_xpath, str): prune_xpath = [prune_xpath] tree = prune_unwanted_nodes(tree, [XPath(x) for x in prune_xpath]) # clean and backup for further processing cleaned_tree = tree_cleaning(copy(tree), options) cleaned_tree_backup = copy(cleaned_tree) # convert tags, the rest does not work without conversion cleaned_tree = convert_tags(cleaned_tree, options, options.url or document.url) # comments first, then remove if options.comments: commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, options) else: commentsbody, temp_comments, len_comments = Element("body"), "", 0 if options.focus == "precision": cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH) postbody, temp_text, len_text = trafilatura_sequence(cleaned_tree, cleaned_tree_backup, tree, options) # tree size sanity check if options.max_tree_size: # strip tags if len(postbody) > options.max_tree_size: LOGGER.debug("output tree too long: %s", len(postbody)) strip_tags(postbody, "hi") # still too long, raise an error if len(postbody) > options.max_tree_size: LOGGER.debug( "output tree too long: %s, discarding %s", len(postbody), options.source, ) raise ValueError # size checks if options.comments and len_comments < options.min_extracted_comm_size: LOGGER.debug("not enough comments: %s", options.source) if len_text < options.min_output_size and len_comments < options.min_output_comm_size: LOGGER.debug( "text and comments not long enough: %s %s %s", len_text, len_comments, options.source, ) raise ValueError # check duplicates at body level if options.dedup and duplicate_test(postbody, options) is True: LOGGER.debug("discarding duplicate document: %s", options.source) raise ValueError # sanity check on language if options.lang: is_not_target_lang, document = language_filter(temp_text, temp_comments, options.lang, document) if is_not_target_lang is True: LOGGER.debug("wrong language: %s", options.source) raise ValueError except (TypeError, ValueError): LOGGER.warning("discarding data: %s", options.source) return None # special case: python variables if options.format == "python": document.text = xmltotxt(postbody, options.formatting) if options.comments: document.comments = xmltotxt(commentsbody, options.formatting) document.commentsbody = commentsbody document.raw_text = document.text else: document.raw_text, document.commentsbody = temp_text, commentsbody document.body = postbody return document if not as_dict else document.as_dict()
[docs] def extract( filecontent: Any, url: str | None = None, record_id: str | None = None, fast: bool = False, no_fallback: bool = False, favor_precision: bool = False, favor_recall: bool = False, include_comments: bool = True, output_format: str = "txt", tei_validation: bool = False, target_language: str | None = None, include_tables: bool = True, include_images: bool = False, include_formatting: bool = False, include_links: bool = False, deduplicate: bool = False, date_extraction_params: dict[str, Any] | None = None, with_metadata: bool = False, only_with_metadata: bool = False, max_tree_size: int | None = None, url_blacklist: set[str] | None = None, author_blacklist: set[str] | None = None, settingsfile: str | None = None, prune_xpath: str | list[str] | None = None, config: ConfigParser = DEFAULT_CONFIG, options: Extractor | None = None, ) -> str | None: """Main function exposed by the package: Wrapper for text extraction and conversion to chosen output format. Args: filecontent: HTML code as string. url: URL of the webpage. record_id: Add an ID to the metadata. fast: Use faster heuristics and skip backup extraction. no_fallback: Deprecated, use "fast" instead. favor_precision: prefer less text but correct extraction. favor_recall: when unsure, prefer more text. include_comments: Extract comments along with the main text. output_format: Define an output format: "csv", "html", "json", "markdown", "txt", "xml", and "xmltei". tei_validation: Validate the XML-TEI output with respect to the TEI standard. target_language: Define a language to discard invalid documents (ISO 639-1 format). include_tables: Take into account information within the HTML <table> element. include_images: Take images into account (experimental). include_formatting: Keep structural elements related to formatting (only valuable if output_format is set to XML). include_links: Keep links along with their targets (experimental). deduplicate: Remove duplicate segments and documents. date_extraction_params: Provide extraction parameters to htmldate as dict(). with_metadata: Extract metadata fields and add them to the output. only_with_metadata: Only keep documents featuring all essential metadata (date, title, url). url_blacklist: Provide a blacklist of URLs as set() to filter out documents. author_blacklist: Provide a blacklist of Author Names as set() to filter out authors. settingsfile: Use a configuration file to override the standard settings. prune_xpath: Provide an XPath expression to prune the tree before extraction. can be str or list of str. config: Directly provide a configparser configuration. options: Directly provide a whole extractor configuration. Returns: A string in the desired format or None. """ document = _internal_extraction( filecontent=filecontent, url=url, record_id=record_id, fast=fast, no_fallback=no_fallback, favor_precision=favor_precision, favor_recall=favor_recall, include_comments=include_comments, output_format=output_format, tei_validation=tei_validation, target_language=target_language, include_tables=include_tables, include_images=include_images, include_formatting=include_formatting, include_links=include_links, deduplicate=deduplicate, date_extraction_params=date_extraction_params, with_metadata=with_metadata, only_with_metadata=only_with_metadata, max_tree_size=max_tree_size, url_blacklist=url_blacklist, author_blacklist=author_blacklist, settingsfile=settingsfile, prune_xpath=prune_xpath, config=config, options=options, ) return document.text if document is not None else None
def extract_with_metadata( filecontent: Any, url: str | None = None, record_id: str | None = None, fast: bool = False, favor_precision: bool = False, favor_recall: bool = False, include_comments: bool = True, output_format: str = "txt", tei_validation: bool = False, target_language: str | None = None, include_tables: bool = True, include_images: bool = False, include_formatting: bool = False, include_links: bool = False, deduplicate: bool = False, date_extraction_params: dict[str, Any] | None = None, url_blacklist: set[str] | None = None, author_blacklist: set[str] | None = None, settingsfile: str | None = None, prune_xpath: str | list[str] | None = None, config: ConfigParser = DEFAULT_CONFIG, options: Extractor | None = None, ) -> Document | None: """Main function exposed by the package: Wrapper for text extraction and conversion to chosen output format. This method also returns document metadata. Args: filecontent: HTML code as string. url: URL of the webpage. record_id: Add an ID to the metadata. fast: Use faster heuristics and skip backup extraction. favor_precision: prefer less text but correct extraction. favor_recall: when unsure, prefer more text. include_comments: Extract comments along with the main text. output_format: Define an output format: "csv", "html", "json", "markdown", "txt", "xml", and "xmltei". tei_validation: Validate the XML-TEI output with respect to the TEI standard. target_language: Define a language to discard invalid documents (ISO 639-1 format). include_tables: Take into account information within the HTML <table> element. include_images: Take images into account (experimental). include_formatting: Keep structural elements related to formatting (only valuable if output_format is set to XML). include_links: Keep links along with their targets (experimental). deduplicate: Remove duplicate segments and documents. date_extraction_params: Provide extraction parameters to htmldate as dict(). url_blacklist: Provide a blacklist of URLs as set() to filter out documents. author_blacklist: Provide a blacklist of Author Names as set() to filter out authors. settingsfile: Use a configuration file to override the standard settings. prune_xpath: Provide an XPath expression to prune the tree before extraction. can be str or list of str. config: Directly provide a configparser configuration. options: Directly provide a whole extractor configuration. Returns: Document metadata with content string in the desired format or None. """ return _internal_extraction( filecontent=filecontent, url=url, record_id=record_id, fast=fast, favor_precision=favor_precision, favor_recall=favor_recall, include_comments=include_comments, output_format=output_format, tei_validation=tei_validation, target_language=target_language, include_tables=include_tables, include_images=include_images, include_formatting=include_formatting, include_links=include_links, deduplicate=deduplicate, date_extraction_params=date_extraction_params, with_metadata=True, only_with_metadata=False, url_blacklist=url_blacklist, author_blacklist=author_blacklist, settingsfile=settingsfile, prune_xpath=prune_xpath, config=config, options=options, ) def _check_deprecation( fast: bool = False, *, no_fallback: bool = False, as_dict: bool = False, max_tree_size: int | None = None, stacklevel: int = 2, ) -> bool: """Check deprecated params and return the effective "fast" flag.""" if no_fallback: warnings.warn( '"no_fallback" will be removed, use "fast" instead', DeprecationWarning, stacklevel=stacklevel, ) if as_dict: warnings.warn( '"as_dict" will be removed, use the .as_dict() method instead', DeprecationWarning, stacklevel=stacklevel, ) if max_tree_size: raise ValueError('"max_tree_size" will be removed, use settings.cfg instead') return fast or no_fallback def _internal_extraction( filecontent: Any, url: str | None = None, record_id: str | None = None, fast: bool = False, no_fallback: bool = False, favor_precision: bool = False, favor_recall: bool = False, include_comments: bool = True, output_format: str = "txt", tei_validation: bool = False, target_language: str | None = None, include_tables: bool = True, include_images: bool = False, include_formatting: bool = False, include_links: bool = False, deduplicate: bool = False, date_extraction_params: dict[str, Any] | None = None, with_metadata: bool = False, only_with_metadata: bool = False, max_tree_size: int | None = None, url_blacklist: set[str] | None = None, author_blacklist: set[str] | None = None, settingsfile: str | None = None, prune_xpath: str | list[str] | None = None, config: ConfigParser = DEFAULT_CONFIG, options: Extractor | None = None, ) -> Document | None: """Internal method to do the extraction""" # stacklevel=4 → user → extract → _internal_extraction → _check_deprecation fast = _check_deprecation( fast, no_fallback=no_fallback, as_dict=False, max_tree_size=max_tree_size, stacklevel=4, ) # regroup extraction options if not options or not isinstance(options, Extractor): options = Extractor( config=use_config(settingsfile) if settingsfile else config, output_format=output_format, fast=fast, precision=favor_precision, recall=favor_recall, comments=include_comments, formatting=include_formatting, links=include_links, images=include_images, tables=include_tables, dedup=deduplicate, lang=target_language, url=url, with_metadata=with_metadata, only_with_metadata=only_with_metadata, tei_validation=tei_validation, author_blacklist=author_blacklist, url_blacklist=url_blacklist, date_params=date_extraction_params, ) # extraction document = bare_extraction( filecontent, options=options, as_dict=False, prune_xpath=prune_xpath, ) # post-processing if not document or not isinstance(document, Document): return None if options.format not in TXT_FORMATS: # control output if options.format == "python": raise ValueError("'python' format only usable in bare_extraction() function") # add record ID to metadata document.id = record_id # calculate fingerprint if document.raw_text is not None: document.fingerprint = content_fingerprint(str(document.title) + " " + str(document.raw_text)) # return document.text = determine_returnstring(document, options) return document