# pylint:disable-msg=E0611,I1101
"""
All functions related to XML generation, processing and validation.
"""
import csv
import logging
import re
from copy import deepcopy
from html import unescape
from importlib.metadata import version
from io import StringIO
from json import dumps as json_dumps
from pathlib import Path
from lxml.etree import DTD, Element, SubElement, XMLParser, _Element, fromstring, tostring
from .settings import INLINE_CONSUMING, INLINE_FORMATTABLE, Document, Extractor
from .utils import (
is_element_in_item,
is_in_table_cell,
is_last_element_in_cell,
is_last_element_in_item,
item_if_first_element,
sanitize,
sanitize_tree,
text_chars_test,
)
LOGGER = logging.getLogger(__name__)
PKG_VERSION = version("trafilatura")
# validation
TEI_SCHEMA = str(Path(__file__).parent / "data" / "tei_corpus.dtd")
TEI_VALID_TAGS = {
"ab",
"body",
"cell",
"code",
"del",
"div",
"graphic",
"head",
"hi",
"item",
"lb",
"list",
"p",
"quote",
"ref",
"row",
"table",
}
TEI_VALID_ATTRS = {"rend", "rendition", "role", "target", "type"}
TEI_DTD = None # to be downloaded later if necessary
TEI_REMOVE_TAIL = {"ab", "p"}
TEI_DIV_SIBLINGS = {"p", "list", "table", "quote", "ab"}
CONTROL_PARSER = XMLParser(remove_blank_text=True)
NEWLINE_ELEMS = {"graphic", "head", "lb", "list", "p", "quote", "row", "table"}
SPECIAL_FORMATTING = {"code", "del", "head", "hi", "ref", "item", "cell"}
WITH_ATTRIBUTES = {"cell", "row", "del", "graphic", "head", "hi", "item", "list", "ref"}
NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}
META_ATTRIBUTES = [
"sitename",
"title",
"author",
"date",
"url",
"hostname",
"description",
"categories",
"tags",
"license",
"id",
"fingerprint",
"language",
]
HI_FORMATTING = {"#b": "**", "#i": "*", "#u": "__", "#t": "`"}
HEADING_LEVELS = frozenset("123456")
# preceding characters that already separate content, so no extra space/newline is needed
SEPARATORS = frozenset((" ", "\n", "|", ""))
MAX_TABLE_WIDTH = 1000
# block \[...\] and inline \(...\) math; only matched pairs are converted
_MATH_BLOCK_RE = re.compile(r"(?<!\S)\\\[(.+?)\\\]", re.DOTALL)
_MATH_INLINE_RE = re.compile(r"\\\((.+?)\\\)")
# https://github.com/lxml/lxml/blob/master/src/lxml/html/__init__.py
def delete_element(element: _Element, keep_tail: bool = True) -> None:
"""
Removes this element from the tree, including its children and
text. The tail text is joined to the previous element or parent.
"""
parent = element.getparent()
if parent is None:
return
if keep_tail and element.tail:
previous = element.getprevious()
if previous is None:
parent.text = (parent.text or "") + element.tail
else:
previous.tail = (previous.tail or "") + element.tail
parent.remove(element)
def merge_with_parent(element: _Element, include_formatting: bool = False) -> None:
"""Merge element with its parent and convert formatting to markdown."""
parent = element.getparent()
if parent is None:
return
full_text = replace_element_text(element, include_formatting)
if element.tail is not None:
full_text += element.tail
previous = element.getprevious()
if previous is not None:
# There is a previous node, append text to its tail
previous.tail = f"{previous.tail} {full_text}" if previous.tail else full_text
elif parent.text is not None:
parent.text = f"{parent.text} {full_text}"
else:
parent.text = full_text
parent.remove(element)
def remove_empty_elements(tree: _Element) -> _Element:
"""Remove text elements without text."""
for element in tree.iter("*"): # 'head', 'hi', 'item', 'p'
if len(element) == 0 and text_chars_test(element.text) is False and text_chars_test(element.tail) is False:
parent = element.getparent()
# not root element or element which is naturally empty
# do not remove elements inside <code> to preserve formatting
if parent is not None and element.tag != "graphic" and parent.tag != "code":
parent.remove(element)
return tree
def strip_double_tags(tree: _Element) -> _Element:
"Prevent nested tags among a fixed list of tags."
for elem in reversed(tree.xpath(".//head | .//code | .//p")):
for subelem in elem.iterdescendants("code", "head", "p"):
if subelem.tag == elem.tag and subelem.getparent().tag not in NESTING_WHITELIST:
merge_with_parent(subelem)
return tree
def build_json_output(docmeta: Document, with_metadata: bool = True) -> str:
"""Build JSON output based on extracted information"""
if with_metadata:
outputdict = {slot: getattr(docmeta, slot, None) for slot in docmeta.__slots__}
outputdict.update(
{
"source": outputdict.pop("url"),
"source-hostname": outputdict.pop("sitename"),
"excerpt": outputdict.pop("description"),
"categories": ";".join(outputdict.pop("categories") or []),
"tags": ";".join(outputdict.pop("tags") or []),
"text": xmltotxt(outputdict.pop("body"), include_formatting=False),
}
)
commentsbody = outputdict.pop("commentsbody")
else:
outputdict = {"text": xmltotxt(docmeta.body, include_formatting=False)}
commentsbody = docmeta.commentsbody
outputdict["comments"] = xmltotxt(commentsbody, include_formatting=False)
return json_dumps(outputdict, ensure_ascii=False)
def clean_attributes(tree: _Element) -> _Element:
"""Remove unnecessary attributes."""
for elem in tree.iter("*"):
if elem.tag not in WITH_ATTRIBUTES:
elem.attrib.clear()
return tree
def build_xml_output(docmeta: Document) -> _Element:
"""Build XML output tree based on extracted information"""
output = Element("doc")
add_xml_meta(output, docmeta)
docmeta.body.tag = "main"
# clean XML tree
output.append(clean_attributes(docmeta.body))
docmeta.commentsbody.tag = "comments"
output.append(clean_attributes(docmeta.commentsbody))
return output
def control_xml_output(document: Document, options: Extractor) -> str:
"""Make sure the XML output is conform and valid if required"""
strip_double_tags(document.body)
remove_empty_elements(document.body)
func = build_xml_output if options.format == "xml" else build_tei_output
output_tree = func(document)
output_tree = sanitize_tree(output_tree)
# necessary for cleaning
output_tree = fromstring(tostring(output_tree, encoding="unicode"), CONTROL_PARSER)
# validate
if options.format == "xmltei" and options.tei_validation:
LOGGER.debug("TEI validation result: %s %s", validate_tei(output_tree), options.source)
return tostring(output_tree, pretty_print=True, encoding="unicode").strip()
def add_xml_meta(output: _Element, docmeta: Document) -> None:
"""Add extracted metadata to the XML output tree"""
for attribute in META_ATTRIBUTES:
value = getattr(docmeta, attribute, None)
if value:
output.set(attribute, value if isinstance(value, str) else ";".join(value))
def build_tei_output(docmeta: Document) -> _Element:
"""Build TEI-XML output tree based on extracted information"""
# build TEI tree
output = write_teitree(docmeta)
# filter output (strip unwanted elements), just in case
# check and repair
output = check_tei(output, docmeta.url)
return output
def check_tei(xmldoc: _Element, url: str | None) -> _Element:
"""Check if the resulting XML file is conform and scrub remaining tags"""
# convert head tags
for elem in xmldoc.iter("head"):
elem.tag = "ab"
elem.set("type", "header")
parent = elem.getparent()
if parent is None:
continue
if len(elem) > 0:
new_elem = _tei_handle_complex_head(elem)
parent.replace(elem, new_elem)
elem = new_elem
if parent.tag == "p":
_move_element_one_level_up(elem)
# convert <lb/> when child of <div> to <p>
for elem in xmldoc.findall(".//text/body//div/lb"):
if elem.tail and elem.tail.strip():
elem.tag, elem.text, elem.tail = "p", elem.tail, None
# look for elements that are not valid
for elem in xmldoc.findall(".//text/body//*"):
# check elements
if elem.tag not in TEI_VALID_TAGS:
# disable warnings for chosen categories
# if element.tag not in ('div', 'span'):
LOGGER.warning("not a TEI element, removing: %s %s", elem.tag, url)
merge_with_parent(elem)
continue
if elem.tag in TEI_REMOVE_TAIL:
_handle_unwanted_tails(elem)
elif elem.tag == "div":
_handle_text_content_of_div_nodes(elem)
_wrap_unwanted_siblings_of_div(elem)
# if len(elem) == 0:
# elem.getparent().remove(elem)
# check attributes
for attribute in [a for a in elem.attrib if a not in TEI_VALID_ATTRS]:
LOGGER.warning("not a valid TEI attribute, removing: %s in %s %s", attribute, elem.tag, url)
elem.attrib.pop(attribute)
return xmldoc
[docs]
def validate_tei(xmldoc: _Element) -> bool:
"""Check if an XML document is conform to the guidelines of the Text Encoding Initiative"""
global TEI_DTD
if TEI_DTD is None:
# https://tei-c.org/release/xml/tei/custom/schema/dtd/tei_corpus.dtd
TEI_DTD = DTD(TEI_SCHEMA)
result = TEI_DTD.validate(xmldoc)
if result is False:
LOGGER.warning("not a valid TEI document: %s", TEI_DTD.error_log.last_error)
return result
def _code_fence(text: str, min_len: int = 1) -> str:
"Return the shortest backtick string of at least min_len that does not appear as a run in text."
fence_len = min_len
run = 0
for ch in text:
if ch == "`":
run += 1
if run >= fence_len:
fence_len = run + 1
else:
run = 0
return "`" * fence_len
def _code_span(text: str) -> str:
"Inline code span: fence wider than any internal backtick run, padding an edge that abuts a backtick."
fence = _code_fence(text)
if text.startswith("`") or text.endswith("`"):
text = f" {text} " # CommonMark: a space stops the edge backtick from merging with the fence
return f"{fence}{text}{fence}"
def _md_wrap(text: str, marker: str) -> str:
"Wrap text in a markdown marker, leaving any flanking whitespace outside it (valid CommonMark)."
stripped = text.strip()
return text.replace(stripped, f"{marker}{stripped}{marker}", 1) if stripped else text
def _md_code(text: str) -> str:
"Wrap text in an inline code span, leaving any flanking whitespace outside it."
stripped = text.strip()
return text.replace(stripped, _code_span(stripped), 1) if stripped else text
def _convert_math(text: str) -> str:
"Convert LaTeX math delimiters to CommonMark $ notation."
text = _MATH_BLOCK_RE.sub(lambda m: f"\n$$\n{m.group(1).strip()}\n$$\n", text)
return _MATH_INLINE_RE.sub(lambda m: f"${m.group(1)}$", text)
def _collapse_emphasis(element: _Element, active: frozenset[str] = frozenset()) -> None:
"Splice out redundant emphasis levels in linear hi chains (nested <font> junk → ***X*** not *******X*******)."
if element.tag == "hi":
here = HI_FORMATTING.get(element.get("rend") or "")
if here:
active = active | {here}
while (
not (element.text or "").strip()
and len(element) == 1
and element[0].tag == "hi"
and not (element[0].tail or "").strip()
and HI_FORMATTING.get(element[0].get("rend") or "") in active
):
child = element[0]
element.text = (element.text or "") + (child.text or "")
element.extend(list(child))
element.remove(child)
for child in element:
_collapse_emphasis(child, active)
def _convert_math_tree(element: _Element) -> None:
"Rewrite LaTeX math in text/tails in place, leaving code subtrees untouched."
# code content is verbatim: skip the whole subtree
if element.tag == "code" or (element.tag == "hi" and HI_FORMATTING.get(element.get("rend") or "") == "`"):
return
if element.text:
element.text = _convert_math(element.text)
for child in element:
_convert_math_tree(child)
if child.tail: # a code element's tail is prose, so it is still converted
child.tail = _convert_math(child.tail)
def _last_char(returnlist: list[str]) -> str:
"Last character emitted so far, or '' if nothing yet."
return returnlist[-1][-1:] if returnlist else ""
def _list_marker(element: _Element, in_item: bool | None = None, include_formatting: bool = True) -> str:
"Markdown marker for the first element of a list item ('N. '/'- ' with nesting indent), else '' (e.g. in a cell)."
# outside any list item there is no marker and no need to walk ancestors
if in_item is None:
in_item = is_element_in_item(element)
if not in_item:
return ""
item = item_if_first_element(element)
if item is None or is_in_table_cell(element):
return ""
indent = " " * (sum(1 for _ in item.iterancestors("list")) - 1)
parent = item.getparent()
# numbering is markdown-only: in plain text it injects digit tokens, so fall back to '-'
if include_formatting and parent is not None and parent.get("rend") == "ol":
return f"{indent}{sum(1 for _ in item.itersiblings('item', preceding=True)) + 1}. "
return f"{indent}- "
def _md_link(text: str, url: str | None, image: bool = False) -> str:
"Markdown link/image with escaped text and a CommonMark-safe target."
esc = text.replace("[", "\\[").replace("]", "\\]")
prefix = "!" if image else ""
if url is None:
return f"{prefix}[{esc}]"
safe = f"<{url}>" if any(c in url for c in " <>()") else url
return f"{prefix}[{esc}]({safe})"
def _consumes_inline_children(element: _Element) -> bool:
"Whether replace_element_text emits this element's children inline (vs. process_element recursion)."
return element.tag in INLINE_CONSUMING and len(element) > 0
def _heading_prefix(element: _Element) -> str:
"Markdown #-marker for a head element, defaulting to level 2."
level = element.get("rend") or ""
number = int(level[1]) if level[1:2] in HEADING_LEVELS else 2
return "#" * number
def _image_markup(element: _Element) -> str:
"Markdown image for a graphic element: ."
alt = f"{element.get('title', '')} {element.get('alt', '')}".strip()
return _md_link(alt, element.get("src", ""), image=True)
def _collect_inline_text(element: _Element, include_formatting: bool) -> str:
"Collect text from an element's direct text and inline children (carried INLINE_CARRIED tags)."
parts: list[str] = [element.text] if element.text else []
for child in element:
if child.tag == "graphic":
parts.append(_image_markup(child))
elif child.tag == "lb":
parts.append("\n")
elif child.tag in INLINE_FORMATTABLE:
parts.append(replace_element_text(child, include_formatting))
elif child.text: # fallback: e.g. structural tag nested inside inline (malformed input)
parts.append(child.text)
if child.tail:
parts.append(child.tail)
return "".join(parts)
def _escape_cell(text: str) -> str:
"Escape characters that would break a GFM table row: pipes split columns, newlines split rows."
return text.replace("|", "\\|").replace("\n", " ")
def replace_element_text(
element: _Element, include_formatting: bool, in_item: bool | None = None, in_cell: bool = False
) -> str:
"""Determine element text based on just the text of the element. One must deal with the tail separately."""
if _consumes_inline_children(element):
elem_text = _collect_inline_text(element, include_formatting)
else:
elem_text = element.text or ""
# handle formatting: convert to markdown
if include_formatting and elem_text:
if element.tag in ("article", "list", "table"):
elem_text = elem_text.strip()
elif element.tag == "head" and not in_cell:
elem_text = f"{_heading_prefix(element)} {elem_text}"
elif element.tag == "del":
elem_text = _md_wrap(elem_text.replace("~~", "~\\~"), "~~")
elif element.tag == "hi":
marker = HI_FORMATTING.get(element.get("rend") or "")
if marker == "`":
elem_text = _md_code(elem_text) # inline code, flanking whitespace stays outside
elif marker:
elem_text = _md_wrap(elem_text, marker)
elif element.tag == "code":
lbs = element.xpath(".//lb")
if "\n" in elem_text or lbs: # Handle <br> inside <code>
# Convert <br> to \n within code blocks
for lb in lbs:
elem_text = f"{elem_text}\n{lb.tail or ''}"
lb.getparent().remove(lb)
fence = _code_fence(elem_text, min_len=3)
elem_text = f"{fence}\n{elem_text}\n{fence}\n"
else:
elem_text = _md_code(elem_text)
# handle links
if element.tag == "ref":
stripped = elem_text.strip()
if stripped:
target = element.get("target")
if not target:
LOGGER.warning("missing link attribute: %s %s'", elem_text, element.attrib)
link_text = _md_link(stripped, target or None)
elem_text = elem_text.replace(stripped, link_text, 1)
else:
LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
# cells
if element.tag == "cell":
elem_text = elem_text.strip()
# separate the cell's text from its children
if elem_text and len(element):
elem_text = f"{elem_text} "
# within lists
elem_text = f"{_list_marker(element, in_item, include_formatting)}{elem_text}"
# escape chars that would break GFM table cell boundaries
if in_cell:
elem_text = _escape_cell(elem_text)
return elem_text
def process_element(
element: _Element, returnlist: list[str], include_formatting: bool, in_cell: bool = False, in_item: bool = False
) -> None:
"Recursively convert a LXML element and its children to a flattened string representation."
# in_cell/in_item are inherited down the recursion instead of re-walking ancestors at every node
in_cell = in_cell or element.tag == "cell"
in_item = in_item or element.tag == "item"
if element.tag == "cell" and element.getprevious() is None:
returnlist.append("| ")
# a block element starts on its own line, not mashed onto preceding loose text (#661)
if element.tag in NEWLINE_ELEMS and not in_cell and not in_item and _last_char(returnlist) not in SEPARATORS:
returnlist.append("\n")
_consumes_children = _consumes_inline_children(element)
_renders_inline = bool(element.text) or _consumes_children
if _renders_inline:
returnlist.append(replace_element_text(element, include_formatting, in_item, in_cell))
elif include_formatting and element.tag == "head" and not in_cell and len(element):
# heading starting with an inline child still needs its # prefix (children render below)
returnlist.append(f"{_heading_prefix(element)} ")
if element.tail and element.tag != "graphic" and in_cell:
# textless elements like lb should be processed here too
tail = element.tail.strip()
# separate the tail from preceding cell content unless a space/delimiter is already there
if tail and _last_char(returnlist) not in (" ", "|", ""):
tail = f" {tail}"
returnlist.append(_escape_cell(tail))
# a sublist starts on its own line, not mashed onto the parent item
if element.tag == "list" and in_item and _last_char(returnlist) not in ("\n", ""):
returnlist.append("\n")
if not _consumes_children:
for child in element:
process_element(child, returnlist, include_formatting, in_cell, in_item)
if not _renders_inline:
if element.tag == "graphic":
image = f"{_list_marker(element, in_item, include_formatting)}{_image_markup(element)}"
if in_cell:
image = _escape_cell(image)
returnlist.append(image)
if element.tail:
tail_text = f" {element.tail.strip()}"
returnlist.append(_escape_cell(tail_text) if in_cell else tail_text)
# newlines for textless elements
elif element.tag in NEWLINE_ELEMS:
# add line after table head
if element.tag == "row":
cells = element.findall("cell")
cell_count = len(cells)
# a row spans at least its own cells; an explicit span may add colspan padding
span_info = element.get("colspan") or element.get("span")
# isdecimal rejects the superscripts isdigit() admits
span = int(span_info) if span_info and span_info.isdecimal() else 0
# restrict columns to a maximum of 1000
max_span = min(max(span, cell_count), MAX_TABLE_WIDTH)
# row ended so draw extra empty cells to match max_span
if cell_count < max_span:
returnlist.append(f"{'|' * (max_span - cell_count)}\n")
# if this is a head row, draw the separator below
if any(cell.get("role") == "head" for cell in cells):
returnlist.append(f"\n|{'---|' * max_span}\n")
elif not in_cell:
# block elements inside a cell must not inject a row-breaking newline
returnlist.append("\n")
elif element.tag not in ("cell", "item"):
# cells still need to append vertical bars
return
last_in_item = in_item and is_last_element_in_item(element)
if element.tag in NEWLINE_ELEMS and not in_cell and not in_item:
returnlist.append("\n\u2424\n" if include_formatting and element.tag != "row" else "\n")
elif element.tag == "cell":
returnlist.append(" | ")
elif element.tag in ("head", "item") and in_cell and not is_last_element_in_cell(element):
# separate flattened block elements inside a cell (e.g. list items) instead of mashing them
returnlist.append(" ")
elif element.tag not in SPECIAL_FORMATTING and not last_in_item and not is_last_element_in_cell(element):
returnlist.append(" ")
# text that comes after the closing tag
if element.tail and not in_cell and element.tag != "graphic": # graphic tail already handled above
tail = element.tail.strip() if in_item or element.tag == "list" else element.tail
# restore a separator lost during extraction so inline content isn't mashed (e.g. **bold**y)
if tail and in_item and _last_char(returnlist) not in SEPARATORS:
tail = f" {tail}"
returnlist.append(tail)
# deal with list items alone
if last_in_item and not in_cell:
returnlist.append("\n")
[docs]
def xmltotxt(xmloutput: _Element | None, include_formatting: bool) -> str:
"Convert to plain text format and optionally preserve formatting as markdown."
if xmloutput is None:
return ""
returnlist: list[str] = []
if include_formatting:
# math rewrite, emphasis collapse, lb removal mutate the tree; protect caller's copy
xmloutput = deepcopy(xmloutput)
_convert_math_tree(xmloutput)
_collapse_emphasis(xmloutput)
process_element(xmloutput, returnlist, include_formatting)
return unescape(sanitize("".join(returnlist), True) or "")
def xmltocsv(document: Document, include_formatting: bool, *, delim: str = "\t", null: str = "null") -> str:
"Convert the internal XML document representation to a CSV string."
# preprocessing
posttext = xmltotxt(document.body, include_formatting) or null
commentstext = xmltotxt(document.commentsbody, include_formatting) or null
# output config
output = StringIO()
outputwriter = csv.writer(output, delimiter=delim, quoting=csv.QUOTE_MINIMAL)
# organize fields
outputwriter.writerow(
[
d if d else null
for d in (
document.url,
document.id,
document.fingerprint,
document.hostname,
document.title,
document.image,
document.date,
posttext,
commentstext,
document.license,
document.pagetype,
)
]
)
return output.getvalue()
def write_teitree(docmeta: Document) -> _Element:
"""Bundle the extracted post and comments into a TEI tree"""
teidoc = Element("TEI", xmlns="http://www.tei-c.org/ns/1.0")
write_fullheader(teidoc, docmeta)
textelem = SubElement(teidoc, "text")
textbody = SubElement(textelem, "body")
# post
postbody = clean_attributes(docmeta.body)
postbody.tag = "div"
postbody.set("type", "entry")
textbody.append(postbody)
# comments
commentsbody = clean_attributes(docmeta.commentsbody)
commentsbody.tag = "div"
commentsbody.set("type", "comments")
textbody.append(commentsbody)
return teidoc
def _define_publisher_string(docmeta: Document) -> str:
"""Construct a publisher string to include in TEI header"""
if docmeta.hostname and docmeta.sitename:
publisher = f"{docmeta.sitename.strip()} ({docmeta.hostname})"
else:
publisher = docmeta.hostname or docmeta.sitename or "N/A"
if LOGGER.isEnabledFor(logging.WARNING) and publisher == "N/A":
LOGGER.warning("no publisher for URL %s", docmeta.url)
return publisher
def write_fullheader(teidoc: _Element, docmeta: Document) -> _Element:
"""Write TEI header based on gathered metadata"""
# todo: add language info
header = SubElement(teidoc, "teiHeader")
filedesc = SubElement(header, "fileDesc")
bib_titlestmt = SubElement(filedesc, "titleStmt")
SubElement(bib_titlestmt, "title", type="main").text = docmeta.title
if docmeta.author:
SubElement(bib_titlestmt, "author").text = docmeta.author
publicationstmt_a = SubElement(filedesc, "publicationStmt")
publisher_string = _define_publisher_string(docmeta)
# license, if applicable
if docmeta.license:
SubElement(publicationstmt_a, "publisher").text = publisher_string
availability = SubElement(publicationstmt_a, "availability")
SubElement(availability, "p").text = docmeta.license
# insert an empty paragraph for conformity
else:
SubElement(publicationstmt_a, "p")
notesstmt = SubElement(filedesc, "notesStmt")
if docmeta.id:
SubElement(notesstmt, "note", type="id").text = docmeta.id
SubElement(notesstmt, "note", type="fingerprint").text = docmeta.fingerprint
sourcedesc = SubElement(filedesc, "sourceDesc")
source_bibl = SubElement(sourcedesc, "bibl")
sigle = ", ".join(filter(None, [docmeta.sitename, docmeta.date]))
if not sigle:
LOGGER.warning("no sigle for URL %s", docmeta.url)
source_bibl.text = ", ".join(filter(None, [docmeta.title, sigle]))
SubElement(sourcedesc, "bibl", type="sigle").text = sigle
biblfull = SubElement(sourcedesc, "biblFull")
bib_titlestmt = SubElement(biblfull, "titleStmt")
SubElement(bib_titlestmt, "title", type="main").text = docmeta.title
if docmeta.author:
SubElement(bib_titlestmt, "author").text = docmeta.author
publicationstmt = SubElement(biblfull, "publicationStmt")
SubElement(publicationstmt, "publisher").text = publisher_string
if docmeta.url:
SubElement(publicationstmt, "ptr", type="URL", target=docmeta.url)
SubElement(publicationstmt, "date").text = docmeta.date
profiledesc = SubElement(header, "profileDesc")
abstract = SubElement(profiledesc, "abstract")
SubElement(abstract, "p").text = docmeta.description
if docmeta.categories or docmeta.tags:
textclass = SubElement(profiledesc, "textClass")
keywords = SubElement(textclass, "keywords")
if docmeta.categories:
SubElement(keywords, "term", type="categories").text = ",".join(docmeta.categories)
if docmeta.tags:
SubElement(keywords, "term", type="tags").text = ",".join(docmeta.tags)
creation = SubElement(profiledesc, "creation")
SubElement(creation, "date", type="download").text = docmeta.filedate
encodingdesc = SubElement(header, "encodingDesc")
appinfo = SubElement(encodingdesc, "appInfo")
application = SubElement(appinfo, "application", version=PKG_VERSION, ident="Trafilatura")
SubElement(application, "label").text = "Trafilatura"
SubElement(application, "ptr", target="https://github.com/adbar/trafilatura")
return header
def _handle_text_content_of_div_nodes(element: _Element) -> None:
"Wrap loose text in <div> within <p> elements for TEI conformity."
if element.text and element.text.strip():
if len(element) > 0 and element[0].tag == "p":
element[0].text = f"{element.text} {element[0].text or ''}".strip()
else:
new_child = Element("p")
new_child.text = element.text
element.insert(0, new_child)
element.text = None
if element.tail and element.tail.strip():
if len(element) > 0 and element[-1].tag == "p":
element[-1].text = f"{element[-1].text or ''} {element.tail}".strip()
else:
new_child = Element("p")
new_child.text = element.tail
element.append(new_child)
element.tail = None
def _handle_unwanted_tails(element: _Element) -> None:
"Handle tail on p and ab elements"
element.tail = element.tail.strip() if element.tail else None
if not element.tail:
return
if element.tag == "p":
element.text = " ".join(filter(None, [element.text, element.tail]))
else:
new_sibling = Element("p")
new_sibling.text = element.tail
parent = element.getparent()
if parent is not None:
parent.insert(parent.index(element) + 1, new_sibling)
element.tail = None
def _tei_handle_complex_head(element: _Element) -> _Element:
"Convert certain child elements to <ab> and <lb>."
new_element = Element("ab", attrib=element.attrib)
new_element.text = element.text.strip() if element.text else None
for child in element.iterchildren():
if child.tag == "p":
if len(new_element) > 0 or new_element.text:
# add <lb> if <ab> has no children or last tail contains text
if len(new_element) == 0 or new_element[-1].tail:
SubElement(new_element, "lb")
new_element[-1].tail = child.text
else:
new_element.text = child.text
else:
new_element.append(child)
tail = element.tail.strip() if element.tail else None
if tail:
new_element.tail = tail
return new_element
def _wrap_unwanted_siblings_of_div(div_element: _Element) -> None:
"Wrap unwanted siblings of a div element in a new div element."
new_sibling = Element("div")
new_sibling_index = None
parent = div_element.getparent()
if parent is None:
return
# check siblings after target element
for sibling in div_element.itersiblings():
if sibling.tag == "div":
break
if sibling.tag in TEI_DIV_SIBLINGS:
new_sibling_index = new_sibling_index or parent.index(sibling)
new_sibling.append(sibling)
# some elements (e.g. <lb/>) can appear next to div, but
# order of elements should be kept, thus add and reset new_sibling
else:
if new_sibling_index and len(new_sibling) > 0:
parent.insert(new_sibling_index, new_sibling)
new_sibling = Element("div")
new_sibling_index = None
if new_sibling_index and len(new_sibling) != 0:
parent.insert(new_sibling_index, new_sibling)
def _move_element_one_level_up(element: _Element) -> None:
"""
Fix TEI compatibility issues by moving certain p-elems up in the XML tree.
There is always a n+2 nesting for p-elements with the minimal structure ./TEI/text/body/p
"""
parent = element.getparent()
grand_parent = parent.getparent() if parent is not None else None
if parent is None or grand_parent is None:
return
new_elem = Element("p")
new_elem.extend(list(element.itersiblings()))
grand_parent.insert(grand_parent.index(parent) + 1, element)
tail = element.tail.strip() if element.tail else None
if tail:
new_elem.text = tail
element.tail = None
tail = parent.tail.strip() if parent.tail else None
if tail:
new_elem.tail = tail
parent.tail = None
if len(new_elem) > 0 or new_elem.text or new_elem.tail:
grand_parent.insert(grand_parent.index(element) + 1, new_elem)
if len(parent) == 0 and not parent.text:
grand_parent.remove(parent)