# pylint:disable-msg=E0611,I1101
"""
Module bundling functions related to HTML and text processing.
"""
## This file is available from https://github.com/adbar/trafilatura
## under GNU GPL v3 license
# import csv
import logging
import re
# if brotli is installed
try:
import brotli
except ImportError:
brotli = None
from difflib import SequenceMatcher
from functools import lru_cache
from gzip import decompress
from html import unescape
from itertools import islice
from unicodedata import normalize
# CChardet is faster and can be more accurate
try:
from cchardet import detect as cchardet_detect
except ImportError:
cchardet_detect = None
from charset_normalizer import from_bytes
from lxml.html import HtmlElement, HTMLParser, fromstring
# response types
from urllib3.response import HTTPResponse
LOGGER = logging.getLogger(__name__)
UNICODE_ALIASES = {'utf-8', 'utf_8'}
DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I)
# note: htmldate could use HTML comments
# huge_tree=True, remove_blank_text=True
HTML_PARSER = HTMLParser(collect_ids=False, default_doctype=False, encoding='utf-8', remove_comments=True, remove_pis=True)
LINES_TRIMMING = re.compile(r'(?<![p{P}>])\n', flags=re.UNICODE|re.MULTILINE)
URL_BLACKLIST_REGEX = re.compile(r'^https?://|/+$')
# Regex to check image file extensions
IMAGE_EXTENSION = re.compile(r'[^\s]+\.(avif|bmp|gif|hei[cf]|jpe?g|png|webp)(\b|$)')
AUTHOR_PREFIX = re.compile(r'^([a-zäöüß]+(ed|t))? ?(written by|words by|words|by|von|from) ', flags=re.IGNORECASE)
AUTHOR_REMOVE_NUMBERS = re.compile(r'\d.+?$')
AUTHOR_TWITTER = re.compile(r'@[\w]+')
AUTHOR_REPLACE_JOIN = re.compile(r'[._+]')
AUTHOR_REMOVE_NICKNAME = re.compile(r'["‘({\[’\'][^"]+?[‘’"\')\]}]')
AUTHOR_REMOVE_SPECIAL = re.compile(r'[^\w]+$|[:()?*$#!%/<>{}~¿]')
AUTHOR_REMOVE_PREPOSITION = re.compile(r'\b\s+(am|on|for|at|in|to|from|of|via|with|—|-|–)\s+(.*)', flags=re.IGNORECASE)
AUTHOR_EMAIL = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
AUTHOR_SPLIT = re.compile(r'/|;|,|\||&|(?:^|\W)[u|a]nd(?:$|\W)', flags=re.IGNORECASE)
AUTHOR_EMOJI_REMOVE = re.compile(
"["
u"\U00002700-\U000027BF" # Dingbats
u"\U0001F600-\U0001F64F" # Emoticons
u"\U00002600-\U000026FF" # Miscellaneous Symbols
u"\U0001F300-\U0001F5FF" # Miscellaneous Symbols And Pictographs
u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
u"\U0001F680-\U0001F6FF" # Transport and Map Symbols
"]+", flags=re.UNICODE)
AUTHOR_REMOVE_HTML = re.compile(r'<[^>]+>')
CLEAN_META_TAGS = re.compile(r'["\']')
STRIP_EXTENSION = re.compile(r"\.[^/?#]{2,63}$")
def handle_compressed_file(filecontent):
"""Tell if a file's magic number corresponds to the GZip format
and try to decode it. Alternatively, try Brotli if the package
is installed."""
if isinstance(filecontent, bytes):
# source: https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed
if filecontent[:2] == b'\x1f\x8b':
# decode GZipped data
try:
filecontent = decompress(filecontent)
except (EOFError, OSError):
logging.warning('invalid GZ file')
# try brotli
elif brotli is not None:
try:
filecontent = brotli.decompress(filecontent)
except brotli.error:
pass # logging.debug('invalid Brotli file')
return filecontent
def isutf8(data):
"""Simple heuristic to determine if a bytestring uses standard unicode encoding"""
try:
data.decode('UTF-8')
except UnicodeDecodeError:
return False
return True
def detect_encoding(bytesobject):
""""Read all input or first chunk and return a list of encodings"""
# alternatives: https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py
# unicode-test
if isutf8(bytesobject):
return ['utf-8']
guesses = []
# additional module
if cchardet_detect is not None:
cchardet_guess = cchardet_detect(bytesobject)['encoding']
if cchardet_guess is not None:
guesses.append(cchardet_guess.lower())
# try charset_normalizer on first part, fallback on full document
detection_results = from_bytes(bytesobject[:15000]) or from_bytes(bytesobject)
# return alternatives
if len(detection_results) > 0:
guesses.extend([r.encoding for r in detection_results])
# it cannot be utf-8 (tested above)
return [g for g in guesses if g not in UNICODE_ALIASES]
[docs]
def decode_response(response):
"""Read the urllib3 object corresponding to the server response,
check if it could be GZip and eventually decompress it, then
try to guess its encoding and decode it to return a unicode string"""
# urllib3 response object / bytes switch
resp_content = response if isinstance(response, bytes) else response.data
return decode_file(resp_content)
def decode_file(filecontent):
"""Guess bytestring encoding and try to decode to Unicode string.
Resort to destructive conversion otherwise."""
# init
if isinstance(filecontent, str):
return filecontent
htmltext = None
# GZip and Brotli test
filecontent = handle_compressed_file(filecontent)
# encoding
for guessed_encoding in detect_encoding(filecontent):
try:
htmltext = filecontent.decode(guessed_encoding)
except (LookupError, UnicodeDecodeError): # VISCII: lookup
LOGGER.warning('wrong encoding detected: %s', guessed_encoding)
htmltext = None
else:
break
# return original content if nothing else succeeded
return htmltext or str(filecontent, encoding='utf-8', errors='replace')
def is_dubious_html(beginning: str) -> bool:
"Assess if the object is proper HTML (awith a corresponding tag or declaration)."
return "html" not in beginning
def strip_faulty_doctypes(htmlstring: str, beginning: str) -> str:
"Repair faulty doctype strings to make then palatable for libxml2."
# libxml2/LXML issue: https://bugs.launchpad.net/lxml/+bug/1955915
if "doctype" in beginning:
firstline, _, rest = htmlstring.partition("\n")
return DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
return htmlstring
def fromstring_bytes(htmlobject):
"Try to pass bytes to LXML parser."
tree = None
try:
tree = fromstring(htmlobject.encode('utf8', 'surrogatepass'), parser=HTML_PARSER)
except Exception as err:
LOGGER.error('lxml parser bytestring %s', err)
return tree
[docs]
def load_html(htmlobject):
"""Load object given as input and validate its type
(accepted: lxml.html tree, trafilatura/urllib3 response, bytestring and string)
"""
# use tree directly
if isinstance(htmlobject, HtmlElement):
return htmlobject
# use trafilatura or urllib3 responses directly
if isinstance(htmlobject, HTTPResponse) or hasattr(htmlobject, 'data'):
htmlobject = htmlobject.data
# do not accept any other type after this point
if not isinstance(htmlobject, (bytes, str)):
raise TypeError('incompatible input type', type(htmlobject))
# start processing
tree = None
# try to guess encoding and decode file: if None then keep original
htmlobject = decode_file(htmlobject)
# sanity checks
beginning = htmlobject[:50].lower()
check_flag = is_dubious_html(beginning)
# repair first
htmlobject = strip_faulty_doctypes(htmlobject, beginning)
# first pass: use Unicode string
fallback_parse = False
try:
tree = fromstring(htmlobject, parser=HTML_PARSER)
except ValueError:
# "Unicode strings with encoding declaration are not supported."
tree = fromstring_bytes(htmlobject)
fallback_parse = True
except Exception as err:
LOGGER.error('lxml parsing failed: %s', err)
# second pass: try passing bytes to LXML
if (tree is None or len(tree) < 1) and not fallback_parse:
tree = fromstring_bytes(htmlobject)
# rejection test: is it (well-formed) HTML at all?
# log parsing errors
if tree is not None and check_flag is True and len(tree) < 2:
LOGGER.error('parsed tree length: %s, wrong data type or not valid HTML', len(tree))
tree = None
return tree
def txttocsv(text, comments, docmeta):
'''Output the result in CSV format (tab-separated values)'''
# outputwriter = csv.writer(sys.stdout, delimiter='\t', quoting=csv.QUOTE_NONE)
# outputwriter.writerow()
# with newlines: '\\n'.join()
text = trim(' '.join(text.splitlines()))
if comments is not None:
comments = trim(' '.join(comments.splitlines()))
tsv_output = \
f'{docmeta.url}\t{docmeta.fingerprint}\t{docmeta.hostname}\t{docmeta.title}\t{docmeta.image}\t{docmeta.date}\t{text}\t{comments}\t{docmeta.license}\t{docmeta.pagetype}\n'
# add id up front if provided
if docmeta.id is not None:
tsv_output = docmeta.id + '\t' + tsv_output
return tsv_output
@lru_cache(maxsize=2**14) # sys.maxunicode = 1114111
def return_printables_and_spaces(char):
'Return a character if it belongs to certain classes'
return char if char.isprintable() or char.isspace() else ''
def remove_control_characters(string):
'''Prevent non-printable and XML invalid character errors'''
return ''.join(map(return_printables_and_spaces, string))
def normalize_unicode(string, unicodeform='NFC'):
'Normalize the given string to the specified unicode format.'
return normalize(unicodeform, string)
@lru_cache(maxsize=1024)
def line_processing(line):
'''Remove HTML space entities, then discard incompatible unicode
and invalid XML characters on line level'''
# spacing HTML entities: https://www.w3.org/MarkUp/html-spec/html-spec_13.html
# unique code spaces
line = line.replace(' ', '\r').replace(' ', '\n').replace(' ', '\u00A0').replace(';cs;', ' ')
# remove newlines that are not related to punctuation or markup
# remove non-printable chars and normalize space characters (including Unicode spaces)
line = trim(remove_control_characters(LINES_TRIMMING.sub(r' ', line)))
# prune empty lines
if all(map(str.isspace, line)):
line = None
return line
[docs]
def sanitize(text):
'''Convert text and discard incompatible and invalid characters'''
try:
return '\n'.join(filter(None, (line_processing(l) for l in text.splitlines())))
except AttributeError:
return None
[docs]
@lru_cache(maxsize=1024)
def trim(string):
'''Remove unnecessary spaces within a text string'''
try:
# remove newlines that are not related to punctuation or markup + proper trimming
# return LINES_TRIMMING.sub(r' ', string).strip(' \t\n\r\v')
# faster:
return ' '.join(string.split()).strip()
except (AttributeError, TypeError):
return None
def normalize_tags(tags):
'''Remove special characters of tags'''
tags = CLEAN_META_TAGS.sub(r'', trim(unescape(tags)))
return ", ".join(filter(None, tags.split(", ")))
def is_image_file(imagesrc):
'''Check if the observed string corresponds to a valid image extension,
return False otherwise'''
return bool(imagesrc is not None and IMAGE_EXTENSION.search(imagesrc))
def normalize_authors(current_authors, author_string):
'''Normalize author info to focus on author names only'''
new_authors = []
if author_string.lower().startswith('http') or AUTHOR_EMAIL.match(author_string):
return current_authors
if current_authors is not None:
new_authors = current_authors.split('; ')
# fix to code with unicode
if '\\u' in author_string:
author_string = author_string.encode().decode('unicode_escape')
# fix html entities
if '&#' in author_string or '&' in author_string:
author_string = unescape(author_string)
# remove html tags
author_string = AUTHOR_REMOVE_HTML.sub('', author_string)
# examine names
for author in AUTHOR_SPLIT.split(author_string):
author = trim(author)
# remove emoji
author = AUTHOR_EMOJI_REMOVE.sub('', author)
# remove @username
author = AUTHOR_TWITTER.sub('', author)
# replace special characters with space
author = trim(AUTHOR_REPLACE_JOIN.sub(' ', author))
author = AUTHOR_REMOVE_NICKNAME.sub('', author)
# remove special characters
author = AUTHOR_REMOVE_SPECIAL.sub('', author)
author = AUTHOR_PREFIX.sub('', author)
author = AUTHOR_REMOVE_NUMBERS.sub('', author)
author = AUTHOR_REMOVE_PREPOSITION.sub('', author)
# skip empty or improbably long strings
if len(author) == 0 or (
# simple heuristics, regex or vowel tests also possible
' ' not in author and '-' not in author and len(author) >= 50
):
continue
# title case
if not author[0].isupper() or sum(1 for c in author if c.isupper()) < 1:
author = author.title()
# safety checks
if author not in new_authors and (len(new_authors) == 0 or all(new_author not in author for new_author in new_authors)):
new_authors.append(author)
if len(new_authors) == 0:
return current_authors
return '; '.join(new_authors).strip('; ')
def uniquify_list(l):
"""
Remove duplicates from a list while keeping order in an efficient way.
Dictionaries preserve insertion order since Python 3.6.
https://www.peterbe.com/plog/fastest-way-to-uniquify-a-list-in-python-3.6
"""
return list(dict.fromkeys(l))
@lru_cache(maxsize=1024)
def is_similar_domain(reference, new_string, threshold=0.5):
"Return the similarity ratio between two short strings, here domain names."
if new_string != reference:
new_string = STRIP_EXTENSION.sub("", new_string)
reference = STRIP_EXTENSION.sub("", reference)
if SequenceMatcher(None, reference, new_string).ratio() < threshold:
return False
return True
def make_chunks(iterable, n):
"""
Chunk data into smaller pieces.
https://docs.python.org/3/library/itertools.html
"""
it = iter(iterable)
while True:
chunk = tuple(islice(it, n))
if not chunk:
return
yield chunk
# Python 3.8+ with walrus operator
# while batch := tuple(islice(it, n)):
# yield batch