"""
Module bundling all functions needed to scrape metadata from webpages.
"""
import json
import logging
import re
from copy import deepcopy
from html import unescape
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from courlan import (
extract_domain,
get_base_url,
is_valid_url,
normalize_url,
validate_url,
)
from htmldate import find_date
from lxml.etree import XPath
from lxml.html import HtmlElement, tostring
from .htmlprocessing import prune_unwanted_nodes
from .json_metadata import (
extract_json,
extract_json_parse_error,
normalize_authors,
normalize_json,
)
from .settings import Document, set_date_params
from .utils import HTML_STRIP_TAGS, line_processing, load_html, trim
from .xpaths import (
AUTHOR_DISCARD_XPATHS,
AUTHOR_XPATHS,
CATEGORIES_XPATHS,
TAGS_XPATHS,
TITLE_XPATHS,
)
__all__ = ["Document"]
LOGGER = logging.getLogger(__name__)
logging.getLogger("htmldate").setLevel(logging.WARNING)
META_URL = re.compile(r"https?://(?:www\.|w[0-9]+\.)?([^/]+)")
JSON_MINIFY = re.compile(r'("(?:\\"|[^"])*")|\s')
HTMLTITLE_REGEX = re.compile(
r"^(.+)?\s+[–•·—|⁄*⋆~‹«<›»>:-]\s+(.+)$"
) # part without dots?
CLEAN_META_TAGS = re.compile(r'["\']')
LICENSE_REGEX = re.compile(
r"/(by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero)/([1-9]\.[0-9])"
)
TEXT_LICENSE_REGEX = re.compile(
r"(cc|creative commons) (by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero) ?([1-9]\.[0-9])?",
re.I,
)
METANAME_AUTHOR = {
"article:author",
"atc-metaauthor",
"author",
"authors",
"byl",
"citation_author",
"creator",
"dc.creator",
"dc.creator.aut",
"dc:creator",
"dcterms.creator",
"dcterms.creator.aut",
"dcsext.author",
"parsely-author",
"rbauthors",
"sailthru.author",
"shareaholic:article_author_name",
} # questionable: twitter:creator
METANAME_DESCRIPTION = {
"dc.description",
"dc:description",
"dcterms.abstract",
"dcterms.description",
"description",
"sailthru.description",
"twitter:description",
}
METANAME_PUBLISHER = {
"article:publisher",
"citation_journal_title",
"copyright",
"dc.publisher",
"dc:publisher",
"dcterms.publisher",
"publisher",
"sailthru.publisher",
"rbpubname",
"twitter:site",
} # questionable: citation_publisher
METANAME_TAG = {
"citation_keywords",
"dcterms.subject",
"keywords",
"parsely-tags",
"shareaholic:keywords",
"tags",
}
METANAME_TITLE = {
"citation_title",
"dc.title",
"dcterms.title",
"fb_title",
"headline",
"parsely-title",
"sailthru.title",
"shareaholic:title",
"rbtitle",
"title",
"twitter:title",
}
METANAME_URL = {"rbmainurl", "twitter:url"}
METANAME_IMAGE = {
"image",
"og:image",
"og:image:url",
"og:image:secure_url",
"twitter:image",
"twitter:image:src",
}
PROPERTY_AUTHOR = {"author", "article:author"}
TWITTER_ATTRS = {"twitter:site", "application-name"}
# also interesting: article:section
EXTRA_META = {"charset", "http-equiv", "property"}
OG_PROPERTIES = {
"og:title": "title",
"og:description": "description",
"og:site_name": "sitename",
"og:image": "image",
"og:image:url": "image",
"og:image:secure_url": "image",
"og:type": "pagetype",
}
OG_AUTHOR = {"og:author", "og:article:author"}
URL_SELECTORS = [
'.//head//link[@rel="canonical"]',
'.//head//base',
'.//head//link[@rel="alternate"][@hreflang="x-default"]'
]
def normalize_tags(tags: str) -> str:
"""Remove special characters of tags"""
trimmed = trim(unescape(tags))
if not trimmed:
return ""
tags = CLEAN_META_TAGS.sub(r"", trimmed)
return ", ".join(filter(None, tags.split(", ")))
def check_authors(authors: str, author_blacklist: Set[str]) -> Optional[str]:
"Check if the authors string correspond to expected values."
author_blacklist = {a.lower() for a in author_blacklist}
new_authors = [
author.strip()
for author in authors.split(";")
if author.strip().lower() not in author_blacklist
]
if new_authors:
return "; ".join(new_authors).strip("; ")
return None
def extract_meta_json(tree: HtmlElement, metadata: Document) -> Document:
"""Parse and extract metadata from JSON-LD data"""
for elem in tree.xpath(
'.//script[@type="application/ld+json" or @type="application/settings+json"]'
):
if not elem.text:
continue
element_text = normalize_json(JSON_MINIFY.sub(r"\1", elem.text))
try:
schema = json.loads(element_text)
metadata = extract_json(schema, metadata)
except json.JSONDecodeError:
metadata = extract_json_parse_error(element_text, metadata)
return metadata
def extract_opengraph(tree: HtmlElement) -> Dict[str, Optional[str]]:
"""Search meta tags following the OpenGraph guidelines (https://ogp.me/)"""
result = dict.fromkeys(
("title", "author", "url", "description", "sitename", "image", "pagetype")
)
# detect OpenGraph schema
for elem in tree.xpath('.//head/meta[starts-with(@property, "og:")]'):
property_name, content = elem.get("property"), elem.get("content")
# safeguard
if content and not content.isspace():
if property_name in OG_PROPERTIES:
result[OG_PROPERTIES[property_name]] = content
elif property_name == "og:url" and is_valid_url(content):
result["url"] = content
elif property_name in OG_AUTHOR:
result["author"] = normalize_authors(None, content)
# og:locale
# elif elem.get('property') == 'og:locale':
# pagelocale = elem.get('content')
return result
def examine_meta(tree: HtmlElement) -> Document:
"""Search meta tags for relevant information"""
# bootstrap from potential OpenGraph tags
metadata = Document().from_dict(extract_opengraph(tree))
# test if all values not assigned in the following have already been assigned
if all(
(
metadata.title,
metadata.author,
metadata.url,
metadata.description,
metadata.sitename,
metadata.image,
)
): # tags
return metadata
tags, backup_sitename = [], None
# iterate through meta tags
for elem in tree.iterfind(".//head/meta[@content]"):
# content
content_attr = HTML_STRIP_TAGS.sub("", elem.get("content", "")).strip()
if not content_attr:
continue
# todo: image info
# ...
# property
if "property" in elem.attrib:
property_attr = elem.get("property", "").lower()
# no opengraph a second time
if property_attr.startswith("og:"):
continue
if property_attr == "article:tag":
tags.append(normalize_tags(content_attr))
elif property_attr in PROPERTY_AUTHOR:
metadata.author = normalize_authors(metadata.author, content_attr)
elif property_attr == "article:publisher":
metadata.sitename = metadata.sitename or content_attr
elif property_attr in METANAME_IMAGE:
metadata.image = metadata.image or content_attr
# name attribute
elif "name" in elem.attrib:
name_attr = elem.get("name", "").lower()
# author
if name_attr in METANAME_AUTHOR:
metadata.author = normalize_authors(metadata.author, content_attr)
# title
elif name_attr in METANAME_TITLE:
metadata.title = metadata.title or content_attr
# description
elif name_attr in METANAME_DESCRIPTION:
metadata.description = metadata.description or content_attr
# site name
elif name_attr in METANAME_PUBLISHER:
metadata.sitename = metadata.sitename or content_attr
# image
elif name_attr in METANAME_IMAGE:
metadata.image = metadata.image or content_attr
# twitter
elif name_attr in TWITTER_ATTRS or "twitter:app:name" in name_attr:
backup_sitename = content_attr
# url
elif (
name_attr == "twitter:url"
and not metadata.url
and is_valid_url(content_attr)
):
metadata.url = content_attr
# keywords
elif name_attr in METANAME_TAG: # 'page-topic'
tags.append(normalize_tags(content_attr))
elif "itemprop" in elem.attrib:
itemprop_attr = elem.get("itemprop", "").lower()
if itemprop_attr == "author":
metadata.author = normalize_authors(metadata.author, content_attr)
elif itemprop_attr == "description":
metadata.description = metadata.description or content_attr
elif itemprop_attr == "headline":
metadata.title = metadata.title or content_attr
# to verify:
# elif itemprop_attr == 'name':
# if title is None:
# title = elem.get('content')
# other types
elif all(key not in elem.attrib for key in EXTRA_META):
LOGGER.debug(
"unknown attribute: %s",
tostring(elem, pretty_print=False, encoding="unicode").strip(),
)
# backups
metadata.sitename = metadata.sitename or backup_sitename
# copy
metadata.tags = tags
# metadata.set_attributes(tags=tags)
return metadata
def extract_metainfo(
tree: HtmlElement, expressions: List[XPath], len_limit: int = 200
) -> Optional[str]:
"""Extract meta information"""
# try all XPath expressions
for expression in expressions:
# examine all results
results = expression(tree)
for elem in results:
content = trim(" ".join(elem.itertext()))
if content and 2 < len(content) < len_limit:
return content
if len(results) > 1:
LOGGER.debug(
"more than one invalid result: %s %s", expression, len(results)
)
return None
def examine_title_element(
tree: HtmlElement,
) -> Tuple[str, Optional[str], Optional[str]]:
"""Extract text segments out of main <title> element."""
title = ""
title_element = tree.find(".//head//title")
if title_element is not None:
title = trim(title_element.text_content())
if match := HTMLTITLE_REGEX.match(title):
return title, match[1], match[2]
LOGGER.debug("no main title found")
return title, None, None
def extract_title(tree: HtmlElement) -> Optional[str]:
"""Extract the document title"""
# only one h1-element: take it
h1_results = tree.findall(".//h1")
if len(h1_results) == 1:
title = trim(h1_results[0].text_content())
if title:
return title
# extract using x-paths
title = extract_metainfo(tree, TITLE_XPATHS) or ""
if title:
return title
# extract using title tag
title, first, second = examine_title_element(tree)
for t in (first, second):
if t and "." not in t:
return t
# take first h1-title
if h1_results:
return h1_results[0].text_content()
# take first h2-title
try:
title = tree.xpath(".//h2")[0].text_content()
except IndexError:
LOGGER.debug("no h2 title found")
return title
def extract_author(tree: HtmlElement) -> Optional[str]:
"""Extract the document author(s)"""
subtree = prune_unwanted_nodes(deepcopy(tree), AUTHOR_DISCARD_XPATHS)
author = extract_metainfo(subtree, AUTHOR_XPATHS, len_limit=120)
if author:
author = normalize_authors(None, author)
# copyright?
return author
def extract_url(tree: HtmlElement, default_url: Optional[str] = None) -> Optional[str]:
"""Extract the URL from the canonical link"""
for selector in URL_SELECTORS:
element = tree.find(selector)
url = element.attrib.get("href") if element is not None else None
if url:
break
# fix relative URLs
if url and url.startswith("/"):
for element in tree.iterfind(".//head//meta[@content]"):
attrtype = element.get("name") or element.get("property") or ""
if attrtype.startswith("og:") or attrtype.startswith("twitter:"):
base_url = get_base_url(element.attrib["content"])
if base_url:
# prepend URL
url = base_url + url
break
# do not return invalid URLs
if url:
validation_result, parsed_url = validate_url(url)
url = normalize_url(parsed_url) if validation_result and parsed_url else None
return url or default_url
def extract_sitename(tree: HtmlElement) -> Optional[str]:
"""Extract the name of a site from the main title (if it exists)"""
_, *parts = examine_title_element(tree)
return next((part for part in parts if part and "." in part), None)
def extract_catstags(metatype: str, tree: HtmlElement) -> List[str]:
"""Find category and tag information"""
results: List[str] = []
regexpr = "/" + metatype + "[s|ies]?/"
xpath_expression = CATEGORIES_XPATHS if metatype == "category" else TAGS_XPATHS
# search using custom expressions
for catexpr in xpath_expression:
results.extend(
elem.text_content()
for elem in catexpr(tree)
if re.search(regexpr, elem.attrib["href"])
)
if results:
break
# category fallback
if metatype == "category" and not results:
for element in tree.xpath(
'.//head//meta[@property="article:section" or contains(@name, "subject")][@content]'
):
results.append(element.attrib["content"])
# optional: search through links
# if not results:
# for elem in tree.xpath('.//a[@href]'):
# search for 'category'
return [r for r in dict.fromkeys(line_processing(x) for x in results if x) if r]
def parse_license_element(element: HtmlElement, strict: bool = False) -> Optional[str]:
"""Probe a link for identifiable free license cues.
Parse the href attribute first and then the link text."""
# look for Creative Commons elements
match = LICENSE_REGEX.search(element.get("href", ""))
if match:
return f"CC {match[1].upper()} {match[2]}"
if element.text:
# check if it could be a CC license
if strict:
match = TEXT_LICENSE_REGEX.search(element.text)
return match[0] if match else None
return trim(element.text)
return None
def extract_license(tree: HtmlElement) -> Optional[str]:
"""Search the HTML code for license information and parse it."""
# look for links labeled as license
for element in tree.findall('.//a[@rel="license"][@href]'):
result = parse_license_element(element, strict=False)
if result is not None:
return result
# probe footer elements for CC links
for element in tree.xpath(
'.//footer//a[@href]|.//div[contains(@class, "footer") or contains(@id, "footer")]//a[@href]'
):
result = parse_license_element(element, strict=True)
if result is not None:
return result
return None