Source code for trafilatura.sitemaps

"""
Deriving link info from sitemaps.
"""

## This file is available from https://github.com/adbar/trafilatura
## under GNU GPL v3 license


import logging
import re
from itertools import islice
from typing import List, Optional

from courlan import (clean_url, extract_domain, filter_urls, fix_relative_urls,
                     get_hostinfo, lang_filter)

from .downloads import fetch_url, is_live_page
from .settings import MAX_LINKS, MAX_SITEMAPS_SEEN
from .utils import is_similar_domain

# import urllib.robotparser # Python >= 3.8
# ROBOT_PARSER = urllib.robotparser.RobotFileParser()





LOGGER = logging.getLogger(__name__)

LINK_REGEX = re.compile(r'<loc>(?:<!\[CDATA\[)?(http.+?)(?:\]\]>)?</loc>')
XHTML_REGEX = re.compile(r'<xhtml:link.+?>', re.DOTALL)
HREFLANG_REGEX = re.compile(r'href=["\'](.+?)["\']')
WHITELISTED_PLATFORMS = re.compile(r'(?:blogger|blogpost|ghost|hubspot|livejournal|medium|typepad|squarespace|tumblr|weebly|wix|wordpress)\.')

SITEMAP_FORMAT = re.compile(r'^.{0,5}<\?xml|<sitemap|<urlset')
DETECT_SITEMAP_LINK = re.compile(r'\.xml(\..{2,4})?$|\.xml[?#]')
DETECT_LINKS = re.compile(r'https?://[^\s<"]+')
SCRUB_REGEX = re.compile(r'\?.*$|#.*$')
POTENTIAL_SITEMAP = re.compile(r'\.xml\b')  # |\bsitemap\b

GUESSES = ['sitemap.xml.gz', 'sitemap', 'sitemap_index.xml', 'sitemap_news.xml']


class SitemapObject:
    "Store all necessary information on sitemap download and processing."
    __slots__ = ["base_url", "content", "domain", "sitemap_url", "sitemap_urls", "target_lang", "urls"]

    def __init__(self, base_url: str, domain: str, sitemap_url: str, target_lang: Optional[str] = None) -> None:
        self.base_url: str = base_url
        self.content: str = ""
        self.domain: str = domain
        self.sitemap_url: str = sitemap_url
        self.sitemap_urls: List[str] = []
        self.target_lang: Optional[str] = target_lang
        self.urls: List[str] = []

    def fetch(self) -> None:
        "Fetch a sitemap over the network."
        LOGGER.debug('fetching sitemap: %s', self.sitemap_url)
        self.content = fetch_url(self.sitemap_url)

    def handle_link(self, link: str) -> None:
        """Examine a link and determine if it's valid and if it leads to
           a sitemap or a web page."""
        if link == self.sitemap_url:  # safety check
            return
        # fix, check, clean and normalize
        link = fix_relative_urls(self.base_url, link)
        link = clean_url(link, self.target_lang)

        if link is None or not lang_filter(link, self.target_lang):
            return

        newdomain = extract_domain(link, fast=True)
        if newdomain is None:
            LOGGER.error("couldn't extract domain: %s", link)
            return

        # don't take links from another domain and make an exception for main platforms
        # also bypass: subdomains vs. domains
        if not is_similar_domain(self.domain, newdomain) and not WHITELISTED_PLATFORMS.search(newdomain):
            LOGGER.warning('link discarded, diverging domain names: %s %s', self.domain, newdomain)
            return

        if DETECT_SITEMAP_LINK.search(link):
            self.sitemap_urls.append(link)
        else:
            self.urls.append(link)

    def extract_sitemap_langlinks(self) -> None:
        "Extract links corresponding to a given target language."
        if 'hreflang=' not in self.content:
            return
        # compile regex here for modularity and efficiency
        lang_regex = re.compile(rf"hreflang=[\"']({self.target_lang}.*?|x-default)[\"']", re.DOTALL)
        # extract
        for attrs in (m[0] for m in islice(XHTML_REGEX.finditer(self.content), MAX_LINKS)):
            if lang_regex.search(attrs):
                lang_match = HREFLANG_REGEX.search(attrs)
                if lang_match:
                    self.handle_link(lang_match[1])
        LOGGER.debug('%s sitemaps and %s links with hreflang found for %s', len(self.sitemap_urls), len(self.urls), self.sitemap_url)

    def extract_sitemap_links(self) -> None:
        "Extract sitemap links and web page links from a sitemap file."
        # extract
        for match in (m[1] for m in islice(LINK_REGEX.finditer(self.content), MAX_LINKS)):
            # process middle part of the match tuple
            self.handle_link(match)
        LOGGER.debug('%s sitemaps and %s links found for %s', len(self.sitemap_urls), len(self.urls), self.sitemap_url)

    def process(self) -> None:
        "Download a sitemap and extract the links it contains."
        plausible = is_plausible_sitemap(self.sitemap_url, self.content)
        # safeguard
        if not plausible:
            return
        # try to extract links from TXT file
        if not SITEMAP_FORMAT.match(self.content):
            for match in (m[0] for m in islice(DETECT_LINKS.finditer(self.content), MAX_LINKS)):
                self.handle_link(match)
            return
        # process XML sitemap
        if self.target_lang is not None:
            self.extract_sitemap_langlinks()
            if self.sitemap_urls or self.urls:
                return
        self.extract_sitemap_links()






def is_plausible_sitemap(url: str, contents: Optional[str]) -> bool:
    '''Check if the sitemap corresponds to an expected format,
       i.e. TXT or XML.'''
    if contents is None:
        return False

    # strip query and fragments
    url = SCRUB_REGEX.sub('', url)

    # check content
    if POTENTIAL_SITEMAP.search(url) and \
        (not isinstance(contents, str) or not SITEMAP_FORMAT.match(contents)) \
         or '<html' in contents[:150].lower():
        LOGGER.warning('not a valid XML sitemap: %s', url)
        return False

    return True


def find_robots_sitemaps(baseurl: str) -> List[str]:
    '''Guess the location of the robots.txt file and try to extract
       sitemap URLs from it'''
    robotstxt = fetch_url(baseurl + '/robots.txt')
    return extract_robots_sitemaps(robotstxt, baseurl)


def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]:
    'Read a robots.txt file and find sitemap links.'
    # sanity check on length (cause: redirections)
    if robotstxt is None or len(robotstxt) > 10000:
        return []
    sitemapurls = []
    # source: https://github.com/python/cpython/blob/3.8/Lib/urllib/robotparser.py
    for line in robotstxt.splitlines():
        # remove optional comment and strip line
        i = line.find('#')
        if i >= 0:
            line = line[:i]
        line = line.strip()
        if not line:
            continue
        line = line.split(':', 1)
        if len(line) == 2:
            line[0] = line[0].strip().lower()
            if line[0] == "sitemap":
                # urllib.parse.unquote(line[1].strip())
                candidate = fix_relative_urls(baseurl, line[1].strip())
                sitemapurls.append(candidate)
    LOGGER.debug('%s sitemaps found in robots.txt', len(sitemapurls))
    return sitemapurls