Source code for trafilatura.downloads

# pylint:disable-msg=E0611,I1101
"""
All functions needed to steer and execute downloads of web documents.
"""


import logging
import random
from collections import namedtuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep

import certifi

try:
    import pycurl
    CURL_SHARE = pycurl.CurlShare()
    # available options:
    # https://curl.se/libcurl/c/curl_share_setopt.html
    CURL_SHARE.setopt(pycurl.SH_SHARE, pycurl.LOCK_DATA_DNS)
    CURL_SHARE.setopt(pycurl.SH_SHARE, pycurl.LOCK_DATA_SSL_SESSION)
    # not thread-safe
    # CURL_SHARE.setopt(pycurl.SH_SHARE, pycurl.LOCK_DATA_CONNECT)
except ImportError:
    pycurl = None

import urllib3
from courlan import UrlStore
from courlan.network import redirection_test

from . import __version__
from .settings import DEFAULT_CONFIG
from .utils import (URL_BLACKLIST_REGEX, decode_response, make_chunks,
                    uniquify_list)

NUM_CONNECTIONS = 50
MAX_REDIRECTS = 2

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
HTTP_POOL = None
NO_CERT_POOL = None
RETRY_STRATEGY = None

DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
USER_AGENT = 'trafilatura/' + __version__ + ' (+https://github.com/adbar/trafilatura)'
DEFAULT_HEADERS['User-Agent'] = USER_AGENT

LOGGER = logging.getLogger(__name__)

RawResponse = namedtuple('RawResponse', ['data', 'status', 'url'])


# caching throws an error
# @lru_cache(maxsize=2)
def _parse_config(config):
    'Read and extract HTTP header strings from the configuration file.'
    # load a series of user-agents
    myagents = config.get('DEFAULT', 'USER_AGENTS').strip() or None
    if myagents is not None and myagents != '':
        myagents = myagents.split("\n")
    # https://developer.mozilla.org/en-US/docs/Web/HTTP/Cookies
    # todo: support for several cookies?
    mycookie = config.get('DEFAULT', 'COOKIE') or None
    return myagents, mycookie


def _determine_headers(config, headers=None):
    'Internal function to decide on user-agent string.'
    if config != DEFAULT_CONFIG:
        myagents, mycookie = _parse_config(config)
        headers = {}
        if myagents is not None:
            rnumber = random.randint(0, len(myagents) - 1)
            headers['User-Agent'] = myagents[rnumber]
        if mycookie is not None:
            headers['Cookie'] = mycookie
    return headers or DEFAULT_HEADERS


def _send_request(url, no_ssl, config):
    "Internal function to robustly send a request (SSL or not) and return its result."
    # customize headers
    global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
    if not RETRY_STRATEGY:
        RETRY_STRATEGY = urllib3.util.Retry(
            total=0,
            redirect=MAX_REDIRECTS, # raise_on_redirect=False,
            connect=0,
            backoff_factor=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')/2,
            status_forcelist=[
                429, 499, 500, 502, 503, 504, 509, 520, 521, 522, 523, 524, 525, 526, 527, 530, 598
            ],
            # unofficial: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes#Unofficial_codes
        )
    try:
        # TODO: read by streaming chunks (stream=True, iter_content=xx)
        # so we can stop downloading as soon as MAX_FILE_SIZE is reached
        if no_ssl is False:
            # define pool
            if not HTTP_POOL:
                HTTP_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), ca_certs=certifi.where(), num_pools=NUM_CONNECTIONS)  # cert_reqs='CERT_REQUIRED'
            # execute request
            response = HTTP_POOL.request('GET', url, headers=_determine_headers(config))
        else:
            # define pool
            if not NO_CERT_POOL:
                NO_CERT_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), cert_reqs='CERT_NONE', num_pools=NUM_CONNECTIONS)
            # execute request
            response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config))
    except urllib3.exceptions.SSLError:
        LOGGER.warning('retrying after SSLError: %s', url)
        return _send_request(url, True, config)
    except Exception as err:
        LOGGER.error('download error: %s %s', url, err)  # sys.exc_info()[0]
    else:
        # necessary for standardization
        return RawResponse(response.data, response.status, response.geturl())
    # catchall
    return None


def _handle_response(url, response, decode, config):
    'Internal function to run safety checks on response result.'
    if response.status != 200:
        LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
    elif response.data is None or len(response.data) < config.getint('DEFAULT', 'MIN_FILE_SIZE'):
        LOGGER.error('too small/incorrect for URL %s', url)
        # raise error instead?
    elif len(response.data) > config.getint('DEFAULT', 'MAX_FILE_SIZE'):
        LOGGER.error('too large: length %s for URL %s', len(response.data), url)
        # raise error instead?
    else:
        return decode_response(response.data) if decode is True else response
    # catchall
    return None


[docs] def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG): """Fetches page using urllib3 and decodes the response. Args: url: URL of the page to fetch. decode: Decode response instead of returning urllib3 response object (boolean). no_ssl: Don't try to establish a secure connection (to prevent SSLError). config: Pass configuration values for output control. Returns: RawResponse object: data (headers + body), status (HTML code as string) and url or None in case the result is invalid or there was a problem with the network. """ LOGGER.debug('sending request: %s', url) if pycurl is None: response = _send_request(url, no_ssl, config) else: response = _send_pycurl_request(url, no_ssl, config) if response is not None and response != '': return _handle_response(url, response, decode, config) # return '' (useful do discard further processing?) # return response LOGGER.debug('request failed: %s', url) return None
def _pycurl_is_live_page(url): "Send a basic HTTP HEAD request with pycurl." # Initialize pycurl object curl = pycurl.Curl() # Set the URL and HTTP method (HEAD) curl.setopt(pycurl.URL, url.encode('utf-8')) curl.setopt(pycurl.CONNECTTIMEOUT, 10) # no SSL verification curl.setopt(pycurl.SSL_VERIFYPEER, 0) curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Set option to avoid getting the response body curl.setopt(curl.NOBODY, True) # Perform the request try: curl.perform() except pycurl.error as err: LOGGER.debug('pycurl HEAD error: %s %s', url, err) return False # Get the response code page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400 # Clean up curl.close() return page_exists def _urllib3_is_live_page(url): "Use courlan redirection test (based on urllib3) to send a HEAD request." try: _ = redirection_test(url) except Exception as err: LOGGER.debug('urllib3 HEAD error: %s %s', url, err) return False return True def is_live_page(url): "Send a HTTP HEAD request without taking anything else into account." if pycurl is not None: return _pycurl_is_live_page(url) return _urllib3_is_live_page(url) def add_to_compressed_dict(inputlist, blacklist=None, url_filter=None, url_store=None, compression=False, verbose=False): '''Filter, convert input URLs and add them to domain-aware processing dictionary''' if url_store is None: url_store = UrlStore( compressed=compression, strict=False, verbose=verbose ) inputlist = uniquify_list(inputlist) if blacklist: inputlist = [u for u in inputlist if URL_BLACKLIST_REGEX.sub('', u) not in blacklist] if url_filter: inputlist = [u for u in inputlist if any(f in u for f in url_filter)] url_store.add_urls(inputlist) return url_store def load_download_buffer(url_store, sleep_time=5): '''Determine threading strategy and draw URLs respecting domain-based back-off rules.''' bufferlist = [] while not bufferlist: bufferlist = url_store.get_download_urls(timelimit=sleep_time) # add emptiness test or sleep? if not bufferlist: if url_store.done is True: break sleep(sleep_time) return bufferlist, url_store def buffered_downloads(bufferlist, download_threads, decode=True): '''Download queue consumer, single- or multi-threaded.''' with ThreadPoolExecutor(max_workers=download_threads) as executor: for chunk in make_chunks(bufferlist, 10000): future_to_url = {executor.submit(fetch_url, url, decode): url for url in chunk} for future in as_completed(future_to_url): # url and download result yield future_to_url[future], future.result() def _send_pycurl_request(url, no_ssl, config): '''Experimental function using libcurl and pycurl to speed up downloads''' # https://github.com/pycurl/pycurl/blob/master/examples/retriever-multi.py # init # headerbytes = BytesIO() headers = _determine_headers(config) headerlist = ['Accept-Encoding: gzip, deflate', 'Accept: */*'] for header, content in headers.items(): headerlist.append(header + ': ' + content) # prepare curl request # https://curl.haxx.se/libcurl/c/curl_easy_setopt.html curl = pycurl.Curl() curl.setopt(pycurl.URL, url.encode('utf-8')) # share data curl.setopt(pycurl.SHARE, CURL_SHARE) curl.setopt(pycurl.HTTPHEADER, headerlist) # curl.setopt(pycurl.USERAGENT, '') curl.setopt(pycurl.FOLLOWLOCATION, 1) curl.setopt(pycurl.MAXREDIRS, MAX_REDIRECTS) curl.setopt(pycurl.CONNECTTIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')) curl.setopt(pycurl.TIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')) curl.setopt(pycurl.NOSIGNAL, 1) if no_ssl is True: curl.setopt(pycurl.SSL_VERIFYPEER, 0) curl.setopt(pycurl.SSL_VERIFYHOST, 0) else: curl.setopt(pycurl.CAINFO, certifi.where()) curl.setopt(pycurl.MAXFILESIZE, config.getint('DEFAULT', 'MAX_FILE_SIZE')) #curl.setopt(pycurl.HEADERFUNCTION, headerbytes.write) #curl.setopt(pycurl.WRITEDATA, bufferbytes) # TCP_FASTOPEN # curl.setopt(pycurl.FAILONERROR, 1) # curl.setopt(pycurl.ACCEPT_ENCODING, '') # send request try: bufferbytes = curl.perform_rb() except pycurl.error as err: LOGGER.error('pycurl error: %s %s', url, err) # retry in case of SSL-related error # see https://curl.se/libcurl/c/libcurl-errors.html # errmsg = curl.errstr_raw() # additional error codes: 80, 90, 96, 98 if no_ssl is False and err.args[0] in (35, 54, 58, 59, 60, 64, 66, 77, 82, 83, 91): LOGGER.debug('retrying after SSL error: %s %s', url, err) return _send_pycurl_request(url, True, config) # traceback.print_exc(file=sys.stderr) # sys.stderr.flush() return None # https://github.com/pycurl/pycurl/blob/master/examples/quickstart/response_headers.py #respheaders = dict() #for header_line in headerbytes.getvalue().decode('iso-8859-1').splitlines(): # re.split(r'\r?\n', # # This will botch headers that are split on multiple lines... # if ':' not in header_line: # continue # # Break the header line into header name and value. # name, value = header_line.split(':', 1) # # Now we can actually record the header name and value. # respheaders[name.strip()] = value.strip() # name.strip().lower() ## TODO: check # status respcode = curl.getinfo(curl.RESPONSE_CODE) # url effective_url = curl.getinfo(curl.EFFECTIVE_URL) # additional info # ip_info = curl.getinfo(curl.PRIMARY_IP) # tidy up curl.close() return RawResponse(bufferbytes, respcode, effective_url)