Implement RequestsFetcher class

A concrete implementation of FetcherInterface based on
the Requests library.

Signed-off-by: Teodora Sechkova <tsechkova@vmware.com>
This commit is contained in:
Teodora Sechkova 2020-12-22 19:20:40 +02:00
parent 84957c87b2
commit 41ffe7aab1
No known key found for this signature in database
GPG key ID: 65F78F613EA1914E

View file

@ -16,6 +16,16 @@
"""
import abc
import requests
import six
import logging
import time
import urllib3.exceptions
import tuf.exceptions
import tuf.settings
logger = logging.getLogger(__name__)
class FetcherInterface():
@ -49,3 +59,126 @@ def fetch(self, url, required_length):
A bytes iterator
"""
raise NotImplementedError # pragma: no cover
class RequestsFetcher(FetcherInterface):
"""
<Purpose>
A concrete implementation of FetcherInterface based on the Requests
library.
"""
def __init__(self):
# From http://docs.python-requests.org/en/master/user/advanced/#session-objects:
#
# "The Session object allows you to persist certain parameters across requests.
# It also persists cookies across all requests made from the Session instance,
# and will use urllib3's connection pooling. So if you're making several
# requests to the same host, the underlying TCP connection will be reused,
# which can result in a significant performance increase (see HTTP persistent
# connection)."
#
# NOTE: We use a separate requests.Session per scheme+hostname combination, in
# order to reuse connections to the same hostname to improve efficiency, but
# avoiding sharing state between different hosts-scheme combinations to
# minimize subtle security issues. Some cookies may not be HTTP-safe.
self._sessions = {}
def fetch(self, url, required_length):
# Get a customized session for each new schema+hostname combination.
session = self._get_session(url)
# Get the requests.Response object for this URL.
#
# Defer downloading the response body with stream=True.
# Always set the timeout. This timeout value is interpreted by requests as:
# - connect timeout (max delay before first byte is received)
# - read (gap) timeout (max delay between bytes received)
with session.get(url, stream=True,
timeout=tuf.settings.SOCKET_TIMEOUT) as response:
# Check response status.
response.raise_for_status()
try:
bytes_received = 0
while True:
# We download a fixed chunk of data in every round. This is so that we
# can defend against slow retrieval attacks. Furthermore, we do not wish
# to download an extremely large file in one shot.
# Before beginning the round, sleep (if set) for a short amount of time
# so that the CPU is not hogged in the while loop.
if tuf.settings.SLEEP_BEFORE_ROUND:
time.sleep(tuf.settings.SLEEP_BEFORE_ROUND)
read_amount = min(
tuf.settings.CHUNK_SIZE, required_length - bytes_received)
# NOTE: This may not handle some servers adding a Content-Encoding
# header, which may cause urllib3 to misbehave:
# https://github.com/pypa/pip/blob/404838abcca467648180b358598c597b74d568c9/src/pip/_internal/download.py#L547-L582
data = response.raw.read(read_amount)
bytes_received += len(data)
yield data
if bytes_received == required_length:
break
# We might have no more data to read. Check number of bytes downloaded.
if not data:
logger.debug('Downloaded ' + repr(bytes_received) + '/' +
repr(required_length) + ' bytes.')
# Finally, we signal that the download is complete.
break
except urllib3.exceptions.ReadTimeoutError as e:
raise tuf.exceptions.SlowRetrievalError(str(e))
def _get_session(self, url):
"""
Returns a different customized requests.Session per schema+hostname
combination.
"""
# Use a different requests.Session per schema+hostname combination, to
# reuse connections while minimizing subtle security issues.
parsed_url = six.moves.urllib.parse.urlparse(url)
if not parsed_url.scheme or not parsed_url.hostname:
raise tuf.exceptions.URLParsingError(
'Could not get scheme and hostname from URL: ' + url)
session_index = parsed_url.scheme + '+' + parsed_url.hostname
logger.debug('url: ' + url)
logger.debug('session index: ' + session_index)
session = self._sessions.get(session_index)
if not session:
session = requests.Session()
self._sessions[session_index] = session
# Attach some default headers to every Session.
requests_user_agent = session.headers['User-Agent']
# Follows the RFC: https://tools.ietf.org/html/rfc7231#section-5.5.3
tuf_user_agent = 'tuf/' + tuf.__version__ + ' ' + requests_user_agent
session.headers.update({
# Tell the server not to compress or modify anything.
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding#Directives
'Accept-Encoding': 'identity',
# The TUF user agent.
'User-Agent': tuf_user_agent})
logger.debug('Made new session for ' + session_index)
else:
logger.debug('Reusing session for ' + session_index)
return session