From 41ffe7aab1ac97dcee2d7668acc457ac83c8e45b Mon Sep 17 00:00:00 2001
From: Teodora Sechkova <tsechkova@vmware.com>
Date: Tue, 22 Dec 2020 19:20:40 +0200
Subject: [PATCH] Implement RequestsFetcher class

A concrete implementation of FetcherInterface based on
the Requests library.

Signed-off-by: Teodora Sechkova <tsechkova@vmware.com>
---
 tuf/fetcher.py | 133 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
diff --git a/tuf/fetcher.py b/tuf/fetcher.py
index 5b07a464..9ef09c2b 100644
--- a/tuf/fetcher.py
+++ b/tuf/fetcher.py
@@ -16,6 +16,16 @@
 """
 
 import abc
+import requests
+import six
+import logging
+import time
+
+import urllib3.exceptions
+import tuf.exceptions
+import tuf.settings
+
+logger = logging.getLogger(__name__)
 
 
 class FetcherInterface():
@@ -49,3 +59,126 @@ def fetch(self, url, required_length):
       A bytes iterator
     """
     raise NotImplementedError # pragma: no cover
+
+
+
+class RequestsFetcher(FetcherInterface):
+  """
+  <Purpose>
+    A concrete implementation of FetcherInterface based on the Requests
+    library.
+  """
+
+  def __init__(self):
+    # From http://docs.python-requests.org/en/master/user/advanced/#session-objects:
+    #
+    # "The Session object allows you to persist certain parameters across requests.
+    # It also persists cookies across all requests made from the Session instance,
+    # and will use urllib3's connection pooling. So if you're making several
+    # requests to the same host, the underlying TCP connection will be reused,
+    # which can result in a significant performance increase (see HTTP persistent
+    # connection)."
+    #
+    # NOTE: We use a separate requests.Session per scheme+hostname combination, in
+    # order to reuse connections to the same hostname to improve efficiency, but
+    # avoiding sharing state between different hosts-scheme combinations to
+    # minimize subtle security issues. Some cookies may not be HTTP-safe.
+    self._sessions = {}
+
+
+  def fetch(self, url, required_length):
+    # Get a customized session for each new schema+hostname combination.
+    session = self._get_session(url)
+
+    # Get the requests.Response object for this URL.
+    #
+    # Defer downloading the response body with stream=True.
+    # Always set the timeout. This timeout value is interpreted by requests as:
+    #  - connect timeout (max delay before first byte is received)
+    #  - read (gap) timeout (max delay between bytes received)
+    with session.get(url, stream=True,
+        timeout=tuf.settings.SOCKET_TIMEOUT) as response:
+
+      # Check response status.
+      response.raise_for_status()
+
+      try:
+        bytes_received = 0
+        while True:
+          # We download a fixed chunk of data in every round. This is so that we
+          # can defend against slow retrieval attacks. Furthermore, we do not wish
+          # to download an extremely large file in one shot.
+          # Before beginning the round, sleep (if set) for a short amount of time
+          # so that the CPU is not hogged in the while loop.
+          if tuf.settings.SLEEP_BEFORE_ROUND:
+            time.sleep(tuf.settings.SLEEP_BEFORE_ROUND)
+
+          read_amount = min(
+              tuf.settings.CHUNK_SIZE, required_length - bytes_received)
+
+          # NOTE: This may not handle some servers adding a Content-Encoding
+          # header, which may cause urllib3 to misbehave:
+          # https://github.com/pypa/pip/blob/404838abcca467648180b358598c597b74d568c9/src/pip/_internal/download.py#L547-L582
+          data = response.raw.read(read_amount)
+          bytes_received += len(data)
+
+          yield data
+
+          if bytes_received == required_length:
+            break
+
+          # We might have no more data to read. Check number of bytes downloaded.
+          if not data:
+            logger.debug('Downloaded ' + repr(bytes_received) + '/' +
+              repr(required_length) + ' bytes.')
+
+            # Finally, we signal that the download is complete.
+            break
+
+      except urllib3.exceptions.ReadTimeoutError as e:
+        raise tuf.exceptions.SlowRetrievalError(str(e))
+
+
+
+
+  def _get_session(self, url):
+    """
+    Returns a different customized requests.Session per schema+hostname
+    combination.
+    """
+    # Use a different requests.Session per schema+hostname combination, to
+    # reuse connections while minimizing subtle security issues.
+    parsed_url = six.moves.urllib.parse.urlparse(url)
+
+    if not parsed_url.scheme or not parsed_url.hostname:
+      raise tuf.exceptions.URLParsingError(
+          'Could not get scheme and hostname from URL: ' + url)
+
+    session_index = parsed_url.scheme + '+' + parsed_url.hostname
+
+    logger.debug('url: ' + url)
+    logger.debug('session index: ' + session_index)
+
+    session = self._sessions.get(session_index)
+
+    if not session:
+      session = requests.Session()
+      self._sessions[session_index] = session
+
+      # Attach some default headers to every Session.
+      requests_user_agent = session.headers['User-Agent']
+      # Follows the RFC: https://tools.ietf.org/html/rfc7231#section-5.5.3
+      tuf_user_agent = 'tuf/' + tuf.__version__ + ' ' + requests_user_agent
+      session.headers.update({
+          # Tell the server not to compress or modify anything.
+          # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding#Directives
+          'Accept-Encoding': 'identity',
+          # The TUF user agent.
+          'User-Agent': tuf_user_agent})
+
+      logger.debug('Made new session for ' + session_index)
+
+    else:
+      logger.debug('Reusing session for ' + session_index)
+
+    return session