python-tuf/tuf/download.py

"""
<Program Name>
  download.py

<Started>
  February 21, 2012.  Based on previous version by Geremy Condra.

<Author>
  Konstantin Andrianov
  Vladimir Diaz <vladimir.v.diaz@gmail.com>

<Copyright>
  See LICENSE for licensing information.

<Purpose>
  Download metadata and target files and check their validity.  The hash and
  length of a downloaded file has to match the hash and length supplied by the
  metadata of that file.  The downloaded file is technically a  file-like object
  that will automatically destroys itself once closed.  Note that the file-like
  object, 'tuf.util.TempFile', is returned by the '_download_file()' function.
"""

# Help with Python 3 compatibility, where the print statement is a function, an
# implicit relative import is invalid, and the '/' operator performs true
# division.  Example:  print 'hello world' raises a 'SyntaxError' exception.
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

import os
import socket
import logging
import timeit
import ssl

import tuf
import tuf.conf
import tuf.hash
import tuf.util
import tuf.formats
import six

# 'ssl.match_hostname' was added in Python 3.2.  The vendored version is needed
# for Python 2.6 and 2.7.
try:
  from ssl import match_hostname, CertificateError

except ImportError: # pragma: no cover
  from tuf._vendor.ssl_match_hostname import match_hostname, CertificateError

# See 'log.py' to learn how logging is handled in TUF.
logger = logging.getLogger('tuf.download')


def safe_download(url, required_length):
  """
  <Purpose>
    Given the 'url' and 'required_length' of the desired file, open a connection
    to 'url', download it, and return the contents of the file.  Also ensure
    the length of the downloaded file matches 'required_length' exactly.
    tuf.download.unsafe_download() may be called if an upper download limit is
    preferred.

    'tuf.util.TempFile', the file-like object returned, is used instead of
    regular tempfile object because of additional functionality provided, such
    as handling compressed metadata and automatically closing files after
    moving to final destination.

  <Arguments>
    url:
      A URL string that represents the location of the file.  The URI scheme
      component must be one of 'tuf.conf.SUPPORTED_URI_SCHEMES'.

    required_length:
      An integer value representing the length of the file.  This is an exact
      limit.

  <Side Effects>
    A 'tuf.util.TempFile' object is created on disk to store the contents of
    'url'.

  <Exceptions>
    tuf.DownloadLengthMismatchError, if there was a mismatch of observed vs
    expected lengths while downloading the file.

    tuf.FormatError, if any of the arguments are improperly formatted.

    Any other unforeseen runtime exception.

  <Returns>
    A 'tuf.util.TempFile' file-like object that points to the contents of 'url'.
  """

  # Do all of the arguments have the appropriate format?
  # Raise 'tuf.FormatError' if there is a mismatch.
  tuf.formats.URL_SCHEMA.check_match(url)
  tuf.formats.LENGTH_SCHEMA.check_match(required_length)

  # Ensure 'url' specifies one of the URI schemes in
  # 'tuf.conf.SUPPORTED_URI_SCHEMES'.  Be default, ['http', 'https'] is
  # supported.  If the URI scheme of 'url' is empty or "file", files on the
  # local system can be accessed.  Unexpected files may be accessed by
  # compromised metadata (unlikely to happen if targets.json metadata is signed
  # with offline keys).
  parsed_url = six.moves.urllib.parse.urlparse(url)

  if parsed_url.scheme not in tuf.conf.SUPPORTED_URI_SCHEMES:
    message = \
      repr(url) + ' specifies an unsupported URI scheme.  Supported ' + \
      ' URI Schemes: ' + repr(tuf.conf.SUPPORTED_URI_SCHEMES)
    raise tuf.FormatError(message)

  return _download_file(url, required_length, STRICT_REQUIRED_LENGTH=True)


def unsafe_download(url, required_length):
  """
  <Purpose>
    Given the 'url' and 'required_length' of the desired file, open a connection
    to 'url', download it, and return the contents of the file.  Also ensure
    the length of the downloaded file is up to 'required_length', and no larger.
    tuf.download.safe_download() may be called if an exact download limit is
    preferred.

    'tuf.util.TempFile', the file-like object returned, is used instead of
    regular tempfile object because of additional functionality provided, such
    as handling compressed metadata and automatically closing files after
    moving to final destination.

  <Arguments>
    url:
      A URL string that represents the location of the file.  The URI scheme
      component must be one of 'tuf.conf.SUPPORTED_URI_SCHEMES'.

    required_length:
      An integer value representing the length of the file.  This is an upper
      limit.

  <Side Effects>
    A 'tuf.util.TempFile' object is created on disk to store the contents of
    'url'.

  <Exceptions>
    tuf.DownloadLengthMismatchError, if there was a mismatch of observed vs
    expected lengths while downloading the file.

    tuf.FormatError, if any of the arguments are improperly formatted.

    Any other unforeseen runtime exception.

  <Returns>
    A 'tuf.util.TempFile' file-like object that points to the contents of 'url'.
  """

  # Do all of the arguments have the appropriate format?
  # Raise 'tuf.FormatError' if there is a mismatch.
  tuf.formats.URL_SCHEMA.check_match(url)
  tuf.formats.LENGTH_SCHEMA.check_match(required_length)

  # Ensure 'url' specifies one of the URI schemes in
  # 'tuf.conf.SUPPORTED_URI_SCHEMES'.  Be default, ['http', 'https'] is
  # supported.  If the URI scheme of 'url' is empty or "file", files on the
  # local system can be accessed.  Unexpected files may be accessed by
  # compromised metadata (unlikely to happen if targets.json metadata is signed
  # with offline keys).
  parsed_url = six.moves.urllib.parse.urlparse(url)

  if parsed_url.scheme not in tuf.conf.SUPPORTED_URI_SCHEMES:
    message = \
      repr(url) + ' specifies an unsupported URI scheme.  Supported ' + \
      ' URI Schemes: ' + repr(tuf.conf.SUPPORTED_URI_SCHEMES)
    raise tuf.FormatError(message)

  return _download_file(url, required_length, STRICT_REQUIRED_LENGTH=False)


def _download_file(url, required_length, STRICT_REQUIRED_LENGTH=True):
  """
  <Purpose>
    Given the url, hashes and length of the desired file, this function
    opens a connection to 'url' and downloads the file while ensuring its
    length and hashes match 'required_hashes' and 'required_length'.

    tuf.util.TempFile is used instead of regular tempfile object because of
    additional functionality provided by 'tuf.util.TempFile'.

  <Arguments>
    url:
      A URL string that represents the location of the file.

    required_length:
      An integer value representing the length of the file.

    STRICT_REQUIRED_LENGTH:
      A Boolean indicator used to signal whether we should perform strict
      checking of required_length. True by default. We explicitly set this to
      False when we know that we want to turn this off for downloading the
      timestamp metadata, which has no signed required_length.

  <Side Effects>
    A 'tuf.util.TempFile' object is created on disk to store the contents of
    'url'.

  <Exceptions>
    tuf.DownloadLengthMismatchError, if there was a mismatch of observed vs
    expected lengths while downloading the file.

    tuf.FormatError, if any of the arguments are improperly formatted.

    Any other unforeseen runtime exception.

  <Returns>
    A 'tuf.util.TempFile' file-like object that points to the contents of 'url'.
  """

  # Do all of the arguments have the appropriate format?
  # Raise 'tuf.FormatError' if there is a mismatch.
  tuf.formats.URL_SCHEMA.check_match(url)
  tuf.formats.LENGTH_SCHEMA.check_match(required_length)

  # 'url.replace()' is for compatibility with Windows-based systems because
  # they might put back-slashes in place of forward-slashes.  This converts it
  # to the common format.
  url = url.replace('\\', '/')
  logger.info('Downloading: '+str(url))

  # This is the temporary file that we will return to contain the contents of
  # the downloaded file.
  temp_file = tuf.util.TempFile()

  try:
    # Open the connection to the remote file.
    connection = _open_connection(url)

    # We ask the server about how big it thinks this file should be.
    reported_length = _get_content_length(connection)

    # Then, we check whether the required length matches the reported length.
    _check_content_length(reported_length, required_length,
                          STRICT_REQUIRED_LENGTH)

    # Download the contents of the URL, up to the required length, to a
    # temporary file, and get the total number of downloaded bytes.
    total_downloaded = _download_fixed_amount_of_data(connection, temp_file,
                                                      required_length)

    # Does the total number of downloaded bytes match the required length?
    _check_downloaded_length(total_downloaded, required_length,
                             STRICT_REQUIRED_LENGTH=STRICT_REQUIRED_LENGTH)

  except:
    # Close 'temp_file'.  Any written data is lost.
    temp_file.close_temp_file()
    logger.exception('Could not download URL: '+str(url))
    raise

  else:
    return temp_file


def _download_fixed_amount_of_data(connection, temp_file, required_length):
  """
  <Purpose>
    This is a helper function, where the download really happens. While-block
    reads data from connection a fixed chunk of data at a time, or less, until
    'required_length' is reached.

  <Arguments>
    connection:
      The object that the _open_connection returns for communicating with the
      server about the contents of a URL.

    temp_file:
      A temporary file where the contents at the URL specified by the
      'connection' object will be stored.

    required_length:
      The number of bytes that we must download for the file.  This is almost
      always specified by the TUF metadata for the data file in question
      (except in the case of timestamp metadata, in which case we would fix a
      reasonable upper bound).

  <Side Effects>
    Data from the server will be written to 'temp_file'.

  <Exceptions>
    Runtime or network exceptions will be raised without question.

  <Returns>
    total_downloaded:
      The total number of bytes downloaded for the desired file.
  """

  # Tolerate servers with a slow start by ignoring their delivery speed for
  # 'tuf.conf.SLOW_START_GRACE_PERIOD' seconds.  Set 'seconds_spent_receiving'
  # to negative SLOW_START_GRACE_PERIOD seconds, and begin checking the average
  # download speed once it is positive.
  grace_period = -tuf.conf.SLOW_START_GRACE_PERIOD

  # Keep track of total bytes downloaded.
  number_of_bytes_received = 0

  start_time = timeit.default_timer()

  try:
    while True:
      # We download a fixed chunk of data in every round. This is so that we
      # can defend against slow retrieval attacks. Furthermore, we do not wish
      # to download an extremely large file in one shot.
      data = b''
      read_amount = min(tuf.conf.CHUNK_SIZE,
                        required_length - number_of_bytes_received)
      #logger.debug('Reading next chunk...')

      try:
        data = connection.read(read_amount)

      # Python 3.2 returns 'IOError' if the remote file object has timed out.
      except (socket.error, IOError):
        pass

      number_of_bytes_received = number_of_bytes_received + len(data)

      # Data successfully read from the connection.  Store it.
      temp_file.write(data)

      if number_of_bytes_received == required_length:
        break

      stop_time = timeit.default_timer()
      seconds_spent_receiving = stop_time - start_time

      if (seconds_spent_receiving + grace_period) < 0:
        #logger.debug('Ignoring average download speed for another: '+\
                     #str(-seconds_spent_receiving) + ' seconds')
        continue

      # Measure the average download speed.
      average_download_speed = number_of_bytes_received / seconds_spent_receiving

      # If the average download speed is below a certain threshold, we flag
      # this as a possible slow-retrieval attack.
      if average_download_speed < tuf.conf.MIN_AVERAGE_DOWNLOAD_SPEED:
        break

      else:
        logger.debug('Good average download speed: '+\
                     str(average_download_speed) + ' bytes per second')

      # We might have no more data to read. Check number of bytes downloaded.
      if not data:
        message = 'Downloaded '+str(number_of_bytes_received)+'/'+ \
          str(required_length)+' bytes.'
        logger.debug(message)

        # Finally, we signal that the download is complete.
        break

  except:
    raise

  else:
    # This else block returns and skips closing the connection in the finally
    # block, so close the connection here.
    connection.close()
    return number_of_bytes_received

  finally:
    # Whatever happens, make sure that we always close the connection.
    connection.close()


def _get_request(url):
  """
  Wraps the URL to retrieve to protects against "creative"
  interpretation of the RFC: http://bugs.python.org/issue8732

  https://github.com/pypa/pip/blob/d0fa66ecc03ab20b7411b35f7c7b423f31f77761/pip/download.py#L147
  """

  return six.moves.urllib.request.Request(url, headers={'Accept-encoding': 'identity'})


def _get_opener(scheme=None):
  """
  Build a urllib2 opener based on whether the user now wants SSL.

  https://github.com/pypa/pip/blob/d0fa66ecc03ab20b7411b35f7c7b423f31f77761/pip/download.py#L178
  """

  if scheme == "https":
    assert os.path.isfile(tuf.conf.ssl_certificates)

    # If we are going over https, use an opener which will provide SSL
    # certificate verification.
    https_handler = VerifiedHTTPSHandler()
    opener = six.moves.urllib.request.build_opener(https_handler)

    # Strip out HTTPHandler to prevent MITM spoof.
    for handler in opener.handlers:
      if isinstance(handler, six.moves.urllib.request.HTTPHandler):
        opener.handlers.remove(handler)

  else:
    # Otherwise, use the default opener.
    opener = six.moves.urllib.request.build_opener()

  return opener


def _open_connection(url):
  """
  <Purpose>
    Helper function that opens a connection to the url. urllib2 supports http,
    ftp, and file. In python (2.6+) where the ssl module is available, urllib2
    also supports https.

    TODO: Determine whether this follows http redirects and decide if we like
    that. For example, would we not want to allow redirection from ssl to
     non-ssl urls?

  <Arguments>
    url:
      URL string (e.g., 'http://...' or 'ftp://...' or 'file://...')

  <Exceptions>
    None.

  <Side Effects>
    Opens a connection to a remote server.

  <Returns>
    File-like object.
  """

  # urllib2.Request produces a Request object that allows for a finer control
  # of the requesting process. Request object allows to add headers or data to
  # the HTTP request. For instance, request method add_header(key, val) can be
  # used to change/spoof 'User-Agent' from default Python-urllib/x.y to
  # 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' this can be useful if
  # servers do not recognize connections that originates from
  # Python-urllib/x.y.

  parsed_url = six.moves.urllib.parse.urlparse(url)
  opener = _get_opener(scheme=parsed_url.scheme)
  request = _get_request(url)

  return opener.open(request, timeout = tuf.conf.SOCKET_TIMEOUT)


def _get_content_length(connection):
  """
  <Purpose>
    A helper function that gets the purported file length from server.

  <Arguments>
    connection:
      The object that the _open_connection function returns for communicating
      with the server about the contents of a URL.

  <Side Effects>
    No known side effects.

  <Exceptions>
    Runtime exceptions will be suppressed but logged.

  <Returns>
    reported_length:
      The total number of bytes reported by server. If the process fails, we
      return None; otherwise we would return a nonnegative integer.
  """

  try:
    # What is the length of this document according to the HTTP spec?
    reported_length = connection.info().get('Content-Length')

    # Try casting it as a decimal number.
    reported_length = int(reported_length, 10)

    # Make sure that it is a nonnegative integer.
    assert reported_length > -1

  except:
    message = \
      'Could not get content length about ' + str(connection) + ' from server.'
    logger.exception(message)
    reported_length = None

  finally:
    return reported_length


def _check_content_length(reported_length, required_length, strict_length=True):
  """
  <Purpose>
    A helper function that checks whether the length reported by server is
    equal to the length we expected.

  <Arguments>
    reported_length:
      The total number of bytes reported by the server.

    required_length:
      The total number of bytes obtained from (possibly default) metadata.

    strict_length:
      Boolean that indicates whether the required length of the file is an
      exact match, or an upper limit (e.g., downloading a Timestamp file).

  <Side Effects>
    No known side effects.

  <Exceptions>
    No known exceptions.

  <Returns>
    None.
  """

  logger.debug('The server reported a length of '+repr(reported_length)+' bytes.')
  comparison_result = None

  if reported_length < required_length:
    comparison_result = 'less than'

  elif reported_length > required_length:
    comparison_result = 'greater than'

  else:
    comparison_result = 'equal to'

  if strict_length:
    message = 'The reported length is '+comparison_result+' the required '+\
      'length of '+repr(required_length)+' bytes.'
    logger.debug(message)

  else:
    message = 'The reported length is '+comparison_result+' the upper limit '+\
      'of '+repr(required_length)+' bytes.'
    logger.debug(message)


def _check_downloaded_length(total_downloaded, required_length,
                             STRICT_REQUIRED_LENGTH=True):
  """
  <Purpose>
    A helper function which checks whether the total number of downloaded bytes
    matches our expectation.

  <Arguments>
    total_downloaded:
      The total number of bytes supposedly downloaded for the file in question.

    required_length:
      The total number of bytes expected of the file as seen from its metadata.
      The Timestamp role is always downloaded without a known file length, and
      the Root role when the client cannot download any of the required
      top-level roles.  In both cases, 'required_length' is actually an upper
      limit on the length of the downloaded file.

    STRICT_REQUIRED_LENGTH:
      A Boolean indicator used to signal whether we should perform strict
      checking of required_length. True by default. We explicitly set this to
      False when we know that we want to turn this off for downloading the
      timestamp metadata, which has no signed required_length.

  <Side Effects>
    None.

  <Exceptions>
    tuf.DownloadLengthMismatchError, if STRICT_REQUIRED_LENGTH is True and
    total_downloaded is not equal required_length.

  <Returns>
    None.
  """

  if total_downloaded == required_length:
    logger.info('Downloaded '+str(total_downloaded)+' bytes out of the '+\
                'expected '+str(required_length)+ ' bytes.')
  else:
    difference_in_bytes = abs(total_downloaded - required_length)

    # What we downloaded is not equal to the required length, but did we ask
    # for strict checking of required length?
    if STRICT_REQUIRED_LENGTH:
      message = 'Downloaded '+str(total_downloaded)+' bytes, but expected '+\
        str(required_length)+' bytes. There is a difference of '+\
        str(difference_in_bytes)+' bytes.'

      # This must be due to a programming error, and must never happen!
      logger.error(message)
      raise tuf.DownloadLengthMismatchError(required_length, total_downloaded)

    else:
      message = 'Downloaded '+str(total_downloaded)+' bytes out of an upper '+\
        'limit of '+str(required_length)+' bytes.'
      # We specifically disabled strict checking of required length, but we
      # will log a warning anyway. This is useful when we wish to download the
      # Timestamp or Root metadata, for which we have no signed metadata; so,
      # we must guess a reasonable required_length for it.
      logger.info(message)


class VerifiedHTTPSConnection(six.moves.http_client.HTTPSConnection):
  """
  A connection that wraps connections with ssl certificate verification.

  https://github.com/pypa/pip/blob/d0fa66ecc03ab20b7411b35f7c7b423f31f77761/pip/download.py#L72
  """

  def connect(self):

    self.connection_kwargs = {}

    # for > py2.5
    if hasattr(self, 'timeout'):
      self.connection_kwargs.update(timeout = self.timeout)

    # for >= py2.7
    if hasattr(self, 'source_address'):
      self.connection_kwargs.update(source_address = self.source_address)

    sock = socket.create_connection((self.host, self.port), **self.connection_kwargs)

    # for >= py2.7
    if getattr(self, '_tunnel_host', None):
      self.sock = sock
      self._tunnel()

    # set location of certificate authorities
    assert os.path.isfile(tuf.conf.ssl_certificates)
    cert_path = tuf.conf.ssl_certificates

    # TODO: Disallow SSLv2.
    # http://docs.python.org/dev/library/ssl.html#protocol-versions
    # TODO: Select the right ciphers.
    # http://docs.python.org/dev/library/ssl.html#cipher-selection
    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file,
                                cert_reqs=ssl.CERT_REQUIRED,
                                ca_certs=cert_path)

    match_hostname(self.sock.getpeercert(), self.host)


class VerifiedHTTPSHandler(six.moves.urllib.request.HTTPSHandler):
  """
  A HTTPSHandler that uses our own VerifiedHTTPSConnection.

  https://github.com/pypa/pip/blob/d0fa66ecc03ab20b7411b35f7c7b423f31f77761/pip/download.py#L109
  """

  def __init__(self, connection_class = VerifiedHTTPSConnection):
    self.specialized_conn_class = connection_class
    six.moves.urllib.request.HTTPSHandler.__init__(self)

  def https_open(self, req):
    return self.do_open(self.specialized_conn_class, req)