mirror of
https://github.com/theupdateframework/python-tuf
synced 2026-05-24 10:08:28 +00:00
In order to avoid re-using session data from an HTTPS connection in an HTTP connection. Some cookies may not be HTTP-safe. Signed-off-by: Sebastien Awwad <sebastien.awwad@gmail.com>
607 lines
21 KiB
Python
Executable file
607 lines
21 KiB
Python
Executable file
#!/usr/bin/env python
|
|
|
|
# Copyright 2012 - 2017, New York University and the TUF contributors
|
|
# SPDX-License-Identifier: MIT OR Apache-2.0
|
|
|
|
"""
|
|
<Program Name>
|
|
download.py
|
|
|
|
<Started>
|
|
February 21, 2012. Based on previous version by Geremy Condra.
|
|
|
|
<Author>
|
|
Konstantin Andrianov
|
|
Vladimir Diaz <vladimir.v.diaz@gmail.com>
|
|
|
|
<Copyright>
|
|
See LICENSE-MIT OR LICENSE for licensing information.
|
|
|
|
<Purpose>
|
|
Download metadata and target files and check their validity. The hash and
|
|
length of a downloaded file has to match the hash and length supplied by the
|
|
metadata of that file. The downloaded file is technically a file-like
|
|
object that will automatically destroys itself once closed. Note that the
|
|
file-like object, 'securesystemslib.util.TempFile', is returned by the
|
|
'_download_file()' function.
|
|
"""
|
|
|
|
# Help with Python 3 compatibility, where the print statement is a function, an
|
|
# implicit relative import is invalid, and the '/' operator performs true
|
|
# division. Example: print 'hello world' raises a 'SyntaxError' exception.
|
|
from __future__ import print_function
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import unicode_literals
|
|
|
|
import logging
|
|
import time
|
|
import timeit
|
|
|
|
import tuf
|
|
import requests
|
|
|
|
import securesystemslib
|
|
import securesystemslib.util
|
|
import six
|
|
import tuf.exceptions
|
|
|
|
import urllib3.exceptions
|
|
|
|
# See 'log.py' to learn how logging is handled in TUF.
|
|
logger = logging.getLogger('tuf.download')
|
|
|
|
# From http://docs.python-requests.org/en/master/user/advanced/#session-objects:
|
|
#
|
|
# "The Session object allows you to persist certain parameters across requests.
|
|
# It also persists cookies across all requests made from the Session instance,
|
|
# and will use urllib3's connection pooling. So if you're making several
|
|
# requests to the same host, the underlying TCP connection will be reused,
|
|
# which can result in a significant performance increase (see HTTP persistent
|
|
# connection)."
|
|
#
|
|
# NOTE: We use a separate requests.Session per scheme+hostname combination, in
|
|
# order to reuse connections to the same hostname to improve efficiency, but
|
|
# avoiding sharing state between different hosts-scheme combinations to
|
|
# minimize subtle security issues. Some cookies may not be HTTP-safe.
|
|
_sessions = {}
|
|
|
|
|
|
def safe_download(url, required_length):
|
|
"""
|
|
<Purpose>
|
|
Given the 'url' and 'required_length' of the desired file, open a connection
|
|
to 'url', download it, and return the contents of the file. Also ensure
|
|
the length of the downloaded file matches 'required_length' exactly.
|
|
tuf.download.unsafe_download() may be called if an upper download limit is
|
|
preferred.
|
|
|
|
'securesystemslib.util.TempFile', the file-like object returned, is used
|
|
instead of regular tempfile object because of additional functionality
|
|
provided, such as handling compressed metadata and automatically closing
|
|
files after moving to final destination.
|
|
|
|
<Arguments>
|
|
url:
|
|
A URL string that represents the location of the file.
|
|
|
|
required_length:
|
|
An integer value representing the length of the file. This is an exact
|
|
limit.
|
|
|
|
<Side Effects>
|
|
A 'securesystemslib.util.TempFile' object is created on disk to store the
|
|
contents of 'url'.
|
|
|
|
<Exceptions>
|
|
tuf.ssl_commons.exceptions.DownloadLengthMismatchError, if there was a
|
|
mismatch of observed vs expected lengths while downloading the file.
|
|
|
|
securesystemslib.exceptions.FormatError, if any of the arguments are
|
|
improperly formatted.
|
|
|
|
Any other unforeseen runtime exception.
|
|
|
|
<Returns>
|
|
A 'securesystemslib.util.TempFile' file-like object that points to the
|
|
contents of 'url'.
|
|
"""
|
|
|
|
# Do all of the arguments have the appropriate format?
|
|
# Raise 'securesystemslib.exceptions.FormatError' if there is a mismatch.
|
|
securesystemslib.formats.URL_SCHEMA.check_match(url)
|
|
securesystemslib.formats.LENGTH_SCHEMA.check_match(required_length)
|
|
|
|
return _download_file(url, required_length, STRICT_REQUIRED_LENGTH=True)
|
|
|
|
|
|
|
|
|
|
|
|
def unsafe_download(url, required_length):
|
|
"""
|
|
<Purpose>
|
|
Given the 'url' and 'required_length' of the desired file, open a connection
|
|
to 'url', download it, and return the contents of the file. Also ensure
|
|
the length of the downloaded file is up to 'required_length', and no larger.
|
|
tuf.download.safe_download() may be called if an exact download limit is
|
|
preferred.
|
|
|
|
'securesystemslib.util.TempFile', the file-like object returned, is used
|
|
instead of regular tempfile object because of additional functionality
|
|
provided, such as handling compressed metadata and automatically closing
|
|
files after moving to final destination.
|
|
|
|
<Arguments>
|
|
url:
|
|
A URL string that represents the location of the file.
|
|
|
|
required_length:
|
|
An integer value representing the length of the file. This is an upper
|
|
limit.
|
|
|
|
<Side Effects>
|
|
A 'securesystemslib.util.TempFile' object is created on disk to store the
|
|
contents of 'url'.
|
|
|
|
<Exceptions>
|
|
tuf.ssl_commons.exceptions.DownloadLengthMismatchError, if there was a
|
|
mismatch of observed vs expected lengths while downloading the file.
|
|
|
|
securesystemslib.exceptions.FormatError, if any of the arguments are
|
|
improperly formatted.
|
|
|
|
Any other unforeseen runtime exception.
|
|
|
|
<Returns>
|
|
A 'securesystemslib.util.TempFile' file-like object that points to the
|
|
contents of 'url'.
|
|
"""
|
|
|
|
# Do all of the arguments have the appropriate format?
|
|
# Raise 'securesystemslib.exceptions.FormatError' if there is a mismatch.
|
|
securesystemslib.formats.URL_SCHEMA.check_match(url)
|
|
securesystemslib.formats.LENGTH_SCHEMA.check_match(required_length)
|
|
|
|
return _download_file(url, required_length, STRICT_REQUIRED_LENGTH=False)
|
|
|
|
|
|
|
|
|
|
|
|
def _download_file(url, required_length, STRICT_REQUIRED_LENGTH=True):
|
|
"""
|
|
<Purpose>
|
|
Given the url and length of the desired file, this function opens a
|
|
connection to 'url' and downloads the file while ensuring its length
|
|
matches 'required_length' if 'STRICT_REQUIRED_LENGH' is True (If False,
|
|
the file's length is not checked and a slow retrieval exception is raised
|
|
if the downloaded rate falls below the acceptable rate).
|
|
|
|
securesystemslib.util.TempFile is used instead of regular tempfile object
|
|
because of additional functionality provided by
|
|
'securesystemslib.util.TempFile'.
|
|
|
|
<Arguments>
|
|
url:
|
|
A URL string that represents the location of the file.
|
|
|
|
required_length:
|
|
An integer value representing the length of the file.
|
|
|
|
STRICT_REQUIRED_LENGTH:
|
|
A Boolean indicator used to signal whether we should perform strict
|
|
checking of required_length. True by default. We explicitly set this to
|
|
False when we know that we want to turn this off for downloading the
|
|
timestamp metadata, which has no signed required_length.
|
|
|
|
<Side Effects>
|
|
A 'securesystemslib.util.TempFile' object is created on disk to store the
|
|
contents of 'url'.
|
|
|
|
<Exceptions>
|
|
tuf.exceptions.DownloadLengthMismatchError, if there was a
|
|
mismatch of observed vs expected lengths while downloading the file.
|
|
|
|
securesystemslib.exceptions.FormatError, if any of the arguments are
|
|
improperly formatted.
|
|
|
|
Any other unforeseen runtime exception.
|
|
|
|
<Returns>
|
|
A 'securesystemslib.util.TempFile' file-like object that points to the
|
|
contents of 'url'.
|
|
"""
|
|
|
|
# Do all of the arguments have the appropriate format?
|
|
# Raise 'securesystemslib.exceptions.FormatError' if there is a mismatch.
|
|
securesystemslib.formats.URL_SCHEMA.check_match(url)
|
|
securesystemslib.formats.LENGTH_SCHEMA.check_match(required_length)
|
|
|
|
# 'url.replace('\\', '/')' is needed for compatibility with Windows-based
|
|
# systems, because they might use back-slashes in place of forward-slashes.
|
|
# This converts it to the common format. unquote() replaces %xx escapes in a
|
|
# url with their single-character equivalent. A back-slash may be encoded as
|
|
# %5c in the url, which should also be replaced with a forward slash.
|
|
url = six.moves.urllib.parse.unquote(url).replace('\\', '/')
|
|
logger.info('Downloading: ' + repr(url))
|
|
|
|
# This is the temporary file that we will return to contain the contents of
|
|
# the downloaded file.
|
|
temp_file = securesystemslib.util.TempFile()
|
|
|
|
try:
|
|
# Use a different requests.Session per schema+hostname combination, to
|
|
# reuse connections while minimizing subtle security issues.
|
|
parsed_url = six.moves.urllib.parse.urlparse(url)
|
|
|
|
if not parsed_url.scheme or not parsed_url.hostname:
|
|
raise tuf.exceptions.URLParsingError(
|
|
'Could not get scheme and hostname from URL: ' + url)
|
|
|
|
session_index = parsed_url.scheme + '+' + parsed_url.hostname
|
|
|
|
logger.debug('url: ' + url)
|
|
logger.debug('session index: ' + session_index)
|
|
|
|
session = _sessions.get(session_index)
|
|
|
|
if not session:
|
|
session = requests.Session()
|
|
_sessions[session_index] = session
|
|
|
|
# Attach some default headers to every Session.
|
|
requests_user_agent = session.headers['User-Agent']
|
|
# Follows the RFC: https://tools.ietf.org/html/rfc7231#section-5.5.3
|
|
tuf_user_agent = 'tuf/' + tuf.__version__ + ' ' + requests_user_agent
|
|
session.headers.update({
|
|
# Tell the server not to compress or modify anything.
|
|
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding#Directives
|
|
'Accept-Encoding': 'identity',
|
|
# The TUF user agent.
|
|
'User-Agent': tuf_user_agent})
|
|
|
|
logger.debug('Made new session for ' + session_index)
|
|
|
|
else:
|
|
logger.debug('Reusing session for ' + session_index)
|
|
|
|
# Get the requests.Response object for this URL.
|
|
#
|
|
# Always stream to control how requests are downloaded:
|
|
# http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow
|
|
#
|
|
# We will always manually close Responses, so no need for a context
|
|
# manager.
|
|
#
|
|
# Always set the timeout. This timeout value is interpreted by requests as:
|
|
# - connect timeout (max delay before first byte is received)
|
|
# - read (gap) timeout (max delay between bytes received)
|
|
# These are NOT overall/total, wall-clock timeouts for any single read.
|
|
# http://docs.python-requests.org/en/master/user/advanced/#timeouts
|
|
response = session.get(
|
|
url, stream=True, timeout=tuf.settings.SOCKET_TIMEOUT)
|
|
|
|
# Check response status.
|
|
response.raise_for_status()
|
|
|
|
# We ask the server about how big it thinks this file should be.
|
|
reported_length = _get_content_length(response)
|
|
|
|
# Then, we check whether the required length matches the reported length.
|
|
_check_content_length(reported_length, required_length,
|
|
STRICT_REQUIRED_LENGTH)
|
|
|
|
# Download the contents of the URL, up to the required length, to a
|
|
# temporary file, and get the total number of downloaded bytes.
|
|
total_downloaded, average_download_speed = \
|
|
_download_fixed_amount_of_data(response, temp_file, required_length)
|
|
|
|
# Does the total number of downloaded bytes match the required length?
|
|
_check_downloaded_length(total_downloaded, required_length,
|
|
STRICT_REQUIRED_LENGTH=STRICT_REQUIRED_LENGTH,
|
|
average_download_speed=average_download_speed)
|
|
|
|
except Exception:
|
|
# Close 'temp_file'. Any written data is lost.
|
|
temp_file.close_temp_file()
|
|
logger.exception('Could not download URL: ' + repr(url))
|
|
raise
|
|
|
|
else:
|
|
return temp_file
|
|
|
|
|
|
|
|
|
|
|
|
def _download_fixed_amount_of_data(response, temp_file, required_length):
|
|
"""
|
|
<Purpose>
|
|
This is a helper function, where the download really happens. While-block
|
|
reads data from response a fixed chunk of data at a time, or less, until
|
|
'required_length' is reached.
|
|
|
|
<Arguments>
|
|
response:
|
|
The object for communicating with the server about the contents of a URL.
|
|
|
|
temp_file:
|
|
A temporary file where the contents at the URL specified by the
|
|
'response' object will be stored.
|
|
|
|
required_length:
|
|
The number of bytes that we must download for the file. This is almost
|
|
always specified by the TUF metadata for the data file in question
|
|
(except in the case of timestamp metadata, in which case we would fix a
|
|
reasonable upper bound).
|
|
|
|
<Side Effects>
|
|
Data from the server will be written to 'temp_file'.
|
|
|
|
<Exceptions>
|
|
tuf.exceptions.SlowRetrievalError
|
|
will be raised if urllib3.exceptions.ReadTimeoutError is caught (if the
|
|
download times out).
|
|
|
|
Otherwise, runtime or network exceptions will be raised without question.
|
|
|
|
<Returns>
|
|
A (total_downloaded, average_download_speed) tuple, where
|
|
'total_downloaded' is the total number of bytes downloaded for the desired
|
|
file and the 'average_download_speed' calculated for the download
|
|
attempt.
|
|
"""
|
|
|
|
# Keep track of total bytes downloaded.
|
|
number_of_bytes_received = 0
|
|
average_download_speed = 0
|
|
|
|
start_time = timeit.default_timer()
|
|
|
|
try:
|
|
while True:
|
|
# We download a fixed chunk of data in every round. This is so that we
|
|
# can defend against slow retrieval attacks. Furthermore, we do not wish
|
|
# to download an extremely large file in one shot.
|
|
# Before beginning the round, sleep (if set) for a short amount of time
|
|
# so that the CPU is not hogged in the while loop.
|
|
if tuf.settings.SLEEP_BEFORE_ROUND:
|
|
time.sleep(tuf.settings.SLEEP_BEFORE_ROUND)
|
|
|
|
read_amount = min(
|
|
tuf.settings.CHUNK_SIZE, required_length - number_of_bytes_received)
|
|
|
|
# NOTE: This may not handle some servers adding a Content-Encoding
|
|
# header, which may cause urllib3 to misbehave:
|
|
# https://github.com/pypa/pip/blob/404838abcca467648180b358598c597b74d568c9/src/pip/_internal/download.py#L547-L582
|
|
data = response.raw.read(read_amount)
|
|
|
|
number_of_bytes_received = number_of_bytes_received + len(data)
|
|
|
|
# Data successfully read from the response. Store it.
|
|
temp_file.write(data)
|
|
|
|
if number_of_bytes_received == required_length:
|
|
break
|
|
|
|
stop_time = timeit.default_timer()
|
|
seconds_spent_receiving = stop_time - start_time
|
|
|
|
# Measure the average download speed.
|
|
average_download_speed = number_of_bytes_received / seconds_spent_receiving
|
|
|
|
if average_download_speed < tuf.settings.MIN_AVERAGE_DOWNLOAD_SPEED:
|
|
logger.debug('The average download speed dropped below the minimum'
|
|
' average download speed set in tuf.settings.py.')
|
|
break
|
|
|
|
else:
|
|
logger.debug('The average download speed has not dipped below the'
|
|
' mimimum average download speed set in tuf.settings.py.')
|
|
|
|
# We might have no more data to read. Check number of bytes downloaded.
|
|
if not data:
|
|
logger.debug('Downloaded ' + repr(number_of_bytes_received) + '/' +
|
|
repr(required_length) + ' bytes.')
|
|
|
|
# Finally, we signal that the download is complete.
|
|
break
|
|
|
|
except urllib3.exceptions.ReadTimeoutError as e:
|
|
# Whatever happens, make sure that we always close the connection.
|
|
response.close()
|
|
raise tuf.exceptions.SlowRetrievalError(str(e))
|
|
|
|
except:
|
|
# Whatever happens, make sure that we always close the connection.
|
|
response.close()
|
|
raise
|
|
|
|
response.close()
|
|
return number_of_bytes_received, average_download_speed
|
|
|
|
|
|
|
|
|
|
|
|
def _get_content_length(response):
|
|
"""
|
|
<Purpose>
|
|
A helper function that gets the purported file length from server.
|
|
|
|
<Arguments>
|
|
response:
|
|
The object for communicating with the server about the contents of a URL.
|
|
|
|
<Side Effects>
|
|
No known side effects.
|
|
|
|
<Exceptions>
|
|
Runtime exceptions will be suppressed but logged.
|
|
|
|
<Returns>
|
|
reported_length:
|
|
The total number of bytes reported by server. If the process fails, we
|
|
return None; otherwise we would return a nonnegative integer.
|
|
"""
|
|
|
|
try:
|
|
# What is the length of this document according to the HTTP spec?
|
|
reported_length = response.headers.get('Content-Length')
|
|
|
|
# Try casting it as a decimal number.
|
|
reported_length = int(reported_length, 10)
|
|
|
|
# Make sure that it is a nonnegative integer.
|
|
if not reported_length > -1:
|
|
raise tuf.exceptions.Error('A non-positive length was reported.')
|
|
|
|
except Exception as e:
|
|
logger.exception('Could not get content length'
|
|
' about ' + str(response) + ' from server: ' + str(e))
|
|
return None
|
|
|
|
return reported_length
|
|
|
|
|
|
|
|
|
|
|
|
def _check_content_length(reported_length, required_length, strict_length=True):
|
|
"""
|
|
<Purpose>
|
|
A helper function that checks whether the length reported by server is
|
|
equal to the length we expected.
|
|
|
|
<Arguments>
|
|
reported_length:
|
|
The total number of bytes reported by the server.
|
|
|
|
required_length:
|
|
The total number of bytes obtained from (possibly default) metadata.
|
|
|
|
strict_length:
|
|
Boolean that indicates whether the required length of the file is an
|
|
exact match, or an upper limit (e.g., downloading a Timestamp file).
|
|
|
|
<Side Effects>
|
|
No known side effects.
|
|
|
|
<Exceptions>
|
|
No known exceptions.
|
|
|
|
<Returns>
|
|
None.
|
|
"""
|
|
|
|
logger.debug('The server reported a length of '+repr(reported_length)+' bytes.')
|
|
comparison_result = None
|
|
|
|
if reported_length < required_length:
|
|
comparison_result = 'less than'
|
|
|
|
elif reported_length > required_length:
|
|
comparison_result = 'greater than'
|
|
|
|
else:
|
|
comparison_result = 'equal to'
|
|
|
|
if strict_length:
|
|
logger.debug('The reported length is ' + comparison_result + ' the'
|
|
' required length of '+repr(required_length)+' bytes.')
|
|
|
|
else:
|
|
logger.debug('The reported length is ' + comparison_result + ' the upper'
|
|
' limit of ' + repr(required_length) + ' bytes.')
|
|
|
|
|
|
|
|
|
|
|
|
def _check_downloaded_length(total_downloaded, required_length,
|
|
STRICT_REQUIRED_LENGTH=True,
|
|
average_download_speed=None):
|
|
"""
|
|
<Purpose>
|
|
A helper function which checks whether the total number of downloaded bytes
|
|
matches our expectation.
|
|
|
|
<Arguments>
|
|
total_downloaded:
|
|
The total number of bytes supposedly downloaded for the file in question.
|
|
|
|
required_length:
|
|
The total number of bytes expected of the file as seen from its metadata.
|
|
The Timestamp role is always downloaded without a known file length, and
|
|
the Root role when the client cannot download any of the required
|
|
top-level roles. In both cases, 'required_length' is actually an upper
|
|
limit on the length of the downloaded file.
|
|
|
|
STRICT_REQUIRED_LENGTH:
|
|
A Boolean indicator used to signal whether we should perform strict
|
|
checking of required_length. True by default. We explicitly set this to
|
|
False when we know that we want to turn this off for downloading the
|
|
timestamp metadata, which has no signed required_length.
|
|
|
|
average_download_speed:
|
|
The average download speed for the downloaded file.
|
|
|
|
<Side Effects>
|
|
None.
|
|
|
|
<Exceptions>
|
|
securesystemslib.exceptions.DownloadLengthMismatchError, if
|
|
STRICT_REQUIRED_LENGTH is True and total_downloaded is not equal
|
|
required_length.
|
|
|
|
tuf.exceptions.SlowRetrievalError, if the total downloaded was
|
|
done in in less than the acceptable download speed (as set in
|
|
tuf.settings.py).
|
|
|
|
<Returns>
|
|
None.
|
|
"""
|
|
|
|
if total_downloaded == required_length:
|
|
logger.info('Downloaded ' + str(total_downloaded) + ' bytes out of the'
|
|
' expected ' + str(required_length) + ' bytes.')
|
|
|
|
else:
|
|
difference_in_bytes = abs(total_downloaded - required_length)
|
|
|
|
# What we downloaded is not equal to the required length, but did we ask
|
|
# for strict checking of required length?
|
|
if STRICT_REQUIRED_LENGTH:
|
|
logger.error('Downloaded ' + str(total_downloaded) + ' bytes, but'
|
|
' expected ' + str(required_length) + ' bytes. There is a difference'
|
|
' of ' + str(difference_in_bytes) + ' bytes.')
|
|
|
|
# If the average download speed is below a certain threshold, we flag
|
|
# this as a possible slow-retrieval attack.
|
|
logger.debug('Average download speed: ' + repr(average_download_speed))
|
|
logger.debug('Minimum average download speed: ' + repr(tuf.settings.MIN_AVERAGE_DOWNLOAD_SPEED))
|
|
|
|
if average_download_speed < tuf.settings.MIN_AVERAGE_DOWNLOAD_SPEED:
|
|
raise tuf.exceptions.SlowRetrievalError(average_download_speed)
|
|
|
|
else:
|
|
logger.debug('Good average download speed: ' +
|
|
repr(average_download_speed) + ' bytes per second')
|
|
|
|
raise tuf.exceptions.DownloadLengthMismatchError(required_length, total_downloaded)
|
|
|
|
else:
|
|
# We specifically disabled strict checking of required length, but we
|
|
# will log a warning anyway. This is useful when we wish to download the
|
|
# Timestamp or Root metadata, for which we have no signed metadata; so,
|
|
# we must guess a reasonable required_length for it.
|
|
if average_download_speed < tuf.settings.MIN_AVERAGE_DOWNLOAD_SPEED:
|
|
raise tuf.exceptions.SlowRetrievalError(average_download_speed)
|
|
|
|
else:
|
|
logger.debug('Good average download speed: ' +
|
|
repr(average_download_speed) + ' bytes per second')
|
|
|
|
logger.info('Downloaded ' + str(total_downloaded) + ' bytes out of an'
|
|
' upper limit of ' + str(required_length) + ' bytes.')
|