Update issues #151 and #137.

Update docstrings and comments and complete the initial implementation of issue 151. Adjust logger level for tuf.download._check_downloaded_length(). Initial implementation of issue 137.
2026-05-24 10:08:28 +00:00 · 2014-01-21 14:42:28 -05:00 · 2014-01-21 14:42:28 -05:00 · 5d1906a239
commit 5d1906a239
parent 52fdb2ea5f
3 changed files with 196 additions and 26 deletions
--- a/tuf/download.py
+++ b/tuf/download.py
@ -645,7 +645,7 @@ def _check_downloaded_length(total_downloaded, required_length,
  """

  if total_downloaded == required_length:
-    logger.warn('Downloaded '+str(total_downloaded)+' bytes out of the '+\
+    logger.info('Downloaded '+str(total_downloaded)+' bytes out of the '+\
                'expected '+str(required_length)+ ' bytes.')
  else:
    difference_in_bytes = abs(total_downloaded-required_length)
@ -668,7 +668,7 @@ def _check_downloaded_length(total_downloaded, required_length,
      # will log a warning anyway. This is useful when we wish to download the
      # Timestamp or Root metadata, for which we have no signed metadata; so,
      # we must guess a reasonable required_length for it.
-      logger.warn(message)
+      logger.info(message)



--- a/tuf/formats.py
+++ b/tuf/formats.py
@ -155,6 +155,9 @@
 # The minimum number of bits for an RSA key.  Must be 2048 bits and greater.
 RSAKEYBITS_SCHEMA = SCHEMA.Integer(lo=2048)

+# The number of bins used to delegate to hashed roles.
+NUMBINS_SCHEMA = SCHEMA.Integer(lo=16)
+
 # A PyCrypto signature.
 PYCRYPTOSIGNATURE_SCHEMA = SCHEMA.AnyString()

--- a/tuf/libtuf.py
+++ b/tuf/libtuf.py
@ -55,6 +55,10 @@
 # are the recommended minimum and are good from the present through 2030.
 DEFAULT_RSA_KEY_BITS = 3072

+# The algorithm used by the repository to generate the hashes of the
+# target filepaths.  The repository may optionally organize targets into
+HASH_FUNCTION = 'sha256'
+
 # The extension of TUF metadata.
 METADATA_EXTENSION = '.txt'

@ -469,7 +473,7 @@ def get_filepaths_in_directory(self, files_directory, recursive_walk=False,
      None.

    <Returns>
-      A list of absolute paths to target files in the given files_directory.
+      A list of absolute paths to target files in the given 'files_directory'.
    """

    # Do the arguments have the correct format?
@ -1977,6 +1981,7 @@ def delegate(self, rolename, public_keys, list_of_targets,
    self._delegated_roles[rolename] = new_targets_object


+
  def revoke(self, rolename):
    """
    <Purpose>
@ -2036,6 +2041,135 @@ def revoke(self, rolename):



+  def delegate_hashed_bins(self, list_of_targets, keys_of_hashed_bins,
+                           number_of_bins=1024):
+    """
+    <Purpose>
+      Split the large number of target files of 'list_of_targets' into  
+      multiple delegated roles (hashed bins).  The size of all the delegated
+      roles will be nearly equal.  The updater client will use "lazy bin walk"
+      to find a target file's hashed bin destination.  The parent role lists
+      the hashed bins as either a direct delegation, or as a path hash prefix
+      of another hashed bin. See the following link for more information:
+      http://www.python.org/dev/peps/pep-0458/#metadata-scalability
+      
+      >>>
+      >>>
+      >>>
+
+    <Arguments>
+      list_of_targets:
+        The target filepaths of the targets that should be stored in the hashed
+        bins (i.e., delegated roles).
+
+      keys_of_hashed_bins:
+        The public keys of the delegated roles.
+      
+      number_of_bins:
+        The number of delegated roles listed in the parent role's
+        'delegations' field.  Must be a multiple of 16.  Each bin may contain
+        multiple roles.
+
+    <Exceptions>
+      tuf.FormatError, if the arguments are improperly formatted,
+        'number_of_bins' is not a multiple of 16, or one of the targets
+        in 'list_of_targets' is not located under the repository's targets
+        directory.
+
+    <Side Effects>
+      Delegates multiple target roles from the current parent role.  Others
+      may be generated/added as a role and only linked with the parent. 
+
+    <Returns>
+      None.
+    """      
+    
+    # Does 'rolename' have the correct format?
+    # Ensure the arguments have the appropriate number of objects and object
+    # types, and that all dict keys are properly named.
+    # Raise 'tuf.FormatError' if there is a mismatch.
+    tuf.formats.PATHS_SCHEMA.check_match(list_of_targets)
+    tuf.formats.ANYKEYLIST_SCHEMA.check_match(keys_of_hashed_bins)
+    tuf.formats.NUMBINS_SCHEMA.check_match(number_of_bins)
+    
+    # Strip the '0x' from the Python hex representation.
+    prefix_length =  len(hex(number_of_bins - 1)[2:])
+    max_number_of_bins = 16 ** prefix_length
+
+    # For simplicity, ensure that we can evenly distribute 'max_number_of_bins'
+    # over 'number_of_bins'.
+    if max_number_of_bins % number_of_bins != 0:
+      message = 'The number of bins argument must be a multiple of 16.'
+      raise tuf.FormatError(message)
+
+    logger.info('There are '+len(list_of_targets)+' total targets.')
+
+    # Store the target paths that fall into each bin.
+    target_paths_in_bin = {}
+    for bin_index in xrange(max_number_of_bins):
+      target_paths_in_bin[bin_index] = []
+
+    # Assign every path to its bin.  Ensure every target is located under the
+    # repository's targets directory.
+    for target_path in list_of_targets:
+      if not target_path.startswith(self._targets_directory+'/'):
+        message = 'A path in the list of targets arguments is not '+\
+          'under the repository\'s targets directory: '+repr(target_path) 
+        raise tuf.FormatError(message)
+      
+      # Determine the hash prefix of 'target_path' by computing the digest of
+      # its path relative to the targets directory.  Example:
+      # '{repository_root}/targets/file1.txt' -> 'file1.txt'.
+      relative_path = target_path[len(self._targets_directory)+1:]
+      digest_object = tuf.hash.digest(algorithm=HASH_FUNCTION)
+      digest_object.update(relative_path)
+      relative_path_hash = digest.hexdigest()
+      relative_path_hash_prefix = relative_path_hash[:prefix_length]
+
+      # 'target_paths_in_bin' store bin indices in base-10, so convert the
+      # 'relative_path_hash_prefix' base-16 (hex) number to a base-10 (dec)
+      # number.
+      bin_index = int(relative_path_hash_prefix, 16)
+
+      # Add the 'target_path' (absolute) to the bin.
+      target_paths_in_bin[bin_index] = \
+        target_paths_in_bin[bin_index].append(target_path)
+
+    # Calculate the path hash prefixes of each bin_offset stored in the parent
+    # role.  For example: 'targets/unclaimed/004' may list the path hash
+    # prefixes "000", "001", "002", "003" in the delegations dict of
+    # 'targets/unclaimed'. 
+    bin_offset = max_number_of_bins // number_of_bins
+   
+    # The parent roles will list bin roles starting from "0" to
+    # 'max_number_of_bins' in 'bin_offset' increments.  The skipped bin roles
+    # are listed in 'path_hash_prefixes' of 'outer_bin_index.
+    for outer_bin_index in xrange(0, max_number_of_bins, bin_offset):
+      # The bin index in hex padded from the left with zeroes for up to the
+      # 'prefix_lengthn'.
+      bin_rolename = hex(outer_bin_index)[2:].zfill(prefix_length)
+
+      # The hash prefixes of the skipped bin roles, or the roles not directly
+      # delegated from the parent role.
+      path_hash_prefixes = []
+
+      for inner_bin_index in xrange(outer_bin_index, outer_bin_index+bin_offset):
+        # 'inner_bin_rolename' in padded hex.  For example, "00b". 
+        inner_bin_rolename = hex(inner_bin_index)[2:].zfill(prefix_length)
+        path_hash_prefixes.append(inner_bin_rolename)
+        
+      # Delegate from the "unclaimed" targets role to each 'bin_rolename'
+      # (i.e., outer_bin_index).
+      bin_rolename_targets = target_paths_in_bin[outer_bin_index]
+      self.delegate(bin_rolename, keys_of_hashed_bins,
+                    list_of_targets=bin_rolename_targets,
+                    path_hash_prefixes=path_hash_prefixes)   
+
+      message = 'Delegated from '+repr(self.rolename)+' to '+repr(binned_rolename)
+      logger.debug(message)
+
+
+
  @property
  def delegations(self):
    """
@ -2704,6 +2838,8 @@ def load_repository(repository_directory):

 def _load_top_level_metadata(repository, top_level_filenames):
  """
+  Load the metadata of the Root, Timestamp, Targets, and Release roles.
+  At a minimum, the Root role must exist and successfully loaded.
  """

  root_filename = top_level_filenames[ROOT_FILENAME] 
@ -2716,7 +2852,7 @@ def _load_top_level_metadata(repository, top_level_filenames):
  release_metadata = None
  timestamp_metadata = None
  
-  # ROOT.txt 
+  # Load ROOT.txt.  A Root role file without a digest is always written. 
  if os.path.exists(root_filename):
    # Initialize the key and role metadata of the top-level roles.
    signable = tuf.util.load_json_file(root_filename)
@ -2725,6 +2861,7 @@ def _load_top_level_metadata(repository, top_level_filenames):
    tuf.keydb.create_keydb_from_root_metadata(root_metadata)
    tuf.roledb.create_roledb_from_root_metadata(root_metadata)

+    # Load Root's roleinfo and update 'tuf.roledb'.
    roleinfo = tuf.roledb.get_roleinfo('root')
    roleinfo['signatures'] = []
    for signature in signable['signatures']:
@ -2737,19 +2874,22 @@ def _load_top_level_metadata(repository, top_level_filenames):
    _check_if_partial_loaded('root', signable, roleinfo)
    tuf.roledb.update_roleinfo('root', roleinfo)

+    # Ensure the 'consistent_snapshots' field is extracted.
    consistent_snapshots = root_metadata['consistent_snapshots']
  
  else:
    message = 'Cannot load the required root file: '+repr(root_filename)
    raise tuf.RepositoryError(message)
  
-  # TIMESTAMP.txt
+  # Load TIMESTAMP.txt.  A Timestamp role file without a digest is always
+  # written. 
  if os.path.exists(timestamp_filename):
    signable = tuf.util.load_json_file(timestamp_filename)
    timestamp_metadata = signable['signed']  
    for signature in signable['signatures']:
      repository.timestamp.add_signature(signature)

+    # Load Timestamp's roleinfo and update 'tuf.roledb'.
    roleinfo = tuf.roledb.get_roleinfo('timestamp')
    roleinfo['expires'] = timestamp_metadata['expires']
    roleinfo['version'] = timestamp_metadata['version']
@ -2762,7 +2902,8 @@ def _load_top_level_metadata(repository, top_level_filenames):
  else:
    pass
  
-  # RELEASE.txt
+  # Load RELEASE.txt.  A consistent snapshot of Release must be calculated
+  # if 'consistent_snapshots' is True.
  if consistent_snapshots:
    release_hashes = timestamp_metadata['meta'][RELEASE_FILENAME]['hashes']
    release_digest = random.choice(release_hashes.values())
@ -2776,6 +2917,7 @@ def _load_top_level_metadata(repository, top_level_filenames):
    for signature in signable['signatures']:
      repository.release.add_signature(signature)

+    # Load Release's roleinfo and update 'tuf.roledb'.
    roleinfo = tuf.roledb.get_roleinfo('release')
    roleinfo['expires'] = release_metadata['expires']
    roleinfo['version'] = release_metadata['version']
@ -2788,7 +2930,8 @@ def _load_top_level_metadata(repository, top_level_filenames):
  else:
    pass 

-  # TARGETS.txt
+  # Load TARGETS.txt.  A consistent snapshot of Targets must be calculated if
+  # 'consistent_snapshots' is True.
  if consistent_snapshots:
    targets_hashes = release_metadata['meta'][TARGETS_FILENAME]['hashes']
    targets_digest = random.choice(targets_hashes.values())
@ -3360,25 +3503,22 @@ def get_metadata_file_info(filename):



-def get_target_hash(self, target_filepath, hash_function='sha256'):
+def get_target_hash(self, target_filepath):
  """
  <Purpose>
    Compute the hash of 'target_filepath'. This is useful in conjunction with
    the "path_hash_prefixes" attribute in a delegated targets role, which
    tells us which paths it is implicitly responsible for.
+    
+    The repository may optionally organize targets into hashed bins to ease
+    target delegations and role metadata management.  The use of consistent
+    hashing allows for a uniform distribution of targets into bins. 

  <Arguments>
    target_filepath:
      The path to the target file on the repository. This will be relative to
      the 'targets' (or equivalent) directory on a given mirror.

-    hash_function:
-      The algorithm used by the repository to generate the hashes of the
-      target filepaths.  The repository may optionally organize targets into
-      hashed bins to ease target delegations and role metadata management.
-      The use of consistent hashing allows for a uniform distribution of
-      targets into bins. 
-
  <Exceptions>
    None.
 
@ -3391,16 +3531,16 @@ def get_target_hash(self, target_filepath, hash_function='sha256'):

  # Calculate the hash of the filepath to determine which bin to find the 
  # target.  The client currently assumes the repository uses
-  # 'hash_function' to generate hashes.
+  # 'HASH_FUNCTION' to generate hashes.

-  digest_object = tuf.hash.digest(hash_function)
+  digest_object = tuf.hash.digest(HASH_FUNCTION)

  try:
    digest_object.update(target_filepath)
  except UnicodeEncodeError:
    # Sometimes, there are Unicode characters in target paths. We assume a
    # UTF-8 encoding and try to hash that.
-    digest_object = tuf.hash.digest(hash_function)
+    digest_object = tuf.hash.digest(HASH_FUNCTION)
    encoded_target_filepath = target_filepath.encode('utf-8')
    digest_object.update(encoded_target_filepath)

@ -4012,10 +4152,15 @@ def write_metadata_file(metadata, filename, compressions, consistent_snapshots):
      shutil.copy(written_filename, consistent_filename)
   
   
-  # Generate the compressed versions of 'metadata', if necessary.
+  # Generate the compressed versions of 'metadata', if necessary.  A compressed
+  # file may be written (without needed to write the uncompressed version) if
+  # the repository maintainer adds compression after writting the the
+  # uncompressed version.
  for compression in compressions:
    file_object = None 
-    
+   
+    # Ignore the empty string that signifies non-compression.  The uncompressed
+    # file was previously written above, if necessary.
    if not len(compression):
      continue

@ -4031,7 +4176,10 @@ def write_metadata_file(metadata, filename, compressions, consistent_snapshots):
    
    else:
      raise tuf.FormatError('Unknown compression algorithm: '+repr(compression))
-    
+   
+    # Save the compressed version, ensuring an unchanged file is not re-saved.
+    # Re-savign the same compressed version may cause its digest to unexpectedly
+    # change (gzip includes a timestamp) even though content has not changed.
    _write_compressed_metadata(file_object, compressed_filename,
                               consistent_snapshots)
  return written_filename
@ -4043,35 +4191,54 @@ def write_metadata_file(metadata, filename, compressions, consistent_snapshots):
 def _write_compressed_metadata(file_object, compressed_filename,
                               consistent_snapshots):
  """
+  Write compressed versions of metadata, ensuring compressed file that have
+  not changed are not re-written, the digest of the compressed file is properly
+  added to the compressed filename, and consistent snapshots are also saved.
+  Ensure compressed files are written to a temporary location, and then
+  moved to their destinations.
  """
-  
+ 
+  # If a consistent snapshot is unneeded, 'file_object' may be simply moved
+  # 'compressed_filename' if not already written. 
  if not consistent_snapshots:
    if not os.path.exists(compressed_filename):
      file_object.move(compressed_filename)
    
+    # The temporary file must be closed if 'file_object.move()' is not used.
+    # tuf.util.TempFile() automatically closes the temp file when move() is
+    # called
    else:
      file_object.close_temp_file()
-  
+ 
+  # Consistent snapshots = True.  Ensure the file's digest is included in the
+  # compressed filename written, provided it does not already exist.
  else:
    compressed_content = file_object.read()
    new_digests = []
    consistent_filenames = []
-    
+   
+    # Multiple snapshots may be written if the repository uses multiple
+    # hash algorithms.  Generate the digest of the compressed content.
    for hash_algorithm in tuf.conf.REPOSITORY_HASH_ALGORITHMS:
      digest_object = tuf.hash.digest(hash_algorithm)
      digest_object.update(compressed_content)
      new_digests.append(digest_object.hexdigest())
-    
+   
+    # Attach each digest to the compressed consistent snapshot filename.
    for new_digest in new_digests:
      dirname, basename = os.path.split(compressed_filename)
      digest_and_filename = new_digest + '.' + basename
      consistent_filenames.append(os.path.join(dirname, digest_and_filename))
-    
+   
+    # Move the 'tuf.util.TempFile' object to one of the filenames so that it is
+    # saved and the temporary file closed.  Any remaining consistent snapshots
+    # may still need to be copied or linked. 
    compressed_filename = consistent_filenames.pop()
    if not os.path.exists(compressed_filename):
      logger.info('Saving ' + repr(compressed_filename))
      file_object.move(compressed_filename)

+    # Save any remaining compressed consistent snapshots.
    for consistent_filename in consistent_filenames:
      if not os.path.exists(consistent_filename):
        logger.info('Saving ' + repr(consistent_filename))