# -*- coding: iso-8859-1 -*-
# vim: set ft=python ts=3 sw=3 expandtab:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
#              C E D A R
#          S O L U T I O N S       "Software done right."
#           S O F T W A R E
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
# Copyright (c) 2004-2008,2010,2015 Kenneth J. Pronovici.
# All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# Version 2, as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Copies of the GNU General Public License are available from
# the Free Software Foundation website, http://www.gnu.org/.
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
# Author   : Kenneth J. Pronovici <pronovic@ieee.org>
# Language : Python 3 (>= 3.4)
# Project  : Cedar Backup, release 3
# Purpose  : Provides filesystem-related objects.
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
########################################################################
# Module documentation
########################################################################
"""
Provides filesystem-related objects.
:author: Kenneth J. Pronovici <pronovic@ieee.org>
"""
########################################################################
# Imported modules
########################################################################
# System modules
import os
import re
import math
import logging
import tarfile
import hashlib
# Cedar Backup modules
from CedarBackup3.knapsack import firstFit, bestFit, worstFit, alternateFit
from CedarBackup3.util import AbsolutePathList, UnorderedList, RegexList
from CedarBackup3.util import removeKeys, displayBytes, calculateFileAge, encodePath, dereferenceLink
########################################################################
# Module-wide variables
########################################################################
logger = logging.getLogger("CedarBackup3.log.filesystem")
########################################################################
# FilesystemList class definition
########################################################################
[docs]class FilesystemList(list):
   ######################
   # Class documentation
   ######################
   """
   Represents a list of filesystem items.
   This is a generic class that represents a list of filesystem items.  Callers
   can add individual files or directories to the list, or can recursively add
   the contents of a directory.  The class also allows for up-front exclusions
   in several forms (all files, all directories, all items matching a pattern,
   all items whose basename matches a pattern, or all directories containing a
   specific "ignore file").  Symbolic links are typically backed up
   non-recursively, i.e. the link to a directory is backed up, but not the
   contents of that link (we don't want to deal with recursive loops, etc.).
   The custom methods such as :any:`addFile` will only add items if they exist on
   the filesystem and do not match any exclusions that are already in place.
   However, since a FilesystemList is a subclass of Python's standard list
   class, callers can also add items to the list in the usual way, using
   methods like ``append()`` or ``insert()``.  No validations apply to items
   added to the list in this way; however, many list-manipulation methods deal
   "gracefully" with items that don't exist in the filesystem, often by
   ignoring them.
   Once a list has been created, callers can remove individual items from the
   list using standard methods like ``pop()`` or ``remove()`` or they can use
   custom methods to remove specific types of entries or entries which match a
   particular pattern.
   *Note:* Regular expression patterns that apply to paths are assumed to be
   bounded at front and back by the beginning and end of the string, i.e. they
   are treated as if they begin with ``^`` and end with ``$``.  This is true
   whether we are matching a complete path or a basename.
   """
   ##############
   # Constructor
   ##############
[docs]   def __init__(self):
      """Initializes a list with no configured exclusions."""
      list.__init__(self)
      self._excludeFiles = False
      self._excludeDirs = False
      self._excludeLinks = False
      self._excludePaths = None
      self._excludePatterns = None
      self._excludeBasenamePatterns = None
      self._ignoreFile = None
      self.excludeFiles = False
      self.excludeLinks = False
      self.excludeDirs = False
      self.excludePaths = []
      self.excludePatterns = RegexList()
      self.excludeBasenamePatterns = RegexList()
      self.ignoreFile = None 
   #############
   # Properties
   #############
   def _setExcludeFiles(self, value):
      """
      Property target used to set the exclude files flag.
      No validations, but we normalize the value to ``True`` or ``False``.
      """
      if value:
         self._excludeFiles = True
      else:
         self._excludeFiles = False
   def _getExcludeFiles(self):
      """
      Property target used to get the exclude files flag.
      """
      return self._excludeFiles
   def _setExcludeDirs(self, value):
      """
      Property target used to set the exclude directories flag.
      No validations, but we normalize the value to ``True`` or ``False``.
      """
      if value:
         self._excludeDirs = True
      else:
         self._excludeDirs = False
   def _getExcludeDirs(self):
      """
      Property target used to get the exclude directories flag.
      """
      return self._excludeDirs
   def _setExcludeLinks(self, value):
      """
      Property target used to set the exclude soft links flag.
      No validations, but we normalize the value to ``True`` or ``False``.
      """
      if value:
         self._excludeLinks = True
      else:
         self._excludeLinks = False
   def _getExcludeLinks(self):
      """
      Property target used to get the exclude soft links flag.
      """
      return self._excludeLinks
   def _setExcludePaths(self, value):
      """
      Property target used to set the exclude paths list.
      A ``None`` value is converted to an empty list.
      Elements do not have to exist on disk at the time of assignment.
      Raises:
         ValueError: If any list element is not an absolute path
      """
      self._excludePaths = AbsolutePathList()
      if value is not None:
         self._excludePaths.extend(value)
   def _getExcludePaths(self):
      """
      Property target used to get the absolute exclude paths list.
      """
      return self._excludePaths
   def _setExcludePatterns(self, value):
      """
      Property target used to set the exclude patterns list.
      A ``None`` value is converted to an empty list.
      """
      self._excludePatterns = RegexList()
      if value is not None:
         self._excludePatterns.extend(value)
   def _getExcludePatterns(self):
      """
      Property target used to get the exclude patterns list.
      """
      return self._excludePatterns
   def _setExcludeBasenamePatterns(self, value):
      """
      Property target used to set the exclude basename patterns list.
      A ``None`` value is converted to an empty list.
      """
      self._excludeBasenamePatterns = RegexList()
      if value is not None:
         self._excludeBasenamePatterns.extend(value)
   def _getExcludeBasenamePatterns(self):
      """
      Property target used to get the exclude basename patterns list.
      """
      return self._excludeBasenamePatterns
   def _setIgnoreFile(self, value):
      """
      Property target used to set the ignore file.
      The value must be a non-empty string if it is not ``None``.
      Raises:
         ValueError: If the value is an empty string
      """
      if value is not None:
         if len(value) < 1:
            raise ValueError("The ignore file must be a non-empty string.")
      self._ignoreFile = value
   def _getIgnoreFile(self):
      """
      Property target used to get the ignore file.
      """
      return self._ignoreFile
   excludeFiles = property(_getExcludeFiles, _setExcludeFiles, None, "Boolean indicating whether files should be excluded.")
   excludeDirs = property(_getExcludeDirs, _setExcludeDirs, None, "Boolean indicating whether directories should be excluded.")
   excludeLinks = property(_getExcludeLinks, _setExcludeLinks, None, "Boolean indicating whether soft links should be excluded.")
   excludePaths = property(_getExcludePaths, _setExcludePaths, None, "List of absolute paths to be excluded.")
   excludePatterns = property(_getExcludePatterns, _setExcludePatterns, None,
                              "List of regular expression patterns (matching complete path) to be excluded.")
   excludeBasenamePatterns = property(_getExcludeBasenamePatterns, _setExcludeBasenamePatterns,
                                      None, "List of regular expression patterns (matching basename) to be excluded.")
   ignoreFile = property(_getIgnoreFile, _setIgnoreFile, None, "Name of file which will cause directory contents to be ignored.")
   ##############
   # Add methods
   ##############
[docs]   def addFile(self, path):
      """
      Adds a file to the list.
      The path must exist and must be a file or a link to an existing file.  It
      will be added to the list subject to any exclusions that are in place.
      Args:
         path (String representing a path on disk): File path to be added to the list
      Returns:
          Number of items added to the list
      Raises:
         ValueError: If path is not a file or does not exist
         ValueError: If the path could not be encoded properly
      """
      path = encodePath(path)
      if not os.path.exists(path) or not os.path.isfile(path):
         logger.debug("Path [%s] is not a file or does not exist on disk.", path)
         raise ValueError("Path is not a file or does not exist on disk.")
      if self.excludeLinks and os.path.islink(path):
         logger.debug("Path [%s] is excluded based on excludeLinks.", path)
         return 0
      if self.excludeFiles:
         logger.debug("Path [%s] is excluded based on excludeFiles.", path)
         return 0
      if path in self.excludePaths:
         logger.debug("Path [%s] is excluded based on excludePaths.", path)
         return 0
      for pattern in self.excludePatterns:
         pattern = encodePath(pattern)  # use same encoding as filenames
         if re.compile(r"^%s$" % pattern).match(path): # safe to assume all are valid due to RegexList
            logger.debug("Path [%s] is excluded based on pattern [%s].", path, pattern)
            return 0
      for pattern in self.excludeBasenamePatterns: # safe to assume all are valid due to RegexList
         pattern = encodePath(pattern)  # use same encoding as filenames
         if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
            logger.debug("Path [%s] is excluded based on basename pattern [%s].", path, pattern)
            return 0
      self.append(path)
      logger.debug("Added file to list: [%s]", path)
      return 1 
[docs]   def addDir(self, path):
      """
      Adds a directory to the list.
      The path must exist and must be a directory or a link to an existing
      directory.  It will be added to the list subject to any exclusions that
      are in place.  The :any:`ignoreFile` does not apply to this method, only to
      :any:`addDirContents`.
      Args:
         path (String representing a path on disk): Directory path to be added to the list
      Returns:
          Number of items added to the list
      Raises:
         ValueError: If path is not a directory or does not exist
         ValueError: If the path could not be encoded properly
      """
      path = encodePath(path)
      path = normalizeDir(path)
      if not os.path.exists(path) or not os.path.isdir(path):
         logger.debug("Path [%s] is not a directory or does not exist on disk.", path)
         raise ValueError("Path is not a directory or does not exist on disk.")
      if self.excludeLinks and os.path.islink(path):
         logger.debug("Path [%s] is excluded based on excludeLinks.", path)
         return 0
      if self.excludeDirs:
         logger.debug("Path [%s] is excluded based on excludeDirs.", path)
         return 0
      if path in self.excludePaths:
         logger.debug("Path [%s] is excluded based on excludePaths.", path)
         return 0
      for pattern in self.excludePatterns: # safe to assume all are valid due to RegexList
         pattern = encodePath(pattern)  # use same encoding as filenames
         if re.compile(r"^%s$" % pattern).match(path):
            logger.debug("Path [%s] is excluded based on pattern [%s].", path, pattern)
            return 0
      for pattern in self.excludeBasenamePatterns: # safe to assume all are valid due to RegexList
         pattern = encodePath(pattern)  # use same encoding as filenames
         if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
            logger.debug("Path [%s] is excluded based on basename pattern [%s].", path, pattern)
            return 0
      self.append(path)
      logger.debug("Added directory to list: [%s]", path)
      return 1 
[docs]   def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0, dereference=False):
      """
      Adds the contents of a directory to the list.
      The path must exist and must be a directory or a link to a directory.
      The contents of the directory (as well as the directory path itself) will
      be recursively added to the list, subject to any exclusions that are in
      place.  If you only want the directory and its immediate contents to be
      added, then pass in ``recursive=False``.
      *Note:* If a directory's absolute path matches an exclude pattern or path,
      or if the directory contains the configured ignore file, then the
      directory and all of its contents will be recursively excluded from the
      list.
      *Note:* If the passed-in directory happens to be a soft link, it will be
      recursed.  However, the linkDepth parameter controls whether any soft
      links *within* the directory will be recursed.  The link depth is
      maximum depth of the tree at which soft links should be followed.  So, a
      depth of 0 does not follow any soft links, a depth of 1 follows only
      links within the passed-in directory, a depth of 2 follows the links at
      the next level down, etc.
      *Note:* Any invalid soft links (i.e.  soft links that point to
      non-existent items) will be silently ignored.
      *Note:* The :any:`excludeDirs` flag only controls whether any given directory
      path itself is added to the list once it has been discovered.  It does
      *not* modify any behavior related to directory recursion.
      *Note:* If you call this method *on a link to a directory* that link will
      never be dereferenced (it may, however, be followed).
      Args:
         path (String representing a path on disk): Directory path whose contents should be added to the list
         recursive (Boolean value): Indicates whether directory contents should be added recursively
         addSelf (Boolean value): Indicates whether the directory itself should be added to the list
         linkDepth (Integer value): Maximum depth of the tree at which soft links should be followed, zero means not to folow
         dereference (Boolean value): Indicates whether soft links, if followed, should be dereferenced
      Returns:
          Number of items recursively added to the list
      Raises:
         ValueError: If path is not a directory or does not exist
         ValueError: If the path could not be encoded properly
      """
      path = encodePath(path)
      path = normalizeDir(path)
      return self._addDirContentsInternal(path, addSelf, recursive, linkDepth, dereference) 
   def _addDirContentsInternal(self, path, includePath=True, recursive=True, linkDepth=0, dereference=False):
      """
      Internal implementation of ``addDirContents``.
      This internal implementation exists due to some refactoring.  Basically,
      some subclasses have a need to add the contents of a directory, but not
      the directory itself.  This is different than the standard ``FilesystemList``
      behavior and actually ends up making a special case out of the first
      call in the recursive chain.  Since I don't want to expose the modified
      interface, ``addDirContents`` ends up being wholly implemented in terms
      of this method.
      The linkDepth parameter controls whether soft links are followed when we
      are adding the contents recursively.  Any recursive calls reduce the
      value by one.  If the value zero or less, then soft links will just be
      added as directories, but will not be followed.  This means that links
      are followed to a *constant depth* starting from the top-most directory.
      There is one difference between soft links and directories: soft links
      that are added recursively are not placed into the list explicitly.  This
      is because if we do add the links recursively, the resulting tar file
      gets a little confused (it has a link and a directory with the same
      name).
      *Note:* If you call this method *on a link to a directory* that link will
      never be dereferenced (it may, however, be followed).
      Args:
         path: Directory path whose contents should be added to the list
         includePath: Indicates whether to include the path as well as contents
         recursive: Indicates whether directory contents should be added recursively
         linkDepth: Depth of soft links that should be followed
         dereference: Indicates whether soft links, if followed, should be dereferenced
      Returns:
          Number of items recursively added to the list
      Raises:
         ValueError: If path is not a directory or does not exist
      """
      added = 0
      if not os.path.exists(path) or not os.path.isdir(path):
         logger.debug("Path [%s] is not a directory or does not exist on disk.", path)
         raise ValueError("Path is not a directory or does not exist on disk.")
      if path in self.excludePaths:
         logger.debug("Path [%s] is excluded based on excludePaths.", path)
         return added
      for pattern in self.excludePatterns: # safe to assume all are valid due to RegexList
         pattern = encodePath(pattern)  # use same encoding as filenames
         if re.compile(r"^%s$" % pattern).match(path):
            logger.debug("Path [%s] is excluded based on pattern [%s].", path, pattern)
            return added
      for pattern in self.excludeBasenamePatterns: # safe to assume all are valid due to RegexList
         pattern = encodePath(pattern)  # use same encoding as filenames
         if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
            logger.debug("Path [%s] is excluded based on basename pattern [%s].", path, pattern)
            return added
      if self.ignoreFile is not None and os.path.exists(os.path.join(path, self.ignoreFile)):
         logger.debug("Path [%s] is excluded based on ignore file.", path)
         return added
      if includePath:
         added += self.addDir(path)    # could actually be excluded by addDir, yet
      for entry in os.listdir(path):
         entrypath = os.path.join(path, entry)
         if os.path.isfile(entrypath):
            if linkDepth > 0 and dereference:
               derefpath = dereferenceLink(entrypath)
               if derefpath != entrypath:
                  added += self.addFile(derefpath)
            added += self.addFile(entrypath)
         elif os.path.isdir(entrypath):
            if os.path.islink(entrypath):
               if recursive:
                  if linkDepth > 0:
                     newDepth = linkDepth - 1
                     if dereference:
                        derefpath = dereferenceLink(entrypath)
                        if derefpath != entrypath:
                           added += self._addDirContentsInternal(derefpath, True, recursive, newDepth, dereference)
                        added += self.addDir(entrypath)
                     else:
                        added += self._addDirContentsInternal(entrypath, False, recursive, newDepth, dereference)
                  else:
                     added += self.addDir(entrypath)
               else:
                  added += self.addDir(entrypath)
            else:
               if recursive:
                  newDepth = linkDepth - 1
                  added += self._addDirContentsInternal(entrypath, True, recursive, newDepth, dereference)
               else:
                  added += self.addDir(entrypath)
      return added
   #################
   # Remove methods
   #################
[docs]   def removeFiles(self, pattern=None):
      """
      Removes file entries from the list.
      If ``pattern`` is not passed in or is ``None``, then all file entries will
      be removed from the list.  Otherwise, only those file entries matching
      the pattern will be removed.  Any entry which does not exist on disk
      will be ignored (use :any:`removeInvalid` to purge those entries).
      This method might be fairly slow for large lists, since it must check the
      type of each item in the list.  If you know ahead of time that you want
      to exclude all files, then you will be better off setting :any:`excludeFiles`
      to ``True`` before adding items to the list.
      Args:
         pattern: Regular expression pattern representing entries to remove
      Returns:
          Number of entries removed
      Raises:
         ValueError: If the passed-in pattern is not a valid regular expression
      """
      removed = 0
      if pattern is None:
         for entry in self[:]:
            if os.path.exists(entry) and os.path.isfile(entry):
               self.remove(entry)
               logger.debug("Removed path [%s] from list.", entry)
               removed += 1
      else:
         try:
            pattern = encodePath(pattern)  # use same encoding as filenames
            compiled = re.compile(pattern)
         except re.error:
            raise ValueError("Pattern is not a valid regular expression.")
         for entry in self[:]:
            if os.path.exists(entry) and os.path.isfile(entry):
               if compiled.match(entry):
                  self.remove(entry)
                  logger.debug("Removed path [%s] from list.", entry)
                  removed += 1
      logger.debug("Removed a total of %d entries.", removed)
      return removed 
[docs]   def removeDirs(self, pattern=None):
      """
      Removes directory entries from the list.
      If ``pattern`` is not passed in or is ``None``, then all directory entries
      will be removed from the list.  Otherwise, only those directory entries
      matching the pattern will be removed.  Any entry which does not exist on
      disk will be ignored (use :any:`removeInvalid` to purge those entries).
      This method might be fairly slow for large lists, since it must check the
      type of each item in the list.  If you know ahead of time that you want
      to exclude all directories, then you will be better off setting
      :any:`excludeDirs` to ``True`` before adding items to the list (note that this
      will not prevent you from recursively adding the *contents* of
      directories).
      Args:
         pattern: Regular expression pattern representing entries to remove
      Returns:
          Number of entries removed
      Raises:
         ValueError: If the passed-in pattern is not a valid regular expression
      """
      removed = 0
      if pattern is None:
         for entry in self[:]:
            if os.path.exists(entry) and os.path.isdir(entry):
               self.remove(entry)
               logger.debug("Removed path [%s] from list.", entry)
               removed += 1
      else:
         try:
            pattern = encodePath(pattern)  # use same encoding as filenames
            compiled = re.compile(pattern)
         except re.error:
            raise ValueError("Pattern is not a valid regular expression.")
         for entry in self[:]:
            if os.path.exists(entry) and os.path.isdir(entry):
               if compiled.match(entry):
                  self.remove(entry)
                  logger.debug("Removed path [%s] from list based on pattern [%s].", entry, pattern)
                  removed += 1
      logger.debug("Removed a total of %d entries.", removed)
      return removed 
[docs]   def removeLinks(self, pattern=None):
      """
      Removes soft link entries from the list.
      If ``pattern`` is not passed in or is ``None``, then all soft link entries
      will be removed from the list.  Otherwise, only those soft link entries
      matching the pattern will be removed.  Any entry which does not exist on
      disk will be ignored (use :any:`removeInvalid` to purge those entries).
      This method might be fairly slow for large lists, since it must check the
      type of each item in the list.  If you know ahead of time that you want
      to exclude all soft links, then you will be better off setting
      :any:`excludeLinks` to ``True`` before adding items to the list.
      Args:
         pattern: Regular expression pattern representing entries to remove
      Returns:
          Number of entries removed
      Raises:
         ValueError: If the passed-in pattern is not a valid regular expression
      """
      removed = 0
      if pattern is None:
         for entry in self[:]:
            if os.path.exists(entry) and os.path.islink(entry):
               self.remove(entry)
               logger.debug("Removed path [%s] from list.", entry)
               removed += 1
      else:
         try:
            pattern = encodePath(pattern)  # use same encoding as filenames
            compiled = re.compile(pattern)
         except re.error:
            raise ValueError("Pattern is not a valid regular expression.")
         for entry in self[:]:
            if os.path.exists(entry) and os.path.islink(entry):
               if compiled.match(entry):
                  self.remove(entry)
                  logger.debug("Removed path [%s] from list based on pattern [%s].", entry, pattern)
                  removed += 1
      logger.debug("Removed a total of %d entries.", removed)
      return removed 
[docs]   def removeMatch(self, pattern):
      """
      Removes from the list all entries matching a pattern.
      This method removes from the list all entries which match the passed in
      ``pattern``.  Since there is no need to check the type of each entry, it
      is faster to call this method than to call the :any:`removeFiles`,
      :any:`removeDirs` or :any:`removeLinks` methods individually.  If you know which
      patterns you will want to remove ahead of time, you may be better off
      setting :any:`excludePatterns` or :any:`excludeBasenamePatterns` before adding
      items to the list.
      *Note:* Unlike when using the exclude lists, the pattern here is *not*
      bounded at the front and the back of the string.  You can use any pattern
      you want.
      Args:
         pattern: Regular expression pattern representing entries to remove
      Returns:
          Number of entries removed
      Raises:
         ValueError: If the passed-in pattern is not a valid regular expression
      """
      try:
         pattern = encodePath(pattern)  # use same encoding as filenames
         compiled = re.compile(pattern)
      except re.error:
         raise ValueError("Pattern is not a valid regular expression.")
      removed = 0
      for entry in self[:]:
         if compiled.match(entry):
            self.remove(entry)
            logger.debug("Removed path [%s] from list based on pattern [%s].", entry, pattern)
            removed += 1
      logger.debug("Removed a total of %d entries.", removed)
      return removed 
[docs]   def removeInvalid(self):
      """
      Removes from the list all entries that do not exist on disk.
      This method removes from the list all entries which do not currently
      exist on disk in some form.  No attention is paid to whether the entries
      are files or directories.
      Returns:
          Number of entries removed
      """
      removed = 0
      for entry in self[:]:
         if not os.path.exists(entry):
            self.remove(entry)
            logger.debug("Removed path [%s] from list.", entry)
            removed += 1
      logger.debug("Removed a total of %d entries.", removed)
      return removed 
   ##################
   # Utility methods
   ##################
[docs]   def normalize(self):
      """Normalizes the list, ensuring that each entry is unique."""
      orig = len(self)
      self.sort()
      dups = list(filter(lambda x, self=self: self[x] == self[x+1], list(range(0, len(self) - 1))))
      items = list(map(lambda x, self=self: self[x], dups))
      list(map(self.remove, items))
      new = len(self)
      logger.debug("Completed normalizing list; removed %d items (%d originally, %d now).", new-orig, orig, new) 
[docs]   def verify(self):
      """
      Verifies that all entries in the list exist on disk.
      Returns:
          ``True`` if all entries exist, ``False`` otherwise
      """
      for entry in self:
         if not os.path.exists(entry):
            logger.debug("Path [%s] is invalid; list is not valid.", entry)
            return False
      logger.debug("All entries in list are valid.")
      return True  
########################################################################
# SpanItem class definition
########################################################################
[docs]class SpanItem(object): # pylint: disable=R0903
   """
   Item returned by :any:`BackupFileList.generateSpan`.
   """
[docs]   def __init__(self, fileList, size, capacity, utilization):
      """
      Create object.
      Args:
         fileList: List of files
         size: Size (in bytes) of files
         utilization: Utilization, as a percentage (0-100)
      """
      self.fileList = fileList
      self.size = size
      self.capacity = capacity
      self.utilization = utilization  
########################################################################
# BackupFileList class definition
########################################################################
[docs]class BackupFileList(FilesystemList): # pylint: disable=R0904
   ######################
   # Class documentation
   ######################
   """
   List of files to be backed up.
   A BackupFileList is a :any:`FilesystemList` containing a list of files to be
   backed up.  It only contains files, not directories (soft links are treated
   like files).  On top of the generic functionality provided by
   :any:`FilesystemList`, this class adds functionality to keep a hash (checksum)
   for each file in the list, and it also provides a method to calculate the
   total size of the files in the list and a way to export the list into tar
   form.
   """
   ##############
   # Constructor
   ##############
[docs]   def __init__(self):
      """Initializes a list with no configured exclusions."""
      FilesystemList.__init__(self) 
   ################################
   # Overridden superclass methods
   ################################
[docs]   def addDir(self, path):
      """
      Adds a directory to the list.
      Note that this class does not allow directories to be added by themselves
      (a backup list contains only files).  However, since links to directories
      are technically files, we allow them to be added.
      This method is implemented in terms of the superclass method, with one
      additional validation: the superclass method is only called if the
      passed-in path is both a directory and a link.  All of the superclass's
      existing validations and restrictions apply.
      Args:
         path (String representing a path on disk): Directory path to be added to the list
      Returns:
          Number of items added to the list
      Raises:
         ValueError: If path is not a directory or does not exist
         ValueError: If the path could not be encoded properly
      """
      path = encodePath(path)
      path = normalizeDir(path)
      if os.path.isdir(path) and not os.path.islink(path):
         return 0
      else:
         return FilesystemList.addDir(self, path) 
   ##################
   # Utility methods
   ##################
[docs]   def totalSize(self):
      """
      Returns the total size among all files in the list.
      Only files are counted.
      Soft links that point at files are ignored.
      Entries which do not exist on disk are ignored.
      Returns:
          Total size, in bytes
      """
      total = 0.0
      for entry in self:
         if os.path.isfile(entry) and not os.path.islink(entry):
            total += float(os.stat(entry).st_size)
      return total 
[docs]   def generateSizeMap(self):
      """
      Generates a mapping from file to file size in bytes.
      The mapping does include soft links, which are listed with size zero.
      Entries which do not exist on disk are ignored.
      Returns:
          Dictionary mapping file to file size
      """
      table = { }
      for entry in self:
         if os.path.islink(entry):
            table[entry] = 0.0
         elif os.path.isfile(entry):
            table[entry] = float(os.stat(entry).st_size)
      return table 
[docs]   def generateDigestMap(self, stripPrefix=None):
      """
      Generates a mapping from file to file digest.
      Currently, the digest is an SHA hash, which should be pretty secure.  In
      the future, this might be a different kind of hash, but we guarantee that
      the type of the hash will not change unless the library major version
      number is bumped.
      Entries which do not exist on disk are ignored.
      Soft links are ignored.  We would end up generating a digest for the file
      that the soft link points at, which doesn't make any sense.
      If ``stripPrefix`` is passed in, then that prefix will be stripped from
      each key when the map is generated.  This can be useful in generating two
      "relative" digest maps to be compared to one another.
      Args:
         stripPrefix (String with any contents): Common prefix to be stripped from paths
      Returns:
          Dictionary mapping file to digest value
      @see: :any:`removeUnchanged`
      """
      table = { }
      if stripPrefix is not None:
         for entry in self:
            if os.path.isfile(entry) and not os.path.islink(entry):
               table[entry.replace(stripPrefix, "", 1)] = BackupFileList._generateDigest(entry)
      else:
         for entry in self:
            if os.path.isfile(entry) and not os.path.islink(entry):
               table[entry] = BackupFileList._generateDigest(entry)
      return table 
   @staticmethod
   def _generateDigest(path):
      """
      Generates an SHA digest for a given file on disk.
      The original code for this function used this simplistic implementation,
      which requires reading the entire file into memory at once in order to
      generate a digest value::
         sha.new(open(path).read()).hexdigest()
      Not surprisingly, this isn't an optimal solution.  The U{Simple file
      hashing <http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/259109>}
      Python Cookbook recipe describes how to incrementally generate a hash
      value by reading in chunks of data rather than reading the file all at
      once.  The recipe relies on the the ``update()`` method of the various
      Python hashing algorithms.
      In my tests using a 110 MB file on CD, the original implementation
      requires 111 seconds.  This implementation requires only 40-45 seconds,
      which is a pretty substantial speed-up.
      Experience shows that reading in around 4kB (4096 bytes) at a time yields
      the best performance.  Smaller reads are quite a bit slower, and larger
      reads don't make much of a difference.  The 4kB number makes me a little
      suspicious, and I think it might be related to the size of a filesystem
      read at the hardware level.  However, I've decided to just hardcode 4096
      until I have evidence that shows it's worthwhile making the read size
      configurable.
      Args:
         path: Path to generate digest for
      Returns:
          ASCII-safe SHA digest for the file
      Raises:
         OSError: If the file cannot be opened
      """
      # pylint: disable=C0103,E1101
      s = hashlib.sha1()
      with open(path, mode="rb") as f:
         readBytes = 4096  # see notes above
         while readBytes > 0:
            readString = f.read(readBytes)
            s.update(readString)
            readBytes = len(readString)
      digest = s.hexdigest()
      logger.debug("Generated digest [%s] for file [%s].", digest, path)
      return digest
[docs]   def generateFitted(self, capacity, algorithm="worst_fit"):
      """
      Generates a list of items that fit in the indicated capacity.
      Sometimes, callers would like to include every item in a list, but are
      unable to because not all of the items fit in the space available.  This
      method returns a copy of the list, containing only the items that fit in
      a given capacity.  A copy is returned so that we don't lose any
      information if for some reason the fitted list is unsatisfactory.
      The fitting is done using the functions in the knapsack module.  By
      default, the first fit algorithm is used, but you can also choose
      from best fit, worst fit and alternate fit.
      Args:
         capacity (Integer, in bytes): Maximum capacity among the files in the new list
         algorithm (One of "first_fit", "best_fit", "worst_fit", "alternate_fit"): Knapsack (fit) algorithm to use
      Returns:
          Copy of list with total size no larger than indicated capacity
      Raises:
         ValueError: If the algorithm is invalid
      """
      table = self._getKnapsackTable()
      function = BackupFileList._getKnapsackFunction(algorithm)
      return function(table, capacity)[0] 
[docs]   def generateSpan(self, capacity, algorithm="worst_fit"):
      """
      Splits the list of items into sub-lists that fit in a given capacity.
      Sometimes, callers need split to a backup file list into a set of smaller
      lists.  For instance, you could use this to "span" the files across a set
      of discs.
      The fitting is done using the functions in the knapsack module.  By
      default, the first fit algorithm is used, but you can also choose
      from best fit, worst fit and alternate fit.
      *Note:* If any of your items are larger than the capacity, then it won't
      be possible to find a solution.  In this case, a value error will be
      raised.
      Args:
         capacity (Integer, in bytes): Maximum capacity among the files in the new list
         algorithm (One of "first_fit", "best_fit", "worst_fit", "alternate_fit"): Knapsack (fit) algorithm to use
      Returns:
          List of :any:`SpanItem` objects
      Raises:
         ValueError: If the algorithm is invalid
         ValueError: If it's not possible to fit some items
      """
      spanItems = []
      function = BackupFileList._getKnapsackFunction(algorithm)
      table = self._getKnapsackTable(capacity)
      iteration = 0
      while len(table) > 0:
         iteration += 1
         fit = function(table, capacity)
         if len(fit[0]) == 0:
            # Should never happen due to validations in _convertToKnapsackForm(), but let's be safe
            raise ValueError("After iteration %d, unable to add any new items." % iteration)
         removeKeys(table, fit[0])
         utilization = (float(fit[1])/float(capacity))*100.0
         item = SpanItem(fit[0], fit[1], capacity, utilization)
         spanItems.append(item)
      return spanItems 
   def _getKnapsackTable(self, capacity=None):
      """
      Converts the list into the form needed by the knapsack algorithms.
      Returns:
          Dictionary mapping file name to tuple of (file path, file size)
      """
      table = { }
      for entry in self:
         if os.path.islink(entry):
            table[entry] = (entry, 0.0)
         elif os.path.isfile(entry):
            size = float(os.stat(entry).st_size)
            if capacity is not None:
               if size > capacity:
                  raise ValueError("File [%s] cannot fit in capacity %s." % (entry, displayBytes(capacity)))
            table[entry] = (entry, size)
      return table
   @staticmethod
   def _getKnapsackFunction(algorithm):
      """
      Returns a reference to the function associated with an algorithm name.
      Algorithm name must be one of "first_fit", "best_fit", "worst_fit", "alternate_fit"
      Args:
         algorithm: Name of the algorithm
      Returns:
          Reference to knapsack function
      Raises:
         ValueError: If the algorithm name is unknown
      """
      if algorithm == "first_fit":
         return firstFit
      elif algorithm == "best_fit":
         return bestFit
      elif algorithm == "worst_fit":
         return worstFit
      elif algorithm == "alternate_fit":
         return alternateFit
      else:
         raise ValueError("Algorithm [%s] is invalid." % algorithm)
[docs]   def generateTarfile(self, path, mode='tar', ignore=False, flat=False):
      """
      Creates a tar file containing the files in the list.
      By default, this method will create uncompressed tar files.  If you pass
      in mode ``'targz'``, then it will create gzipped tar files, and if you
      pass in mode ``'tarbz2'``, then it will create bzipped tar files.
      The tar file will be created as a GNU tar archive, which enables extended
      file name lengths, etc.  Since GNU tar is so prevalent, I've decided that
      the extra functionality out-weighs the disadvantage of not being
      "standard".
      If you pass in ``flat=True``, then a "flat" archive will be created, and
      all of the files will be added to the root of the archive.  So, the file
      ``/tmp/something/whatever.txt`` would be added as just ``whatever.txt``.
      By default, the whole method call fails if there are problems adding any
      of the files to the archive, resulting in an exception.  Under these
      circumstances, callers are advised that they might want to call
      :any:`removeInvalid` and then attempt to extract the tar file a second
      time, since the most common cause of failures is a missing file (a file
      that existed when the list was built, but is gone again by the time the
      tar file is built).
      If you want to, you can pass in ``ignore=True``, and the method will
      ignore errors encountered when adding individual files to the archive
      (but not errors opening and closing the archive itself).
      We'll always attempt to remove the tarfile from disk if an exception will
      be thrown.
      *Note:* No validation is done as to whether the entries in the list are
      files, since only files or soft links should be in an object like this.
      However, to be safe, everything is explicitly added to the tar archive
      non-recursively so it's safe to include soft links to directories.
      *Note:* The Python ``tarfile`` module, which is used internally here, is
      supposed to deal properly with long filenames and links.  In my testing,
      I have found that it appears to be able to add long really long filenames
      to archives, but doesn't do a good job reading them back out, even out of
      an archive it created.  Fortunately, all Cedar Backup does is add files
      to archives.
      Args:
         path (String representing a path on disk): Path of tar file to create on disk
         mode (One of either ``'tar'``, ``'targz'`` or ``'tarbz2'``): Tar creation mode
         ignore (Boolean): Indicates whether to ignore certain errors
         flat (Boolean): Creates "flat" archive by putting all items in root
      Raises:
         ValueError: If mode is not valid
         ValueError: If list is empty
         ValueError: If the path could not be encoded properly
         TarError: If there is a problem creating the tar file
      """
      # pylint: disable=E1101
      path = encodePath(path)
      if len(self) == 0: raise ValueError("Empty list cannot be used to generate tarfile.")
      if mode == 'tar': tarmode = "w:"
      elif mode == 'targz': tarmode = "w:gz"
      elif mode == 'tarbz2': tarmode = "w:bz2"
      else: raise ValueError("Mode [%s] is not valid." % mode)
      try:
         tar = tarfile.open(path, tarmode)
         try:
            tar.format = tarfile.GNU_FORMAT
         except AttributeError:
            tar.posix = False
         for entry in self:
            try:
               if flat:
                  tar.add(entry, arcname=os.path.basename(entry), recursive=False)
               else:
                  tar.add(entry, recursive=False)
            except tarfile.TarError as e:
               if not ignore:
                  raise e
               logger.info("Unable to add file [%s]; going on anyway.", entry)
            except OSError as e:
               if not ignore:
                  raise tarfile.TarError(e)
               logger.info("Unable to add file [%s]; going on anyway.", entry)
         tar.close()
      except tarfile.ReadError as e:
         try: tar.close()
         except: pass
         if os.path.exists(path):
            try: os.remove(path)
            except: pass
         raise tarfile.ReadError("Unable to open [%s]; maybe directory doesn't exist?" % path)
      except tarfile.TarError as e:
         try: tar.close()
         except: pass
         if os.path.exists(path):
            try: os.remove(path)
            except: pass
         raise e 
[docs]   def removeUnchanged(self, digestMap, captureDigest=False):
      """
      Removes unchanged entries from the list.
      This method relies on a digest map as returned from :any:`generateDigestMap`.
      For each entry in ``digestMap``, if the entry also exists in the current
      list *and* the entry in the current list has the same digest value as in
      the map, the entry in the current list will be removed.
      This method offers a convenient way for callers to filter unneeded
      entries from a list.  The idea is that a caller will capture a digest map
      from ``generateDigestMap`` at some point in time (perhaps the beginning of
      the week), and will save off that map using ``pickle`` or some other
      method.  Then, the caller could use this method sometime in the future to
      filter out any unchanged files based on the saved-off map.
      If ``captureDigest`` is passed-in as ``True``, then digest information will
      be captured for the entire list before the removal step occurs using the
      same rules as in :any:`generateDigestMap`.  The check will involve a lookup
      into the complete digest map.
      If ``captureDigest`` is passed in as ``False``, we will only generate a
      digest value for files we actually need to check, and we'll ignore any
      entry in the list which isn't a file that currently exists on disk.
      The return value varies depending on ``captureDigest``, as well.  To
      preserve backwards compatibility, if ``captureDigest`` is ``False``, then
      we'll just return a single value representing the number of entries
      removed.  Otherwise, we'll return a tuple of C{(entries removed, digest
      map)}.  The returned digest map will be in exactly the form returned by
      :any:`generateDigestMap`.
      *Note:* For performance reasons, this method actually ends up rebuilding
      the list from scratch.  First, we build a temporary dictionary containing
      all of the items from the original list.  Then, we remove items as needed
      from the dictionary (which is faster than the equivalent operation on a
      list).  Finally, we replace the contents of the current list based on the
      keys left in the dictionary.  This should be transparent to the caller.
      Args:
         digestMap (Map as returned from :any:`generateDigestMap`): Dictionary mapping file name to digest value
         captureDigest (Boolean): Indicates that digest information should be captured
      Returns:
          Results as discussed above (format varies based on arguments)
      """
      if captureDigest:
         removed = 0
         table = {}
         captured = {}
         for entry in self:
            if os.path.isfile(entry) and not os.path.islink(entry):
               table[entry] = BackupFileList._generateDigest(entry)
               captured[entry] = table[entry]
            else:
               table[entry] = None
         for entry in list(digestMap.keys()):
            if entry in table:
               if table[entry] is not None:  # equivalent to file/link check in other case
                  digest = table[entry]
                  if digest == digestMap[entry]:
                     removed += 1
                     del table[entry]
                     logger.debug("Discarded unchanged file [%s].", entry)
         self[:] = list(table.keys())
         return (removed, captured)
      else:
         removed = 0
         table = {}
         for entry in self:
            table[entry] = None
         for entry in list(digestMap.keys()):
            if entry in table:
               if os.path.isfile(entry) and not os.path.islink(entry):
                  digest = BackupFileList._generateDigest(entry)
                  if digest == digestMap[entry]:
                     removed += 1
                     del table[entry]
                     logger.debug("Discarded unchanged file [%s].", entry)
         self[:] = list(table.keys())
         return removed  
########################################################################
# PurgeItemList class definition
########################################################################
[docs]class PurgeItemList(FilesystemList): # pylint: disable=R0904
   ######################
   # Class documentation
   ######################
   """
   List of files and directories to be purged.
   A PurgeItemList is a :any:`FilesystemList` containing a list of files and
   directories to be purged.  On top of the generic functionality provided by
   :any:`FilesystemList`, this class adds functionality to remove items that are
   too young to be purged, and to actually remove each item in the list from
   the filesystem.
   The other main difference is that when you add a directory's contents to a
   purge item list, the directory itself is not added to the list.  This way,
   if someone asks to purge within in ``/opt/backup/collect``, that directory
   doesn't get removed once all of the files within it is gone.
   """
   ##############
   # Constructor
   ##############
[docs]   def __init__(self):
      """Initializes a list with no configured exclusions."""
      FilesystemList.__init__(self) 
   ##############
   # Add methods
   ##############
[docs]   def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0, dereference=False):
      """
      Adds the contents of a directory to the list.
      The path must exist and must be a directory or a link to a directory.
      The contents of the directory (but *not* the directory path itself) will
      be recursively added to the list, subject to any exclusions that are in
      place.  If you only want the directory and its contents to be added, then
      pass in ``recursive=False``.
      *Note:* If a directory's absolute path matches an exclude pattern or path,
      or if the directory contains the configured ignore file, then the
      directory and all of its contents will be recursively excluded from the
      list.
      *Note:* If the passed-in directory happens to be a soft link, it will be
      recursed.  However, the linkDepth parameter controls whether any soft
      links *within* the directory will be recursed.  The link depth is
      maximum depth of the tree at which soft links should be followed.  So, a
      depth of 0 does not follow any soft links, a depth of 1 follows only
      links within the passed-in directory, a depth of 2 follows the links at
      the next level down, etc.
      *Note:* Any invalid soft links (i.e.  soft links that point to
      non-existent items) will be silently ignored.
      *Note:* The :any:`excludeDirs` flag only controls whether any given soft link
      path itself is added to the list once it has been discovered.  It does
      *not* modify any behavior related to directory recursion.
      *Note:* The :any:`excludeDirs` flag only controls whether any given directory
      path itself is added to the list once it has been discovered.  It does
      *not* modify any behavior related to directory recursion.
      *Note:* If you call this method *on a link to a directory* that link will
      never be dereferenced (it may, however, be followed).
      Args:
         path (String representing a path on disk): Directory path whose contents should be added to the list
         recursive (Boolean value): Indicates whether directory contents should be added recursively
         addSelf: Ignored in this subclass
         linkDepth (Integer value, where zero means not to follow any soft links): Depth of soft links that should be followed
         dereference (Boolean value): Indicates whether soft links, if followed, should be dereferenced
      Returns:
          Number of items recursively added to the list
      Raises:
         ValueError: If path is not a directory or does not exist
         ValueError: If the path could not be encoded properly
      """
      path = encodePath(path)
      path = normalizeDir(path)
      return super(PurgeItemList, self)._addDirContentsInternal(path, False, recursive, linkDepth, dereference) 
   ##################
   # Utility methods
   ##################
[docs]   def removeYoungFiles(self, daysOld):
      """
      Removes from the list files younger than a certain age (in days).
      Any file whose "age" in days is less than (``<``) the value of the
      ``daysOld`` parameter will be removed from the list so that it will not be
      purged later when :any:`purgeItems` is called.  Directories and soft links
      will be ignored.
      The "age" of a file is the amount of time since the file was last used,
      per the most recent of the file's ``st_atime`` and ``st_mtime`` values.
      *Note:* Some people find the "sense" of this method confusing or
      "backwards".  Keep in mind that this method is used to remove items
      *from the list*, not from the filesystem!  It removes from the list
      those items that you would *not* want to purge because they are too
      young.  As an example, passing in ``daysOld`` of zero (0) would remove
      from the list no files, which would result in purging all of the files
      later.  I would be happy to make a synonym of this method with an
      easier-to-understand "sense", if someone can suggest one.
      Args:
         daysOld (Integer value >= 0): Minimum age of files that are to be kept in the list
      Returns:
          Number of entries removed
      """
      removed = 0
      daysOld = int(daysOld)
      if daysOld < 0:
         raise ValueError("Days old value must be an integer >= 0.")
      for entry in self[:]:
         if os.path.isfile(entry) and not os.path.islink(entry):
            try:
               ageInDays = calculateFileAge(entry)
               ageInWholeDays = math.floor(ageInDays)
               if ageInWholeDays < 0: ageInWholeDays = 0
               if ageInWholeDays < daysOld:
                  removed += 1
                  self.remove(entry)
            except OSError:
               pass
      return removed 
[docs]   def purgeItems(self):
      """
      Purges all items in the list.
      Every item in the list will be purged.  Directories in the list will
      *not* be purged recursively, and hence will only be removed if they are
      empty.  Errors will be ignored.
      To faciliate easy removal of directories that will end up being empty,
      the delete process happens in two passes: files first (including soft
      links), then directories.
      Returns:
          Tuple containing count of (files, dirs) removed
      """
      files = 0
      dirs = 0
      for entry in self:
         if os.path.exists(entry) and (os.path.isfile(entry) or os.path.islink(entry)):
            try:
               os.remove(entry)
               files += 1
               logger.debug("Purged file [%s].", entry)
            except OSError:
               pass
      for entry in self:
         if os.path.exists(entry) and os.path.isdir(entry) and not os.path.islink(entry):
            try:
               os.rmdir(entry)
               dirs += 1
               logger.debug("Purged empty directory [%s].", entry)
            except OSError:
               pass
      return (files, dirs)  
########################################################################
# Public functions
########################################################################
##########################
# normalizeDir() function
##########################
[docs]def normalizeDir(path):
   """
   Normalizes a directory name.
   For our purposes, a directory name is normalized by removing the trailing
   path separator, if any.  This is important because we want directories to
   appear within lists in a consistent way, although from the user's
   perspective passing in ``/path/to/dir/`` and ``/path/to/dir`` are equivalent.
   Args:
      path (String representing a path on disk): Path to be normalized
   Returns:
       Normalized path, which should be equivalent to the original
   """
   if path != os.sep and path[-1:] == os.sep:
      return path[:-1]
   return path 
#############################
# compareContents() function
#############################
[docs]def compareContents(path1, path2, verbose=False):
   """
   Compares the contents of two directories to see if they are equivalent.
   The two directories are recursively compared.  First, we check whether they
   contain exactly the same set of files.  Then, we check to see every given
   file has exactly the same contents in both directories.
   This is all relatively simple to implement through the magic of
   :any:`BackupFileList.generateDigestMap`, which knows how to strip a path prefix
   off the front of each entry in the mapping it generates.  This makes our
   comparison as simple as creating a list for each path, then generating a
   digest map for each path and comparing the two.
   If no exception is thrown, the two directories are considered identical.
   If the ``verbose`` flag is ``True``, then an alternate (but slower) method is
   used so that any thrown exception can indicate exactly which file caused the
   comparison to fail.  The thrown ``ValueError`` exception distinguishes
   between the directories containing different files, and containing the same
   files with differing content.
   *Note:* Symlinks are *not* followed for the purposes of this comparison.
   Args:
      path1 (String representing a path on disk): First path to compare
      path2 (String representing a path on disk): First path to compare
      verbose (Boolean): Indicates whether a verbose response should be given
   Raises:
      ValueError: If a directory doesn't exist or can't be read
      ValueError: If the two directories are not equivalent
      IOError: If there is an unusual problem reading the directories
   """
   try:
      path1List = BackupFileList()
      path1List.addDirContents(path1)
      path1Digest = path1List.generateDigestMap(stripPrefix=normalizeDir(path1))
      path2List = BackupFileList()
      path2List.addDirContents(path2)
      path2Digest = path2List.generateDigestMap(stripPrefix=normalizeDir(path2))
      compareDigestMaps(path1Digest, path2Digest, verbose)
   except IOError as e:
      logger.error("I/O error encountered during consistency check.")
      raise e 
[docs]def compareDigestMaps(digest1, digest2, verbose=False):
   """
   Compares two digest maps and throws an exception if they differ.
   Args:
      digest1 (Digest as returned from BackupFileList.generateDigestMap()): First digest to compare
      digest2 (Digest as returned from BackupFileList.generateDigestMap()): Second digest to compare
      verbose (Boolean): Indicates whether a verbose response should be given
   Raises:
      ValueError: If the two directories are not equivalent
   """
   if not verbose:
      if digest1 != digest2:
         raise ValueError("Consistency check failed.")
   else:
      list1 = UnorderedList(list(digest1.keys()))
      list2 = UnorderedList(list(digest2.keys()))
      if list1 != list2:
         raise ValueError("Directories contain a different set of files.")
      for key in list1:
         if digest1[key] != digest2[key]:
            raise ValueError("File contents for [%s] vary between directories." % key)