#!/usr/bin/env python # # \\(File: md5sumd.py \\( # \\(LastChangedDate: 2013-01-30 18:29:35 -0600 (Wed, 30 Jan 2013) \\( # # Copyright (C) 2013 Bo Peng (bpeng@mdanderson.org) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # try: # for Python 2.7 and higher, Python3.2 and higher import argparse use_argparse = True except ImportError: # for earlier versions of Python, Python 3.0, 3.1 etc import optparse use_argparse = False # import hashlib import gzip import os import sys import signal import time import tempfile import operator try: from fcntl import ioctl from array import array import termios except ImportError: pass PY3 = sys.version_info.major == 3 class ProgressBar: '''A text-based progress bar''' def __init__(self, message, totalCount): self.message = message self.count = 0 self.totalCount = totalCount self.start_time = None self.last_time = None # get terminal width self.handle_resize() try: signal.signal(signal.SIGWINCH, self.handle_resize) except Exception: pass self.outputProgress() def handle_resize(self, signum=None, frame=None): 'Tries to catch resize signals sent from the terminal.' try: h, w = array('h', ioctl(sys.stderr, termios.TIOCGWINSZ, '\0' * 8))[:2] self.term_width = w except Exception: self.term_width = 79 def update(self, count): '''make count progress''' self.count += count self.outputProgress() def speedMessage(self, cps): # speed if cps > 1000000: return' {:.1f}M/s'.format(cps/1000000) elif cps > 1000: return ' {:.1f}K/s'.format(cps/1000) elif cps > 0.05: return ' {:.1f}/s'.format(cps) elif cps > 1e-6: return ' {:.1f}s each'.format(1. / cps) else: return ' 0.0/s' def outputProgress(self): '''Output progress''' if not self.start_time: self.start_time = time.time() self.last_time = self.start_time cur_time = time.time() # stop update progress bar more than once per second. if self.count > 0 and self.count != self.totalCount and cur_time - self.last_time < 1: return msg = [self.message + ':', '', '', ' ', '', '', ''] # message self.last_time = cur_time second_elapsed = cur_time - self.start_time if second_elapsed >= 0.0001 and self.count > 0: msg[4] = self.speedMessage(self.count / second_elapsed) # estimated time left perc = 1 if self.totalCount == 0 else min(1, float(self.count) / self.totalCount) time_left = 0 if perc == 0 else (second_elapsed / perc * (1 - perc)) msg[5] += ' in {}{}'.format('' if time_left < 86400 else '{} days '.format(int(time_left/86400)), time.strftime('%H:%M:%S', time.gmtime(time_left))) if self.count > 0: msg[3] = ' {:,}'.format(int(self.count)) # percentage perc = 1 if self.totalCount == 0 else min(1, float(self.count) / self.totalCount) msg[1] = ' {:5.1f}%'.format(perc * 100) width = self.term_width - sum([len(x) for x in msg]) if width > 5: front = int((perc) * (width - 5)) back = width - 5 - front msg[2] = ' [{}>{}]'.format('=' * front, ' ' * back) # use stderr to avoid messing up process output sys.stderr.write('\r' + ''.join(msg)) def done(self): '''Finish, output a new line''' self.count = self.totalCount # msg = [self.message + ':', ' 100%', '', ' {:,}'.format(self.count), '', ''] # second_elapsed = time.time() - self.start_time msg[4] = self.speedMessage(0 if second_elapsed < 0.0001 else (self.count) / second_elapsed) msg[5] += ' in {}{}'.format('' if second_elapsed < 86400 else '{} days '.format(int(second_elapsed/86400)), time.strftime('%H:%M:%S', time.gmtime(second_elapsed))) width = self.term_width - sum([len(x) for x in msg]) if width > 4: msg[2] = '[{}]'.format('=' * int(width - 4)) sys.stderr.write('\r' + ''.join(msg) + '\n') sys.stderr.flush() def probeDir(basedir): '''Return the number and total size of files that will be processed by calculateDirMD5''' if os.path.isfile(basedir): return (1, os.path.getsize(basedir)) count = 0 filesize = 0 for item in os.listdir(basedir): fullname = os.path.join(basedir, item) if os.path.isfile(fullname): count += 1 filesize += min(2**26, os.path.getsize(fullname)) elif os.path.isdir(fullname): item_count, item_filesize = probeDir(fullname) count += item_count filesize += item_filesize return count, filesize def calculateFileMD5(filename): '''calculate md5 for specified file, using the first 64M of the content''' md5 = hashlib.md5() # limit the calculation to the first 1G of the file content block_size = 2**20 # buffer of 1M filesize = os.path.getsize(filename) try: if filesize < 2**26: # for file less than 1G, use all its content with open(filename, 'rb') as f: while True: data = f.read(block_size) if not data: break md5.update(data) else: count = 64 # otherwise, use the first and last 500M with open(filename, 'rb') as f: while True: data = f.read(block_size) count -= 1 if count == 32: f.seek(-2**25, 2) if not data or count == 0: break md5.update(data) except IOError as e: sys.exit('Failed to read {}: {}'.format(filename, e)) return md5.hexdigest() def calculateDirMD5(basedir, call_back): '''Calculate MD5 signature of specified directory. call_back is used to report progress.''' manifest = [] for item in os.listdir(basedir): fullname = os.path.join(basedir, item) # if the item is a file, collect md5 and other information # linkes are ignored if os.path.isfile(fullname): filesize = os.path.getsize(fullname) manifest.append((fullname, calculateFileMD5(fullname), '-', 1, 0, filesize, 1, filesize)) # advance the progress bar if call_back is not None: call_back(min(2**26, filesize)) elif os.path.isdir(fullname): # for a directory, call calculateDirMD5 recursively md5, nfiles, ndirs, sfiles, total_nfiles, total_sfiles, manifests = calculateDirMD5(fullname, call_back) # folder md5 is required because name of an empty folder will otherwise not be recorded manifest.append((fullname, md5, 'd', nfiles, ndirs, sfiles, total_nfiles, total_sfiles)) manifest.extend(manifests) # sort the entries to avoid folder md5 affected by file/directory order # here we sort by md5 key because sorting by filename can be unstable due to different encoding # of filenames obtained by python2/python3 from different operating systems # The final output is sorted by filename though, because that will make the output easier to read manifest.sort(key=operator.itemgetter(1)) # calculate folder md5 from a temporary manifest file. To keep compatibility # the file is written in wb mode to avoid \n to \r\n translation under windows tmp = tempfile.NamedTemporaryFile(mode='wb', delete=False) for item in manifest: if PY3: # in pyhon 3, we need to worry about what has been returned for a non-ascii filename from os.listdir # however, as long as we use the same encoding (utf-8) the MD5 generated by the command is the same # across platform tmp.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(item[1], item[2], item[3], item[4], item[5], item[6], item[7], os.path.normpath(os.path.relpath(item[0], basedir)).replace('\\', '/')).encode('utf-8')) # each line has md5, type, nfiles, ndirs, sfiles, total_nfiles, total_sfiles, name # to make the md5 work for both windows and linux, the path name is saved in / slash else: # in python 2, we can write a string directly to a 'wb' file, without worry about encoding tmp.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(item[1], item[2], item[3], item[4], item[5], item[6], item[7], os.path.normpath(os.path.relpath(item[0], basedir)).replace('\\', '/'))) tmp.close() # this is the md5 for the whole directory md5 = calculateFileMD5(tmp.name) os.remove(tmp.name) # nfiles = len([x for x in manifest if x[2] == '-']) ndirs = len(manifest) - nfiles sfiles = sum([x[5] for x in manifest if x[2] == '-']) total_nfiles = sum([x[6] for x in manifest]) total_sfiles = sum([x[7] for x in manifest]) return md5, nfiles, ndirs, sfiles, total_nfiles, total_sfiles, manifest def checkDir(basedir, checksum, call_back): '''Validate directories against their md5 signatures.''' if os.path.isfile(basedir): md5 = calculateFileMD5(basedir) if md5 == checksum[basedir][0]: return True, ['{}: OK'.format(basedir)] else: return False, ['{}: FAILED'.format(basedir)] md5, nfiles, ndirs, sfiles, total_nfiles, total_sfiles, manifest = calculateDirMD5(basedir, call_back) if md5 == checksum[basedir][0]: return True, ['{}: OK'.format(basedir)] # if something goes wrong, we need to check more # find items that starts with basedir cs = {x:y for x,y in checksum.iteritems() if x.startswith(basedir) and x != basedir} if not cs: return False, ['{}: FAILED'.format(basedir)] # # if there are some information messages = [] for line in manifest: # existing file if line[0] not in cs: if line[2] == 'd': messages.append('{}: new directory added.'.format(line[0])) else: messages.append('{}: new file added.'.format(line[0])) continue if line[1] != cs[line[0]][0]: if line[2] == 'd': messages.append('{}: directory modified.'.format(line[0])) else: messages.append('{}: file modified.'.format(line[0])) cs.pop(line[0]) continue cs.pop(line[0]) # Removed files? for item, value in cs.iteritems(): if value[1] == 'd': messages.append('{}: directory removed.'.format(item)) else: messages.append('{}: file removed.'.format(item)) # messages.append('{}: FAILED'.format(basedir)) return False, messages if __name__ == '__main__': help_messages = { 'description': 'A tool that calculates the MD5 ' 'checksum of files and directories, and use it to check the integrity of ' 'these files and directories. It has a interface that is similar to the ' 'md5sum command, with support for checksum of directories. For files larger ' 'than 64M, this command caculates partial checksum based on the first and ' 'last 32Mb of data. ', 'items': 'Calculate MD5 signature of one or more files and directories and ' 'print MD5 checksums to the standard output.', 'check': 'Check the content of one or more files and directories using a ' 'file that contains the checksum of these files and directories. Gippped ' 'checksum file is acceptable. If a file is unspecified or is -, read from ' 'the standard input.', 'verbose': 'If specified, this program will output checksum for all files ' 'when the checksum of a directory is calculated. Such information ' 'will help the --check command to figure out what files have been changed ' 'if a directory checksum mismatch happens. This option will also enable ' 'a progress bar for file scanning.'} if use_argparse: parser = argparse.ArgumentParser(description=help_messages['description']) parser.add_argument('--version', action='version', version='%(prog)s 1.01') parser.add_argument('items', nargs='*', metavar='FILE_OR_DIR', help=help_messages['items']) parser.add_argument('-c', '--check', metavar='CHECKSUM', nargs='?', const='-', help=help_messages['check']) parser.add_argument('-v', '--verbose', action='store_true', help=help_messages['verbose']) args = parser.parse_args() else: parser = optparse.OptionParser(usage=help_messages['description'] + "\nusage: %prog [-v|-c] files_or_dirsars\n" + help_messages['items'], version='%prog 1.0') parser.add_option('-c', '--check', dest='check', help=help_messages['check'], metavar='CHECKSUM') parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help=help_messages['verbose']) (args, pos_args) = parser.parse_args() args.items = pos_args # if args.items and args.check is not None: sys.exit('Cannot specify calculate and check checksum at the same time.\n') elif args.items == [] and args.check is None: sys.stderr.write('Please specify either one or more directories, or --check option with a checksum file.\n') sys.exit(0) # if args.items: for item in args.items: basename = os.path.expanduser(item) # if this is a file, easy if os.path.isfile(basename): sys.stdout.write('{} {}\n'.format(calculateFileMD5(basename), basename)) continue if not os.path.isdir(basename): sys.exit('{} is not a valid file or directory.'.format(basename)) if args.verbose: # get number and size of files to be processed count, total_filesize = probeDir(basename) prog = ProgressBar('Scanning {} files under {}'.format(count, basename), total_filesize) md5, nfiles, ndirs, sfiles, total_nfiles, total_sfiles, manifest = calculateDirMD5(basename, call_back=prog.update) prog.done() else: md5, nfiles, ndirs, sfiles, total_nfiles, total_sfiles, manifest = calculateDirMD5(basename, None) # # Write in binary mode to handle non-ascii file and directory names if PY3: stdout = sys.stdout.buffer # write in binary mode elif sys.platform == 'win32': import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) stdout = sys.stdout else: stdout = sys.stdout stdout.write('{} {}\n'.format(md5, item.replace('\\', '/')).encode('utf-8')) if args.verbose: # the manifest is internally sorted by MD5 due to compatibility issues with Python, OS and filesystem. # the final output is sorted by filename to make the output easier to read manifest.sort() stdout.write('## MD5\ttype\tnum_files\tnum_dirs\tfilesize\ttotal_num_files\ttotal_filesize\tname\n'.encode('utf-8')) stdout.write('#{}\td\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(md5, nfiles, ndirs, sfiles, total_nfiles, total_sfiles, basename).encode('utf-8')) for item in manifest: if PY3: stdout.write('#{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(item[1], item[2], item[3], item[4], item[5], item[6], item[7], item[0].replace('\\', '/')).encode('utf-8')) else: stdout.write('#{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(item[1], item[2], item[3], item[4], item[5], item[6], item[7], item[0].replace('\\', '/'))) # if args.check is not None: signed_dirs = [] checksum = {} all_succ = True if args.check == '-': checksum_file = sys.stdin elif args.check.endswith('.gz'): checksum_file = gzip.open(args.check, 'rb') else: checksum_file = open(args.check, 'rb') for line in checksum_file: try: if type(line) != str: line = line.decode('utf-8') if line.startswith('##'): continue if line.startswith('#'): # the details md5, ftype, nfiles, ndirs, sfiles, total_nfiles, total_sfiles, name = line.strip().split('\t') # md5 will contain a leading # checksum[name] = [md5[1:], ftype, sfiles] else: md5, basedir = line.strip().split(' ', 1) if not os.path.isdir(basedir) and not os.path.isfile(basedir): sys.stderr.write('{}: missing file or directory.\n'.format(basedir)) all_succ = False else: checksum[basedir] = [md5, 'd', 0] signed_dirs.append(basedir) except Exception as e: sys.exit('Invalid manifest line "{}": {}'.format(line.strip(), e)) if args.check != '-': checksum_file.close() # # check directories for basedir in signed_dirs: # get number and size of files to be processed if args.verbose: count, total_filesize = probeDir(basedir) prog = ProgressBar(basedir, total_filesize) succ, messages = checkDir(basedir, checksum, call_back=prog.update) prog.done() else: succ, messages = checkDir(basedir, checksum, call_back=None) all_succ = all_succ and succ for msg in messages: sys.stderr.write(msg + '\n') sys.exit(all_succ)