Difference between revisions of "ProvaPratica 2013.06.21"

From Sistemi Operativi
Jump to navigation Jump to search
Line 44: Line 44:
 
             sameSize[fileSize] = sameSize.get(fileSize, []) + [filePath]   
 
             sameSize[fileSize] = sameSize.get(fileSize, []) + [filePath]   
 
   
 
   
# Walk through the directory tree and populate the dictionary
+
# Build a dictionary with key-value pair {MD5 hash - [file name]}
 
def PopulateSameContent(files, sameContent):
 
def PopulateSameContent(files, sameContent):
 
     for filePath in files:
 
     for filePath in files:

Revision as of 23:43, 21 November 2013

[Python 3]

'''
Prova Pratica di Laboratorio di Sistemi Operativi
20 giugno 2013
Esercizio 3

URL: http://www.cs.unibo.it/~renzo/so/pratiche/2013.06.21.pdf

@author: Tommaso Ognibene
'''

import os, sys, hashlib

def Main(argv):
    # Check number of arguments
    if len(argv) != 1:
        print("The function does not require arguments to be passed in.")
        return
    
    # Build a dictionary with key-value pair {file size - [file name]}
    sameSize = { }
    PopulateSameSize(sameSize)
    
    # Build a dictionary with key-value pair {MD5 hash - [file name]}
    sameContent = { }
    for files in sorted(sameSize.values(), key = len, reverse = True):
        if len(files) < 2:
            break
        PopulateSameContent(files, sameContent)

    # Print results
    PrintResults(sameContent)

    print("Done!")
     
# Populate a dictionary with key-value pair {file size - [file name]}
def PopulateSameSize(sameSize):
    for dirPath, dirNames, fileNames in os.walk(os.getcwd()):
        for fileName in fileNames:
            filePath = os.path.join(dirPath, fileName)
            fileSize = os.path.getsize(filePath)
            sameSize[fileSize] = sameSize.get(fileSize, []) + [filePath]  
 
# Build a dictionary with key-value pair {MD5 hash - [file name]}
def PopulateSameContent(files, sameContent):
    for filePath in files:
        md5 = Md5Checksum(filePath)
        fileRelPath = os.path.relpath(filePath, os.getcwd())
        sameContent[md5] = sameContent.get(md5, []) + [fileRelPath]

# Compute the MD5 hash of a file
def Md5Checksum(filePath):
    with open(filePath, 'rb') as file:
        m = hashlib.md5()
        while True:
            data = file.read(8192)
            if not data:
                break
            m.update(data)
        return m.hexdigest()

# Printout the lists of files having same content
def PrintResults(sameContent):
    print('List of files having same content:')
    for list in sameContent.values():
        if len(list) > 1:
            print("[{0}]".format(", ".join(str(i) for i in list)))
        
if __name__ == "__main__":
    sys.exit(Main(sys.argv))