Difference between revisions of "ProvaPratica 2013.06.21"

From Sistemi Operativi
Jump to navigation Jump to search
Line 54: Line 54:
 
# Break the file in chunks whose size is a multiple of 128
 
# Break the file in chunks whose size is a multiple of 128
 
# This takes advantage of the fact that MD5 has 128-byte digest blocks
 
# This takes advantage of the fact that MD5 has 128-byte digest blocks
# Default block size = Default NTFS cluster size = 4 KB
+
def GetMd5Hash(filePath, blockSize = 2 ** 20):
def GetMd5Hash(filePath, blockSize = 4096):
 
 
     digest = hashlib.md5()
 
     digest = hashlib.md5()
 
     with open(filePath, "rb") as file:
 
     with open(filePath, "rb") as file:

Revision as of 08:44, 25 November 2013

[Python 3]

'''
Prova Pratica di Laboratorio di Sistemi Operativi
20 giugno 2013
Esercizio 3

URL: http://www.cs.unibo.it/~renzo/so/pratiche/2013.06.21.pdf

@author: Tommaso Ognibene
'''

import os, sys, hashlib

def Main(argv):
    # Check number of arguments
    if len(argv) != 1:
        print("The function does not require arguments to be passed in.")
        return
    
    # Build a dictionary with key-value pair {file size - [file name]}
    sameSize = { }
    PopulateSameSize(sameSize)
    
    # Build a dictionary with key-value pair {MD5 hash - [file name]}
    sameContent = { }
    for filePaths in sorted(sameSize.values(), key = len, reverse = True):
        # No files with same size => No files with same content
        if len(filePaths) < 2: break
        PopulateSameContent(filePaths, sameContent)

    # Print results
    PrintResults(sameContent)

    print("Done!")
     
# Populate a dictionary with key-value pair {file size - [file name]}
def PopulateSameSize(sameSize):
    for dirPath, _, fileNames in os.walk(os.getcwd()):
        for fileName in fileNames:
            filePath = os.path.join(dirPath, fileName)
            fileSize = os.path.getsize(filePath)
            sameSize[fileSize] = sameSize.get(fileSize, []) + [filePath]  
 
# Populate a dictionary with key-value pair {MD5 hash - [file name]}
def PopulateSameContent(filePaths, sameContent):
    for filePath in filePaths:
        md5 = GetMd5Hash(filePath)
        fileRelPath = os.path.relpath(filePath, os.getcwd())
        sameContent[md5] = sameContent.get(md5, []) + [fileRelPath]

# Get the MD5 hash without loading the whole file to memory
# Break the file in chunks whose size is a multiple of 128
# This takes advantage of the fact that MD5 has 128-byte digest blocks
def GetMd5Hash(filePath, blockSize = 2 ** 20):
    digest = hashlib.md5()
    with open(filePath, "rb") as file:
        for chunk in iter(lambda: file.read(blockSize), b''): 
            digest.update(chunk)
    return digest.hexdigest()

# Printout the lists of files having same content
def PrintResults(sameContent):
    print("Lists of files having same content:")
    for files in sorted(sameContent.values(), key = len, reverse = True):
        if len(files) < 2: break
        print("[{0}]".format(", ".join(file for file in files)))
        
if __name__ == "__main__":
    sys.exit(Main(sys.argv))





ecco la mia versione:

import os, hashlib

def fileCurrDir():#restituisce una lista res di file presenti nella directory
	fcd = os.listdir('.')
	res = []
	for ott in fcd:
		if os.path.isfile('{0}'.format(ott)):res.append(ott)
		else:continue
	return res

def dictsize(fl=fileCurrDir()):#restituisce un dizionario con key filesize e value lista di filenames aventi size di filesize
	res = {}
	for f in fl:
		if os.path.getsize('{0}'.format(f)) in list(res.keys()):res[os.path.getsize('{0}'.format(f))].append(f);continue
		else:pass
		res[os.path.getsize('{0}'.format(f))] = list()
		res[os.path.getsize('{0}'.format(f))].append(f)
	return res

def dictremsa(a=dictsize()):#data un dizionario key::list rimuove tutti gli item la cui len di lista sia unitaria
	tmp = list(a.keys())
	for tmpkey in tmp:
		if len(a[tmpkey]) < 2: a.pop(tmpkey)
		else:continue
	return a

def hashcontrolinsl(l1): #data una lista di nomi di file compara l hash di tutte le possibili coppie dentro 	l1
	while l1 != []:
		toTest = l1.pop()
		del res[:]
		res.append(toTest)
		for tmp in l1:
			#res.append(list[toTest,tmp])
	#return res
			hasher = hashlib.md5()
			hasher2 = hashlib.md5()
			f = open('{0}'.format(toTest), 'rb')
			toHash = f.read()
			hasher.update(toHash)
			toTesthash = hasher.hexdigest()
			f.close()
			f = open('{0}'.format(tmp), 'rb')
			toHash2 = f.read()
			hasher2.update(toHash2)
			tmphash = hasher2.hexdigest()
			if tmphash==toTesthash: #print('{0} e {1}  sono uguali\n'.format(toTest,tmp))
				res.append(tmp)
			else:continue
		if len(res)>1: 
			print(res);res.pop(0)
			for j in res:
				l1.pop(l1.index(j))
		
		

		


def hashcontroltoMajorTom(a=dictremsa()):#fa in modo che vengano "hashate" solo delle liste di file che abbiano passato il "stessadimenzione" test
	hasher = hashlib.md5()
	try:
		obviouslyequal = a.pop(0)
		print("i seguenti file hanno lo stesso contenuto... NULLA!!!!!:\n")
		for oe in obviouslyequal:
			print("{0}".format(oe))
	except KeyError:pass
	values = list(a.values())
	print("\ni seguenti file contengono qualcosa ma sono uguali:\n")
	for namelist in values:
		hashcontrolinsl(namelist)


hashcontroltoMajorTom()

-fede