Difference between revisions of "ProvaPratica 2013.06.21"

From Sistemi Operativi
Jump to navigation Jump to search
Line 26: Line 26:
 
     # Build a dictionary with key-value pair {MD5 hash - [file name]}
 
     # Build a dictionary with key-value pair {MD5 hash - [file name]}
 
     sameContent = { }
 
     sameContent = { }
     for files in sorted(sameSize.values(), key = len, reverse = True):
+
     for filePaths in sorted(sameSize.values(), key = len, reverse = True):
         if len(files) < 2:
+
        # No files with same size => No files with same content
            break
+
         if len(filePaths) < 2: break
         PopulateSameContent(files, sameContent)
+
         PopulateSameContent(filePaths, sameContent)
  
 
     # Print results
 
     # Print results
Line 38: Line 38:
 
# Populate a dictionary with key-value pair {file size - [file name]}
 
# Populate a dictionary with key-value pair {file size - [file name]}
 
def PopulateSameSize(sameSize):
 
def PopulateSameSize(sameSize):
     for dirPath, dirNames, fileNames in os.walk(os.getcwd()):
+
     for dirPath, _, fileNames in os.walk(os.getcwd()):
 
         for fileName in fileNames:
 
         for fileName in fileNames:
 
             filePath = os.path.join(dirPath, fileName)
 
             filePath = os.path.join(dirPath, fileName)
Line 44: Line 44:
 
             sameSize[fileSize] = sameSize.get(fileSize, []) + [filePath]   
 
             sameSize[fileSize] = sameSize.get(fileSize, []) + [filePath]   
 
   
 
   
# Populate a dictionary with key-value pair {MD5 hash - [file name]}
+
# Walk through the directory tree and populate the dictionary
def PopulateSameContent(files, sameContent):
+
def PopulateSameContent(filePaths, sameContent):
     for filePath in files:
+
     for filePath in filePaths:
         md5 = Md5Checksum(filePath)
+
         md5 = GetMd5Hash(filePath)
 
         fileRelPath = os.path.relpath(filePath, os.getcwd())
 
         fileRelPath = os.path.relpath(filePath, os.getcwd())
 
         sameContent[md5] = sameContent.get(md5, []) + [fileRelPath]
 
         sameContent[md5] = sameContent.get(md5, []) + [fileRelPath]
  
# Compute the MD5 hash of a file
+
# Get the MD5 hash without loading the whole file to memory
def Md5Checksum(filePath):
+
# Break the file in chunks whose size is a multiple of 128
     with open(filePath, 'rb') as file:
+
# This takes advantage of the fact that MD5 has 128-byte digest blocks
         m = hashlib.md5()
+
# Default block size = Default NTFS cluster size = 4 KB
        while True:
+
def GetMd5Hash(filePath, blockSize = 4096):
            data = file.read(8192)
+
    digest = hashlib.md5()
            if not data:
+
     with open(filePath, "rb") as file:
                break
+
         for chunk in iter(lambda: file.read(blockSize), b''):  
             m.update(data)
+
             digest.update(chunk)
        return m.hexdigest()
+
    return digest.hexdigest()
  
 
# Printout the lists of files having same content
 
# Printout the lists of files having same content
 
def PrintResults(sameContent):
 
def PrintResults(sameContent):
     print('List of files having same content:')
+
     print("Lists of files having same content:")
     for list in sameContent.values():
+
     for files in sorted(sameContent.values(), key = len, reverse = True):
         if len(list) > 1:
+
         if len(files) < 2: break
            print("[{0}]".format(", ".join(str(i) for i in list)))
+
        print("[{0}]".format(", ".join(file for file in files)))
 
          
 
          
 
if __name__ == "__main__":
 
if __name__ == "__main__":

Revision as of 16:10, 24 November 2013

[Python 3]

'''
Prova Pratica di Laboratorio di Sistemi Operativi
20 giugno 2013
Esercizio 3

URL: http://www.cs.unibo.it/~renzo/so/pratiche/2013.06.21.pdf

@author: Tommaso Ognibene
'''

import os, sys, hashlib

def Main(argv):
    # Check number of arguments
    if len(argv) != 1:
        print("The function does not require arguments to be passed in.")
        return
    
    # Build a dictionary with key-value pair {file size - [file name]}
    sameSize = { }
    PopulateSameSize(sameSize)
    
    # Build a dictionary with key-value pair {MD5 hash - [file name]}
    sameContent = { }
    for filePaths in sorted(sameSize.values(), key = len, reverse = True):
        # No files with same size => No files with same content
        if len(filePaths) < 2: break
        PopulateSameContent(filePaths, sameContent)

    # Print results
    PrintResults(sameContent)

    print("Done!")
     
# Populate a dictionary with key-value pair {file size - [file name]}
def PopulateSameSize(sameSize):
    for dirPath, _, fileNames in os.walk(os.getcwd()):
        for fileName in fileNames:
            filePath = os.path.join(dirPath, fileName)
            fileSize = os.path.getsize(filePath)
            sameSize[fileSize] = sameSize.get(fileSize, []) + [filePath]  
 
# Walk through the directory tree and populate the dictionary
def PopulateSameContent(filePaths, sameContent):
    for filePath in filePaths:
        md5 = GetMd5Hash(filePath)
        fileRelPath = os.path.relpath(filePath, os.getcwd())
        sameContent[md5] = sameContent.get(md5, []) + [fileRelPath]

# Get the MD5 hash without loading the whole file to memory
# Break the file in chunks whose size is a multiple of 128
# This takes advantage of the fact that MD5 has 128-byte digest blocks
# Default block size = Default NTFS cluster size = 4 KB
def GetMd5Hash(filePath, blockSize = 4096):
    digest = hashlib.md5()
    with open(filePath, "rb") as file:
        for chunk in iter(lambda: file.read(blockSize), b''): 
            digest.update(chunk)
    return digest.hexdigest()

# Printout the lists of files having same content
def PrintResults(sameContent):
    print("Lists of files having same content:")
    for files in sorted(sameContent.values(), key = len, reverse = True):
        if len(files) < 2: break
        print("[{0}]".format(", ".join(file for file in files)))
        
if __name__ == "__main__":
    sys.exit(Main(sys.argv))





ecco la mia versione:

import os, hashlib

def fileCurrDir():#restituisce una lista res di file presenti nella directory
	fcd = os.listdir('.')
	res = []
	for ott in fcd:
		if os.path.isfile('{0}'.format(ott)):res.append(ott)
		else:continue
	return res

def dictsize(fl=fileCurrDir()):#restituisce un dizionario con key filesize e value lista di filenames aventi size di filesize
	res = {}
	for f in fl:
		if os.path.getsize('{0}'.format(f)) in list(res.keys()):res[os.path.getsize('{0}'.format(f))].append(f);continue
		else:pass
		res[os.path.getsize('{0}'.format(f))] = list()
		res[os.path.getsize('{0}'.format(f))].append(f)
	return res

def dictremsa(a=dictsize()):#data un dizionario key::list rimuove tutti gli item la cui len di lista sia unitaria
	tmp = list(a.keys())
	for tmpkey in tmp:
		if len(a[tmpkey]) < 2: a.pop(tmpkey)
		else:continue
	return a

def hashcontrolinsl(l1): #data una lista di nomi di file compara l hash di tutte le possibili coppie dentro 	l1
	while l1 != []:
		toTest = l1.pop()
		del res[:]
		res.append(toTest)
		for tmp in l1:
			#res.append(list[toTest,tmp])
	#return res
			hasher = hashlib.md5()
			hasher2 = hashlib.md5()
			f = open('{0}'.format(toTest), 'rb')
			toHash = f.read()
			hasher.update(toHash)
			toTesthash = hasher.hexdigest()
			f.close()
			f = open('{0}'.format(tmp), 'rb')
			toHash2 = f.read()
			hasher2.update(toHash2)
			tmphash = hasher2.hexdigest()
			if tmphash==toTesthash: #print('{0} e {1}  sono uguali\n'.format(toTest,tmp))
				res.append(tmp)
			else:continue
		if len(res)>1: 
			print(res);res.pop(0)
			for j in res:
				l1.pop(l1.index(j))
		
		

		


def hashcontroltoMajorTom(a=dictremsa()):#fa in modo che vengano "hashate" solo delle liste di file che abbiano passato il "stessadimenzione" test
	hasher = hashlib.md5()
	try:
		obviouslyequal = a.pop(0)
		print("i seguenti file hanno lo stesso contenuto... NULLA!!!!!:\n")
		for oe in obviouslyequal:
			print("{0}".format(oe))
	except KeyError:pass
	values = list(a.values())
	print("\ni seguenti file contengono qualcosa ma sono uguali:\n")
	for namelist in values:
		hashcontrolinsl(namelist)


hashcontroltoMajorTom()

-fede