import os from pyhashxx import Hashxx from pymongo import MongoClient import socket import sys import time import uuid client = MongoClient('docker', 27017) db = client['duplicates_finder'] db_file = db.file db_discovery = db.discovery def hash_file(fname): hasher = Hashxx() with open(fname, "rb") as f: for chunk in iter(lambda: f.read(2 ** 20), b""): hasher.update(chunk) return hasher.digest() def process(discovery_id, path): for (root, dirs, files) in os.walk(path): files = [f for f in files if not f[0] == '.'] dirs[:] = [d for d in dirs if not d[0] == '.'] for filename in files: try: path = os.path.join(root, filename) # print(path) extension = os.path.splitext(filename)[1] mod_time = os.path.getmtime(path) created_time = os.path.getctime(path) file_hash = hash_file(path) size = os.path.getsize(path) # print("Hash: " + str(file_hash)) db_file.insert_one({ 'discovery_id': discovery_id, 'path': path, 'file': filename, 'extension': extension, 'modified': mod_time, 'created': created_time, 'size': size, 'hash': file_hash }) except OSError: continue except UnicodeEncodeError: continue if __name__ == "__main__": start_time = time.time() name = sys.argv[1] path = sys.argv[2] discovery_id = db_discovery.insert_one({ 'start': start_time, 'hostname': socket.gethostname(), 'name': name, 'path': path }).inserted_id process(discovery_id, path) finished = time.time() duration = finished - start_time db_discovery.update_one( {'_id': discovery_id}, {"$set": {"finish": finished, "duration": duration}}, upsert=False ) print(f"Processed in {duration} seconds")