Initial commit

This commit is contained in:
2020-05-08 14:39:22 +01:00
commit 57828567af
1662 changed files with 248701 additions and 0 deletions

78
hashFiles.py Executable file
View File

@@ -0,0 +1,78 @@
import os
from pyhashxx import Hashxx
from pymongo import MongoClient
import socket
import sys
import time
import uuid
client = MongoClient('docker', 27017)
db = client['duplicates_finder']
db_file = db.file
db_discovery = db.discovery
def hash_file(fname):
hasher = Hashxx()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(2 ** 20), b""):
hasher.update(chunk)
return hasher.digest()
def process(discovery_id, path):
for (root, dirs, files) in os.walk(path):
files = [f for f in files if not f[0] == '.']
dirs[:] = [d for d in dirs if not d[0] == '.']
for filename in files:
try:
path = os.path.join(root, filename)
# print(path)
extension = os.path.splitext(filename)[1]
mod_time = os.path.getmtime(path)
created_time = os.path.getctime(path)
file_hash = hash_file(path)
size = os.path.getsize(path)
# print("Hash: " + str(file_hash))
db_file.insert_one({
'discovery_id': discovery_id,
'path': path,
'file': filename,
'extension': extension,
'modified': mod_time,
'created': created_time,
'size': size,
'hash': file_hash
})
except OSError:
continue
except UnicodeEncodeError:
continue
if __name__ == "__main__":
start_time = time.time()
name = sys.argv[1]
path = sys.argv[2]
discovery_id = db_discovery.insert_one({
'start': start_time,
'hostname': socket.gethostname(),
'name': name,
'path': path
}).inserted_id
process(discovery_id, path)
finished = time.time()
duration = finished - start_time
db_discovery.update_one(
{'_id': discovery_id},
{"$set": {"finish": finished, "duration": duration}},
upsert=False
)
print(f"Processed in {duration} seconds")