79 lines
2.1 KiB
Python
Executable File
79 lines
2.1 KiB
Python
Executable File
import os
|
|
from pyhashxx import Hashxx
|
|
from pymongo import MongoClient
|
|
import socket
|
|
import sys
|
|
import time
|
|
import uuid
|
|
|
|
client = MongoClient('docker', 27017)
|
|
db = client['duplicates_finder']
|
|
db_file = db.file
|
|
db_discovery = db.discovery
|
|
|
|
|
|
def hash_file(fname):
|
|
hasher = Hashxx()
|
|
with open(fname, "rb") as f:
|
|
for chunk in iter(lambda: f.read(2 ** 20), b""):
|
|
hasher.update(chunk)
|
|
return hasher.digest()
|
|
|
|
|
|
def process(discovery_id, path):
|
|
for (root, dirs, files) in os.walk(path):
|
|
files = [f for f in files if not f[0] == '.']
|
|
dirs[:] = [d for d in dirs if not d[0] == '.']
|
|
for filename in files:
|
|
try:
|
|
path = os.path.join(root, filename)
|
|
# print(path)
|
|
extension = os.path.splitext(filename)[1]
|
|
mod_time = os.path.getmtime(path)
|
|
created_time = os.path.getctime(path)
|
|
file_hash = hash_file(path)
|
|
size = os.path.getsize(path)
|
|
|
|
# print("Hash: " + str(file_hash))
|
|
db_file.insert_one({
|
|
'discovery_id': discovery_id,
|
|
'path': path,
|
|
'file': filename,
|
|
'extension': extension,
|
|
'modified': mod_time,
|
|
'created': created_time,
|
|
'size': size,
|
|
'hash': file_hash
|
|
})
|
|
except OSError:
|
|
continue
|
|
except UnicodeEncodeError:
|
|
continue
|
|
|
|
|
|
if __name__ == "__main__":
|
|
start_time = time.time()
|
|
|
|
name = sys.argv[1]
|
|
path = sys.argv[2]
|
|
|
|
discovery_id = db_discovery.insert_one({
|
|
'start': start_time,
|
|
'hostname': socket.gethostname(),
|
|
'name': name,
|
|
'path': path
|
|
}).inserted_id
|
|
|
|
process(discovery_id, path)
|
|
|
|
finished = time.time()
|
|
duration = finished - start_time
|
|
|
|
db_discovery.update_one(
|
|
{'_id': discovery_id},
|
|
{"$set": {"finish": finished, "duration": duration}},
|
|
upsert=False
|
|
)
|
|
|
|
print(f"Processed in {duration} seconds")
|