Initial commit
This commit is contained in:
78
hashFiles.py
Executable file
78
hashFiles.py
Executable file
@@ -0,0 +1,78 @@
|
||||
import os
|
||||
from pyhashxx import Hashxx
|
||||
from pymongo import MongoClient
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
|
||||
client = MongoClient('docker', 27017)
|
||||
db = client['duplicates_finder']
|
||||
db_file = db.file
|
||||
db_discovery = db.discovery
|
||||
|
||||
|
||||
def hash_file(fname):
|
||||
hasher = Hashxx()
|
||||
with open(fname, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(2 ** 20), b""):
|
||||
hasher.update(chunk)
|
||||
return hasher.digest()
|
||||
|
||||
|
||||
def process(discovery_id, path):
|
||||
for (root, dirs, files) in os.walk(path):
|
||||
files = [f for f in files if not f[0] == '.']
|
||||
dirs[:] = [d for d in dirs if not d[0] == '.']
|
||||
for filename in files:
|
||||
try:
|
||||
path = os.path.join(root, filename)
|
||||
# print(path)
|
||||
extension = os.path.splitext(filename)[1]
|
||||
mod_time = os.path.getmtime(path)
|
||||
created_time = os.path.getctime(path)
|
||||
file_hash = hash_file(path)
|
||||
size = os.path.getsize(path)
|
||||
|
||||
# print("Hash: " + str(file_hash))
|
||||
db_file.insert_one({
|
||||
'discovery_id': discovery_id,
|
||||
'path': path,
|
||||
'file': filename,
|
||||
'extension': extension,
|
||||
'modified': mod_time,
|
||||
'created': created_time,
|
||||
'size': size,
|
||||
'hash': file_hash
|
||||
})
|
||||
except OSError:
|
||||
continue
|
||||
except UnicodeEncodeError:
|
||||
continue
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_time = time.time()
|
||||
|
||||
name = sys.argv[1]
|
||||
path = sys.argv[2]
|
||||
|
||||
discovery_id = db_discovery.insert_one({
|
||||
'start': start_time,
|
||||
'hostname': socket.gethostname(),
|
||||
'name': name,
|
||||
'path': path
|
||||
}).inserted_id
|
||||
|
||||
process(discovery_id, path)
|
||||
|
||||
finished = time.time()
|
||||
duration = finished - start_time
|
||||
|
||||
db_discovery.update_one(
|
||||
{'_id': discovery_id},
|
||||
{"$set": {"finish": finished, "duration": duration}},
|
||||
upsert=False
|
||||
)
|
||||
|
||||
print(f"Processed in {duration} seconds")
|
||||
Reference in New Issue
Block a user