1
0

_is_file_ok now is _check_file and modified to be more efficient. Also added check for if files happened to share same name and size, but IB said wrong hash.

This commit is contained in:
Alexander Andreev 2020-11-18 23:47:26 +04:00
parent 8403fcf0f2
commit bb47b50c5f

View File

@ -5,7 +5,7 @@ from os import remove, stat
from os.path import exists, join, getsize
import re
from typing import List, Callable
from urllib.request import urlretrieve, URLopener
from urllib.request import urlretrieve, URLopener, HTTPError
import hashlib
from scrapthechan import USER_AGENT
@ -63,35 +63,46 @@ class Scraper:
newname = f"{newname[:lbracket]}({int(num)+1})"
return newname
def _hash_file(self, filename: str, hash_algo: str = "md5",
def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file."""
hash_func = hashlib.new(hash_algo)
with open(filename, 'rb') as f:
hash_func = hashlib.new(hash_algorithm)
with open(filepath, 'rb') as f:
buf = f.read(blocksize)
while len(buf) > 0:
hash_func.update(buf)
buf = f.read(blocksize)
return hash_func.hexdigest(), hash_func.digest()
return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
def _is_file_ok(self, f: FileInfo, filepath: str) -> bool:
def _check_file(self, f: FileInfo, filepath: str) -> bool:
"""Check if a file exist and isn't broken."""
if not exists(filepath):
return False
computed_size = getsize(filepath)
is_size_match = f.size == computed_size \
or f.size == round(computed_size / 1024)
hexdig, dig = self._hash_file(filepath, f.hash_algo)
is_hash_match = f.hash_value == hexdig \
or f.hash_value == b64encode(dig).decode()
return is_size_match and is_hash_match
if not (f.size == computed_size \
or f.size == round(computed_size / 1024)):
return False
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
return f.hash_value == hexdig or f.hash_value == dig
def _download_file(self, f: FileInfo):
"""Download a single file."""
is_same_filename = False
filepath = join(self._save_directory, f.name)
if self._is_file_ok(f, filepath):
return True
orig_filepath = filepath
if self._check_file(f, filepath):
return
elif exists(filepath):
is_same_filename = True
filepath = join(self._save_directory, \
self._same_filename(f.name, self._save_directory))
self._url_opener.retrieve(f.dlurl, filepath)
try:
self._url_opener.retrieve(f.download_url, filepath)
if is_same_filename:
f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
assert filepath != orig_filepath, 'Filepaths are matching!'
if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
remove(filepath)
except HTTPError as e:
print(e, f.download_url)