1
0

_is_file_ok now is _check_file and modified to be more efficient. Also added check for if files happened to share same name and size, but IB said wrong hash.

This commit is contained in:
Alexander Andreev 2020-11-18 23:47:26 +04:00
parent 8403fcf0f2
commit bb47b50c5f

View File

@ -5,7 +5,7 @@ from os import remove, stat
from os.path import exists, join, getsize from os.path import exists, join, getsize
import re import re
from typing import List, Callable from typing import List, Callable
from urllib.request import urlretrieve, URLopener from urllib.request import urlretrieve, URLopener, HTTPError
import hashlib import hashlib
from scrapthechan import USER_AGENT from scrapthechan import USER_AGENT
@ -63,35 +63,46 @@ class Scraper:
newname = f"{newname[:lbracket]}({int(num)+1})" newname = f"{newname[:lbracket]}({int(num)+1})"
return newname return newname
def _hash_file(self, filename: str, hash_algo: str = "md5", def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
blocksize: int = 1048576) -> (str, str): blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file.""" """Compute hash of a file."""
hash_func = hashlib.new(hash_algo) hash_func = hashlib.new(hash_algorithm)
with open(filename, 'rb') as f: with open(filepath, 'rb') as f:
buf = f.read(blocksize) buf = f.read(blocksize)
while len(buf) > 0: while len(buf) > 0:
hash_func.update(buf) hash_func.update(buf)
buf = f.read(blocksize) buf = f.read(blocksize)
return hash_func.hexdigest(), hash_func.digest() return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
def _is_file_ok(self, f: FileInfo, filepath: str) -> bool: def _check_file(self, f: FileInfo, filepath: str) -> bool:
"""Check if a file exist and isn't broken.""" """Check if a file exist and isn't broken."""
if not exists(filepath): if not exists(filepath):
return False return False
computed_size = getsize(filepath) computed_size = getsize(filepath)
is_size_match = f.size == computed_size \ if not (f.size == computed_size \
or f.size == round(computed_size / 1024) or f.size == round(computed_size / 1024)):
hexdig, dig = self._hash_file(filepath, f.hash_algo) return False
is_hash_match = f.hash_value == hexdig \ hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
or f.hash_value == b64encode(dig).decode() return f.hash_value == hexdig or f.hash_value == dig
return is_size_match and is_hash_match
def _download_file(self, f: FileInfo): def _download_file(self, f: FileInfo):
"""Download a single file.""" """Download a single file."""
is_same_filename = False
filepath = join(self._save_directory, f.name) filepath = join(self._save_directory, f.name)
if self._is_file_ok(f, filepath): orig_filepath = filepath
return True if self._check_file(f, filepath):
return
elif exists(filepath): elif exists(filepath):
is_same_filename = True
filepath = join(self._save_directory, \ filepath = join(self._save_directory, \
self._same_filename(f.name, self._save_directory)) self._same_filename(f.name, self._save_directory))
self._url_opener.retrieve(f.dlurl, filepath) try:
self._url_opener.retrieve(f.download_url, filepath)
if is_same_filename:
f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
assert filepath != orig_filepath, 'Filepaths are matching!'
if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
remove(filepath)
except HTTPError as e:
print(e, f.download_url)