_is_file_ok now is _check_file and modified to be more efficient. Also added check for if files happened to share same name and size, but IB said wrong hash.
This commit is contained in:
parent
8403fcf0f2
commit
bb47b50c5f
@ -5,7 +5,7 @@ from os import remove, stat
|
|||||||
from os.path import exists, join, getsize
|
from os.path import exists, join, getsize
|
||||||
import re
|
import re
|
||||||
from typing import List, Callable
|
from typing import List, Callable
|
||||||
from urllib.request import urlretrieve, URLopener
|
from urllib.request import urlretrieve, URLopener, HTTPError
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
from scrapthechan import USER_AGENT
|
from scrapthechan import USER_AGENT
|
||||||
@ -63,35 +63,46 @@ class Scraper:
|
|||||||
newname = f"{newname[:lbracket]}({int(num)+1})"
|
newname = f"{newname[:lbracket]}({int(num)+1})"
|
||||||
return newname
|
return newname
|
||||||
|
|
||||||
def _hash_file(self, filename: str, hash_algo: str = "md5",
|
def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
|
||||||
blocksize: int = 1048576) -> (str, str):
|
blocksize: int = 1048576) -> (str, str):
|
||||||
"""Compute hash of a file."""
|
"""Compute hash of a file."""
|
||||||
hash_func = hashlib.new(hash_algo)
|
hash_func = hashlib.new(hash_algorithm)
|
||||||
with open(filename, 'rb') as f:
|
with open(filepath, 'rb') as f:
|
||||||
buf = f.read(blocksize)
|
buf = f.read(blocksize)
|
||||||
while len(buf) > 0:
|
while len(buf) > 0:
|
||||||
hash_func.update(buf)
|
hash_func.update(buf)
|
||||||
buf = f.read(blocksize)
|
buf = f.read(blocksize)
|
||||||
return hash_func.hexdigest(), hash_func.digest()
|
return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
|
||||||
|
|
||||||
def _is_file_ok(self, f: FileInfo, filepath: str) -> bool:
|
def _check_file(self, f: FileInfo, filepath: str) -> bool:
|
||||||
"""Check if a file exist and isn't broken."""
|
"""Check if a file exist and isn't broken."""
|
||||||
if not exists(filepath):
|
if not exists(filepath):
|
||||||
return False
|
return False
|
||||||
computed_size = getsize(filepath)
|
computed_size = getsize(filepath)
|
||||||
is_size_match = f.size == computed_size \
|
if not (f.size == computed_size \
|
||||||
or f.size == round(computed_size / 1024)
|
or f.size == round(computed_size / 1024)):
|
||||||
hexdig, dig = self._hash_file(filepath, f.hash_algo)
|
return False
|
||||||
is_hash_match = f.hash_value == hexdig \
|
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
|
||||||
or f.hash_value == b64encode(dig).decode()
|
return f.hash_value == hexdig or f.hash_value == dig
|
||||||
return is_size_match and is_hash_match
|
|
||||||
|
|
||||||
def _download_file(self, f: FileInfo):
|
def _download_file(self, f: FileInfo):
|
||||||
"""Download a single file."""
|
"""Download a single file."""
|
||||||
|
is_same_filename = False
|
||||||
filepath = join(self._save_directory, f.name)
|
filepath = join(self._save_directory, f.name)
|
||||||
if self._is_file_ok(f, filepath):
|
orig_filepath = filepath
|
||||||
return True
|
if self._check_file(f, filepath):
|
||||||
|
return
|
||||||
elif exists(filepath):
|
elif exists(filepath):
|
||||||
|
is_same_filename = True
|
||||||
filepath = join(self._save_directory, \
|
filepath = join(self._save_directory, \
|
||||||
self._same_filename(f.name, self._save_directory))
|
self._same_filename(f.name, self._save_directory))
|
||||||
self._url_opener.retrieve(f.dlurl, filepath)
|
try:
|
||||||
|
self._url_opener.retrieve(f.download_url, filepath)
|
||||||
|
if is_same_filename:
|
||||||
|
f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
|
||||||
|
f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
|
||||||
|
assert filepath != orig_filepath, 'Filepaths are matching!'
|
||||||
|
if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
|
||||||
|
remove(filepath)
|
||||||
|
except HTTPError as e:
|
||||||
|
print(e, f.download_url)
|
||||||
|
Loading…
Reference in New Issue
Block a user