1
0

Improved error handling, retries for damaged files.

This commit is contained in:
Alexander Andreev 2020-11-19 01:26:19 +04:00
parent 21837c5335
commit 7b2fcf0899

View File

@ -15,97 +15,115 @@ __all__ = ["Scraper"]
class Scraper: class Scraper:
"""Base class for all scrapers that will actually do the job. """Base class for all scrapers that will actually do the job.
Arguments: Arguments:
save_directory -- a path to a directory where file will be save_directory -- a path to a directory where file will be
saved; saved;
files -- a list of FileInfo objects; files -- a list of FileInfo objects;
download_progress_callback -- a callback function that will be called download_progress_callback -- a callback function that will be called
for each file started downloading. for each file started downloading.
""" """
def __init__(self, save_directory: str, files: List[FileInfo], def __init__(self, save_directory: str, files: List[FileInfo],
download_progress_callback: Callable[[int], None] = None) -> None: download_progress_callback: Callable[[int], None] = None) -> None:
self._save_directory = save_directory self._save_directory = save_directory
self._files = files self._files = files
self._url_opener = URLopener() self._url_opener = URLopener()
self._url_opener.addheaders = [('User-Agent', USER_AGENT)] self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
self._url_opener.version = USER_AGENT self._url_opener.version = USER_AGENT
self._progress_callback = download_progress_callback self._progress_callback = download_progress_callback
def run(self): def run(self):
raise NotImplementedError raise NotImplementedError
def _same_filename(self, filename: str, path: str) -> str: def _same_filename(self, filename: str, path: str) -> str:
"""Check if there is a file with same name. If so then add incremental """Check if there is a file with same name. If so then add incremental
number enclosed in brackets to a name of a new one.""" number enclosed in brackets to a name of a new one."""
newname = filename newname = filename
while exists(join(path, newname)): while exists(join(path, newname)):
has_extension = newname.rfind(".") != -1 has_extension = newname.rfind(".") != -1
if has_extension: if has_extension:
l, r = newname.rsplit(".", 1) l, r = newname.rsplit(".", 1)
lbracket = l.rfind("(") lbracket = l.rfind("(")
if lbracket == -1: if lbracket == -1:
newname = f"{l}(1).{r}" newname = f"{l}(1).{r}"
else: else:
num = l[lbracket+1:-1] num = l[lbracket+1:-1]
if num.isnumeric(): if num.isnumeric():
newname = f"{l[:lbracket]}({int(num)+1}).{r}" newname = f"{l[:lbracket]}({int(num)+1}).{r}"
else: else:
newname = f"{l}(1).{r}" newname = f"{l}(1).{r}"
else: else:
lbracket = l.rfind("(") lbracket = l.rfind("(")
if lbracket == -1: if lbracket == -1:
newname = f"{newname}(1)" newname = f"{newname}(1)"
else: else:
num = newname[lbracket+1:-1] num = newname[lbracket+1:-1]
if num.isnumeric(): if num.isnumeric():
newname = f"{newname[:lbracket]}({int(num)+1})" newname = f"{newname[:lbracket]}({int(num)+1})"
return newname return newname
def _hash_file(self, filepath: str, hash_algorithm: str = "md5", def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
blocksize: int = 1048576) -> (str, str): blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file.""" """Compute hash of a file."""
hash_func = hashlib.new(hash_algorithm) hash_func = hashlib.new(hash_algorithm)
with open(filepath, 'rb') as f: with open(filepath, 'rb') as f:
buf = f.read(blocksize) buf = f.read(blocksize)
while len(buf) > 0: while len(buf) > 0:
hash_func.update(buf) hash_func.update(buf)
buf = f.read(blocksize) buf = f.read(blocksize)
return hash_func.hexdigest(), b64encode(hash_func.digest()).decode() return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
def _check_file(self, f: FileInfo, filepath: str) -> bool: def _check_file(self, f: FileInfo, filepath: str) -> bool:
"""Check if a file exist and isn't broken.""" """Check if a file exist and isn't broken."""
if not exists(filepath): if not exists(filepath):
return False return False
computed_size = getsize(filepath) computed_size = getsize(filepath)
if not (f.size == computed_size \ if not (f.size == computed_size \
or f.size == round(computed_size / 1024)): or f.size == round(computed_size / 1024)):
return False return False
hexdig, dig = self._hash_file(filepath, f.hash_algorithm) hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
return f.hash_value == hexdig or f.hash_value == dig return f.hash_value == hexdig or f.hash_value == dig
def _download_file(self, f: FileInfo): def _download_file(self, f: FileInfo):
"""Download a single file.""" """Download a single file."""
is_same_filename = False is_same_filename = False
filepath = join(self._save_directory, f.name) filepath = join(self._save_directory, f.name)
orig_filepath = filepath orig_filepath = filepath
if self._check_file(f, filepath): if self._check_file(f, filepath):
return return
elif exists(filepath): elif exists(filepath):
is_same_filename = True is_same_filename = True
filepath = join(self._save_directory, \ filepath = join(self._save_directory, \
self._same_filename(f.name, self._save_directory)) self._same_filename(f.name, self._save_directory))
try: try:
self._url_opener.retrieve(f.download_url, filepath) retries = 3
if is_same_filename: while retries > 0:
f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm) self._url_opener.retrieve(f.download_url, filepath)
f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm) if not self._check_file(f, filepath):
assert filepath != orig_filepath, 'Filepaths are matching!' print(filepath, f.size, f.hash_value)
if f1_hexdig == f2_hexdig or f1_dig == f2_dig: remove(filepath)
remove(filepath) retries -= 1
except HTTPError as e: else:
print(e, f.download_url) break
except ConnectionResetError: if is_same_filename:
print("Remote host reset connection for", f.download_url, \ f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
"Try again later.") f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
remove(filepath)
except HTTPError as e:
print("HTTP Error", e.code, e.reason, f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionResetError:
print("Connection reset for", f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionRefusedError:
print("Connection refused for", f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionAbortedError:
print("Connection aborted for", f.download_url)
if exists(filepath):
remove(filepath)