1
0

Compare commits

..

2 Commits

2 changed files with 111 additions and 91 deletions

View File

@ -4,8 +4,10 @@
### Added ### Added
- For 2ch.hk check for if a file is a sticker was added; - For 2ch.hk check for if a file is a sticker was added;
- Encoding for `!op.txt` file was explicitly set to `utf-8`; - Encoding for `!op.txt` file was explicitly set to `utf-8`;
- Handling of HTTP errors and reset connection error was added so now program - Handling of connection errors was added so now program won't crash if file
won't crash if file doesn't exist or not accessible for any other reason; doesn't exist or not accessible for any other reason and if any damaged files
was created then they will be removed;
- Added 3 retries if file was damaged during downloading;
- To a scraper was added matching of hashes of two files that happen to share - To a scraper was added matching of hashes of two files that happen to share
same name and size, but hash reported by an imageboard is not the same as of same name and size, but hash reported by an imageboard is not the same as of
a file. It results in excessive downloading and hash calculations. Hopefully, a file. It results in excessive downloading and hash calculations. Hopefully,

View File

@ -15,97 +15,115 @@ __all__ = ["Scraper"]
class Scraper: class Scraper:
"""Base class for all scrapers that will actually do the job. """Base class for all scrapers that will actually do the job.
Arguments: Arguments:
save_directory -- a path to a directory where file will be save_directory -- a path to a directory where file will be
saved; saved;
files -- a list of FileInfo objects; files -- a list of FileInfo objects;
download_progress_callback -- a callback function that will be called download_progress_callback -- a callback function that will be called
for each file started downloading. for each file started downloading.
""" """
def __init__(self, save_directory: str, files: List[FileInfo], def __init__(self, save_directory: str, files: List[FileInfo],
download_progress_callback: Callable[[int], None] = None) -> None: download_progress_callback: Callable[[int], None] = None) -> None:
self._save_directory = save_directory self._save_directory = save_directory
self._files = files self._files = files
self._url_opener = URLopener() self._url_opener = URLopener()
self._url_opener.addheaders = [('User-Agent', USER_AGENT)] self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
self._url_opener.version = USER_AGENT self._url_opener.version = USER_AGENT
self._progress_callback = download_progress_callback self._progress_callback = download_progress_callback
def run(self): def run(self):
raise NotImplementedError raise NotImplementedError
def _same_filename(self, filename: str, path: str) -> str: def _same_filename(self, filename: str, path: str) -> str:
"""Check if there is a file with same name. If so then add incremental """Check if there is a file with same name. If so then add incremental
number enclosed in brackets to a name of a new one.""" number enclosed in brackets to a name of a new one."""
newname = filename newname = filename
while exists(join(path, newname)): while exists(join(path, newname)):
has_extension = newname.rfind(".") != -1 has_extension = newname.rfind(".") != -1
if has_extension: if has_extension:
l, r = newname.rsplit(".", 1) l, r = newname.rsplit(".", 1)
lbracket = l.rfind("(") lbracket = l.rfind("(")
if lbracket == -1: if lbracket == -1:
newname = f"{l}(1).{r}" newname = f"{l}(1).{r}"
else: else:
num = l[lbracket+1:-1] num = l[lbracket+1:-1]
if num.isnumeric(): if num.isnumeric():
newname = f"{l[:lbracket]}({int(num)+1}).{r}" newname = f"{l[:lbracket]}({int(num)+1}).{r}"
else: else:
newname = f"{l}(1).{r}" newname = f"{l}(1).{r}"
else: else:
lbracket = l.rfind("(") lbracket = l.rfind("(")
if lbracket == -1: if lbracket == -1:
newname = f"{newname}(1)" newname = f"{newname}(1)"
else: else:
num = newname[lbracket+1:-1] num = newname[lbracket+1:-1]
if num.isnumeric(): if num.isnumeric():
newname = f"{newname[:lbracket]}({int(num)+1})" newname = f"{newname[:lbracket]}({int(num)+1})"
return newname return newname
def _hash_file(self, filepath: str, hash_algorithm: str = "md5", def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
blocksize: int = 1048576) -> (str, str): blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file.""" """Compute hash of a file."""
hash_func = hashlib.new(hash_algorithm) hash_func = hashlib.new(hash_algorithm)
with open(filepath, 'rb') as f: with open(filepath, 'rb') as f:
buf = f.read(blocksize) buf = f.read(blocksize)
while len(buf) > 0: while len(buf) > 0:
hash_func.update(buf) hash_func.update(buf)
buf = f.read(blocksize) buf = f.read(blocksize)
return hash_func.hexdigest(), b64encode(hash_func.digest()).decode() return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
def _check_file(self, f: FileInfo, filepath: str) -> bool: def _check_file(self, f: FileInfo, filepath: str) -> bool:
"""Check if a file exist and isn't broken.""" """Check if a file exist and isn't broken."""
if not exists(filepath): if not exists(filepath):
return False return False
computed_size = getsize(filepath) computed_size = getsize(filepath)
if not (f.size == computed_size \ if not (f.size == computed_size \
or f.size == round(computed_size / 1024)): or f.size == round(computed_size / 1024)):
return False return False
hexdig, dig = self._hash_file(filepath, f.hash_algorithm) hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
return f.hash_value == hexdig or f.hash_value == dig return f.hash_value == hexdig or f.hash_value == dig
def _download_file(self, f: FileInfo): def _download_file(self, f: FileInfo):
"""Download a single file.""" """Download a single file."""
is_same_filename = False is_same_filename = False
filepath = join(self._save_directory, f.name) filepath = join(self._save_directory, f.name)
orig_filepath = filepath orig_filepath = filepath
if self._check_file(f, filepath): if self._check_file(f, filepath):
return return
elif exists(filepath): elif exists(filepath):
is_same_filename = True is_same_filename = True
filepath = join(self._save_directory, \ filepath = join(self._save_directory, \
self._same_filename(f.name, self._save_directory)) self._same_filename(f.name, self._save_directory))
try: try:
self._url_opener.retrieve(f.download_url, filepath) retries = 3
if is_same_filename: while retries > 0:
f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm) self._url_opener.retrieve(f.download_url, filepath)
f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm) if not self._check_file(f, filepath):
assert filepath != orig_filepath, 'Filepaths are matching!' print(filepath, f.size, f.hash_value)
if f1_hexdig == f2_hexdig or f1_dig == f2_dig: remove(filepath)
remove(filepath) retries -= 1
except HTTPError as e: else:
print(e, f.download_url) break
except ConnectionResetError: if is_same_filename:
print("Remote host reset connection for", f.download_url, \ f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
"Try again later.") f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
remove(filepath)
except HTTPError as e:
print("HTTP Error", e.code, e.reason, f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionResetError:
print("Connection reset for", f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionRefusedError:
print("Connection refused for", f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionAbortedError:
print("Connection aborted for", f.download_url)
if exists(filepath):
remove(filepath)