diff --git a/scrapthechan/scraper.py b/scrapthechan/scraper.py index 2b93377..016f377 100644 --- a/scrapthechan/scraper.py +++ b/scrapthechan/scraper.py @@ -5,7 +5,7 @@ from os import remove, stat from os.path import exists, join, getsize import re from typing import List, Callable -from urllib.request import urlretrieve, URLopener +from urllib.request import urlretrieve, URLopener, HTTPError import hashlib from scrapthechan import USER_AGENT @@ -63,35 +63,46 @@ class Scraper: newname = f"{newname[:lbracket]}({int(num)+1})" return newname - def _hash_file(self, filename: str, hash_algo: str = "md5", + def _hash_file(self, filepath: str, hash_algorithm: str = "md5", blocksize: int = 1048576) -> (str, str): """Compute hash of a file.""" - hash_func = hashlib.new(hash_algo) - with open(filename, 'rb') as f: + hash_func = hashlib.new(hash_algorithm) + with open(filepath, 'rb') as f: buf = f.read(blocksize) while len(buf) > 0: hash_func.update(buf) buf = f.read(blocksize) - return hash_func.hexdigest(), hash_func.digest() + return hash_func.hexdigest(), b64encode(hash_func.digest()).decode() - def _is_file_ok(self, f: FileInfo, filepath: str) -> bool: + def _check_file(self, f: FileInfo, filepath: str) -> bool: """Check if a file exist and isn't broken.""" if not exists(filepath): return False computed_size = getsize(filepath) - is_size_match = f.size == computed_size \ - or f.size == round(computed_size / 1024) - hexdig, dig = self._hash_file(filepath, f.hash_algo) - is_hash_match = f.hash_value == hexdig \ - or f.hash_value == b64encode(dig).decode() - return is_size_match and is_hash_match + if not (f.size == computed_size \ + or f.size == round(computed_size / 1024)): + return False + hexdig, dig = self._hash_file(filepath, f.hash_algorithm) + return f.hash_value == hexdig or f.hash_value == dig def _download_file(self, f: FileInfo): """Download a single file.""" + is_same_filename = False filepath = join(self._save_directory, f.name) - if self._is_file_ok(f, filepath): - return True + orig_filepath = filepath + if self._check_file(f, filepath): + return elif exists(filepath): + is_same_filename = True filepath = join(self._save_directory, \ self._same_filename(f.name, self._save_directory)) - self._url_opener.retrieve(f.dlurl, filepath) + try: + self._url_opener.retrieve(f.download_url, filepath) + if is_same_filename: + f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm) + f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm) + assert filepath != orig_filepath, 'Filepaths are matching!' + if f1_hexdig == f2_hexdig or f1_dig == f2_dig: + remove(filepath) + except HTTPError as e: + print(e, f.download_url)