"""Base class for all scrapers that will actually do the job.""" from base64 import b64encode from os import remove, stat from os.path import exists, join, getsize import re from typing import List, Callable from urllib.request import urlretrieve, URLopener, HTTPError, URLError import hashlib from http.client import HTTPException from scrapthechan import USER_AGENT from scrapthechan.fileinfo import FileInfo __all__ = ["Scraper"] class Scraper: """Base class for all scrapers that will actually do the job. Arguments: save_directory -- a path to a directory where file will be saved; files -- a list of FileInfo objects; download_progress_callback -- a callback function that will be called for each file started downloading. """ def __init__(self, save_directory: str, files: List[FileInfo], download_progress_callback: Callable[[int], None] = None) -> None: self._save_directory = save_directory self._files = files self._url_opener = URLopener() self._url_opener.addheaders = [('User-Agent', USER_AGENT)] self._url_opener.version = USER_AGENT self._progress_callback = download_progress_callback def run(self): raise NotImplementedError def _same_filename(self, filename: str, path: str) -> str: """Check if there is a file with same name. If so then add incremental number enclosed in brackets to a name of a new one.""" newname = filename while exists(join(path, newname)): has_extension = newname.rfind(".") != -1 if has_extension: l, r = newname.rsplit(".", 1) lbracket = l.rfind("(") if lbracket == -1: newname = f"{l}(1).{r}" else: num = l[lbracket+1:-1] if num.isnumeric(): newname = f"{l[:lbracket]}({int(num)+1}).{r}" else: newname = f"{l}(1).{r}" else: lbracket = l.rfind("(") if lbracket == -1: newname = f"{newname}(1)" else: num = newname[lbracket+1:-1] if num.isnumeric(): newname = f"{newname[:lbracket]}({int(num)+1})" return newname def _hash_file(self, filepath: str, hash_algorithm: str = "md5", blocksize: int = 1048576) -> (str, str): """Compute hash of a file.""" if hash_algorithm is None: return None hash_func = hashlib.new(hash_algorithm) with open(filepath, 'rb') as f: buf = f.read(blocksize) while len(buf) > 0: hash_func.update(buf) buf = f.read(blocksize) return hash_func.hexdigest(), b64encode(hash_func.digest()).decode() def _check_file(self, f: FileInfo, filepath: str) -> bool: """Check if a file exist and isn't broken.""" if not exists(filepath): return False computed_size = getsize(filepath) if not (f.size == computed_size \ or f.size == round(computed_size / 1024)): return False if not f.hash_algorithm is None: hexdig, dig = self._hash_file(filepath, f.hash_algorithm) return f.hash_value == hexdig or f.hash_value == dig return True def _download_file(self, f: FileInfo): """Download a single file.""" is_same_filename = False filepath = join(self._save_directory, f.name) orig_filepath = filepath if self._check_file(f, filepath): return elif exists(filepath): is_same_filename = True filepath = join(self._save_directory, \ self._same_filename(f.name, self._save_directory)) try: retries = 3 while retries > 0: self._url_opener.retrieve(f.download_url, filepath) if not self._check_file(f, filepath): remove(filepath) retries -= 1 else: break if retries == 0: print(f"Cannot retrieve {f.download_url}, {filepath}.") return if is_same_filename: _, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm) _, f2_dig = self._hash_file(filepath, f.hash_algorithm) if f1_dig == f2_dig: remove(filepath) except FileNotFoundError as e: print("File Not Found", filepath) except HTTPError as e: print("HTTP Error", e.code, e.reason, f.download_url) if exists(filepath): remove(filepath) except HTTPException: print("HTTP Exception for", f.download_url) if exists(filepath): remove(filepath) except URLError as e: print("URL Error for", f.download_url) if exists(filepath): remove(filepath) except ConnectionResetError: print("Connection reset for", f.download_url) if exists(filepath): remove(filepath) except ConnectionRefusedError: print("Connection refused for", f.download_url) if exists(filepath): remove(filepath) except ConnectionAbortedError: print("Connection aborted for", f.download_url) if exists(filepath): remove(filepath)