ScrapTheChan/scrapthechan/scraper.py

"""Base class for all scrapers that will actually do the job."""

from base64 import b64encode
from os import remove, stat
from os.path import exists, join, getsize
import re
from typing import List, Callable
from urllib.request import urlretrieve, URLopener, HTTPError, URLError
import hashlib
from http.client import HTTPException

from scrapthechan import USER_AGENT
from scrapthechan.fileinfo import FileInfo

__all__ = ["Scraper"]


class Scraper:
	"""Base class for all scrapers that will actually do the job.

	Arguments:
		save_directory             -- a path to a directory where file will be
									  saved;
		files                      -- a list of FileInfo objects;
		download_progress_callback -- a callback function that will be called
									  for each file started downloading.
	"""
	def __init__(self, save_directory: str, files: List[FileInfo],
		download_progress_callback: Callable[[int], None] = None) -> None:
		self._save_directory = save_directory
		self._files = files
		self._url_opener = URLopener()
		self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
		self._url_opener.version = USER_AGENT
		self._progress_callback = download_progress_callback

	def run(self):
		raise NotImplementedError

	def _same_filename(self, filename: str, path: str) -> str:
		"""Check if there is a file with same name. If so then add incremental
		number enclosed in brackets to a name of a new one."""
		newname = filename
		while exists(join(path, newname)):
			has_extension = newname.rfind(".") != -1
			if has_extension:
				l, r = newname.rsplit(".", 1)
				lbracket = l.rfind("(")
				if lbracket == -1:
					newname = f"{l}(1).{r}"
				else:
					num = l[lbracket+1:-1]
					if num.isnumeric():
						newname = f"{l[:lbracket]}({int(num)+1}).{r}"
					else:
						newname = f"{l}(1).{r}"
			else:
				lbracket = l.rfind("(")
				if lbracket == -1:
					newname = f"{newname}(1)"
				else:
					num = newname[lbracket+1:-1]
					if num.isnumeric():
						newname = f"{newname[:lbracket]}({int(num)+1})"
		return newname

	def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
				   blocksize: int = 1048576) -> (str, str):
		"""Compute hash of a file."""
		if hash_algorithm is None:
			return None
		hash_func = hashlib.new(hash_algorithm)
		with open(filepath, 'rb') as f:
			buf = f.read(blocksize)
			while len(buf) > 0:
				hash_func.update(buf)
				buf = f.read(blocksize)
		return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()

	def _check_file(self, f: FileInfo, filepath: str) -> bool:
		"""Check if a file exist and isn't broken."""
		if not exists(filepath):
			return False
		computed_size = getsize(filepath)
		if not (f.size == computed_size \
				or f.size == round(computed_size / 1024)):
			return False
		if not f.hash_algorithm is None:
			hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
			return f.hash_value == hexdig or f.hash_value == dig
		return True

	def _download_file(self, f: FileInfo):
		"""Download a single file."""
		is_same_filename = False
		filepath = join(self._save_directory, f.name)
		orig_filepath = filepath
		if self._check_file(f, filepath):
			return
		elif exists(filepath):
			is_same_filename = True
			filepath = join(self._save_directory, \
				self._same_filename(f.name, self._save_directory))
		try:
			retries = 3
			while retries > 0:
				self._url_opener.retrieve(f.download_url, filepath)
				if not self._check_file(f, filepath):
					remove(filepath)
					retries -= 1
				else:
					break
			if retries == 0:
				print(f"Cannot retrieve {f.download_url}, {filepath}.")
				return
			if is_same_filename:
				_, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
				_, f2_dig = self._hash_file(filepath, f.hash_algorithm)
				if f1_dig == f2_dig:
					remove(filepath)
		except FileNotFoundError as e:
		 	print("File Not Found", filepath)
		except HTTPError as e:
			print("HTTP Error", e.code, e.reason, f.download_url)
			if exists(filepath):
				remove(filepath)
		except HTTPException:
			print("HTTP Exception for", f.download_url)
			if exists(filepath):
				remove(filepath)
		except URLError as e:
			print("URL Error for", f.download_url)
			if exists(filepath):
				remove(filepath)
		except ConnectionResetError:
			print("Connection reset for", f.download_url)
			if exists(filepath):
				remove(filepath)
		except ConnectionRefusedError:
			print("Connection refused for", f.download_url)
			if exists(filepath):
				remove(filepath)
		except ConnectionAbortedError:
			print("Connection aborted for", f.download_url)
			if exists(filepath):
				remove(filepath)