ScrapTheChan/scrapthechan/scraper.py

"""Base class for all scrapers that will actually do the job."""

from base64 import b64encode
from os import remove, stat
from os.path import exists, join, getsize
import re
from typing import List, Callable
from urllib.request import urlretrieve, URLopener, HTTPError, URLError
import hashlib
from http.client import HTTPException

from scrapthechan import USER_AGENT
from scrapthechan.fileinfo import FileInfo

__all__ = ["Scraper"]


class Scraper:
	"""Base class for all scrapers that will actually do the job.
	
	Arguments:
		save_directory             -- a path to a directory where file will be
									  saved;
		files                      -- a list of FileInfo objects;
		download_progress_callback -- a callback function that will be called
									  for each file started downloading.
	"""
	def __init__(self, save_directory: str, files: List[FileInfo],
		download_progress_callback: Callable[[int], None] = None) -> None:
		self._save_directory = save_directory
		self._files = files
		self._url_opener = URLopener()
		self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
		self._url_opener.version = USER_AGENT
		self._progress_callback = download_progress_callback

	def run(self):
		raise NotImplementedError

	def _same_filename(self, filename: str, path: str) -> str:
		"""Check if there is a file with same name. If so then add incremental
		number enclosed in brackets to a name of a new one."""
		newname = filename
		while exists(join(path, newname)):
			has_extension = newname.rfind(".") != -1
			if has_extension:
				l, r = newname.rsplit(".", 1)
				lbracket = l.rfind("(")
				if lbracket == -1:
					newname = f"{l}(1).{r}"
				else:
					num = l[lbracket+1:-1]
					if num.isnumeric():
						newname = f"{l[:lbracket]}({int(num)+1}).{r}"
					else:
						newname = f"{l}(1).{r}"
			else:
				lbracket = l.rfind("(")
				if lbracket == -1:
					newname = f"{newname}(1)"
				else:
					num = newname[lbracket+1:-1]
					if num.isnumeric():
						newname = f"{newname[:lbracket]}({int(num)+1})"
		return newname

	def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
				   blocksize: int = 1048576) -> (str, str):
		"""Compute hash of a file."""
		if hash_algorithm is None:
			return None
		hash_func = hashlib.new(hash_algorithm)
		with open(filepath, 'rb') as f:
			buf = f.read(blocksize)
			while len(buf) > 0:
				hash_func.update(buf)
				buf = f.read(blocksize)
		return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()

	def _check_file(self, f: FileInfo, filepath: str) -> bool:
		"""Check if a file exist and isn't broken."""
		if not exists(filepath):
			return False
		computed_size = getsize(filepath)
		if not (f.size == computed_size \
				or f.size == round(computed_size / 1024)):
			return False
		if not f.hash_algorithm is None:
			hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
			return f.hash_value == hexdig or f.hash_value == dig
		return True

	def _download_file(self, f: FileInfo):
		"""Download a single file."""
		is_same_filename = False
		filepath = join(self._save_directory, f.name)
		orig_filepath = filepath
		if self._check_file(f, filepath):
			return
		elif exists(filepath):
			is_same_filename = True
			filepath = join(self._save_directory, \
				self._same_filename(f.name, self._save_directory))
		try:
			retries = 3
			while retries > 0:
				self._url_opener.retrieve(f.download_url, filepath)
				if not self._check_file(f, filepath):
					remove(filepath)
					retries -= 1
				else:
					break
			if retries == 0:
				print(f"Cannot retrieve {f.download_url}, {filepath}.")
				return
			if is_same_filename:
				_, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
				_, f2_dig = self._hash_file(filepath, f.hash_algorithm)
				if f1_dig == f2_dig:
					remove(filepath)
		except FileNotFoundError as e:
		 	print("File Not Found", filepath)
		except HTTPError as e:
			print("HTTP Error", e.code, e.reason, f.download_url)
			if exists(filepath):
				remove(filepath)
		except HTTPException:
			print("HTTP Exception for", f.download_url)
			if exists(filepath):
				remove(filepath)
		except URLError as e:
			print("URL Error for", f.download_url)
			if exists(filepath):
				remove(filepath)
		except ConnectionResetError:
			print("Connection reset for", f.download_url)
			if exists(filepath):
				remove(filepath)
		except ConnectionRefusedError:
			print("Connection refused for", f.download_url)
			if exists(filepath):
				remove(filepath)
		except ConnectionAbortedError:
			print("Connection aborted for", f.download_url)
			if exists(filepath):
				remove(filepath)
Moved User-Agent off to __init__ in its own variable. 2020-07-20 04:31:27 +04:00			`"""Base class for all scrapers that will actually do the job."""`
Initial commit with all the files. 2020-07-08 22:53:39 +04:00
			`from base64 import b64encode`
			`from os import remove, stat`
			`from os.path import exists, join, getsize`
			`import re`
			`from typing import List, Callable`
Added HTTP and URL exceptions handling. 2021-04-28 02:47:41 +04:00			`from urllib.request import urlretrieve, URLopener, HTTPError, URLError`
Initial commit with all the files. 2020-07-08 22:53:39 +04:00			`import hashlib`
Added HTTP and URL exceptions handling. 2021-04-28 02:47:41 +04:00			`from http.client import HTTPException`
Initial commit with all the files. 2020-07-08 22:53:39 +04:00
Moved User-Agent off to __init__ in its own variable. 2020-07-20 04:31:27 +04:00			`from scrapthechan import USER_AGENT`
Initial commit with all the files. 2020-07-08 22:53:39 +04:00			`from scrapthechan.fileinfo import FileInfo`

			`__all__ = ["Scraper"]`


			`class Scraper:`
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`"""Base class for all scrapers that will actually do the job.`

			`Arguments:`
			`save_directory -- a path to a directory where file will be`
			`saved;`
			`files -- a list of FileInfo objects;`
			`download_progress_callback -- a callback function that will be called`
			`for each file started downloading.`
			`"""`
			`def __init__(self, save_directory: str, files: List[FileInfo],`
			`download_progress_callback: Callable[[int], None] = None) -> None:`
			`self._save_directory = save_directory`
			`self._files = files`
			`self._url_opener = URLopener()`
			`self._url_opener.addheaders = [('User-Agent', USER_AGENT)]`
			`self._url_opener.version = USER_AGENT`
			`self._progress_callback = download_progress_callback`
Initial commit with all the files. 2020-07-08 22:53:39 +04:00
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`def run(self):`
			`raise NotImplementedError`
Initial commit with all the files. 2020-07-08 22:53:39 +04:00
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`def _same_filename(self, filename: str, path: str) -> str:`
			`"""Check if there is a file with same name. If so then add incremental`
			`number enclosed in brackets to a name of a new one."""`
			`newname = filename`
			`while exists(join(path, newname)):`
			`has_extension = newname.rfind(".") != -1`
			`if has_extension:`
			`l, r = newname.rsplit(".", 1)`
			`lbracket = l.rfind("(")`
			`if lbracket == -1:`
			`newname = f"{l}(1).{r}"`
			`else:`
			`num = l[lbracket+1:-1]`
			`if num.isnumeric():`
			`newname = f"{l[:lbracket]}({int(num)+1}).{r}"`
			`else:`
			`newname = f"{l}(1).{r}"`
			`else:`
			`lbracket = l.rfind("(")`
			`if lbracket == -1:`
			`newname = f"{newname}(1)"`
			`else:`
			`num = newname[lbracket+1:-1]`
			`if num.isnumeric():`
			`newname = f"{newname[:lbracket]}({int(num)+1})"`
			`return newname`
Initial commit with all the files. 2020-07-08 22:53:39 +04:00
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`def _hash_file(self, filepath: str, hash_algorithm: str = "md5",`
			`blocksize: int = 1048576) -> (str, str):`
			`"""Compute hash of a file."""`
Added HTTP and URL exceptions handling. 2021-04-28 02:47:41 +04:00			`if hash_algorithm is None:`
			`return None`
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`hash_func = hashlib.new(hash_algorithm)`
			`with open(filepath, 'rb') as f:`
			`buf = f.read(blocksize)`
			`while len(buf) > 0:`
			`hash_func.update(buf)`
			`buf = f.read(blocksize)`
			`return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()`
Initial commit with all the files. 2020-07-08 22:53:39 +04:00
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`def _check_file(self, f: FileInfo, filepath: str) -> bool:`
			`"""Check if a file exist and isn't broken."""`
			`if not exists(filepath):`
			`return False`
			`computed_size = getsize(filepath)`
			`if not (f.size == computed_size \`
			`or f.size == round(computed_size / 1024)):`
			`return False`
Added HTTP and URL exceptions handling. 2021-04-28 02:47:41 +04:00			`if not f.hash_algorithm is None:`
			`hexdig, dig = self._hash_file(filepath, f.hash_algorithm)`
			`return f.hash_value == hexdig or f.hash_value == dig`
Added a missing return True statement in _check_file 2021-05-03 02:30:31 +04:00			`return True`
Initial commit with all the files. 2020-07-08 22:53:39 +04:00
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`def _download_file(self, f: FileInfo):`
			`"""Download a single file."""`
			`is_same_filename = False`
			`filepath = join(self._save_directory, f.name)`
			`orig_filepath = filepath`
			`if self._check_file(f, filepath):`
			`return`
			`elif exists(filepath):`
			`is_same_filename = True`
			`filepath = join(self._save_directory, \`
			`self._same_filename(f.name, self._save_directory))`
			`try:`
			`retries = 3`
			`while retries > 0:`
			`self._url_opener.retrieve(f.download_url, filepath)`
			`if not self._check_file(f, filepath):`
			`remove(filepath)`
			`retries -= 1`
			`else:`
			`break`
Removed excessive comparison of hash. Added message when file cannot be retrieved. 2021-05-04 03:56:59 +04:00			`if retries == 0:`
			`print(f"Cannot retrieve {f.download_url}, {filepath}.")`
			`return`
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`if is_same_filename:`
Removed excessive comparison of hash. Added message when file cannot be retrieved. 2021-05-04 03:56:59 +04:00			`_, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)`
			`_, f2_dig = self._hash_file(filepath, f.hash_algorithm)`
			`if f1_dig == f2_dig:`
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`remove(filepath)`
Removed excessive comparison of hash. Added message when file cannot be retrieved. 2021-05-04 03:56:59 +04:00			`except FileNotFoundError as e:`
			`print("File Not Found", filepath)`
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`except HTTPError as e:`
			`print("HTTP Error", e.code, e.reason, f.download_url)`
			`if exists(filepath):`
			`remove(filepath)`
Added HTTP and URL exceptions handling. 2021-04-28 02:47:41 +04:00			`except HTTPException:`
			`print("HTTP Exception for", f.download_url)`
			`if exists(filepath):`
			`remove(filepath)`
			`except URLError as e:`
			`print("URL Error for", f.download_url)`
			`if exists(filepath):`
			`remove(filepath)`
Improved error handling, retries for damaged files. 2020-11-19 01:26:19 +04:00			`except ConnectionResetError:`
			`print("Connection reset for", f.download_url)`
			`if exists(filepath):`
			`remove(filepath)`
			`except ConnectionRefusedError:`
			`print("Connection refused for", f.download_url)`
			`if exists(filepath):`
			`remove(filepath)`
			`except ConnectionAbortedError:`
			`print("Connection aborted for", f.download_url)`
			`if exists(filepath):`
			`remove(filepath)`