Improved error handling, retries for damaged files.

2020-11-19 01:26:19 +04:00 · 2020-11-19 01:26:19 +04:00 · 7b2fcf0899
commit 7b2fcf0899
parent 21837c5335
1 changed files with 107 additions and 89 deletions
--- a/scrapthechan/scraper.py
+++ b/scrapthechan/scraper.py
@ -15,97 +15,115 @@ __all__ = ["Scraper"]


 class Scraper:
-    """Base class for all scrapers that will actually do the job.
-    
-    Arguments:
-        save_directory             -- a path to a directory where file will be
-                                      saved;
-        files                      -- a list of FileInfo objects;
-        download_progress_callback -- a callback function that will be called
-                                      for each file started downloading.
-    """
-    def __init__(self, save_directory: str, files: List[FileInfo],
-        download_progress_callback: Callable[[int], None] = None) -> None:
-        self._save_directory = save_directory
-        self._files = files
-        self._url_opener = URLopener()
-        self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
-        self._url_opener.version = USER_AGENT
-        self._progress_callback = download_progress_callback
+	"""Base class for all scrapers that will actually do the job.
+	
+	Arguments:
+		save_directory             -- a path to a directory where file will be
+									  saved;
+		files                      -- a list of FileInfo objects;
+		download_progress_callback -- a callback function that will be called
+									  for each file started downloading.
+	"""
+	def __init__(self, save_directory: str, files: List[FileInfo],
+		download_progress_callback: Callable[[int], None] = None) -> None:
+		self._save_directory = save_directory
+		self._files = files
+		self._url_opener = URLopener()
+		self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
+		self._url_opener.version = USER_AGENT
+		self._progress_callback = download_progress_callback

-    def run(self):
-        raise NotImplementedError
+	def run(self):
+		raise NotImplementedError

-    def _same_filename(self, filename: str, path: str) -> str:
-        """Check if there is a file with same name. If so then add incremental
-        number enclosed in brackets to a name of a new one."""
-        newname = filename
-        while exists(join(path, newname)):
-            has_extension = newname.rfind(".") != -1
-            if has_extension:
-                l, r = newname.rsplit(".", 1)
-                lbracket = l.rfind("(")
-                if lbracket == -1:
-                    newname = f"{l}(1).{r}"
-                else:
-                    num = l[lbracket+1:-1]
-                    if num.isnumeric():
-                        newname = f"{l[:lbracket]}({int(num)+1}).{r}"
-                    else:
-                        newname = f"{l}(1).{r}"
-            else:
-                lbracket = l.rfind("(")
-                if lbracket == -1:
-                    newname = f"{newname}(1)"
-                else:
-                    num = newname[lbracket+1:-1]
-                    if num.isnumeric():
-                        newname = f"{newname[:lbracket]}({int(num)+1})"
-        return newname
+	def _same_filename(self, filename: str, path: str) -> str:
+		"""Check if there is a file with same name. If so then add incremental
+		number enclosed in brackets to a name of a new one."""
+		newname = filename
+		while exists(join(path, newname)):
+			has_extension = newname.rfind(".") != -1
+			if has_extension:
+				l, r = newname.rsplit(".", 1)
+				lbracket = l.rfind("(")
+				if lbracket == -1:
+					newname = f"{l}(1).{r}"
+				else:
+					num = l[lbracket+1:-1]
+					if num.isnumeric():
+						newname = f"{l[:lbracket]}({int(num)+1}).{r}"
+					else:
+						newname = f"{l}(1).{r}"
+			else:
+				lbracket = l.rfind("(")
+				if lbracket == -1:
+					newname = f"{newname}(1)"
+				else:
+					num = newname[lbracket+1:-1]
+					if num.isnumeric():
+						newname = f"{newname[:lbracket]}({int(num)+1})"
+		return newname

-    def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
-                   blocksize: int = 1048576) -> (str, str):
-        """Compute hash of a file."""
-        hash_func = hashlib.new(hash_algorithm)
-        with open(filepath, 'rb') as f:
-            buf = f.read(blocksize)
-            while len(buf) > 0:
-                hash_func.update(buf)
-                buf = f.read(blocksize)
-        return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
+	def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
+				   blocksize: int = 1048576) -> (str, str):
+		"""Compute hash of a file."""
+		hash_func = hashlib.new(hash_algorithm)
+		with open(filepath, 'rb') as f:
+			buf = f.read(blocksize)
+			while len(buf) > 0:
+				hash_func.update(buf)
+				buf = f.read(blocksize)
+		return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()

-    def _check_file(self, f: FileInfo, filepath: str) -> bool:
-        """Check if a file exist and isn't broken."""
-        if not exists(filepath):
-            return False
-        computed_size = getsize(filepath)
-        if not (f.size == computed_size \
-                or f.size == round(computed_size / 1024)):
-            return False
-        hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
-        return f.hash_value == hexdig or f.hash_value == dig
+	def _check_file(self, f: FileInfo, filepath: str) -> bool:
+		"""Check if a file exist and isn't broken."""
+		if not exists(filepath):
+			return False
+		computed_size = getsize(filepath)
+		if not (f.size == computed_size \
+				or f.size == round(computed_size / 1024)):
+			return False
+		hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
+		return f.hash_value == hexdig or f.hash_value == dig

-    def _download_file(self, f: FileInfo):
-        """Download a single file."""
-        is_same_filename = False
-        filepath = join(self._save_directory, f.name)
-        orig_filepath = filepath
-        if self._check_file(f, filepath):
-            return
-        elif exists(filepath):
-            is_same_filename = True
-            filepath = join(self._save_directory, \
-                self._same_filename(f.name, self._save_directory))
-        try:
-            self._url_opener.retrieve(f.download_url, filepath)
-            if is_same_filename:
-                f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
-                f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
-                assert filepath != orig_filepath, 'Filepaths are matching!'
-                if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
-                    remove(filepath)
-        except HTTPError as e:
-            print(e, f.download_url)
-        except ConnectionResetError:
-            print("Remote host reset connection for", f.download_url, \
-                  "Try again later.")
+	def _download_file(self, f: FileInfo):
+		"""Download a single file."""
+		is_same_filename = False
+		filepath = join(self._save_directory, f.name)
+		orig_filepath = filepath
+		if self._check_file(f, filepath):
+			return
+		elif exists(filepath):
+			is_same_filename = True
+			filepath = join(self._save_directory, \
+				self._same_filename(f.name, self._save_directory))
+		try:
+			retries = 3
+			while retries > 0:
+				self._url_opener.retrieve(f.download_url, filepath)
+				if not self._check_file(f, filepath):
+					print(filepath, f.size, f.hash_value)
+					remove(filepath)
+					retries -= 1
+				else:
+					break
+			if is_same_filename:
+				f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
+				f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
+				if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
+					remove(filepath)
+		except HTTPError as e:
+			print("HTTP Error", e.code, e.reason, f.download_url)
+			if exists(filepath):
+				remove(filepath)
+		except ConnectionResetError:
+			print("Connection reset for", f.download_url)
+			if exists(filepath):
+				remove(filepath)
+		except ConnectionRefusedError:
+			print("Connection refused for", f.download_url)
+			if exists(filepath):
+				remove(filepath)
+		except ConnectionAbortedError:
+			print("Connection aborted for", f.download_url)
+			if exists(filepath):
+				remove(filepath)