Improved error handling, retries for damaged files.

2020-11-19 01:26:19 +04:00 · 2020-11-19 01:26:19 +04:00 · 7b2fcf0899
commit 7b2fcf0899
parent 21837c5335
1 changed files with 107 additions and 89 deletions
--- a/scrapthechan/scraper.py
+++ b/scrapthechan/scraper.py
@ -97,15 +97,33 @@ class Scraper:
 			filepath = join(self._save_directory, \
 				self._same_filename(f.name, self._save_directory))
 		try:
+			retries = 3
+			while retries > 0:
 				self._url_opener.retrieve(f.download_url, filepath)
+				if not self._check_file(f, filepath):
+					print(filepath, f.size, f.hash_value)
+					remove(filepath)
+					retries -= 1
+				else:
+					break
 			if is_same_filename:
 				f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
 				f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
-                assert filepath != orig_filepath, 'Filepaths are matching!'
 				if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
 					remove(filepath)
 		except HTTPError as e:
-            print(e, f.download_url)
+			print("HTTP Error", e.code, e.reason, f.download_url)
+			if exists(filepath):
+				remove(filepath)
 		except ConnectionResetError:
-            print("Remote host reset connection for", f.download_url, \
-                  "Try again later.")
+			print("Connection reset for", f.download_url)
+			if exists(filepath):
+				remove(filepath)
+		except ConnectionRefusedError:
+			print("Connection refused for", f.download_url)
+			if exists(filepath):
+				remove(filepath)
+		except ConnectionAbortedError:
+			print("Connection aborted for", f.download_url)
+			if exists(filepath):
+				remove(filepath)