From 6022c9929adff8ba3937437891bfeb5e402e1690 Mon Sep 17 00:00:00 2001 From: "Alexander \"Arav\" Andreev" Date: Wed, 28 Apr 2021 02:47:41 +0400 Subject: [PATCH] Added HTTP and URL exceptions handling. --- scrapthechan/scraper.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/scrapthechan/scraper.py b/scrapthechan/scraper.py index 025f5c1..dfa1937 100644 --- a/scrapthechan/scraper.py +++ b/scrapthechan/scraper.py @@ -5,8 +5,9 @@ from os import remove, stat from os.path import exists, join, getsize import re from typing import List, Callable -from urllib.request import urlretrieve, URLopener, HTTPError +from urllib.request import urlretrieve, URLopener, HTTPError, URLError import hashlib +from http.client import HTTPException from scrapthechan import USER_AGENT from scrapthechan.fileinfo import FileInfo @@ -66,6 +67,8 @@ class Scraper: def _hash_file(self, filepath: str, hash_algorithm: str = "md5", blocksize: int = 1048576) -> (str, str): """Compute hash of a file.""" + if hash_algorithm is None: + return None hash_func = hashlib.new(hash_algorithm) with open(filepath, 'rb') as f: buf = f.read(blocksize) @@ -82,8 +85,9 @@ class Scraper: if not (f.size == computed_size \ or f.size == round(computed_size / 1024)): return False - hexdig, dig = self._hash_file(filepath, f.hash_algorithm) - return f.hash_value == hexdig or f.hash_value == dig + if not f.hash_algorithm is None: + hexdig, dig = self._hash_file(filepath, f.hash_algorithm) + return f.hash_value == hexdig or f.hash_value == dig def _download_file(self, f: FileInfo): """Download a single file.""" @@ -101,7 +105,6 @@ class Scraper: while retries > 0: self._url_opener.retrieve(f.download_url, filepath) if not self._check_file(f, filepath): - print(filepath, f.size, f.hash_value) remove(filepath) retries -= 1 else: @@ -115,6 +118,14 @@ class Scraper: print("HTTP Error", e.code, e.reason, f.download_url) if exists(filepath): remove(filepath) + except HTTPException: + print("HTTP Exception for", f.download_url) + if exists(filepath): + remove(filepath) + except URLError as e: + print("URL Error for", f.download_url) + if exists(filepath): + remove(filepath) except ConnectionResetError: print("Connection reset for", f.download_url) if exists(filepath):