1
0

Added HTTP and URL exceptions handling.

This commit is contained in:
Alexander Andreev 2021-04-28 02:47:41 +04:00
parent f79abcc310
commit 6022c9929a
Signed by: Arav
GPG Key ID: 610DF2574456329F

View File

@ -5,8 +5,9 @@ from os import remove, stat
from os.path import exists, join, getsize from os.path import exists, join, getsize
import re import re
from typing import List, Callable from typing import List, Callable
from urllib.request import urlretrieve, URLopener, HTTPError from urllib.request import urlretrieve, URLopener, HTTPError, URLError
import hashlib import hashlib
from http.client import HTTPException
from scrapthechan import USER_AGENT from scrapthechan import USER_AGENT
from scrapthechan.fileinfo import FileInfo from scrapthechan.fileinfo import FileInfo
@ -66,6 +67,8 @@ class Scraper:
def _hash_file(self, filepath: str, hash_algorithm: str = "md5", def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
blocksize: int = 1048576) -> (str, str): blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file.""" """Compute hash of a file."""
if hash_algorithm is None:
return None
hash_func = hashlib.new(hash_algorithm) hash_func = hashlib.new(hash_algorithm)
with open(filepath, 'rb') as f: with open(filepath, 'rb') as f:
buf = f.read(blocksize) buf = f.read(blocksize)
@ -82,6 +85,7 @@ class Scraper:
if not (f.size == computed_size \ if not (f.size == computed_size \
or f.size == round(computed_size / 1024)): or f.size == round(computed_size / 1024)):
return False return False
if not f.hash_algorithm is None:
hexdig, dig = self._hash_file(filepath, f.hash_algorithm) hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
return f.hash_value == hexdig or f.hash_value == dig return f.hash_value == hexdig or f.hash_value == dig
@ -101,7 +105,6 @@ class Scraper:
while retries > 0: while retries > 0:
self._url_opener.retrieve(f.download_url, filepath) self._url_opener.retrieve(f.download_url, filepath)
if not self._check_file(f, filepath): if not self._check_file(f, filepath):
print(filepath, f.size, f.hash_value)
remove(filepath) remove(filepath)
retries -= 1 retries -= 1
else: else:
@ -115,6 +118,14 @@ class Scraper:
print("HTTP Error", e.code, e.reason, f.download_url) print("HTTP Error", e.code, e.reason, f.download_url)
if exists(filepath): if exists(filepath):
remove(filepath) remove(filepath)
except HTTPException:
print("HTTP Exception for", f.download_url)
if exists(filepath):
remove(filepath)
except URLError as e:
print("URL Error for", f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionResetError: except ConnectionResetError:
print("Connection reset for", f.download_url) print("Connection reset for", f.download_url)
if exists(filepath): if exists(filepath):