1
0
Fork 0

Added HTTP and URL exceptions handling.

This commit is contained in:
Alexander Andreev 2021-04-28 02:47:41 +04:00
parent f79abcc310
commit 6022c9929a
Signed by: Arav
GPG Key ID: 610DF2574456329F
1 changed files with 15 additions and 4 deletions

View File

@ -5,8 +5,9 @@ from os import remove, stat
from os.path import exists, join, getsize from os.path import exists, join, getsize
import re import re
from typing import List, Callable from typing import List, Callable
from urllib.request import urlretrieve, URLopener, HTTPError from urllib.request import urlretrieve, URLopener, HTTPError, URLError
import hashlib import hashlib
from http.client import HTTPException
from scrapthechan import USER_AGENT from scrapthechan import USER_AGENT
from scrapthechan.fileinfo import FileInfo from scrapthechan.fileinfo import FileInfo
@ -66,6 +67,8 @@ class Scraper:
def _hash_file(self, filepath: str, hash_algorithm: str = "md5", def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
blocksize: int = 1048576) -> (str, str): blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file.""" """Compute hash of a file."""
if hash_algorithm is None:
return None
hash_func = hashlib.new(hash_algorithm) hash_func = hashlib.new(hash_algorithm)
with open(filepath, 'rb') as f: with open(filepath, 'rb') as f:
buf = f.read(blocksize) buf = f.read(blocksize)
@ -82,8 +85,9 @@ class Scraper:
if not (f.size == computed_size \ if not (f.size == computed_size \
or f.size == round(computed_size / 1024)): or f.size == round(computed_size / 1024)):
return False return False
hexdig, dig = self._hash_file(filepath, f.hash_algorithm) if not f.hash_algorithm is None:
return f.hash_value == hexdig or f.hash_value == dig hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
return f.hash_value == hexdig or f.hash_value == dig
def _download_file(self, f: FileInfo): def _download_file(self, f: FileInfo):
"""Download a single file.""" """Download a single file."""
@ -101,7 +105,6 @@ class Scraper:
while retries > 0: while retries > 0:
self._url_opener.retrieve(f.download_url, filepath) self._url_opener.retrieve(f.download_url, filepath)
if not self._check_file(f, filepath): if not self._check_file(f, filepath):
print(filepath, f.size, f.hash_value)
remove(filepath) remove(filepath)
retries -= 1 retries -= 1
else: else:
@ -115,6 +118,14 @@ class Scraper:
print("HTTP Error", e.code, e.reason, f.download_url) print("HTTP Error", e.code, e.reason, f.download_url)
if exists(filepath): if exists(filepath):
remove(filepath) remove(filepath)
except HTTPException:
print("HTTP Exception for", f.download_url)
if exists(filepath):
remove(filepath)
except URLError as e:
print("URL Error for", f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionResetError: except ConnectionResetError:
print("Connection reset for", f.download_url) print("Connection reset for", f.download_url)
if exists(filepath): if exists(filepath):