2020-07-20 04:31:27 +04:00
|
|
|
"""Base class for all scrapers that will actually do the job."""
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
from base64 import b64encode
|
|
|
|
from os import remove, stat
|
|
|
|
from os.path import exists, join, getsize
|
|
|
|
import re
|
|
|
|
from typing import List, Callable
|
2021-04-28 02:47:41 +04:00
|
|
|
from urllib.request import urlretrieve, URLopener, HTTPError, URLError
|
2020-07-08 22:53:39 +04:00
|
|
|
import hashlib
|
2021-04-28 02:47:41 +04:00
|
|
|
from http.client import HTTPException
|
2020-07-08 22:53:39 +04:00
|
|
|
|
2020-07-20 04:31:27 +04:00
|
|
|
from scrapthechan import USER_AGENT
|
2020-07-08 22:53:39 +04:00
|
|
|
from scrapthechan.fileinfo import FileInfo
|
|
|
|
|
|
|
|
__all__ = ["Scraper"]
|
|
|
|
|
|
|
|
|
|
|
|
class Scraper:
|
2020-11-19 01:26:19 +04:00
|
|
|
"""Base class for all scrapers that will actually do the job.
|
|
|
|
|
|
|
|
Arguments:
|
|
|
|
save_directory -- a path to a directory where file will be
|
|
|
|
saved;
|
|
|
|
files -- a list of FileInfo objects;
|
|
|
|
download_progress_callback -- a callback function that will be called
|
|
|
|
for each file started downloading.
|
|
|
|
"""
|
|
|
|
def __init__(self, save_directory: str, files: List[FileInfo],
|
|
|
|
download_progress_callback: Callable[[int], None] = None) -> None:
|
|
|
|
self._save_directory = save_directory
|
|
|
|
self._files = files
|
|
|
|
self._url_opener = URLopener()
|
|
|
|
self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
|
|
|
|
self._url_opener.version = USER_AGENT
|
|
|
|
self._progress_callback = download_progress_callback
|
2020-07-08 22:53:39 +04:00
|
|
|
|
2020-11-19 01:26:19 +04:00
|
|
|
def run(self):
|
|
|
|
raise NotImplementedError
|
2020-07-08 22:53:39 +04:00
|
|
|
|
2020-11-19 01:26:19 +04:00
|
|
|
def _same_filename(self, filename: str, path: str) -> str:
|
|
|
|
"""Check if there is a file with same name. If so then add incremental
|
|
|
|
number enclosed in brackets to a name of a new one."""
|
|
|
|
newname = filename
|
|
|
|
while exists(join(path, newname)):
|
|
|
|
has_extension = newname.rfind(".") != -1
|
|
|
|
if has_extension:
|
|
|
|
l, r = newname.rsplit(".", 1)
|
|
|
|
lbracket = l.rfind("(")
|
|
|
|
if lbracket == -1:
|
|
|
|
newname = f"{l}(1).{r}"
|
|
|
|
else:
|
|
|
|
num = l[lbracket+1:-1]
|
|
|
|
if num.isnumeric():
|
|
|
|
newname = f"{l[:lbracket]}({int(num)+1}).{r}"
|
|
|
|
else:
|
|
|
|
newname = f"{l}(1).{r}"
|
|
|
|
else:
|
|
|
|
lbracket = l.rfind("(")
|
|
|
|
if lbracket == -1:
|
|
|
|
newname = f"{newname}(1)"
|
|
|
|
else:
|
|
|
|
num = newname[lbracket+1:-1]
|
|
|
|
if num.isnumeric():
|
|
|
|
newname = f"{newname[:lbracket]}({int(num)+1})"
|
|
|
|
return newname
|
2020-07-08 22:53:39 +04:00
|
|
|
|
2020-11-19 01:26:19 +04:00
|
|
|
def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
|
|
|
|
blocksize: int = 1048576) -> (str, str):
|
|
|
|
"""Compute hash of a file."""
|
2021-04-28 02:47:41 +04:00
|
|
|
if hash_algorithm is None:
|
|
|
|
return None
|
2020-11-19 01:26:19 +04:00
|
|
|
hash_func = hashlib.new(hash_algorithm)
|
|
|
|
with open(filepath, 'rb') as f:
|
|
|
|
buf = f.read(blocksize)
|
|
|
|
while len(buf) > 0:
|
|
|
|
hash_func.update(buf)
|
|
|
|
buf = f.read(blocksize)
|
|
|
|
return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
|
2020-07-08 22:53:39 +04:00
|
|
|
|
2020-11-19 01:26:19 +04:00
|
|
|
def _check_file(self, f: FileInfo, filepath: str) -> bool:
|
|
|
|
"""Check if a file exist and isn't broken."""
|
|
|
|
if not exists(filepath):
|
|
|
|
return False
|
|
|
|
computed_size = getsize(filepath)
|
|
|
|
if not (f.size == computed_size \
|
|
|
|
or f.size == round(computed_size / 1024)):
|
|
|
|
return False
|
2021-04-28 02:47:41 +04:00
|
|
|
if not f.hash_algorithm is None:
|
|
|
|
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
|
|
|
|
return f.hash_value == hexdig or f.hash_value == dig
|
2021-05-03 02:30:31 +04:00
|
|
|
return True
|
2020-07-08 22:53:39 +04:00
|
|
|
|
2020-11-19 01:26:19 +04:00
|
|
|
def _download_file(self, f: FileInfo):
|
|
|
|
"""Download a single file."""
|
|
|
|
is_same_filename = False
|
|
|
|
filepath = join(self._save_directory, f.name)
|
|
|
|
orig_filepath = filepath
|
|
|
|
if self._check_file(f, filepath):
|
|
|
|
return
|
|
|
|
elif exists(filepath):
|
|
|
|
is_same_filename = True
|
|
|
|
filepath = join(self._save_directory, \
|
|
|
|
self._same_filename(f.name, self._save_directory))
|
|
|
|
try:
|
|
|
|
retries = 3
|
|
|
|
while retries > 0:
|
|
|
|
self._url_opener.retrieve(f.download_url, filepath)
|
|
|
|
if not self._check_file(f, filepath):
|
|
|
|
remove(filepath)
|
|
|
|
retries -= 1
|
|
|
|
else:
|
|
|
|
break
|
2021-05-04 03:56:59 +04:00
|
|
|
if retries == 0:
|
|
|
|
print(f"Cannot retrieve {f.download_url}, {filepath}.")
|
|
|
|
return
|
2020-11-19 01:26:19 +04:00
|
|
|
if is_same_filename:
|
2021-05-04 03:56:59 +04:00
|
|
|
_, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
|
|
|
|
_, f2_dig = self._hash_file(filepath, f.hash_algorithm)
|
|
|
|
if f1_dig == f2_dig:
|
2020-11-19 01:26:19 +04:00
|
|
|
remove(filepath)
|
2021-05-04 03:56:59 +04:00
|
|
|
except FileNotFoundError as e:
|
|
|
|
print("File Not Found", filepath)
|
2020-11-19 01:26:19 +04:00
|
|
|
except HTTPError as e:
|
|
|
|
print("HTTP Error", e.code, e.reason, f.download_url)
|
|
|
|
if exists(filepath):
|
|
|
|
remove(filepath)
|
2021-04-28 02:47:41 +04:00
|
|
|
except HTTPException:
|
|
|
|
print("HTTP Exception for", f.download_url)
|
|
|
|
if exists(filepath):
|
|
|
|
remove(filepath)
|
|
|
|
except URLError as e:
|
|
|
|
print("URL Error for", f.download_url)
|
|
|
|
if exists(filepath):
|
|
|
|
remove(filepath)
|
2020-11-19 01:26:19 +04:00
|
|
|
except ConnectionResetError:
|
|
|
|
print("Connection reset for", f.download_url)
|
|
|
|
if exists(filepath):
|
|
|
|
remove(filepath)
|
|
|
|
except ConnectionRefusedError:
|
|
|
|
print("Connection refused for", f.download_url)
|
|
|
|
if exists(filepath):
|
|
|
|
remove(filepath)
|
|
|
|
except ConnectionAbortedError:
|
|
|
|
print("Connection aborted for", f.download_url)
|
|
|
|
if exists(filepath):
|
|
|
|
remove(filepath)
|