1
0
Fork 0

Compare commits

...

2 Commits

2 changed files with 111 additions and 91 deletions

View File

@ -4,8 +4,10 @@
### Added
- For 2ch.hk check for if a file is a sticker was added;
- Encoding for `!op.txt` file was explicitly set to `utf-8`;
- Handling of HTTP errors and reset connection error was added so now program
won't crash if file doesn't exist or not accessible for any other reason;
- Handling of connection errors was added so now program won't crash if file
doesn't exist or not accessible for any other reason and if any damaged files
was created then they will be removed;
- Added 3 retries if file was damaged during downloading;
- To a scraper was added matching of hashes of two files that happen to share
same name and size, but hash reported by an imageboard is not the same as of
a file. It results in excessive downloading and hash calculations. Hopefully,

View File

@ -15,97 +15,115 @@ __all__ = ["Scraper"]
class Scraper:
"""Base class for all scrapers that will actually do the job.
Arguments:
save_directory -- a path to a directory where file will be
saved;
files -- a list of FileInfo objects;
download_progress_callback -- a callback function that will be called
for each file started downloading.
"""
def __init__(self, save_directory: str, files: List[FileInfo],
download_progress_callback: Callable[[int], None] = None) -> None:
self._save_directory = save_directory
self._files = files
self._url_opener = URLopener()
self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
self._url_opener.version = USER_AGENT
self._progress_callback = download_progress_callback
"""Base class for all scrapers that will actually do the job.
Arguments:
save_directory -- a path to a directory where file will be
saved;
files -- a list of FileInfo objects;
download_progress_callback -- a callback function that will be called
for each file started downloading.
"""
def __init__(self, save_directory: str, files: List[FileInfo],
download_progress_callback: Callable[[int], None] = None) -> None:
self._save_directory = save_directory
self._files = files
self._url_opener = URLopener()
self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
self._url_opener.version = USER_AGENT
self._progress_callback = download_progress_callback
def run(self):
raise NotImplementedError
def run(self):
raise NotImplementedError
def _same_filename(self, filename: str, path: str) -> str:
"""Check if there is a file with same name. If so then add incremental
number enclosed in brackets to a name of a new one."""
newname = filename
while exists(join(path, newname)):
has_extension = newname.rfind(".") != -1
if has_extension:
l, r = newname.rsplit(".", 1)
lbracket = l.rfind("(")
if lbracket == -1:
newname = f"{l}(1).{r}"
else:
num = l[lbracket+1:-1]
if num.isnumeric():
newname = f"{l[:lbracket]}({int(num)+1}).{r}"
else:
newname = f"{l}(1).{r}"
else:
lbracket = l.rfind("(")
if lbracket == -1:
newname = f"{newname}(1)"
else:
num = newname[lbracket+1:-1]
if num.isnumeric():
newname = f"{newname[:lbracket]}({int(num)+1})"
return newname
def _same_filename(self, filename: str, path: str) -> str:
"""Check if there is a file with same name. If so then add incremental
number enclosed in brackets to a name of a new one."""
newname = filename
while exists(join(path, newname)):
has_extension = newname.rfind(".") != -1
if has_extension:
l, r = newname.rsplit(".", 1)
lbracket = l.rfind("(")
if lbracket == -1:
newname = f"{l}(1).{r}"
else:
num = l[lbracket+1:-1]
if num.isnumeric():
newname = f"{l[:lbracket]}({int(num)+1}).{r}"
else:
newname = f"{l}(1).{r}"
else:
lbracket = l.rfind("(")
if lbracket == -1:
newname = f"{newname}(1)"
else:
num = newname[lbracket+1:-1]
if num.isnumeric():
newname = f"{newname[:lbracket]}({int(num)+1})"
return newname
def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file."""
hash_func = hashlib.new(hash_algorithm)
with open(filepath, 'rb') as f:
buf = f.read(blocksize)
while len(buf) > 0:
hash_func.update(buf)
buf = f.read(blocksize)
return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file."""
hash_func = hashlib.new(hash_algorithm)
with open(filepath, 'rb') as f:
buf = f.read(blocksize)
while len(buf) > 0:
hash_func.update(buf)
buf = f.read(blocksize)
return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
def _check_file(self, f: FileInfo, filepath: str) -> bool:
"""Check if a file exist and isn't broken."""
if not exists(filepath):
return False
computed_size = getsize(filepath)
if not (f.size == computed_size \
or f.size == round(computed_size / 1024)):
return False
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
return f.hash_value == hexdig or f.hash_value == dig
def _check_file(self, f: FileInfo, filepath: str) -> bool:
"""Check if a file exist and isn't broken."""
if not exists(filepath):
return False
computed_size = getsize(filepath)
if not (f.size == computed_size \
or f.size == round(computed_size / 1024)):
return False
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
return f.hash_value == hexdig or f.hash_value == dig
def _download_file(self, f: FileInfo):
"""Download a single file."""
is_same_filename = False
filepath = join(self._save_directory, f.name)
orig_filepath = filepath
if self._check_file(f, filepath):
return
elif exists(filepath):
is_same_filename = True
filepath = join(self._save_directory, \
self._same_filename(f.name, self._save_directory))
try:
self._url_opener.retrieve(f.download_url, filepath)
if is_same_filename:
f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
assert filepath != orig_filepath, 'Filepaths are matching!'
if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
remove(filepath)
except HTTPError as e:
print(e, f.download_url)
except ConnectionResetError:
print("Remote host reset connection for", f.download_url, \
"Try again later.")
def _download_file(self, f: FileInfo):
"""Download a single file."""
is_same_filename = False
filepath = join(self._save_directory, f.name)
orig_filepath = filepath
if self._check_file(f, filepath):
return
elif exists(filepath):
is_same_filename = True
filepath = join(self._save_directory, \
self._same_filename(f.name, self._save_directory))
try:
retries = 3
while retries > 0:
self._url_opener.retrieve(f.download_url, filepath)
if not self._check_file(f, filepath):
print(filepath, f.size, f.hash_value)
remove(filepath)
retries -= 1
else:
break
if is_same_filename:
f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
remove(filepath)
except HTTPError as e:
print("HTTP Error", e.code, e.reason, f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionResetError:
print("Connection reset for", f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionRefusedError:
print("Connection refused for", f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionAbortedError:
print("Connection aborted for", f.download_url)
if exists(filepath):
remove(filepath)