1
0
Fork 0

Compare commits

...

5 Commits

5 changed files with 37 additions and 14 deletions

View File

@ -1,5 +1,11 @@
# Changelog
## 0.4.1 - 2020-12-08
## Fixed
- Now HTTPException from http.client and URLError from urllib.request
are handled;
- 2ch.hk's stickers handling.
## 0.4.0 - 2020-11-18
### Added
- For 2ch.hk check for if a file is a sticker was added;

View File

@ -1,7 +1,7 @@
build: scrapthechan README.md setup.cfg
python setup.py sdist bdist_wheel
install:
python -m pip install --upgrade dist/scrapthechan-0.4.0-py3-none-any.whl --user
python -m pip install --upgrade dist/scrapthechan-0.4.1-py3-none-any.whl --user
uninstall:
# We change directory so pip uninstall will run, it'll fail otherwise.
@cd ~/

View File

@ -1,5 +1,5 @@
__date__ = "18 November 2020"
__version__ = "0.4.0"
__date__ = "8 December 2020"
__version__ = "0.4.1"
__author__ = "Alexander \"Arav\" Andreev"
__email__ = "me@arav.top"
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"

View File

@ -36,15 +36,21 @@ class DvachParser(Parser):
if not 'files' in post: return None
files = []
for f in post['files']:
if 'sticker' in f:
continue
if match(r"^image\.\w+$", f['fullname']) is None:
fullname = f['fullname']
if not 'sticker' in f:
if match(r"^image\.\w+$", f['fullname']) is None:
fullname = f['fullname']
else:
fullname = f['name']
else:
fullname = f['name']
# Here's same thing as 4chan. 2ch.hk also has md5 field, so it is
# completely fine to hardcode `hash_algo`.
files.append(FileInfo(fullname, f['size'],
f"{self.__url_file_link}{f['path']}",
f['md5'], 'md5'))
if 'md5' in f:
files.append(FileInfo(fullname, f['size'],
f"{self.__url_file_link}{f['path']}",
f['md5'], 'md5'))
else:
files.append(FileInfo(fullname, f['size'],
f"{self.__url_file_link}{f['path']}",
None, None))
return files

View File

@ -5,8 +5,9 @@ from os import remove, stat
from os.path import exists, join, getsize
import re
from typing import List, Callable
from urllib.request import urlretrieve, URLopener, HTTPError
from urllib.request import urlretrieve, URLopener, HTTPError, URLError
import hashlib
from http.client import HTTPException
from scrapthechan import USER_AGENT
from scrapthechan.fileinfo import FileInfo
@ -66,6 +67,8 @@ class Scraper:
def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file."""
if hash_algorithm is None:
return None
hash_func = hashlib.new(hash_algorithm)
with open(filepath, 'rb') as f:
buf = f.read(blocksize)
@ -82,8 +85,9 @@ class Scraper:
if not (f.size == computed_size \
or f.size == round(computed_size / 1024)):
return False
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
return f.hash_value == hexdig or f.hash_value == dig
if not f.hash_algorithm is None:
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
return f.hash_value == hexdig or f.hash_value == dig
def _download_file(self, f: FileInfo):
"""Download a single file."""
@ -101,7 +105,6 @@ class Scraper:
while retries > 0:
self._url_opener.retrieve(f.download_url, filepath)
if not self._check_file(f, filepath):
print(filepath, f.size, f.hash_value)
remove(filepath)
retries -= 1
else:
@ -115,6 +118,14 @@ class Scraper:
print("HTTP Error", e.code, e.reason, f.download_url)
if exists(filepath):
remove(filepath)
except HTTPException:
print("HTTP Exception for", f.download_url)
if exists(filepath):
remove(filepath)
except URLError as e:
print("URL Error for", f.download_url)
if exists(filepath):
remove(filepath)
except ConnectionResetError:
print("Connection reset for", f.download_url)
if exists(filepath):