Compare commits
5 Commits
f79abcc310
...
4f6f56ae7b
Author | SHA1 | Date |
---|---|---|
Alexander Andreev | 4f6f56ae7b | |
Alexander Andreev | 503eb9959b | |
Alexander Andreev | cb2e0d77f7 | |
Alexander Andreev | 93e442939a | |
Alexander Andreev | 6022c9929a |
|
@ -1,5 +1,11 @@
|
|||
# Changelog
|
||||
|
||||
## 0.4.1 - 2020-12-08
|
||||
## Fixed
|
||||
- Now HTTPException from http.client and URLError from urllib.request
|
||||
are handled;
|
||||
- 2ch.hk's stickers handling.
|
||||
|
||||
## 0.4.0 - 2020-11-18
|
||||
### Added
|
||||
- For 2ch.hk check for if a file is a sticker was added;
|
||||
|
|
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
build: scrapthechan README.md setup.cfg
|
||||
python setup.py sdist bdist_wheel
|
||||
install:
|
||||
python -m pip install --upgrade dist/scrapthechan-0.4.0-py3-none-any.whl --user
|
||||
python -m pip install --upgrade dist/scrapthechan-0.4.1-py3-none-any.whl --user
|
||||
uninstall:
|
||||
# We change directory so pip uninstall will run, it'll fail otherwise.
|
||||
@cd ~/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
__date__ = "18 November 2020"
|
||||
__version__ = "0.4.0"
|
||||
__date__ = "8 December 2020"
|
||||
__version__ = "0.4.1"
|
||||
__author__ = "Alexander \"Arav\" Andreev"
|
||||
__email__ = "me@arav.top"
|
||||
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"
|
||||
|
|
|
@ -36,15 +36,21 @@ class DvachParser(Parser):
|
|||
if not 'files' in post: return None
|
||||
files = []
|
||||
for f in post['files']:
|
||||
if 'sticker' in f:
|
||||
continue
|
||||
if match(r"^image\.\w+$", f['fullname']) is None:
|
||||
fullname = f['fullname']
|
||||
if not 'sticker' in f:
|
||||
if match(r"^image\.\w+$", f['fullname']) is None:
|
||||
fullname = f['fullname']
|
||||
else:
|
||||
fullname = f['name']
|
||||
else:
|
||||
fullname = f['name']
|
||||
# Here's same thing as 4chan. 2ch.hk also has md5 field, so it is
|
||||
# completely fine to hardcode `hash_algo`.
|
||||
files.append(FileInfo(fullname, f['size'],
|
||||
f"{self.__url_file_link}{f['path']}",
|
||||
f['md5'], 'md5'))
|
||||
if 'md5' in f:
|
||||
files.append(FileInfo(fullname, f['size'],
|
||||
f"{self.__url_file_link}{f['path']}",
|
||||
f['md5'], 'md5'))
|
||||
else:
|
||||
files.append(FileInfo(fullname, f['size'],
|
||||
f"{self.__url_file_link}{f['path']}",
|
||||
None, None))
|
||||
return files
|
||||
|
|
|
@ -5,8 +5,9 @@ from os import remove, stat
|
|||
from os.path import exists, join, getsize
|
||||
import re
|
||||
from typing import List, Callable
|
||||
from urllib.request import urlretrieve, URLopener, HTTPError
|
||||
from urllib.request import urlretrieve, URLopener, HTTPError, URLError
|
||||
import hashlib
|
||||
from http.client import HTTPException
|
||||
|
||||
from scrapthechan import USER_AGENT
|
||||
from scrapthechan.fileinfo import FileInfo
|
||||
|
@ -66,6 +67,8 @@ class Scraper:
|
|||
def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
|
||||
blocksize: int = 1048576) -> (str, str):
|
||||
"""Compute hash of a file."""
|
||||
if hash_algorithm is None:
|
||||
return None
|
||||
hash_func = hashlib.new(hash_algorithm)
|
||||
with open(filepath, 'rb') as f:
|
||||
buf = f.read(blocksize)
|
||||
|
@ -82,8 +85,9 @@ class Scraper:
|
|||
if not (f.size == computed_size \
|
||||
or f.size == round(computed_size / 1024)):
|
||||
return False
|
||||
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
|
||||
return f.hash_value == hexdig or f.hash_value == dig
|
||||
if not f.hash_algorithm is None:
|
||||
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
|
||||
return f.hash_value == hexdig or f.hash_value == dig
|
||||
|
||||
def _download_file(self, f: FileInfo):
|
||||
"""Download a single file."""
|
||||
|
@ -101,7 +105,6 @@ class Scraper:
|
|||
while retries > 0:
|
||||
self._url_opener.retrieve(f.download_url, filepath)
|
||||
if not self._check_file(f, filepath):
|
||||
print(filepath, f.size, f.hash_value)
|
||||
remove(filepath)
|
||||
retries -= 1
|
||||
else:
|
||||
|
@ -115,6 +118,14 @@ class Scraper:
|
|||
print("HTTP Error", e.code, e.reason, f.download_url)
|
||||
if exists(filepath):
|
||||
remove(filepath)
|
||||
except HTTPException:
|
||||
print("HTTP Exception for", f.download_url)
|
||||
if exists(filepath):
|
||||
remove(filepath)
|
||||
except URLError as e:
|
||||
print("URL Error for", f.download_url)
|
||||
if exists(filepath):
|
||||
remove(filepath)
|
||||
except ConnectionResetError:
|
||||
print("Connection reset for", f.download_url)
|
||||
if exists(filepath):
|
||||
|
|
Loading…
Reference in New Issue