1
0
Fork 0

Compare commits

...

7 Commits

12 changed files with 86 additions and 53 deletions

View File

@ -1,6 +1,26 @@
# Changelog
## 0.3 - 2020-09-09
## 0.4.0 - 2020-11-18
### Added
- For 2ch.hk check for if a file is a sticker was added;
- Encoding for `!op.txt` file was explicitly set to `utf-8`;
- Handling of HTTP errors was added so now program won't crash if file doesn't
exist or not accessible for any other reason;
- To a scraper was added matching of hashes of two files that happen to share
same name and size, but hash reported by an imageboard is not the same as of
a file. It results in excessive downloading and hash calculations. Hopefully,
that only the case for 2ch.hk.
### Changed
- FileInfo class is now a frozen dataclass for memory efficiency.
### Fixed
- Found that arguments for match function that matches for `image.ext` pattern
were mixed up in places all over the parsers;
- Also for 2ch.hk checking for if `sub` and `com` was changed to `subject` and
`comment`.
## 0.3.0 - 2020-09-09
### Added
- Parser for lolifox.cc.

View File

@ -1,7 +1,7 @@
build: scrapthechan README.md setup.cfg
python setup.py sdist bdist_wheel
install:
python -m pip install --upgrade dist/scrapthechan-0.3.0-py3-none-any.whl --user
python -m pip install --upgrade dist/scrapthechan-0.4.0-py3-none-any.whl --user
uninstall:
# We change directory so pip uninstall will run, it'll fail otherwise.
@cd ~/

View File

@ -1,8 +1,8 @@
This is a tool for scraping files from imageboards' threads.
It extracts the files from a JSON version of a thread. And then downloads 'em
in a specified output directory or if it isn't specified then creates following
directory hierarchy in a working directory:
It extracts the files from a JSON representation of a thread. And then downloads
'em in a specified output directory or if it isn't specified then creates
following directory hierarchy in a working directory:
<imageboard name>
|-<board name>
@ -25,8 +25,8 @@ separately. E.g. `4chan b 1100500`.
`-o`, `--output-dir` -- output directory where all files will be dumped to.
`--no-op` -- by default OP's post will be saved in a `!op.txt` file. This flag
disables this behaviour. I desided to put an `!` in a name so this file will be
on the top in a directory listing.
disables this behaviour. An exclamation mark `!` in a name is for so this file
will be on the top of a directory listing.
`-v`, `--version` prints the version of the program, and `-h`, `--help` prints
help for a program.
@ -37,4 +37,4 @@ help for a program.
- [lainchan.org](https://lainchan.org) since 0.1.0
- [2ch.hk](https://2ch.hk) since 0.1.0
- [8kun.top](https://8kun.top) since 0.2.2
- [lolifox.cc](https://lolifox.cc) since 0.3
- [lolifox.cc](https://lolifox.cc) since 0.3.0

View File

@ -1,5 +1,5 @@
__date__ = "9 September 2020"
__version__ = "0.3.0"
__date__ = "18 November 2020"
__version__ = "0.4.0"
__author__ = "Alexander \"Arav\" Andreev"
__email__ = "me@arav.top"
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"

View File

@ -109,7 +109,7 @@ def main() -> None:
if parser.op is None:
print("No text's there.")
elif not exists(join(save_dir, "!op.txt")):
with open(join(save_dir, "!op.txt"), 'w') as opf:
with open(join(save_dir, "!op.txt"), 'w', encoding='utf-8') as opf:
opf.write(f"{parser.op}\n")
print("Done.")
else:

View File

@ -1,23 +1,23 @@
"""FileInfo object stores all needed information about a file."""
"""FileInfo object stores information about a file."""
from dataclasses import dataclass
__all__ = ["FileInfo"]
@dataclass(frozen=True)
class FileInfo:
"""Stores all needed information about a file.
"""Stores information about a file.
Arguments:
- `name` -- name of a file;
- `size` -- size of a file;
- `dlurl` -- full download URL for a file;
- `hash_value` -- hash sum of a file;
- `hash_algo` -- hash algorithm used (e.g. md5).
"""
def __init__(self, name: str, size: int, dlurl: str,
hash_value: str, hash_algo: str) -> None:
self.name = name
self.size = size
self.dlurl = dlurl
self.hash_value = hash_value
self.hash_algo = hash_algo
Fields:
- `name` -- name of a file;
- `size` -- size of a file;
- `download_url` -- full download URL for a file;
- `hash_value` -- hash sum of a file;
- `hash_algorithm` -- hash algorithm used (e.g. md5).
"""
name: str
size: int
download_url: str
hash_value: str
hash_algorithm: str

View File

@ -26,9 +26,9 @@ class DvachParser(Parser):
@property
def op(self) -> Optional[str]:
op = ""
if 'sub' in self._op_post:
if 'subject' in self._op_post:
op = f"{self._op_post['subject']}\n"
if 'com' in self._op_post:
if 'comment' in self._op_post:
op += self._op_post['comment']
return op if not op == "" else None
@ -36,7 +36,9 @@ class DvachParser(Parser):
if not 'files' in post: return None
files = []
for f in post['files']:
if match(f['fullname'], r"^image\.\w{1,4}$") is None:
if 'sticker' in f:
continue
if match(r"^image\.\w+$", f['fullname']) is None:
fullname = f['fullname']
else:
fullname = f['name']

View File

@ -38,7 +38,7 @@ class EightKunParser(Parser):
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w{1,4}$") is None:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
@ -52,7 +52,7 @@ class EightKunParser(Parser):
for f in post["extra_files"]:
dlfname = f"{f['tim']}{f['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w+$") is None:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"

View File

@ -38,7 +38,7 @@ class FourChanParser(Parser):
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w{1,4}$") is None:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"

View File

@ -41,7 +41,7 @@ class LainchanParser(Parser):
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w{1,4}$") is None:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
@ -55,7 +55,7 @@ class LainchanParser(Parser):
for f in post["extra_files"]:
dlfname = f"{f['tim']}{f['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w+$") is None:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"

View File

@ -24,7 +24,7 @@ class LolifoxParser(Parser):
@property
def imageboard(self) -> str:
return "lolifox.cc"
@property
def op(self) -> Optional[str]:
op = ""
@ -40,7 +40,7 @@ class LolifoxParser(Parser):
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w{1,4}$") is None:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
@ -54,7 +54,7 @@ class LolifoxParser(Parser):
for f in post["extra_files"]:
dlfname = f"{f['tim']}{f['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w+$") is None:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"

View File

@ -5,7 +5,7 @@ from os import remove, stat
from os.path import exists, join, getsize
import re
from typing import List, Callable
from urllib.request import urlretrieve, URLopener
from urllib.request import urlretrieve, URLopener, HTTPError
import hashlib
from scrapthechan import USER_AGENT
@ -63,35 +63,46 @@ class Scraper:
newname = f"{newname[:lbracket]}({int(num)+1})"
return newname
def _hash_file(self, filename: str, hash_algo: str = "md5",
def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file."""
hash_func = hashlib.new(hash_algo)
with open(filename, 'rb') as f:
hash_func = hashlib.new(hash_algorithm)
with open(filepath, 'rb') as f:
buf = f.read(blocksize)
while len(buf) > 0:
hash_func.update(buf)
buf = f.read(blocksize)
return hash_func.hexdigest(), hash_func.digest()
return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
def _is_file_ok(self, f: FileInfo, filepath: str) -> bool:
def _check_file(self, f: FileInfo, filepath: str) -> bool:
"""Check if a file exist and isn't broken."""
if not exists(filepath):
return False
computed_size = getsize(filepath)
is_size_match = f.size == computed_size \
or f.size == round(computed_size / 1024)
hexdig, dig = self._hash_file(filepath, f.hash_algo)
is_hash_match = f.hash_value == hexdig \
or f.hash_value == b64encode(dig).decode()
return is_size_match and is_hash_match
if not (f.size == computed_size \
or f.size == round(computed_size / 1024)):
return False
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
return f.hash_value == hexdig or f.hash_value == dig
def _download_file(self, f: FileInfo):
"""Download a single file."""
is_same_filename = False
filepath = join(self._save_directory, f.name)
if self._is_file_ok(f, filepath):
return True
orig_filepath = filepath
if self._check_file(f, filepath):
return
elif exists(filepath):
is_same_filename = True
filepath = join(self._save_directory, \
self._same_filename(f.name, self._save_directory))
self._url_opener.retrieve(f.dlurl, filepath)
try:
self._url_opener.retrieve(f.download_url, filepath)
if is_same_filename:
f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
assert filepath != orig_filepath, 'Filepaths are matching!'
if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
remove(filepath)
except HTTPError as e:
print(e, f.download_url)