Compare commits
7 Commits
2043fc277f
...
6dab626084
Author | SHA1 | Date |
---|---|---|
Alexander Andreev | 6dab626084 | |
Alexander Andreev | 86b6278657 | |
Alexander Andreev | 7754a90313 | |
Alexander Andreev | bb47b50c5f | |
Alexander Andreev | 8403fcf0f2 | |
Alexander Andreev | 647a787974 | |
Alexander Andreev | 6a54b88498 |
22
CHANGELOG.md
22
CHANGELOG.md
|
@ -1,6 +1,26 @@
|
|||
# Changelog
|
||||
|
||||
## 0.3 - 2020-09-09
|
||||
## 0.4.0 - 2020-11-18
|
||||
### Added
|
||||
- For 2ch.hk check for if a file is a sticker was added;
|
||||
- Encoding for `!op.txt` file was explicitly set to `utf-8`;
|
||||
- Handling of HTTP errors was added so now program won't crash if file doesn't
|
||||
exist or not accessible for any other reason;
|
||||
- To a scraper was added matching of hashes of two files that happen to share
|
||||
same name and size, but hash reported by an imageboard is not the same as of
|
||||
a file. It results in excessive downloading and hash calculations. Hopefully,
|
||||
that only the case for 2ch.hk.
|
||||
|
||||
### Changed
|
||||
- FileInfo class is now a frozen dataclass for memory efficiency.
|
||||
|
||||
### Fixed
|
||||
- Found that arguments for match function that matches for `image.ext` pattern
|
||||
were mixed up in places all over the parsers;
|
||||
- Also for 2ch.hk checking for if `sub` and `com` was changed to `subject` and
|
||||
`comment`.
|
||||
|
||||
## 0.3.0 - 2020-09-09
|
||||
### Added
|
||||
- Parser for lolifox.cc.
|
||||
|
||||
|
|
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
build: scrapthechan README.md setup.cfg
|
||||
python setup.py sdist bdist_wheel
|
||||
install:
|
||||
python -m pip install --upgrade dist/scrapthechan-0.3.0-py3-none-any.whl --user
|
||||
python -m pip install --upgrade dist/scrapthechan-0.4.0-py3-none-any.whl --user
|
||||
uninstall:
|
||||
# We change directory so pip uninstall will run, it'll fail otherwise.
|
||||
@cd ~/
|
||||
|
|
12
README.md
12
README.md
|
@ -1,8 +1,8 @@
|
|||
This is a tool for scraping files from imageboards' threads.
|
||||
|
||||
It extracts the files from a JSON version of a thread. And then downloads 'em
|
||||
in a specified output directory or if it isn't specified then creates following
|
||||
directory hierarchy in a working directory:
|
||||
It extracts the files from a JSON representation of a thread. And then downloads
|
||||
'em in a specified output directory or if it isn't specified then creates
|
||||
following directory hierarchy in a working directory:
|
||||
|
||||
<imageboard name>
|
||||
|-<board name>
|
||||
|
@ -25,8 +25,8 @@ separately. E.g. `4chan b 1100500`.
|
|||
`-o`, `--output-dir` -- output directory where all files will be dumped to.
|
||||
|
||||
`--no-op` -- by default OP's post will be saved in a `!op.txt` file. This flag
|
||||
disables this behaviour. I desided to put an `!` in a name so this file will be
|
||||
on the top in a directory listing.
|
||||
disables this behaviour. An exclamation mark `!` in a name is for so this file
|
||||
will be on the top of a directory listing.
|
||||
|
||||
`-v`, `--version` prints the version of the program, and `-h`, `--help` prints
|
||||
help for a program.
|
||||
|
@ -37,4 +37,4 @@ help for a program.
|
|||
- [lainchan.org](https://lainchan.org) since 0.1.0
|
||||
- [2ch.hk](https://2ch.hk) since 0.1.0
|
||||
- [8kun.top](https://8kun.top) since 0.2.2
|
||||
- [lolifox.cc](https://lolifox.cc) since 0.3
|
||||
- [lolifox.cc](https://lolifox.cc) since 0.3.0
|
|
@ -1,5 +1,5 @@
|
|||
__date__ = "9 September 2020"
|
||||
__version__ = "0.3.0"
|
||||
__date__ = "18 November 2020"
|
||||
__version__ = "0.4.0"
|
||||
__author__ = "Alexander \"Arav\" Andreev"
|
||||
__email__ = "me@arav.top"
|
||||
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"
|
||||
|
|
|
@ -109,7 +109,7 @@ def main() -> None:
|
|||
if parser.op is None:
|
||||
print("No text's there.")
|
||||
elif not exists(join(save_dir, "!op.txt")):
|
||||
with open(join(save_dir, "!op.txt"), 'w') as opf:
|
||||
with open(join(save_dir, "!op.txt"), 'w', encoding='utf-8') as opf:
|
||||
opf.write(f"{parser.op}\n")
|
||||
print("Done.")
|
||||
else:
|
||||
|
|
|
@ -1,23 +1,23 @@
|
|||
"""FileInfo object stores all needed information about a file."""
|
||||
"""FileInfo object stores information about a file."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
__all__ = ["FileInfo"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FileInfo:
|
||||
"""Stores all needed information about a file.
|
||||
"""Stores information about a file.
|
||||
|
||||
Arguments:
|
||||
- `name` -- name of a file;
|
||||
- `size` -- size of a file;
|
||||
- `dlurl` -- full download URL for a file;
|
||||
- `hash_value` -- hash sum of a file;
|
||||
- `hash_algo` -- hash algorithm used (e.g. md5).
|
||||
"""
|
||||
def __init__(self, name: str, size: int, dlurl: str,
|
||||
hash_value: str, hash_algo: str) -> None:
|
||||
self.name = name
|
||||
self.size = size
|
||||
self.dlurl = dlurl
|
||||
self.hash_value = hash_value
|
||||
self.hash_algo = hash_algo
|
||||
Fields:
|
||||
- `name` -- name of a file;
|
||||
- `size` -- size of a file;
|
||||
- `download_url` -- full download URL for a file;
|
||||
- `hash_value` -- hash sum of a file;
|
||||
- `hash_algorithm` -- hash algorithm used (e.g. md5).
|
||||
"""
|
||||
name: str
|
||||
size: int
|
||||
download_url: str
|
||||
hash_value: str
|
||||
hash_algorithm: str
|
||||
|
|
|
@ -26,9 +26,9 @@ class DvachParser(Parser):
|
|||
@property
|
||||
def op(self) -> Optional[str]:
|
||||
op = ""
|
||||
if 'sub' in self._op_post:
|
||||
if 'subject' in self._op_post:
|
||||
op = f"{self._op_post['subject']}\n"
|
||||
if 'com' in self._op_post:
|
||||
if 'comment' in self._op_post:
|
||||
op += self._op_post['comment']
|
||||
return op if not op == "" else None
|
||||
|
||||
|
@ -36,7 +36,9 @@ class DvachParser(Parser):
|
|||
if not 'files' in post: return None
|
||||
files = []
|
||||
for f in post['files']:
|
||||
if match(f['fullname'], r"^image\.\w{1,4}$") is None:
|
||||
if 'sticker' in f:
|
||||
continue
|
||||
if match(r"^image\.\w+$", f['fullname']) is None:
|
||||
fullname = f['fullname']
|
||||
else:
|
||||
fullname = f['name']
|
||||
|
|
|
@ -38,7 +38,7 @@ class EightKunParser(Parser):
|
|||
dlfname = f"{post['tim']}{post['ext']}"
|
||||
|
||||
if "filename" in post:
|
||||
if match(post['filename'], r"^image\.\w{1,4}$") is None:
|
||||
if match(r"^image\.\w+$", post['filename']) is None:
|
||||
filename = dlfname
|
||||
else:
|
||||
filename = f"{post['filename']}{post['ext']}"
|
||||
|
@ -52,7 +52,7 @@ class EightKunParser(Parser):
|
|||
for f in post["extra_files"]:
|
||||
dlfname = f"{f['tim']}{f['ext']}"
|
||||
if "filename" in post:
|
||||
if match(post['filename'], r"^image\.\w+$") is None:
|
||||
if match(r"^image\.\w+$", post['filename']) is None:
|
||||
filename = dlfname
|
||||
else:
|
||||
filename = f"{post['filename']}{post['ext']}"
|
||||
|
|
|
@ -38,7 +38,7 @@ class FourChanParser(Parser):
|
|||
dlfname = f"{post['tim']}{post['ext']}"
|
||||
|
||||
if "filename" in post:
|
||||
if match(post['filename'], r"^image\.\w{1,4}$") is None:
|
||||
if match(r"^image\.\w+$", post['filename']) is None:
|
||||
filename = dlfname
|
||||
else:
|
||||
filename = f"{post['filename']}{post['ext']}"
|
||||
|
|
|
@ -41,7 +41,7 @@ class LainchanParser(Parser):
|
|||
dlfname = f"{post['tim']}{post['ext']}"
|
||||
|
||||
if "filename" in post:
|
||||
if match(post['filename'], r"^image\.\w{1,4}$") is None:
|
||||
if match(r"^image\.\w+$", post['filename']) is None:
|
||||
filename = dlfname
|
||||
else:
|
||||
filename = f"{post['filename']}{post['ext']}"
|
||||
|
@ -55,7 +55,7 @@ class LainchanParser(Parser):
|
|||
for f in post["extra_files"]:
|
||||
dlfname = f"{f['tim']}{f['ext']}"
|
||||
if "filename" in post:
|
||||
if match(post['filename'], r"^image\.\w+$") is None:
|
||||
if match(r"^image\.\w+$", post['filename']) is None:
|
||||
filename = dlfname
|
||||
else:
|
||||
filename = f"{post['filename']}{post['ext']}"
|
||||
|
|
|
@ -24,7 +24,7 @@ class LolifoxParser(Parser):
|
|||
@property
|
||||
def imageboard(self) -> str:
|
||||
return "lolifox.cc"
|
||||
|
||||
|
||||
@property
|
||||
def op(self) -> Optional[str]:
|
||||
op = ""
|
||||
|
@ -40,7 +40,7 @@ class LolifoxParser(Parser):
|
|||
dlfname = f"{post['tim']}{post['ext']}"
|
||||
|
||||
if "filename" in post:
|
||||
if match(post['filename'], r"^image\.\w{1,4}$") is None:
|
||||
if match(r"^image\.\w+$", post['filename']) is None:
|
||||
filename = dlfname
|
||||
else:
|
||||
filename = f"{post['filename']}{post['ext']}"
|
||||
|
@ -54,7 +54,7 @@ class LolifoxParser(Parser):
|
|||
for f in post["extra_files"]:
|
||||
dlfname = f"{f['tim']}{f['ext']}"
|
||||
if "filename" in post:
|
||||
if match(post['filename'], r"^image\.\w+$") is None:
|
||||
if match(r"^image\.\w+$", post['filename']) is None:
|
||||
filename = dlfname
|
||||
else:
|
||||
filename = f"{post['filename']}{post['ext']}"
|
||||
|
|
|
@ -5,7 +5,7 @@ from os import remove, stat
|
|||
from os.path import exists, join, getsize
|
||||
import re
|
||||
from typing import List, Callable
|
||||
from urllib.request import urlretrieve, URLopener
|
||||
from urllib.request import urlretrieve, URLopener, HTTPError
|
||||
import hashlib
|
||||
|
||||
from scrapthechan import USER_AGENT
|
||||
|
@ -63,35 +63,46 @@ class Scraper:
|
|||
newname = f"{newname[:lbracket]}({int(num)+1})"
|
||||
return newname
|
||||
|
||||
def _hash_file(self, filename: str, hash_algo: str = "md5",
|
||||
def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
|
||||
blocksize: int = 1048576) -> (str, str):
|
||||
"""Compute hash of a file."""
|
||||
hash_func = hashlib.new(hash_algo)
|
||||
with open(filename, 'rb') as f:
|
||||
hash_func = hashlib.new(hash_algorithm)
|
||||
with open(filepath, 'rb') as f:
|
||||
buf = f.read(blocksize)
|
||||
while len(buf) > 0:
|
||||
hash_func.update(buf)
|
||||
buf = f.read(blocksize)
|
||||
return hash_func.hexdigest(), hash_func.digest()
|
||||
return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()
|
||||
|
||||
def _is_file_ok(self, f: FileInfo, filepath: str) -> bool:
|
||||
def _check_file(self, f: FileInfo, filepath: str) -> bool:
|
||||
"""Check if a file exist and isn't broken."""
|
||||
if not exists(filepath):
|
||||
return False
|
||||
computed_size = getsize(filepath)
|
||||
is_size_match = f.size == computed_size \
|
||||
or f.size == round(computed_size / 1024)
|
||||
hexdig, dig = self._hash_file(filepath, f.hash_algo)
|
||||
is_hash_match = f.hash_value == hexdig \
|
||||
or f.hash_value == b64encode(dig).decode()
|
||||
return is_size_match and is_hash_match
|
||||
if not (f.size == computed_size \
|
||||
or f.size == round(computed_size / 1024)):
|
||||
return False
|
||||
hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
|
||||
return f.hash_value == hexdig or f.hash_value == dig
|
||||
|
||||
def _download_file(self, f: FileInfo):
|
||||
"""Download a single file."""
|
||||
is_same_filename = False
|
||||
filepath = join(self._save_directory, f.name)
|
||||
if self._is_file_ok(f, filepath):
|
||||
return True
|
||||
orig_filepath = filepath
|
||||
if self._check_file(f, filepath):
|
||||
return
|
||||
elif exists(filepath):
|
||||
is_same_filename = True
|
||||
filepath = join(self._save_directory, \
|
||||
self._same_filename(f.name, self._save_directory))
|
||||
self._url_opener.retrieve(f.dlurl, filepath)
|
||||
try:
|
||||
self._url_opener.retrieve(f.download_url, filepath)
|
||||
if is_same_filename:
|
||||
f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
|
||||
f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
|
||||
assert filepath != orig_filepath, 'Filepaths are matching!'
|
||||
if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
|
||||
remove(filepath)
|
||||
except HTTPError as e:
|
||||
print(e, f.download_url)
|
||||
|
|
Loading…
Reference in New Issue