Initial commit with all the files.
This commit is contained in:
commit
a5028162d8
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
.vscode/
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
*.egg-info/
|
||||||
|
__pycache__
|
11
CHANGELOG.md
Normal file
11
CHANGELOG.md
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# Changelog
|
||||||
|
|
||||||
|
## 0.1.0 - 2020-07-08
|
||||||
|
### Added
|
||||||
|
- JSON parsers for 4chan.org, lainchan.org and 2ch.hk.
|
||||||
|
- Basic straightforward scraper that downloads files one by one.
|
||||||
|
|
||||||
|
### Issues
|
||||||
|
- 2ch.hk: I can't figure out what exactly it tells as a size and hash of a file.
|
||||||
|
Example: file may have a size of 127798 bytes (125K) but 2ch reports 150 and a
|
||||||
|
hash reported doesn't equal to a computed one.
|
21
COPYING
Normal file
21
COPYING
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
The MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2020 Alexander "Arav" Andreev <me@arav.top>
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
13
Makefile
Normal file
13
Makefile
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
build: scrapthechan README.md setup.cfg
|
||||||
|
python setup.py sdist bdist_wheel
|
||||||
|
install:
|
||||||
|
python -m pip install --upgrade dist/scrapthechan-1.0.0-py3-none-any.whl --user
|
||||||
|
uninstall:
|
||||||
|
# We change directory so pip uninstall will run, it'll fail otherwise.
|
||||||
|
@cd ~/
|
||||||
|
python -m pip uninstall scrapthechan
|
||||||
|
clean:
|
||||||
|
rm -rf __pycache__ scrapthechan/__pycache__ scrapthechan/parsers/__pycache__ \
|
||||||
|
scrapthechan.egg-info build
|
||||||
|
|
||||||
|
.PHONY: build
|
33
README.md
Normal file
33
README.md
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
This is a tool for scraping files from imageboards' threads.
|
||||||
|
|
||||||
|
It extracts the files from a JSON version of a thread. And then downloads 'em
|
||||||
|
in a specified output directory or if it isn't specified then creates following
|
||||||
|
directory hierarchy in a working directory:
|
||||||
|
|
||||||
|
<imageboard name>
|
||||||
|
|-<board name>
|
||||||
|
|-<thread>
|
||||||
|
|-[!op.txt]
|
||||||
|
|-...
|
||||||
|
|-...
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
scrapthechan [<url> | <imageboard> <board> <thread>] [-o,--output-dir] [--no-op]
|
||||||
|
[-v,--version] [-h,--help]
|
||||||
|
```
|
||||||
|
|
||||||
|
There are two ways to pass a thread. One is by passing a full URL of a thread
|
||||||
|
(`<url>` argument), and the other one is by passing thread in three components:
|
||||||
|
`<imageboard>` is a name of website (e.g. 4chan), `<board>` is a name of a board
|
||||||
|
(e.g. wg), and `<thread>` is a number of a thread on that board.
|
||||||
|
|
||||||
|
`-o`, `--output-dir` -- output directory where all files will be dumped to.
|
||||||
|
|
||||||
|
`--no-op` -- by default OP's post will be saved in a `!op.txt` file. This flag
|
||||||
|
disables this behaviour. I desided to put an `!` in a name so this file will be
|
||||||
|
on the top in a directory listing.
|
||||||
|
|
||||||
|
`-v`, `--version` prints the version of the program, and `-h`, `--help` prints
|
||||||
|
help for a program.
|
13
scrapthechan/__init__.py
Normal file
13
scrapthechan/__init__.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
__date__ = "8 Jule 2020"
|
||||||
|
__version__ = "0.1.0"
|
||||||
|
__author__ = "Alexander \"Arav\" Andreev"
|
||||||
|
__email__ = "me@arav.top"
|
||||||
|
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"
|
||||||
|
__license__ = \
|
||||||
|
"""This program is licensed under the terms of the MIT license.
|
||||||
|
For a copy see COPYING file in a directory of the program, or
|
||||||
|
see <https://opensource.org/licenses/MIT>"""
|
||||||
|
|
||||||
|
VERSION = \
|
||||||
|
f"ScrapTheChan ver. {__version__} ({__date__})\n\n{__copyright__}\n"\
|
||||||
|
f"\n{__license__}"
|
0
scrapthechan/cli/__init__.py
Normal file
0
scrapthechan/cli/__init__.py
Normal file
116
scrapthechan/cli/scraper.py
Normal file
116
scrapthechan/cli/scraper.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
from argparse import ArgumentParser
|
||||||
|
from os import makedirs
|
||||||
|
from os.path import join, exists
|
||||||
|
from re import search
|
||||||
|
from sys import argv
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from scrapthechan import VERSION
|
||||||
|
from scrapthechan.parser import Parser, ParserThreadNotFoundError
|
||||||
|
from scrapthechan.parsers import get_parser_by_url, get_parser_by_site, \
|
||||||
|
SUPPORTED_IMAGEBOARDS
|
||||||
|
from scrapthechan.scrapers.basicscraper import BasicScraper
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["main"]
|
||||||
|
|
||||||
|
|
||||||
|
USAGE = \
|
||||||
|
"""Usage: scrapthechan [OPTIONS] (URL|)
|
||||||
|
|
||||||
|
Options:
|
||||||
|
\t-h,--help -- print this help and exit;
|
||||||
|
\t-v,--version -- print program's version and exit;
|
||||||
|
\t-o,--output-dir -- directory where to place scraped files. By default
|
||||||
|
\t following structure will be created in current directory:
|
||||||
|
\t <imageboard>/<board>/<thread>;
|
||||||
|
\t-N,--no-op -- by default OP's post will be written in !op.txt file. This
|
||||||
|
\t option disables this behaviour;
|
||||||
|
|
||||||
|
Supported imageboards: 4chan.org, 2ch.hk, lainchan.org
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def parse_common_arguments(args: str) -> dict:
|
||||||
|
r = r"(?P<help>-h|--help)|(?P<version>-v|--version)"
|
||||||
|
argd = search(r, args)
|
||||||
|
if not argd is None:
|
||||||
|
argd = argd.groupdict()
|
||||||
|
return {
|
||||||
|
"help": not argd["help"] is None,
|
||||||
|
"version": not argd["version"] is None }
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_arguments(args: str) -> dict:
|
||||||
|
rlink = r"^(https?:\/\/)?(?P<site>[\w.-]+)[ \/](?P<board>\w+)(\S+)?[ \/](?P<thread>\w+)"
|
||||||
|
link = search(rlink, args)
|
||||||
|
if not link is None:
|
||||||
|
link = link.groupdict()
|
||||||
|
out_dir = search(r"(?=(-o|--output-dir) (?P<outdir>\S+))", args)
|
||||||
|
return {
|
||||||
|
"site": None if link is None else link["site"],
|
||||||
|
"board": None if link is None else link["board"],
|
||||||
|
"thread": None if link is None else link["thread"],
|
||||||
|
"no-op": not search(r"-N|--no-op", args) is None,
|
||||||
|
"output-dir": None if out_dir is None \
|
||||||
|
else out_dir.groupdict()["outdir"] }
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
cargs = parse_common_arguments(' '.join(argv[1:]))
|
||||||
|
if not cargs is None:
|
||||||
|
if cargs["help"]:
|
||||||
|
print(USAGE)
|
||||||
|
exit()
|
||||||
|
elif cargs["version"]:
|
||||||
|
print(VERSION)
|
||||||
|
exit()
|
||||||
|
|
||||||
|
args = parse_arguments(' '.join(argv[1:]))
|
||||||
|
if args is None \
|
||||||
|
or not "site" in args or not "board" in args or not "thread" in args:
|
||||||
|
print(USAGE)
|
||||||
|
exit()
|
||||||
|
|
||||||
|
try:
|
||||||
|
parser = get_parser_by_site(args["site"], args["board"], args["thread"])
|
||||||
|
except NotImplementedError as ex:
|
||||||
|
print(f"{str(ex)}.")
|
||||||
|
print(f"Supported image boards are {', '.join(SUPPORTED_IMAGEBOARDS)}")
|
||||||
|
exit()
|
||||||
|
except ParserThreadNotFoundError:
|
||||||
|
print(f"Thread is no longer exist.")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
flen = len(parser.files)
|
||||||
|
|
||||||
|
|
||||||
|
print(f"There are {flen} files in this thread.")
|
||||||
|
|
||||||
|
if not args["output-dir"] is None:
|
||||||
|
save_dir = args["output-dir"]
|
||||||
|
else:
|
||||||
|
save_dir = join(parser.imageboard, parser.board,
|
||||||
|
parser.thread)
|
||||||
|
|
||||||
|
print(f"They will be saved in {save_dir}.")
|
||||||
|
|
||||||
|
makedirs(save_dir, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
if not args["no-op"]:
|
||||||
|
print("Writing OP... ", end='')
|
||||||
|
if not exists(join(save_dir, "!op.txt")):
|
||||||
|
with open(join(save_dir, "!op.txt"), 'w') as opf:
|
||||||
|
opf.write(f"{parser.op}\n")
|
||||||
|
print("Done.")
|
||||||
|
else:
|
||||||
|
print("Exists.")
|
||||||
|
|
||||||
|
|
||||||
|
scraper = BasicScraper(save_dir, parser.files, \
|
||||||
|
lambda i: print(f"{i}/{flen}", end="\r"))
|
||||||
|
scraper.run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
23
scrapthechan/fileinfo.py
Normal file
23
scrapthechan/fileinfo.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
"""FileInfo object stores all needed information about a file."""
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["FileInfo"]
|
||||||
|
|
||||||
|
|
||||||
|
class FileInfo:
|
||||||
|
"""Stores all needed information about a file.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- `name` -- name of a file;
|
||||||
|
- `size` -- size of a file;
|
||||||
|
- `dlurl` -- full download URL for a file;
|
||||||
|
- `hash_value` -- hash sum of a file;
|
||||||
|
- `hash_algo` -- hash algorithm used (e.g. md5).
|
||||||
|
"""
|
||||||
|
def __init__(self, name: str, size: int, dlurl: str,
|
||||||
|
hash_value: str, hash_algo: str) -> None:
|
||||||
|
self.name = name
|
||||||
|
self.size = size
|
||||||
|
self.dlurl = dlurl
|
||||||
|
self.hash_value = hash_value
|
||||||
|
self.hash_algo = hash_algo
|
81
scrapthechan/parser.py
Normal file
81
scrapthechan/parser.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
"""Base `Parser` class for JSON parsers to inherit."""
|
||||||
|
|
||||||
|
from itertools import chain
|
||||||
|
from json import loads
|
||||||
|
from re import findall, match
|
||||||
|
from typing import List, Optional
|
||||||
|
from urllib.request import urlopen, urlretrieve
|
||||||
|
|
||||||
|
from scrapthechan.fileinfo import FileInfo
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Parser", "ParserThreadNotFoundError"]
|
||||||
|
|
||||||
|
|
||||||
|
class ParserThreadNotFoundError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Parser:
|
||||||
|
"""Base class for all parsers.
|
||||||
|
It fetches JSON of a specified thread and collects all the files from it
|
||||||
|
into a list of the `FileInfo` objects.
|
||||||
|
Also it extracts OP's post, that may come handy if you do bulk scraping.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
board -- is a name of a board on an image board;
|
||||||
|
thread -- is a name of a thread inside a board;
|
||||||
|
posts -- is a list of posts in form of dictionaries exported from a JSON;
|
||||||
|
skip_posts -- number of posts to skip.
|
||||||
|
|
||||||
|
All the extracted files will be stored as the `FileInfo` objects."""
|
||||||
|
__url_thread_json: str = "https://example.org/{board}/{thread}.json"
|
||||||
|
__url_file_link: str = None
|
||||||
|
|
||||||
|
def __init__(self, board: str, thread: str, posts: List[dict],
|
||||||
|
skip_posts: Optional[int] = None) -> None:
|
||||||
|
self._board = board
|
||||||
|
self._thread = thread
|
||||||
|
self._op_post = posts[0]
|
||||||
|
if not skip_posts is None:
|
||||||
|
posts = posts[skip_posts:]
|
||||||
|
self._files = list(chain.from_iterable(filter(None, \
|
||||||
|
map(self._parse_post, posts))))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def imageboard(self) -> str:
|
||||||
|
"""Returns image board's name."""
|
||||||
|
return NotImplementedError
|
||||||
|
|
||||||
|
@property
|
||||||
|
def board(self) -> str:
|
||||||
|
"""Returns a name of a board of image board."""
|
||||||
|
return self._board
|
||||||
|
|
||||||
|
@property
|
||||||
|
def thread(self) -> str:
|
||||||
|
"""Returns a name of thread from a board."""
|
||||||
|
return self._thread
|
||||||
|
|
||||||
|
@property
|
||||||
|
def op(self) -> str:
|
||||||
|
"""Returns OP's post as combination of subject and comment separated
|
||||||
|
by a new line."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@property
|
||||||
|
def files(self) -> List[FileInfo]:
|
||||||
|
"""Returns a list of retrieved files as `FileInfo` objects."""
|
||||||
|
return self._files
|
||||||
|
|
||||||
|
def _get_json(self, thread_url: str) -> dict:
|
||||||
|
"""Gets JSON version of a thread and converts it in a dictionary."""
|
||||||
|
try:
|
||||||
|
with urlopen(thread_url) as url:
|
||||||
|
return loads(url.read().decode('utf-8'))
|
||||||
|
except:
|
||||||
|
raise ParserThreadNotFoundError
|
||||||
|
|
||||||
|
def _parse_post(self, post: dict) -> List[FileInfo]:
|
||||||
|
"""Parses a single post and extracts files into `FileInfo` object."""
|
||||||
|
raise NotImplementedError
|
34
scrapthechan/parsers/__init__.py
Normal file
34
scrapthechan/parsers/__init__.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
"""Here are defined the JSON parsers for imageboards."""
|
||||||
|
from re import search
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from scrapthechan.parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["SUPPORTED_IMAGEBOARDS", "get_parser_by_url", "get_parser_by_site"]
|
||||||
|
|
||||||
|
|
||||||
|
SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser_by_url(url: str) -> Parser:
|
||||||
|
"""Parses URL and extracts from it site name, board and thread.
|
||||||
|
And then returns initialised Parser object for detected imageboard."""
|
||||||
|
URLRX = r"https?:\/\/(?P<s>[\w\.]+)\/(?P<b>\w+)\/(?:\w+)?\/(?P<t>\w+)"
|
||||||
|
site, board, thread = search(URLRX, url).groups()
|
||||||
|
return get_parser_by_site(site, board, thread)
|
||||||
|
|
||||||
|
def get_parser_by_site(site: str, board: str, thread: str) -> Parser:
|
||||||
|
"""Returns an initialised parser for `site` with `board` and `thread`."""
|
||||||
|
if site in ['boards.4chan.org', 'boards.4channel.org',
|
||||||
|
'4chan', '4chan.org']:
|
||||||
|
from .fourchan import FourChanParser
|
||||||
|
return FourChanParser(board, thread)
|
||||||
|
elif site in ['lainchan.org', 'lainchan']:
|
||||||
|
from .lainchan import LainchanParser
|
||||||
|
return LainchanParser(board, thread)
|
||||||
|
elif site in ['2ch.hk', '2ch']:
|
||||||
|
from .dvach import DvachParser
|
||||||
|
return DvachParser(board, thread)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Parser for {site} is not implemented")
|
43
scrapthechan/parsers/dvach.py
Normal file
43
scrapthechan/parsers/dvach.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
from re import match
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from scrapthechan.fileinfo import FileInfo
|
||||||
|
from scrapthechan.parser import Parser
|
||||||
|
|
||||||
|
__all__ = ["DvachParser"]
|
||||||
|
|
||||||
|
|
||||||
|
class DvachParser(Parser):
|
||||||
|
"""JSON parser for 2ch.hk image board."""
|
||||||
|
|
||||||
|
__url_thread_json = "https://2ch.hk/{board}/res/{thread}.json"
|
||||||
|
__url_file_link = "https://2ch.hk"
|
||||||
|
|
||||||
|
def __init__(self, board: str, thread: str,
|
||||||
|
skip_posts: Optional[int] = None) -> None:
|
||||||
|
posts = self._get_json(self.__url_thread_json.format(board=board, \
|
||||||
|
thread=thread))['threads'][0]['posts']
|
||||||
|
super(DvachParser, self).__init__(board, thread, posts, skip_posts)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def imageboard(self) -> str:
|
||||||
|
return "2ch.hk"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def op(self) -> str:
|
||||||
|
return f"{self._op_post['subject']}\n{self._op_post['comment']}"
|
||||||
|
|
||||||
|
def _parse_post(self, post) -> Optional[List[FileInfo]]:
|
||||||
|
if not 'files' in post: return None
|
||||||
|
files = []
|
||||||
|
for f in post['files']:
|
||||||
|
if match(f['fullname'], r"^image\.\w+$") is None:
|
||||||
|
fullname = f['fullname']
|
||||||
|
else:
|
||||||
|
fullname = f['name']
|
||||||
|
# Here's same thing as 4chan. 2ch.hk also has md5 field, so it is
|
||||||
|
# completely fine to hardcode `hash_algo`.
|
||||||
|
files.append(FileInfo(fullname, f['size'],
|
||||||
|
f"{self.__url_file_link}{f['path']}",
|
||||||
|
f['md5'], 'md5'))
|
||||||
|
return files
|
49
scrapthechan/parsers/fourchan.py
Normal file
49
scrapthechan/parsers/fourchan.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
from re import match
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from scrapthechan.fileinfo import FileInfo
|
||||||
|
from scrapthechan.parser import Parser
|
||||||
|
|
||||||
|
__all__ = ["FourChanParser"]
|
||||||
|
|
||||||
|
|
||||||
|
class FourChanParser(Parser):
|
||||||
|
"""JSON parser for 4chan.org image board."""
|
||||||
|
|
||||||
|
__url_thread_json = "https://a.4cdn.org/{board}/thread/{thread}.json"
|
||||||
|
__url_file_link = "https://i.4cdn.org/{board}/{filename}"
|
||||||
|
|
||||||
|
def __init__(self, board: str, thread: str,
|
||||||
|
skip_posts: Optional[int] = None) -> None:
|
||||||
|
posts = self._get_json(self.__url_thread_json.format(board=board, \
|
||||||
|
thread=thread))['posts']
|
||||||
|
super(FourChanParser, self).__init__(board, thread, posts, skip_posts)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def imageboard(self) -> str:
|
||||||
|
return "4chan.org"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def op(self) -> str:
|
||||||
|
if 'sub' in self._op_post:
|
||||||
|
return f"{self._op_post['sub']}\n{self._op_post['com']}"
|
||||||
|
else:
|
||||||
|
return self._op_post['com']
|
||||||
|
|
||||||
|
def _parse_post(self, post: dict) -> List[FileInfo]:
|
||||||
|
if not 'tim' in post: return None
|
||||||
|
|
||||||
|
dlfname = f"{post['tim']}{post['ext']}"
|
||||||
|
|
||||||
|
if "filename" in post:
|
||||||
|
if match(post['filename'], r"^image\.\w+$") is None:
|
||||||
|
filename = dlfname
|
||||||
|
else:
|
||||||
|
filename = f"{post['filename']}{post['ext']}"
|
||||||
|
|
||||||
|
# Hash algorithm is hardcoded since it is highly unlikely that it will
|
||||||
|
# be changed in foreseeable future. And if it'll change then this line
|
||||||
|
# will be necessarily updated anyway.
|
||||||
|
return [FileInfo(filename, post['fsize'],
|
||||||
|
self.__url_file_link.format(board=self.board, filename=dlfname),
|
||||||
|
post['md5'], 'md5')]
|
57
scrapthechan/parsers/lainchan.py
Normal file
57
scrapthechan/parsers/lainchan.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
from re import match
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from scrapthechan.parser import Parser
|
||||||
|
from scrapthechan.fileinfo import FileInfo
|
||||||
|
|
||||||
|
__all__ = ["LainchanParser"]
|
||||||
|
|
||||||
|
|
||||||
|
class LainchanParser(Parser):
|
||||||
|
"""JSON parser for lainchan.org image board.
|
||||||
|
JSON structure is identical to 4chan.org's, so this parser is just inherited
|
||||||
|
from 4chan.org's parser and only needed things are redefined.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__url_thread_json = "https://lainchan.org/{board}/res/{thread}.json"
|
||||||
|
__url_file_link = "https://lainchan.org/{board}/src/{filename}"
|
||||||
|
|
||||||
|
def __init__(self, board: str, thread: str,
|
||||||
|
skip_posts: Optional[int] = None) -> None:
|
||||||
|
posts = self._get_json(self.__url_thread_json.format(board=board, \
|
||||||
|
thread=thread))['posts']
|
||||||
|
super(LainchanParser, self).__init__(board, thread, posts, skip_posts)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def imageboard(self) -> str:
|
||||||
|
return "lainchan.org"
|
||||||
|
|
||||||
|
def _parse_post(self, post) -> List[FileInfo]:
|
||||||
|
if not 'tim' in post: return None
|
||||||
|
|
||||||
|
dlfname = f"{post['tim']}{post['ext']}"
|
||||||
|
|
||||||
|
if "filename" in post:
|
||||||
|
if match(post['filename'], r"^image\.\w+$") is None:
|
||||||
|
filename = dlfname
|
||||||
|
else:
|
||||||
|
filename = f"{post['filename']}{post['ext']}"
|
||||||
|
|
||||||
|
files = []
|
||||||
|
files.append(FileInfo(filename, post['fsize'],
|
||||||
|
self.__url_file_link.format(board=self.board, filename=dlfname),
|
||||||
|
post['md5'], 'md5'))
|
||||||
|
|
||||||
|
if "extra_files" in post:
|
||||||
|
for f in post["extra_files"]:
|
||||||
|
dlfname = f"{f['tim']}{f['ext']}"
|
||||||
|
if "filename" in post:
|
||||||
|
if match(post['filename'], r"^image\.\w+$") is None:
|
||||||
|
filename = dlfname
|
||||||
|
else:
|
||||||
|
filename = f"{post['filename']}{post['ext']}"
|
||||||
|
dlurl = self.__url_file_link.format(board=self.board, \
|
||||||
|
filename=dlfname)
|
||||||
|
files.append(FileInfo(filename, f['fsize'], \
|
||||||
|
dlurl, f['md5'], 'md5'))
|
||||||
|
return files
|
96
scrapthechan/scraper.py
Normal file
96
scrapthechan/scraper.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
"""Base Scraper implementation."""
|
||||||
|
|
||||||
|
from base64 import b64encode
|
||||||
|
from os import remove, stat
|
||||||
|
from os.path import exists, join, getsize
|
||||||
|
import re
|
||||||
|
from typing import List, Callable
|
||||||
|
from urllib.request import urlretrieve, URLopener
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
from scrapthechan import __version__
|
||||||
|
from scrapthechan.fileinfo import FileInfo
|
||||||
|
|
||||||
|
__all__ = ["Scraper"]
|
||||||
|
|
||||||
|
|
||||||
|
class Scraper:
|
||||||
|
"""Base scraper implementation.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
save_directory -- a path to a directory where file will be
|
||||||
|
saved;
|
||||||
|
files -- a list of FileInfo objects;
|
||||||
|
download_progress_callback -- a callback function that will be called
|
||||||
|
for each file started downloading.
|
||||||
|
"""
|
||||||
|
def __init__(self, save_directory: str, files: List[FileInfo],
|
||||||
|
download_progress_callback: Callable[[int], None] = None) -> None:
|
||||||
|
self._save_directory = save_directory
|
||||||
|
self._files = files
|
||||||
|
self._url_opener = URLopener()
|
||||||
|
self._url_opener.version = f"ScrapTheChan/{__version__}"
|
||||||
|
self._progress_callback = download_progress_callback
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _same_filename(self, filename: str, path: str) -> str:
|
||||||
|
"""Check if there is a file with same name. If so then add incremental
|
||||||
|
number enclosed in brackets to a name of a new one."""
|
||||||
|
newname = filename
|
||||||
|
while exists(join(path, newname)):
|
||||||
|
has_extension = newname.rfind(".") != -1
|
||||||
|
if has_extension:
|
||||||
|
l, r = newname.rsplit(".", 1)
|
||||||
|
lbracket = l.rfind("(")
|
||||||
|
if lbracket == -1:
|
||||||
|
newname = f"{l}(1).{r}"
|
||||||
|
else:
|
||||||
|
num = l[lbracket+1:-1]
|
||||||
|
if num.isnumeric():
|
||||||
|
newname = f"{l[:lbracket]}({int(num)+1}).{r}"
|
||||||
|
else:
|
||||||
|
newname = f"{l}(1).{r}"
|
||||||
|
else:
|
||||||
|
lbracket = l.rfind("(")
|
||||||
|
if lbracket == -1:
|
||||||
|
newname = f"{newname}(1)"
|
||||||
|
else:
|
||||||
|
num = newname[lbracket+1:-1]
|
||||||
|
if num.isnumeric():
|
||||||
|
newname = f"{newname[:lbracket]}({int(num)+1})"
|
||||||
|
return newname
|
||||||
|
|
||||||
|
def _hash_file(self, filename: str, hash_algo: str = "md5",
|
||||||
|
blocksize: int = 1048576) -> (str, str):
|
||||||
|
"""Compute hash of a file."""
|
||||||
|
hash_func = hashlib.new(hash_algo)
|
||||||
|
with open(filename, 'rb') as f:
|
||||||
|
buf = f.read(blocksize)
|
||||||
|
while len(buf) > 0:
|
||||||
|
hash_func.update(buf)
|
||||||
|
buf = f.read(blocksize)
|
||||||
|
return hash_func.hexdigest(), hash_func.digest()
|
||||||
|
|
||||||
|
def _is_file_ok(self, f: FileInfo, filepath: str) -> bool:
|
||||||
|
"""Check if a file exist and isn't broken."""
|
||||||
|
if not exists(filepath):
|
||||||
|
return False
|
||||||
|
computed_size = getsize(filepath)
|
||||||
|
is_size_match = f.size == computed_size \
|
||||||
|
or f.size == round(computed_size / 1024)
|
||||||
|
hexdig, dig = self._hash_file(filepath, f.hash_algo)
|
||||||
|
is_hash_match = f.hash_value == hexdig \
|
||||||
|
or f.hash_value == b64encode(dig).decode()
|
||||||
|
return is_size_match and is_hash_match
|
||||||
|
|
||||||
|
def _download_file(self, f: FileInfo):
|
||||||
|
"""Download a single file."""
|
||||||
|
filepath = join(self._save_directory, f.name)
|
||||||
|
if self._is_file_ok(f, filepath):
|
||||||
|
return True
|
||||||
|
elif exists(filepath):
|
||||||
|
filepath = join(self._save_directory, \
|
||||||
|
self._same_filename(f.name, self._save_directory))
|
||||||
|
self._url_opener.retrieve(f.dlurl, filepath)
|
0
scrapthechan/scrapers/__init__.py
Normal file
0
scrapthechan/scrapers/__init__.py
Normal file
15
scrapthechan/scrapers/basicscraper.py
Normal file
15
scrapthechan/scrapers/basicscraper.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
"""Implementation of basic sequential one-threaded scraper that downloads
|
||||||
|
files one by one."""
|
||||||
|
|
||||||
|
from scrapthechan.scraper import Scraper
|
||||||
|
|
||||||
|
__all__ = ["BasicScraper"]
|
||||||
|
|
||||||
|
|
||||||
|
class BasicScraper(Scraper):
|
||||||
|
def run(self):
|
||||||
|
"""Download files one by one."""
|
||||||
|
for i, f in enumerate(self._files, start=1):
|
||||||
|
if not self._progress_callback is None:
|
||||||
|
self._progress_callback(i)
|
||||||
|
self._download_file(f)
|
42
setup.cfg
Normal file
42
setup.cfg
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
[metadata]
|
||||||
|
name = scrapthechan
|
||||||
|
version = attr: scrapthechan.__version__
|
||||||
|
description =
|
||||||
|
Scrap the files posted in a thread on an imageboard. Currently supports
|
||||||
|
4chan.org, lainchan.org and 2ch.hk.
|
||||||
|
long_description = file: README.md
|
||||||
|
long_description_content_type = text/markdown
|
||||||
|
author = Alexander "Arav" Andreev
|
||||||
|
author_email = me@arav.top
|
||||||
|
url = https://arav.top
|
||||||
|
keywords =
|
||||||
|
scraper
|
||||||
|
imageboard
|
||||||
|
4chan
|
||||||
|
2ch
|
||||||
|
lainchan
|
||||||
|
license = MIT
|
||||||
|
license_file = COPYING
|
||||||
|
classifiers =
|
||||||
|
Development Status :: 2 - Pre-Alpha
|
||||||
|
Environment :: Console
|
||||||
|
Intended Audience :: End Users/Desktop
|
||||||
|
License :: Other/Proprietary License
|
||||||
|
Natural Language :: English
|
||||||
|
Operating System :: OS Independent
|
||||||
|
Programming Language :: Python :: 3.7
|
||||||
|
Programming Language :: Python :: 3.8
|
||||||
|
Topic :: Utilities
|
||||||
|
|
||||||
|
[options]
|
||||||
|
zip_safe = False
|
||||||
|
python_requires = >=3.7
|
||||||
|
include_package_data = True
|
||||||
|
packages = find:
|
||||||
|
|
||||||
|
[options.package_data]
|
||||||
|
* = COPYING, README.md
|
||||||
|
|
||||||
|
[options.entry_points]
|
||||||
|
console_scripts =
|
||||||
|
scrapthechan = scrapthechan.cli.scraper:main
|
Loading…
Reference in New Issue
Block a user