2020-07-08 22:53:39 +04:00
|
|
|
from argparse import ArgumentParser
|
|
|
|
from os import makedirs
|
|
|
|
from os.path import join, exists
|
|
|
|
from re import search
|
|
|
|
from sys import argv
|
2021-05-03 02:35:31 +04:00
|
|
|
from typing import List, Optional
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
from scrapthechan import VERSION
|
2020-07-20 04:32:30 +04:00
|
|
|
from scrapthechan.parser import Parser, ThreadNotFoundError
|
2020-07-08 22:53:39 +04:00
|
|
|
from scrapthechan.parsers import get_parser_by_url, get_parser_by_site, \
|
|
|
|
SUPPORTED_IMAGEBOARDS
|
2020-07-18 04:44:37 +04:00
|
|
|
from scrapthechan.scrapers.threadedscraper import ThreadedScraper
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
|
|
|
|
__all__ = ["main"]
|
|
|
|
|
|
|
|
|
2021-05-03 02:35:31 +04:00
|
|
|
USAGE: str = \
|
2020-07-20 04:13:12 +04:00
|
|
|
f"""Usage: scrapthechan [OPTIONS] (URL | IMAGEBOARD BOARD THREAD)
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
Options:
|
2021-05-03 02:35:31 +04:00
|
|
|
\t-h,--help -- print this help and exit;
|
|
|
|
\t-v,--version -- print program's version and exit;
|
|
|
|
\t-o,--output-dir -- directory where to place scraped files. By default
|
|
|
|
\t following structure will be created in current directory:
|
|
|
|
\t <imageboard>/<board>/<thread>;
|
|
|
|
\t-N,--no-op -- by default OP's post will be written in !op.txt file. This
|
|
|
|
\t option disables this behaviour;
|
|
|
|
\t-S,--skip-posts <num> -- skip given number of posts.
|
2020-07-08 22:53:39 +04:00
|
|
|
|
2020-07-08 23:13:17 +04:00
|
|
|
Arguments:
|
|
|
|
\tURL -- URL of a thread;
|
|
|
|
\tIMAGEBOARD -- name of a imageboard. E.g. 4chan;
|
|
|
|
\tBOARD -- short name of a board. E.g. b;
|
|
|
|
\tTHREAD -- ID of a thread. E.g. 100500.
|
|
|
|
|
2020-07-20 04:13:12 +04:00
|
|
|
Supported imageboards: {', '.join(SUPPORTED_IMAGEBOARDS)}.
|
2020-07-08 22:53:39 +04:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
2021-05-03 02:35:31 +04:00
|
|
|
def parse_common_arguments(args: str) -> Optional[dict]:
|
|
|
|
r = r"(?P<help>-h|--help)|(?P<version>-v|--version)"
|
|
|
|
args = search(r, args)
|
|
|
|
if not args is None:
|
|
|
|
args = args.groupdict()
|
|
|
|
return {
|
|
|
|
"help": not args["help"] is None,
|
|
|
|
"version": not args["version"] is None }
|
|
|
|
return None
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
def parse_arguments(args: str) -> dict:
|
|
|
|
rlink = r"^(https?:\/\/)?(?P<site>[\w.-]+)[ \/](?P<board>\w+)(\S+)?[ \/](?P<thread>\w+)"
|
|
|
|
link = search(rlink, args)
|
|
|
|
if not link is None:
|
|
|
|
link = link.groupdict()
|
|
|
|
out_dir = search(r"(?=(-o|--output-dir) (?P<outdir>\S+))", args)
|
2021-05-03 02:35:31 +04:00
|
|
|
skip_posts = search(r"(?=(-S|--skip-posts) (?P<skip>\d+))", args)
|
2020-07-08 22:53:39 +04:00
|
|
|
return {
|
|
|
|
"site": None if link is None else link["site"],
|
|
|
|
"board": None if link is None else link["board"],
|
|
|
|
"thread": None if link is None else link["thread"],
|
2021-05-03 02:35:31 +04:00
|
|
|
"skip-posts": None if skip_posts is None else int(skip_posts.group('skip')),
|
2020-07-08 22:53:39 +04:00
|
|
|
"no-op": not search(r"-N|--no-op", args) is None,
|
|
|
|
"output-dir": None if out_dir is None \
|
|
|
|
else out_dir.groupdict()["outdir"] }
|
|
|
|
|
|
|
|
def main() -> None:
|
2020-11-19 01:30:47 +04:00
|
|
|
if len(argv) == 1:
|
|
|
|
print(USAGE)
|
|
|
|
exit()
|
|
|
|
|
2020-07-08 22:53:39 +04:00
|
|
|
cargs = parse_common_arguments(' '.join(argv[1:]))
|
|
|
|
if not cargs is None:
|
|
|
|
if cargs["help"]:
|
|
|
|
print(USAGE)
|
|
|
|
exit()
|
|
|
|
elif cargs["version"]:
|
|
|
|
print(VERSION)
|
|
|
|
exit()
|
|
|
|
|
|
|
|
args = parse_arguments(' '.join(argv[1:]))
|
|
|
|
if args is None \
|
|
|
|
or not "site" in args or not "board" in args or not "thread" in args:
|
|
|
|
print(USAGE)
|
|
|
|
exit()
|
|
|
|
|
|
|
|
try:
|
2021-05-03 02:35:31 +04:00
|
|
|
if not args["skip-posts"] is None:
|
|
|
|
parser = get_parser_by_site(args["site"], args["board"],
|
|
|
|
args["thread"], args["skip-posts"])
|
|
|
|
else:
|
|
|
|
parser = get_parser_by_site(args["site"], args["board"],
|
|
|
|
args["thread"])
|
2020-07-08 22:53:39 +04:00
|
|
|
except NotImplementedError as ex:
|
|
|
|
print(f"{str(ex)}.")
|
|
|
|
print(f"Supported image boards are {', '.join(SUPPORTED_IMAGEBOARDS)}")
|
|
|
|
exit()
|
2021-05-03 02:35:31 +04:00
|
|
|
except ThreadNotFoundError as e:
|
2020-07-18 05:04:06 +04:00
|
|
|
print(f"Thread {args['site']}/{args['board']}/{args['thread']} " \
|
2021-05-04 03:55:32 +04:00
|
|
|
f"not found. Reason: {e.reason}")
|
2020-07-08 22:53:39 +04:00
|
|
|
exit()
|
|
|
|
|
2020-07-20 04:32:30 +04:00
|
|
|
files_count = len(parser.files)
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
if not args["output-dir"] is None:
|
|
|
|
save_dir = args["output-dir"]
|
|
|
|
else:
|
|
|
|
save_dir = join(parser.imageboard, parser.board,
|
|
|
|
parser.thread)
|
|
|
|
|
2021-05-03 02:35:31 +04:00
|
|
|
print(f"{files_count} files in " \
|
|
|
|
f"{args['site']}/{args['board']}/{args['thread']}. " \
|
|
|
|
f"They're going to {save_dir}. ", end="")
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
makedirs(save_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
if not args["no-op"]:
|
2020-07-18 04:44:37 +04:00
|
|
|
if parser.op is None:
|
2021-05-03 02:35:31 +04:00
|
|
|
print("OP's empty.")
|
2020-07-18 04:44:37 +04:00
|
|
|
elif not exists(join(save_dir, "!op.txt")):
|
2020-11-18 23:45:06 +04:00
|
|
|
with open(join(save_dir, "!op.txt"), 'w', encoding='utf-8') as opf:
|
2020-07-08 22:53:39 +04:00
|
|
|
opf.write(f"{parser.op}\n")
|
2021-05-03 02:35:31 +04:00
|
|
|
print("OP's written.")
|
2020-07-08 22:53:39 +04:00
|
|
|
else:
|
2021-05-03 02:35:31 +04:00
|
|
|
print("OP exists.")
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
|
2020-07-18 04:44:37 +04:00
|
|
|
scraper = ThreadedScraper(save_dir, parser.files, \
|
2020-07-20 04:32:30 +04:00
|
|
|
lambda i: print(f"{i}/{files_count}", end="\r"))
|
2020-07-08 22:53:39 +04:00
|
|
|
scraper.run()
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|