1
0
Fork 0
ScrapTheChan/scrapthechan/cli/scraper.py

117 lines
3.2 KiB
Python

from argparse import ArgumentParser
from os import makedirs
from os.path import join, exists
from re import search
from sys import argv
from typing import List
from scrapthechan import VERSION
from scrapthechan.parser import Parser, ParserThreadNotFoundError
from scrapthechan.parsers import get_parser_by_url, get_parser_by_site, \
SUPPORTED_IMAGEBOARDS
from scrapthechan.scrapers.basicscraper import BasicScraper
__all__ = ["main"]
USAGE = \
"""Usage: scrapthechan [OPTIONS] (URL|)
Options:
\t-h,--help -- print this help and exit;
\t-v,--version -- print program's version and exit;
\t-o,--output-dir -- directory where to place scraped files. By default
\t following structure will be created in current directory:
\t <imageboard>/<board>/<thread>;
\t-N,--no-op -- by default OP's post will be written in !op.txt file. This
\t option disables this behaviour;
Supported imageboards: 4chan.org, 2ch.hk, lainchan.org
"""
def parse_common_arguments(args: str) -> dict:
r = r"(?P<help>-h|--help)|(?P<version>-v|--version)"
argd = search(r, args)
if not argd is None:
argd = argd.groupdict()
return {
"help": not argd["help"] is None,
"version": not argd["version"] is None }
return None
def parse_arguments(args: str) -> dict:
rlink = r"^(https?:\/\/)?(?P<site>[\w.-]+)[ \/](?P<board>\w+)(\S+)?[ \/](?P<thread>\w+)"
link = search(rlink, args)
if not link is None:
link = link.groupdict()
out_dir = search(r"(?=(-o|--output-dir) (?P<outdir>\S+))", args)
return {
"site": None if link is None else link["site"],
"board": None if link is None else link["board"],
"thread": None if link is None else link["thread"],
"no-op": not search(r"-N|--no-op", args) is None,
"output-dir": None if out_dir is None \
else out_dir.groupdict()["outdir"] }
def main() -> None:
cargs = parse_common_arguments(' '.join(argv[1:]))
if not cargs is None:
if cargs["help"]:
print(USAGE)
exit()
elif cargs["version"]:
print(VERSION)
exit()
args = parse_arguments(' '.join(argv[1:]))
if args is None \
or not "site" in args or not "board" in args or not "thread" in args:
print(USAGE)
exit()
try:
parser = get_parser_by_site(args["site"], args["board"], args["thread"])
except NotImplementedError as ex:
print(f"{str(ex)}.")
print(f"Supported image boards are {', '.join(SUPPORTED_IMAGEBOARDS)}")
exit()
except ParserThreadNotFoundError:
print(f"Thread is no longer exist.")
exit()
flen = len(parser.files)
print(f"There are {flen} files in this thread.")
if not args["output-dir"] is None:
save_dir = args["output-dir"]
else:
save_dir = join(parser.imageboard, parser.board,
parser.thread)
print(f"They will be saved in {save_dir}.")
makedirs(save_dir, exist_ok=True)
if not args["no-op"]:
print("Writing OP... ", end='')
if not exists(join(save_dir, "!op.txt")):
with open(join(save_dir, "!op.txt"), 'w') as opf:
opf.write(f"{parser.op}\n")
print("Done.")
else:
print("Exists.")
scraper = BasicScraper(save_dir, parser.files, \
lambda i: print(f"{i}/{flen}", end="\r"))
scraper.run()
if __name__ == "__main__":
main()