commit a5028162d82aba03ef90e5b81e78f5d1a6cad5ac Author: Alexander "Arav" Andreev Date: Wed Jul 8 22:53:39 2020 +0400 Initial commit with all the files. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7bbedd5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.vscode/ +build/ +dist/ +*.egg-info/ +__pycache__ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..db86886 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +# Changelog + +## 0.1.0 - 2020-07-08 +### Added +- JSON parsers for 4chan.org, lainchan.org and 2ch.hk. +- Basic straightforward scraper that downloads files one by one. + +### Issues +- 2ch.hk: I can't figure out what exactly it tells as a size and hash of a file. + Example: file may have a size of 127798 bytes (125K) but 2ch reports 150 and a + hash reported doesn't equal to a computed one. diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..be646f1 --- /dev/null +++ b/COPYING @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2020 Alexander "Arav" Andreev + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..87bd18b --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +build: scrapthechan README.md setup.cfg + python setup.py sdist bdist_wheel +install: + python -m pip install --upgrade dist/scrapthechan-1.0.0-py3-none-any.whl --user +uninstall: + # We change directory so pip uninstall will run, it'll fail otherwise. + @cd ~/ + python -m pip uninstall scrapthechan +clean: + rm -rf __pycache__ scrapthechan/__pycache__ scrapthechan/parsers/__pycache__ \ + scrapthechan.egg-info build + +.PHONY: build \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f52825e --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +This is a tool for scraping files from imageboards' threads. + +It extracts the files from a JSON version of a thread. And then downloads 'em +in a specified output directory or if it isn't specified then creates following +directory hierarchy in a working directory: + + + |- + |- + |-[!op.txt] + |-... + |-... + +# Usage + +```bash +scrapthechan [ | ] [-o,--output-dir] [--no-op] + [-v,--version] [-h,--help] +``` + +There are two ways to pass a thread. One is by passing a full URL of a thread +(`` argument), and the other one is by passing thread in three components: +`` is a name of website (e.g. 4chan), `` is a name of a board + (e.g. wg), and `` is a number of a thread on that board. + +`-o`, `--output-dir` -- output directory where all files will be dumped to. + +`--no-op` -- by default OP's post will be saved in a `!op.txt` file. This flag +disables this behaviour. I desided to put an `!` in a name so this file will be +on the top in a directory listing. + +`-v`, `--version` prints the version of the program, and `-h`, `--help` prints +help for a program. \ No newline at end of file diff --git a/scrapthechan/__init__.py b/scrapthechan/__init__.py new file mode 100644 index 0000000..9f143af --- /dev/null +++ b/scrapthechan/__init__.py @@ -0,0 +1,13 @@ +__date__ = "8 Jule 2020" +__version__ = "0.1.0" +__author__ = "Alexander \"Arav\" Andreev" +__email__ = "me@arav.top" +__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>" +__license__ = \ +"""This program is licensed under the terms of the MIT license. +For a copy see COPYING file in a directory of the program, or +see """ + +VERSION = \ + f"ScrapTheChan ver. {__version__} ({__date__})\n\n{__copyright__}\n"\ + f"\n{__license__}" diff --git a/scrapthechan/cli/__init__.py b/scrapthechan/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapthechan/cli/scraper.py b/scrapthechan/cli/scraper.py new file mode 100644 index 0000000..db56f63 --- /dev/null +++ b/scrapthechan/cli/scraper.py @@ -0,0 +1,116 @@ +from argparse import ArgumentParser +from os import makedirs +from os.path import join, exists +from re import search +from sys import argv +from typing import List + +from scrapthechan import VERSION +from scrapthechan.parser import Parser, ParserThreadNotFoundError +from scrapthechan.parsers import get_parser_by_url, get_parser_by_site, \ + SUPPORTED_IMAGEBOARDS +from scrapthechan.scrapers.basicscraper import BasicScraper + + +__all__ = ["main"] + + +USAGE = \ +"""Usage: scrapthechan [OPTIONS] (URL|) + +Options: +\t-h,--help -- print this help and exit; +\t-v,--version -- print program's version and exit; +\t-o,--output-dir -- directory where to place scraped files. By default +\t following structure will be created in current directory: +\t //; +\t-N,--no-op -- by default OP's post will be written in !op.txt file. This +\t option disables this behaviour; + +Supported imageboards: 4chan.org, 2ch.hk, lainchan.org +""" + + +def parse_common_arguments(args: str) -> dict: + r = r"(?P-h|--help)|(?P-v|--version)" + argd = search(r, args) + if not argd is None: + argd = argd.groupdict() + return { + "help": not argd["help"] is None, + "version": not argd["version"] is None } + return None + +def parse_arguments(args: str) -> dict: + rlink = r"^(https?:\/\/)?(?P[\w.-]+)[ \/](?P\w+)(\S+)?[ \/](?P\w+)" + link = search(rlink, args) + if not link is None: + link = link.groupdict() + out_dir = search(r"(?=(-o|--output-dir) (?P\S+))", args) + return { + "site": None if link is None else link["site"], + "board": None if link is None else link["board"], + "thread": None if link is None else link["thread"], + "no-op": not search(r"-N|--no-op", args) is None, + "output-dir": None if out_dir is None \ + else out_dir.groupdict()["outdir"] } + +def main() -> None: + cargs = parse_common_arguments(' '.join(argv[1:])) + if not cargs is None: + if cargs["help"]: + print(USAGE) + exit() + elif cargs["version"]: + print(VERSION) + exit() + + args = parse_arguments(' '.join(argv[1:])) + if args is None \ + or not "site" in args or not "board" in args or not "thread" in args: + print(USAGE) + exit() + + try: + parser = get_parser_by_site(args["site"], args["board"], args["thread"]) + except NotImplementedError as ex: + print(f"{str(ex)}.") + print(f"Supported image boards are {', '.join(SUPPORTED_IMAGEBOARDS)}") + exit() + except ParserThreadNotFoundError: + print(f"Thread is no longer exist.") + exit() + + flen = len(parser.files) + + + print(f"There are {flen} files in this thread.") + + if not args["output-dir"] is None: + save_dir = args["output-dir"] + else: + save_dir = join(parser.imageboard, parser.board, + parser.thread) + + print(f"They will be saved in {save_dir}.") + + makedirs(save_dir, exist_ok=True) + + + if not args["no-op"]: + print("Writing OP... ", end='') + if not exists(join(save_dir, "!op.txt")): + with open(join(save_dir, "!op.txt"), 'w') as opf: + opf.write(f"{parser.op}\n") + print("Done.") + else: + print("Exists.") + + + scraper = BasicScraper(save_dir, parser.files, \ + lambda i: print(f"{i}/{flen}", end="\r")) + scraper.run() + + +if __name__ == "__main__": + main() diff --git a/scrapthechan/fileinfo.py b/scrapthechan/fileinfo.py new file mode 100644 index 0000000..29e066e --- /dev/null +++ b/scrapthechan/fileinfo.py @@ -0,0 +1,23 @@ +"""FileInfo object stores all needed information about a file.""" + + +__all__ = ["FileInfo"] + + +class FileInfo: + """Stores all needed information about a file. + + Arguments: + - `name` -- name of a file; + - `size` -- size of a file; + - `dlurl` -- full download URL for a file; + - `hash_value` -- hash sum of a file; + - `hash_algo` -- hash algorithm used (e.g. md5). + """ + def __init__(self, name: str, size: int, dlurl: str, + hash_value: str, hash_algo: str) -> None: + self.name = name + self.size = size + self.dlurl = dlurl + self.hash_value = hash_value + self.hash_algo = hash_algo diff --git a/scrapthechan/parser.py b/scrapthechan/parser.py new file mode 100644 index 0000000..014a009 --- /dev/null +++ b/scrapthechan/parser.py @@ -0,0 +1,81 @@ +"""Base `Parser` class for JSON parsers to inherit.""" + +from itertools import chain +from json import loads +from re import findall, match +from typing import List, Optional +from urllib.request import urlopen, urlretrieve + +from scrapthechan.fileinfo import FileInfo + + +__all__ = ["Parser", "ParserThreadNotFoundError"] + + +class ParserThreadNotFoundError(Exception): + pass + + +class Parser: + """Base class for all parsers. + It fetches JSON of a specified thread and collects all the files from it + into a list of the `FileInfo` objects. + Also it extracts OP's post, that may come handy if you do bulk scraping. + + Arguments: + board -- is a name of a board on an image board; + thread -- is a name of a thread inside a board; + posts -- is a list of posts in form of dictionaries exported from a JSON; + skip_posts -- number of posts to skip. + + All the extracted files will be stored as the `FileInfo` objects.""" + __url_thread_json: str = "https://example.org/{board}/{thread}.json" + __url_file_link: str = None + + def __init__(self, board: str, thread: str, posts: List[dict], + skip_posts: Optional[int] = None) -> None: + self._board = board + self._thread = thread + self._op_post = posts[0] + if not skip_posts is None: + posts = posts[skip_posts:] + self._files = list(chain.from_iterable(filter(None, \ + map(self._parse_post, posts)))) + + @property + def imageboard(self) -> str: + """Returns image board's name.""" + return NotImplementedError + + @property + def board(self) -> str: + """Returns a name of a board of image board.""" + return self._board + + @property + def thread(self) -> str: + """Returns a name of thread from a board.""" + return self._thread + + @property + def op(self) -> str: + """Returns OP's post as combination of subject and comment separated + by a new line.""" + raise NotImplementedError + + @property + def files(self) -> List[FileInfo]: + """Returns a list of retrieved files as `FileInfo` objects.""" + return self._files + + def _get_json(self, thread_url: str) -> dict: + """Gets JSON version of a thread and converts it in a dictionary.""" + try: + with urlopen(thread_url) as url: + return loads(url.read().decode('utf-8')) + except: + raise ParserThreadNotFoundError + + def _parse_post(self, post: dict) -> List[FileInfo]: + """Parses a single post and extracts files into `FileInfo` object.""" + raise NotImplementedError diff --git a/scrapthechan/parsers/__init__.py b/scrapthechan/parsers/__init__.py new file mode 100644 index 0000000..adc71a1 --- /dev/null +++ b/scrapthechan/parsers/__init__.py @@ -0,0 +1,34 @@ +"""Here are defined the JSON parsers for imageboards.""" +from re import search +from typing import List + +from scrapthechan.parser import Parser + + +__all__ = ["SUPPORTED_IMAGEBOARDS", "get_parser_by_url", "get_parser_by_site"] + + +SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk"] + + +def get_parser_by_url(url: str) -> Parser: + """Parses URL and extracts from it site name, board and thread. + And then returns initialised Parser object for detected imageboard.""" + URLRX = r"https?:\/\/(?P[\w\.]+)\/(?P\w+)\/(?:\w+)?\/(?P\w+)" + site, board, thread = search(URLRX, url).groups() + return get_parser_by_site(site, board, thread) + +def get_parser_by_site(site: str, board: str, thread: str) -> Parser: + """Returns an initialised parser for `site` with `board` and `thread`.""" + if site in ['boards.4chan.org', 'boards.4channel.org', + '4chan', '4chan.org']: + from .fourchan import FourChanParser + return FourChanParser(board, thread) + elif site in ['lainchan.org', 'lainchan']: + from .lainchan import LainchanParser + return LainchanParser(board, thread) + elif site in ['2ch.hk', '2ch']: + from .dvach import DvachParser + return DvachParser(board, thread) + else: + raise NotImplementedError(f"Parser for {site} is not implemented") diff --git a/scrapthechan/parsers/dvach.py b/scrapthechan/parsers/dvach.py new file mode 100644 index 0000000..4e5e6e2 --- /dev/null +++ b/scrapthechan/parsers/dvach.py @@ -0,0 +1,43 @@ +from re import match +from typing import List, Optional + +from scrapthechan.fileinfo import FileInfo +from scrapthechan.parser import Parser + +__all__ = ["DvachParser"] + + +class DvachParser(Parser): + """JSON parser for 2ch.hk image board.""" + + __url_thread_json = "https://2ch.hk/{board}/res/{thread}.json" + __url_file_link = "https://2ch.hk" + + def __init__(self, board: str, thread: str, + skip_posts: Optional[int] = None) -> None: + posts = self._get_json(self.__url_thread_json.format(board=board, \ + thread=thread))['threads'][0]['posts'] + super(DvachParser, self).__init__(board, thread, posts, skip_posts) + + @property + def imageboard(self) -> str: + return "2ch.hk" + + @property + def op(self) -> str: + return f"{self._op_post['subject']}\n{self._op_post['comment']}" + + def _parse_post(self, post) -> Optional[List[FileInfo]]: + if not 'files' in post: return None + files = [] + for f in post['files']: + if match(f['fullname'], r"^image\.\w+$") is None: + fullname = f['fullname'] + else: + fullname = f['name'] + # Here's same thing as 4chan. 2ch.hk also has md5 field, so it is + # completely fine to hardcode `hash_algo`. + files.append(FileInfo(fullname, f['size'], + f"{self.__url_file_link}{f['path']}", + f['md5'], 'md5')) + return files diff --git a/scrapthechan/parsers/fourchan.py b/scrapthechan/parsers/fourchan.py new file mode 100644 index 0000000..ee4d386 --- /dev/null +++ b/scrapthechan/parsers/fourchan.py @@ -0,0 +1,49 @@ +from re import match +from typing import List, Optional + +from scrapthechan.fileinfo import FileInfo +from scrapthechan.parser import Parser + +__all__ = ["FourChanParser"] + + +class FourChanParser(Parser): + """JSON parser for 4chan.org image board.""" + + __url_thread_json = "https://a.4cdn.org/{board}/thread/{thread}.json" + __url_file_link = "https://i.4cdn.org/{board}/{filename}" + + def __init__(self, board: str, thread: str, + skip_posts: Optional[int] = None) -> None: + posts = self._get_json(self.__url_thread_json.format(board=board, \ + thread=thread))['posts'] + super(FourChanParser, self).__init__(board, thread, posts, skip_posts) + + @property + def imageboard(self) -> str: + return "4chan.org" + + @property + def op(self) -> str: + if 'sub' in self._op_post: + return f"{self._op_post['sub']}\n{self._op_post['com']}" + else: + return self._op_post['com'] + + def _parse_post(self, post: dict) -> List[FileInfo]: + if not 'tim' in post: return None + + dlfname = f"{post['tim']}{post['ext']}" + + if "filename" in post: + if match(post['filename'], r"^image\.\w+$") is None: + filename = dlfname + else: + filename = f"{post['filename']}{post['ext']}" + + # Hash algorithm is hardcoded since it is highly unlikely that it will + # be changed in foreseeable future. And if it'll change then this line + # will be necessarily updated anyway. + return [FileInfo(filename, post['fsize'], + self.__url_file_link.format(board=self.board, filename=dlfname), + post['md5'], 'md5')] diff --git a/scrapthechan/parsers/lainchan.py b/scrapthechan/parsers/lainchan.py new file mode 100644 index 0000000..3d05fd8 --- /dev/null +++ b/scrapthechan/parsers/lainchan.py @@ -0,0 +1,57 @@ +from re import match +from typing import List, Optional + +from scrapthechan.parser import Parser +from scrapthechan.fileinfo import FileInfo + +__all__ = ["LainchanParser"] + + +class LainchanParser(Parser): + """JSON parser for lainchan.org image board. + JSON structure is identical to 4chan.org's, so this parser is just inherited + from 4chan.org's parser and only needed things are redefined. + """ + + __url_thread_json = "https://lainchan.org/{board}/res/{thread}.json" + __url_file_link = "https://lainchan.org/{board}/src/{filename}" + + def __init__(self, board: str, thread: str, + skip_posts: Optional[int] = None) -> None: + posts = self._get_json(self.__url_thread_json.format(board=board, \ + thread=thread))['posts'] + super(LainchanParser, self).__init__(board, thread, posts, skip_posts) + + @property + def imageboard(self) -> str: + return "lainchan.org" + + def _parse_post(self, post) -> List[FileInfo]: + if not 'tim' in post: return None + + dlfname = f"{post['tim']}{post['ext']}" + + if "filename" in post: + if match(post['filename'], r"^image\.\w+$") is None: + filename = dlfname + else: + filename = f"{post['filename']}{post['ext']}" + + files = [] + files.append(FileInfo(filename, post['fsize'], + self.__url_file_link.format(board=self.board, filename=dlfname), + post['md5'], 'md5')) + + if "extra_files" in post: + for f in post["extra_files"]: + dlfname = f"{f['tim']}{f['ext']}" + if "filename" in post: + if match(post['filename'], r"^image\.\w+$") is None: + filename = dlfname + else: + filename = f"{post['filename']}{post['ext']}" + dlurl = self.__url_file_link.format(board=self.board, \ + filename=dlfname) + files.append(FileInfo(filename, f['fsize'], \ + dlurl, f['md5'], 'md5')) + return files diff --git a/scrapthechan/scraper.py b/scrapthechan/scraper.py new file mode 100644 index 0000000..fc0a014 --- /dev/null +++ b/scrapthechan/scraper.py @@ -0,0 +1,96 @@ +"""Base Scraper implementation.""" + +from base64 import b64encode +from os import remove, stat +from os.path import exists, join, getsize +import re +from typing import List, Callable +from urllib.request import urlretrieve, URLopener +import hashlib + +from scrapthechan import __version__ +from scrapthechan.fileinfo import FileInfo + +__all__ = ["Scraper"] + + +class Scraper: + """Base scraper implementation. + + Arguments: + save_directory -- a path to a directory where file will be + saved; + files -- a list of FileInfo objects; + download_progress_callback -- a callback function that will be called + for each file started downloading. + """ + def __init__(self, save_directory: str, files: List[FileInfo], + download_progress_callback: Callable[[int], None] = None) -> None: + self._save_directory = save_directory + self._files = files + self._url_opener = URLopener() + self._url_opener.version = f"ScrapTheChan/{__version__}" + self._progress_callback = download_progress_callback + + def run(self): + raise NotImplementedError + + def _same_filename(self, filename: str, path: str) -> str: + """Check if there is a file with same name. If so then add incremental + number enclosed in brackets to a name of a new one.""" + newname = filename + while exists(join(path, newname)): + has_extension = newname.rfind(".") != -1 + if has_extension: + l, r = newname.rsplit(".", 1) + lbracket = l.rfind("(") + if lbracket == -1: + newname = f"{l}(1).{r}" + else: + num = l[lbracket+1:-1] + if num.isnumeric(): + newname = f"{l[:lbracket]}({int(num)+1}).{r}" + else: + newname = f"{l}(1).{r}" + else: + lbracket = l.rfind("(") + if lbracket == -1: + newname = f"{newname}(1)" + else: + num = newname[lbracket+1:-1] + if num.isnumeric(): + newname = f"{newname[:lbracket]}({int(num)+1})" + return newname + + def _hash_file(self, filename: str, hash_algo: str = "md5", + blocksize: int = 1048576) -> (str, str): + """Compute hash of a file.""" + hash_func = hashlib.new(hash_algo) + with open(filename, 'rb') as f: + buf = f.read(blocksize) + while len(buf) > 0: + hash_func.update(buf) + buf = f.read(blocksize) + return hash_func.hexdigest(), hash_func.digest() + + def _is_file_ok(self, f: FileInfo, filepath: str) -> bool: + """Check if a file exist and isn't broken.""" + if not exists(filepath): + return False + computed_size = getsize(filepath) + is_size_match = f.size == computed_size \ + or f.size == round(computed_size / 1024) + hexdig, dig = self._hash_file(filepath, f.hash_algo) + is_hash_match = f.hash_value == hexdig \ + or f.hash_value == b64encode(dig).decode() + return is_size_match and is_hash_match + + def _download_file(self, f: FileInfo): + """Download a single file.""" + filepath = join(self._save_directory, f.name) + if self._is_file_ok(f, filepath): + return True + elif exists(filepath): + filepath = join(self._save_directory, \ + self._same_filename(f.name, self._save_directory)) + self._url_opener.retrieve(f.dlurl, filepath) diff --git a/scrapthechan/scrapers/__init__.py b/scrapthechan/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapthechan/scrapers/basicscraper.py b/scrapthechan/scrapers/basicscraper.py new file mode 100644 index 0000000..6c1b430 --- /dev/null +++ b/scrapthechan/scrapers/basicscraper.py @@ -0,0 +1,15 @@ +"""Implementation of basic sequential one-threaded scraper that downloads +files one by one.""" + +from scrapthechan.scraper import Scraper + +__all__ = ["BasicScraper"] + + +class BasicScraper(Scraper): + def run(self): + """Download files one by one.""" + for i, f in enumerate(self._files, start=1): + if not self._progress_callback is None: + self._progress_callback(i) + self._download_file(f) diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..82fb1c6 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,42 @@ +[metadata] +name = scrapthechan +version = attr: scrapthechan.__version__ +description = + Scrap the files posted in a thread on an imageboard. Currently supports + 4chan.org, lainchan.org and 2ch.hk. +long_description = file: README.md +long_description_content_type = text/markdown +author = Alexander "Arav" Andreev +author_email = me@arav.top +url = https://arav.top +keywords = + scraper + imageboard + 4chan + 2ch + lainchan +license = MIT +license_file = COPYING +classifiers = + Development Status :: 2 - Pre-Alpha + Environment :: Console + Intended Audience :: End Users/Desktop + License :: Other/Proprietary License + Natural Language :: English + Operating System :: OS Independent + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Topic :: Utilities + +[options] +zip_safe = False +python_requires = >=3.7 +include_package_data = True +packages = find: + +[options.package_data] +* = COPYING, README.md + +[options.entry_points] +console_scripts = + scrapthechan = scrapthechan.cli.scraper:main diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..fc1f76c --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup() \ No newline at end of file