From 520d88c76a4547786621ae280613c9b4fd29305b Mon Sep 17 00:00:00 2001 From: "Alexander \"Arav\" Andreev" Date: Mon, 20 Jul 2020 03:45:51 +0400 Subject: [PATCH] Parser for 8kun.top added. And I changed compares in __init__. --- scrapthechan/parsers/__init__.py | 13 ++++--- scrapthechan/parsers/eightkun.py | 63 ++++++++++++++++++++++++++++++++ setup.cfg | 3 +- 3 files changed, 73 insertions(+), 6 deletions(-) create mode 100644 scrapthechan/parsers/eightkun.py diff --git a/scrapthechan/parsers/__init__.py b/scrapthechan/parsers/__init__.py index adc71a1..aaaa774 100644 --- a/scrapthechan/parsers/__init__.py +++ b/scrapthechan/parsers/__init__.py @@ -8,7 +8,8 @@ from scrapthechan.parser import Parser __all__ = ["SUPPORTED_IMAGEBOARDS", "get_parser_by_url", "get_parser_by_site"] -SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk"] +SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk", \ + "8kun.top"] def get_parser_by_url(url: str) -> Parser: @@ -20,15 +21,17 @@ def get_parser_by_url(url: str) -> Parser: def get_parser_by_site(site: str, board: str, thread: str) -> Parser: """Returns an initialised parser for `site` with `board` and `thread`.""" - if site in ['boards.4chan.org', 'boards.4channel.org', - '4chan', '4chan.org']: + if '4chan' in site: from .fourchan import FourChanParser return FourChanParser(board, thread) - elif site in ['lainchan.org', 'lainchan']: + elif 'lainchan' in site: from .lainchan import LainchanParser return LainchanParser(board, thread) - elif site in ['2ch.hk', '2ch']: + elif '2ch' in site: from .dvach import DvachParser return DvachParser(board, thread) + elif '8kun' in site: + from .eightkun import EightKunParser + return EightKunParser(board, thread) else: raise NotImplementedError(f"Parser for {site} is not implemented") diff --git a/scrapthechan/parsers/eightkun.py b/scrapthechan/parsers/eightkun.py new file mode 100644 index 0000000..afc506b --- /dev/null +++ b/scrapthechan/parsers/eightkun.py @@ -0,0 +1,63 @@ +from re import match +from typing import List, Optional + +from scrapthechan.fileinfo import FileInfo +from scrapthechan.parser import Parser + +__all__ = ["EightKunParser"] + + +class EightKunParser(Parser): + """JSON parser for 8kun.top image board.""" + + __url_thread_json = "https://8kun.top/{board}/res/{thread}.json" + __url_file_link = "https://media.8kun.top/file_store/{filename}" + + def __init__(self, board: str, thread: str, + skip_posts: Optional[int] = None) -> None: + posts = self._get_json(self.__url_thread_json.format(board=board, \ + thread=thread))['posts'] + super(EightKunParser, self).__init__(board, thread, posts, skip_posts) + + @property + def imageboard(self) -> str: + return "8kun.top" + + @property + def op(self) -> Optional[str]: + op = "" + if 'sub' in self._op_post: + op = f"{self._op_post['sub']}\n" + if 'com' in self._op_post: + op += self._op_post['com'] + return op if not op == "" else None + + def _parse_post(self, post: dict) -> List[FileInfo]: + if not 'tim' in post: return None + + dlfname = f"{post['tim']}{post['ext']}" + + if "filename" in post: + if match(post['filename'], r"^image\.\w{1,4}$") is None: + filename = dlfname + else: + filename = f"{post['filename']}{post['ext']}" + + files = [] + files.append(FileInfo(filename, post['fsize'], + self.__url_file_link.format(board=self.board, filename=dlfname), + post['md5'], 'md5')) + + if "extra_files" in post: + for f in post["extra_files"]: + dlfname = f"{f['tim']}{f['ext']}" + if "filename" in post: + if match(post['filename'], r"^image\.\w+$") is None: + filename = dlfname + else: + filename = f"{post['filename']}{post['ext']}" + dlurl = self.__url_file_link.format(board=self.board, \ + filename=dlfname) + files.append(FileInfo(filename, f['fsize'], \ + dlurl, f['md5'], 'md5')) + return files diff --git a/setup.cfg b/setup.cfg index 82fb1c6..c63356d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ name = scrapthechan version = attr: scrapthechan.__version__ description = Scrap the files posted in a thread on an imageboard. Currently supports - 4chan.org, lainchan.org and 2ch.hk. + 4chan.org, lainchan.org, 2ch.hk and 8kun.top. long_description = file: README.md long_description_content_type = text/markdown author = Alexander "Arav" Andreev @@ -15,6 +15,7 @@ keywords = 4chan 2ch lainchan + 8kun.top license = MIT license_file = COPYING classifiers =