diff --git a/scrapthechan/parsers/dvach.py b/scrapthechan/parsers/dvach.py index 85e30a6..b5cafe5 100644 --- a/scrapthechan/parsers/dvach.py +++ b/scrapthechan/parsers/dvach.py @@ -10,31 +10,38 @@ __all__ = ["DvachParser"] class DvachParser(Parser): """JSON parser for 2ch.hk image board.""" - __url_thread_json = "https://2ch.hk/{board}/res/{thread}.json" - __url_file_link = "https://2ch.hk" - def __init__(self, board: str, thread: str, skip_posts: Optional[int] = None) -> None: - posts = self._get_json(self.__url_thread_json.format(board=board, \ - thread=thread))['threads'][0]['posts'] - super(DvachParser, self).__init__(board, thread, posts, skip_posts) + super().__init__(board, thread, skip_posts) + + @property + def json_thread_url(self) -> str: + return "https://2ch.hk/{board}/res/{thread}.json" + + @property + def file_base_url(self) -> str: + return "https://2ch.hk" + + @property + def subject_field(self) -> str: + return "subject" + + @property + def comment_field(self) -> str: + return "comment" @property def imageboard(self) -> str: return "2ch.hk" - @property - def op(self) -> Optional[str]: - op = "" - if 'subject' in self._op_post: - op = f"{self._op_post['subject']}\n" - if 'comment' in self._op_post: - op += self._op_post['comment'] - return op if not op == "" else None + def _extract_posts_list(self, lst: List) -> List[dict]: + return lst['threads'][0]['posts'] def _parse_post(self, post) -> Optional[List[FileInfo]]: if not 'files' in post: return None + files = [] + for f in post['files']: if not 'sticker' in f: if match(r"^image\.\w+$", f['fullname']) is None: @@ -47,10 +54,10 @@ class DvachParser(Parser): # completely fine to hardcode `hash_algo`. if 'md5' in f: files.append(FileInfo(fullname, f['size'], - f"{self.__url_file_link}{f['path']}", + f"{self.file_base_url}{f['path']}", f['md5'], 'md5')) else: files.append(FileInfo(fullname, f['size'], - f"{self.__url_file_link}{f['path']}", + f"{self.file_base_url}{f['path']}", None, None)) return files diff --git a/scrapthechan/parsers/eightkun.py b/scrapthechan/parsers/eightkun.py index 5348c6e..aabe623 100644 --- a/scrapthechan/parsers/eightkun.py +++ b/scrapthechan/parsers/eightkun.py @@ -1,63 +1,25 @@ -from re import match -from typing import List, Optional +from typing import Optional -from scrapthechan.fileinfo import FileInfo -from scrapthechan.parser import Parser +from scrapthechan.parsers.tinyboardlike import TinyboardLikeParser __all__ = ["EightKunParser"] -class EightKunParser(Parser): +class EightKunParser(TinyboardLikeParser): """JSON parser for 8kun.top image board.""" - __url_thread_json = "https://8kun.top/{board}/res/{thread}.json" - __url_file_link = "https://media.8kun.top/file_store/{filename}" - def __init__(self, board: str, thread: str, skip_posts: Optional[int] = None) -> None: - posts = self._get_json(self.__url_thread_json.format(board=board, \ - thread=thread))['posts'] - super(EightKunParser, self).__init__(board, thread, posts, skip_posts) + super().__init__(board, thread, skip_posts) @property def imageboard(self) -> str: return "8kun.top" @property - def op(self) -> Optional[str]: - op = "" - if 'sub' in self._op_post: - op = f"{self._op_post['sub']}\n" - if 'com' in self._op_post: - op += self._op_post['com'] - return op if not op == "" else None + def json_thread_url(self) -> str: + return "https://8kun.top/{board}/res/{thread}.json" - def _parse_post(self, post: dict) -> List[FileInfo]: - if not 'tim' in post: return None - - dlfname = f"{post['tim']}{post['ext']}" - - if "filename" in post: - if match(r"^image\.\w+$", post['filename']) is None: - filename = dlfname - else: - filename = f"{post['filename']}{post['ext']}" - - files = [] - files.append(FileInfo(filename, post['fsize'], - self.__url_file_link.format(board=self.board, filename=dlfname), - post['md5'], 'md5')) - - if "extra_files" in post: - for f in post["extra_files"]: - dlfname = f"{f['tim']}{f['ext']}" - if "filename" in post: - if match(r"^image\.\w+$", post['filename']) is None: - filename = dlfname - else: - filename = f"{post['filename']}{post['ext']}" - dlurl = self.__url_file_link.format(board=self.board, \ - filename=dlfname) - files.append(FileInfo(filename, f['fsize'], \ - dlurl, f['md5'], 'md5')) - return files + @property + def file_base_url(self) -> str: + return "https://media.8kun.top/file_dl/{filename}" diff --git a/scrapthechan/parsers/fourchan.py b/scrapthechan/parsers/fourchan.py index fb60515..632e427 100644 --- a/scrapthechan/parsers/fourchan.py +++ b/scrapthechan/parsers/fourchan.py @@ -1,51 +1,25 @@ -from re import match -from typing import List, Optional +from typing import Optional -from scrapthechan.fileinfo import FileInfo -from scrapthechan.parser import Parser +from scrapthechan.parsers.tinyboardlike import TinyboardLikeParser __all__ = ["FourChanParser"] -class FourChanParser(Parser): +class FourChanParser(TinyboardLikeParser): """JSON parser for 4chan.org image board.""" - __url_thread_json = "https://a.4cdn.org/{board}/thread/{thread}.json" - __url_file_link = "https://i.4cdn.org/{board}/{filename}" - def __init__(self, board: str, thread: str, skip_posts: Optional[int] = None) -> None: - posts = self._get_json(self.__url_thread_json.format(board=board, \ - thread=thread))['posts'] - super(FourChanParser, self).__init__(board, thread, posts, skip_posts) + super().__init__(board, thread, skip_posts) @property def imageboard(self) -> str: return "4chan.org" @property - def op(self) -> Optional[str]: - op = "" - if 'sub' in self._op_post: - op = f"{self._op_post['sub']}\n" - if 'com' in self._op_post: - op += self._op_post['com'] - return op if not op == "" else None + def json_thread_url(self) -> str: + return "https://a.4cdn.org/{board}/thread/{thread}.json" - def _parse_post(self, post: dict) -> List[FileInfo]: - if not 'tim' in post: return None - - dlfname = f"{post['tim']}{post['ext']}" - - if "filename" in post: - if match(r"^image\.\w+$", post['filename']) is None: - filename = dlfname - else: - filename = f"{post['filename']}{post['ext']}" - - # Hash algorithm is hardcoded since it is highly unlikely that it will - # be changed in foreseeable future. And if it'll change then this line - # will be necessarily updated anyway. - return [FileInfo(filename, post['fsize'], - self.__url_file_link.format(board=self.board, filename=dlfname), - post['md5'], 'md5')] + @property + def file_base_url(self) -> str: + return "https://i.4cdn.org/{board}/{filename}" diff --git a/scrapthechan/parsers/lainchan.py b/scrapthechan/parsers/lainchan.py index 5a45292..687109e 100644 --- a/scrapthechan/parsers/lainchan.py +++ b/scrapthechan/parsers/lainchan.py @@ -1,66 +1,25 @@ -from re import match -from typing import List, Optional +from typing import Optional -from scrapthechan.parser import Parser -from scrapthechan.fileinfo import FileInfo +from scrapthechan.parsers.tinyboardlike import TinyboardLikeParser __all__ = ["LainchanParser"] -class LainchanParser(Parser): - """JSON parser for lainchan.org image board. - JSON structure is identical to 4chan.org's, so this parser is just inherited - from 4chan.org's parser and only needed things are redefined. - """ - - __url_thread_json = "https://lainchan.org/{board}/res/{thread}.json" - __url_file_link = "https://lainchan.org/{board}/src/{filename}" +class LainchanParser(TinyboardLikeParser): + """JSON parser for lainchan.org image board.""" def __init__(self, board: str, thread: str, skip_posts: Optional[int] = None) -> None: - posts = self._get_json(self.__url_thread_json.format(board=board, \ - thread=thread))['posts'] - super(LainchanParser, self).__init__(board, thread, posts, skip_posts) + super().__init__(board, thread, skip_posts) @property def imageboard(self) -> str: return "lainchan.org" - + @property - def op(self) -> Optional[str]: - op = "" - if 'sub' in self._op_post: - op = f"{self._op_post['sub']}\n" - if 'com' in self._op_post: - op += self._op_post['com'] - return op if not op == "" else None + def json_thread_url(self) -> str: + return "https://lainchan.org/{board}/res/{thread}.json" - def _parse_post(self, post) -> List[FileInfo]: - if not 'tim' in post: return None - - dlfname = f"{post['tim']}{post['ext']}" - - if "filename" in post: - if match(r"^image\.\w+$", post['filename']) is None: - filename = dlfname - else: - filename = f"{post['filename']}{post['ext']}" - - files = [] - files.append(FileInfo(filename, post['fsize'], - self.__url_file_link.format(board=self.board, filename=dlfname), - post['md5'], 'md5')) - - if "extra_files" in post: - for f in post["extra_files"]: - dlfname = f"{f['tim']}{f['ext']}" - if "filename" in post: - if match(r"^image\.\w+$", post['filename']) is None: - filename = dlfname - else: - filename = f"{post['filename']}{post['ext']}" - dlurl = self.__url_file_link.format(board=self.board, \ - filename=dlfname) - files.append(FileInfo(filename, f['fsize'], \ - dlurl, f['md5'], 'md5')) - return files + @property + def file_base_url(self) -> str: + return "https://lainchan.org/{board}/src/{filename}"