from re import match from typing import List, Optional from scrapthechan.fileinfo import FileInfo from scrapthechan.parser import Parser __all__ = ["DvachParser"] class DvachParser(Parser): """JSON parser for 2ch.hk image board.""" __url_thread_json = "https://2ch.hk/{board}/res/{thread}.json" __url_file_link = "https://2ch.hk" def __init__(self, board: str, thread: str, skip_posts: Optional[int] = None) -> None: posts = self._get_json(self.__url_thread_json.format(board=board, \ thread=thread))['threads'][0]['posts'] super(DvachParser, self).__init__(board, thread, posts, skip_posts) @property def imageboard(self) -> str: return "2ch.hk" @property def op(self) -> Optional[str]: op = "" if 'subject' in self._op_post: op = f"{self._op_post['subject']}\n" if 'comment' in self._op_post: op += self._op_post['comment'] return op if not op == "" else None def _parse_post(self, post) -> Optional[List[FileInfo]]: if not 'files' in post: return None files = [] for f in post['files']: if not 'sticker' in f: if match(r"^image\.\w+$", f['fullname']) is None: fullname = f['fullname'] else: fullname = f['name'] else: fullname = f['name'] # Here's same thing as 4chan. 2ch.hk also has md5 field, so it is # completely fine to hardcode `hash_algo`. if 'md5' in f: files.append(FileInfo(fullname, f['size'], f"{self.__url_file_link}{f['path']}", f['md5'], 'md5')) else: files.append(FileInfo(fullname, f['size'], f"{self.__url_file_link}{f['path']}", None, None)) return files