from re import match from typing import List, Optional from scrapthechan.fileinfo import FileInfo from scrapthechan.parser import Parser __all__ = ["DvachParser"] class DvachParser(Parser): """JSON parser for 2ch.hk image board.""" __url_thread_json = "https://2ch.hk/{board}/res/{thread}.json" __url_file_link = "https://2ch.hk" def __init__(self, board: str, thread: str, skip_posts: Optional[int] = None) -> None: posts = self._get_json(self.__url_thread_json.format(board=board, \ thread=thread))['threads'][0]['posts'] super(DvachParser, self).__init__(board, thread, posts, skip_posts) @property def imageboard(self) -> str: return "2ch.hk" @property def op(self) -> str: return f"{self._op_post['subject']}\n{self._op_post['comment']}" def _parse_post(self, post) -> Optional[List[FileInfo]]: if not 'files' in post: return None files = [] for f in post['files']: if match(f['fullname'], r"^image\.\w+$") is None: fullname = f['fullname'] else: fullname = f['name'] # Here's same thing as 4chan. 2ch.hk also has md5 field, so it is # completely fine to hardcode `hash_algo`. files.append(FileInfo(fullname, f['size'], f"{self.__url_file_link}{f['path']}", f['md5'], 'md5')) return files