from re import match from typing import List, Optional from scrapthechan.fileinfo import FileInfo from scrapthechan.parser import Parser __all__ = ["DvachParser"] class DvachParser(Parser): """JSON parser for 2ch.hk image board.""" def __init__(self, board: str, thread: str, skip_posts: Optional[int] = None) -> None: super().__init__(board, thread, skip_posts) @property def json_thread_url(self) -> str: return "https://2ch.hk/{board}/res/{thread}.json" @property def file_base_url(self) -> str: return "https://2ch.hk" @property def subject_field(self) -> str: return "subject" @property def comment_field(self) -> str: return "comment" @property def imageboard(self) -> str: return "2ch.hk" def _extract_posts_list(self, lst: List) -> List[dict]: return lst['threads'][0]['posts'] def _parse_post(self, post) -> Optional[List[FileInfo]]: if not 'files' in post: return None files = [] for f in post['files']: if not 'sticker' in f: if match(r"^image\.\w+$", f['fullname']) is None: fullname = f['fullname'] else: fullname = f['name'] else: fullname = f['name'] # Here's same thing as 4chan. 2ch.hk also has md5 field, so it is # completely fine to hardcode `hash_algo`. if 'md5' in f: files.append(FileInfo(fullname, f['size'], f"{self.file_base_url}{f['path']}", f['md5'], 'md5')) else: files.append(FileInfo(fullname, f['size'], f"{self.file_base_url}{f['path']}", None, None)) return files