"""Base `Parser` class for JSON parsers to inherit.""" from itertools import chain from json import loads from re import findall, match from typing import List, Optional from urllib.request import urlopen, Request, HTTPError from scrapthechan import USER_AGENT from scrapthechan.fileinfo import FileInfo __all__ = ["Parser", "ThreadNotFoundError"] class ThreadNotFoundError(Exception): def __init__(self, reason: str = ""): self._reason = reason @property def reason(self) -> str: return self._reason class Parser: """Base class for all parsers. It fetches JSON of a specified thread and collects all the files from it into a list of the `FileInfo` objects. Also it extracts OP's post, that may come handy if you do bulk scraping. Arguments: board -- is a name of a board on an image board; thread -- is an id of a thread inside a board; skip_posts -- number of posts to skip. All the extracted files will be stored as the `FileInfo` objects.""" def __init__(self, board: str, thread: str, skip_posts: Optional[int] = None) -> None: self._board: str = board self._thread: str = thread self._posts = self._extract_posts_list(self._get_json()) self._op_post: dict = self._posts[0] self._posts = self._posts[skip_posts:] if not skip_posts is None else self._posts self._files = list(chain.from_iterable(filter(None, \ map(self._parse_post, self._posts)))) @property def json_thread_url(self) -> str: raise NotImplementedError @property def file_base_url(self) -> str: raise NotImplementedError @property def subject_field(self) -> str: return "sub" @property def comment_field(self) -> str: return "com" @property def imageboard(self) -> str: """Returns image board's name.""" raise NotImplementedError @property def board(self) -> str: """Returns a name of a board of image board.""" return self._board @property def thread(self) -> str: """Returns a name of thread from a board.""" return self._thread @property def op(self) -> str: """Returns OP's post as combination of subject and comment separated by a new line.""" op = "" if self.subject_field in self._op_post: op = f"{self._op_post[self.subject_field]}\n" if self.comment_field in self._op_post: op += self._op_post[self.comment_field] return op if not op == "" else None @property def files(self) -> List[FileInfo]: """Returns a list of retrieved files as `FileInfo` objects.""" return self._files def _extract_posts_list(self, lst: List) -> List[dict]: """This method must be overridden in child classes where you specify a path in a JSON document where posts are stored. E.g., on 4chan this is ['posts'], and on 2ch.hk it's ['threads'][0]['posts'].""" return lst def _get_json(self) -> dict: """Retrieves a JSON representation of a thread and converts it in a dictionary.""" try: thread_url = self.json_thread_url.format(board=self._board, \ thread=self._thread) req = Request(thread_url, headers={'User-Agent': USER_AGENT}) with urlopen(req) as url: return loads(url.read().decode('utf-8')) except HTTPError as e: raise ThreadNotFoundError(str(e)) except Exception as e: raise e def _parse_post(self, post: dict) -> Optional[List[FileInfo]]: """Parses a single post and extracts files into `FileInfo` object. Single object is wrapped in a list for convenient insertion into a list.""" raise NotImplementedError