"""Base `Parser` class for JSON parsers to inherit.""" from itertools import chain from json import loads from re import findall, match from typing import List, Optional from urllib.request import urlopen, urlretrieve from scrapthechan.fileinfo import FileInfo __all__ = ["Parser", "ParserThreadNotFoundError"] class ParserThreadNotFoundError(Exception): pass class Parser: """Base class for all parsers. It fetches JSON of a specified thread and collects all the files from it into a list of the `FileInfo` objects. Also it extracts OP's post, that may come handy if you do bulk scraping. Arguments: board -- is a name of a board on an image board; thread -- is a name of a thread inside a board; posts -- is a list of posts in form of dictionaries exported from a JSON; skip_posts -- number of posts to skip. All the extracted files will be stored as the `FileInfo` objects.""" __url_thread_json: str = "https://example.org/{board}/{thread}.json" __url_file_link: str = None def __init__(self, board: str, thread: str, posts: List[dict], skip_posts: Optional[int] = None) -> None: self._board = board self._thread = thread self._op_post = posts[0] if not skip_posts is None: posts = posts[skip_posts:] self._files = list(chain.from_iterable(filter(None, \ map(self._parse_post, posts)))) @property def imageboard(self) -> str: """Returns image board's name.""" return NotImplementedError @property def board(self) -> str: """Returns a name of a board of image board.""" return self._board @property def thread(self) -> str: """Returns a name of thread from a board.""" return self._thread @property def op(self) -> str: """Returns OP's post as combination of subject and comment separated by a new line.""" raise NotImplementedError @property def files(self) -> List[FileInfo]: """Returns a list of retrieved files as `FileInfo` objects.""" return self._files def _get_json(self, thread_url: str) -> dict: """Gets JSON version of a thread and converts it in a dictionary.""" try: with urlopen(thread_url) as url: return loads(url.read().decode('utf-8')) except: raise ParserThreadNotFoundError def _parse_post(self, post: dict) -> List[FileInfo]: """Parses a single post and extracts files into `FileInfo` object.""" raise NotImplementedError