From f3ef07af6850e064668cbed473b2ce50ae05d3b4 Mon Sep 17 00:00:00 2001 From: "Alexander \"Arav\" Andreev" Date: Mon, 3 May 2021 02:38:46 +0400 Subject: [PATCH] Rewrite of Parser class because it was fucked up. Now there's no problems with inheritance and its subclasses now more pleasant to write. ThreadNotFoundError now has a reason field. --- scrapthechan/parser.py | 79 +++++++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/scrapthechan/parser.py b/scrapthechan/parser.py index e83f3a4..0e518af 100644 --- a/scrapthechan/parser.py +++ b/scrapthechan/parser.py @@ -4,7 +4,7 @@ from itertools import chain from json import loads from re import findall, match from typing import List, Optional -from urllib.request import urlopen, Request +from urllib.request import urlopen, Request, HTTPError from scrapthechan import USER_AGENT from scrapthechan.fileinfo import FileInfo @@ -14,7 +14,12 @@ __all__ = ["Parser", "ThreadNotFoundError"] class ThreadNotFoundError(Exception): - pass + def __init__(self, reason: str = ""): + self._reason = reason + + @property + def reason(self) -> str: + return self._reason class Parser: @@ -25,28 +30,42 @@ class Parser: Arguments: board -- is a name of a board on an image board; - thread -- is a name of a thread inside a board; - posts -- is a list of posts in form of dictionaries exported from a JSON; + thread -- is an id of a thread inside a board; skip_posts -- number of posts to skip. All the extracted files will be stored as the `FileInfo` objects.""" - __url_thread_json: str = "https://example.org/{board}/{thread}.json" - __url_file_link: str = None - def __init__(self, board: str, thread: str, posts: List[dict], + def __init__(self, board: str, thread: str, skip_posts: Optional[int] = None) -> None: - self._board = board - self._thread = thread - self._op_post = posts[0] - if not skip_posts is None: - posts = posts[skip_posts:] + + self._board: str = board + self._thread: str = thread + self._posts = self._extract_posts_list(self._get_json()) + self._op_post: dict = self._posts[0] + self._posts = self._posts[skip_posts:] if not skip_posts is None else self._posts self._files = list(chain.from_iterable(filter(None, \ - map(self._parse_post, posts)))) + map(self._parse_post, self._posts)))) + + @property + def json_thread_url(self) -> str: + raise NotImplementedError + + @property + def file_base_url(self) -> str: + raise NotImplementedError + + @property + def subject_field(self) -> str: + return "sub" + + @property + def comment_field(self) -> str: + return "com" @property def imageboard(self) -> str: """Returns image board's name.""" - return NotImplementedError + raise NotImplementedError @property def board(self) -> str: @@ -62,22 +81,40 @@ class Parser: def op(self) -> str: """Returns OP's post as combination of subject and comment separated by a new line.""" - raise NotImplementedError + op = "" + if self.subject_field in self._op_post: + op = f"{self._op_post[self.subject_field]}\n" + if self.comment_field in self._op_post: + op += self._op_post[self.comment_field] + return op if not op == "" else None @property def files(self) -> List[FileInfo]: """Returns a list of retrieved files as `FileInfo` objects.""" return self._files - def _get_json(self, thread_url: str) -> dict: - """Gets JSON version of a thread and converts it in a dictionary.""" + def _extract_posts_list(self, lst: List) -> List[dict]: + """This method must be overridden in child classes where you specify + a path in a JSON document where posts are stored. E.g., on 4chan this is + ['posts'], and on 2ch.hk it's ['threads'][0]['posts'].""" + return lst + + def _get_json(self) -> dict: + """Retrieves a JSON representation of a thread and converts it in + a dictionary.""" try: + thread_url = self.json_thread_url.format(board=self._board, \ + thread=self._thread) req = Request(thread_url, headers={'User-Agent': USER_AGENT}) with urlopen(req) as url: return loads(url.read().decode('utf-8')) - except: - raise ThreadNotFoundError + except HTTPError as e: + raise ThreadNotFoundError(str(e)) + except Exception as e: + raise e - def _parse_post(self, post: dict) -> List[FileInfo]: - """Parses a single post and extracts files into `FileInfo` object.""" + def _parse_post(self, post: dict) -> Optional[List[FileInfo]]: + """Parses a single post and extracts files into `FileInfo` object. + Single object is wrapped in a list for convenient insertion into + a list.""" raise NotImplementedError