1
0
Fork 0

Rewrite of Parser class because it was fucked up. Now there's no problems with inheritance and its subclasses now more pleasant to write. ThreadNotFoundError now has a reason field.

This commit is contained in:
Alexander Andreev 2021-05-03 02:38:46 +04:00
parent 6373518dc3
commit f3ef07af68
Signed by: Arav
GPG Key ID: 610DF2574456329F
1 changed files with 58 additions and 21 deletions

View File

@ -4,7 +4,7 @@ from itertools import chain
from json import loads from json import loads
from re import findall, match from re import findall, match
from typing import List, Optional from typing import List, Optional
from urllib.request import urlopen, Request from urllib.request import urlopen, Request, HTTPError
from scrapthechan import USER_AGENT from scrapthechan import USER_AGENT
from scrapthechan.fileinfo import FileInfo from scrapthechan.fileinfo import FileInfo
@ -14,7 +14,12 @@ __all__ = ["Parser", "ThreadNotFoundError"]
class ThreadNotFoundError(Exception): class ThreadNotFoundError(Exception):
pass def __init__(self, reason: str = ""):
self._reason = reason
@property
def reason(self) -> str:
return self._reason
class Parser: class Parser:
@ -25,28 +30,42 @@ class Parser:
Arguments: Arguments:
board -- is a name of a board on an image board; board -- is a name of a board on an image board;
thread -- is a name of a thread inside a board; thread -- is an id of a thread inside a board;
posts -- is a list of posts in form of dictionaries exported from a JSON;
skip_posts -- number of posts to skip. skip_posts -- number of posts to skip.
All the extracted files will be stored as the `FileInfo` objects.""" All the extracted files will be stored as the `FileInfo` objects."""
__url_thread_json: str = "https://example.org/{board}/{thread}.json"
__url_file_link: str = None
def __init__(self, board: str, thread: str, posts: List[dict], def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None: skip_posts: Optional[int] = None) -> None:
self._board = board
self._thread = thread self._board: str = board
self._op_post = posts[0] self._thread: str = thread
if not skip_posts is None: self._posts = self._extract_posts_list(self._get_json())
posts = posts[skip_posts:] self._op_post: dict = self._posts[0]
self._posts = self._posts[skip_posts:] if not skip_posts is None else self._posts
self._files = list(chain.from_iterable(filter(None, \ self._files = list(chain.from_iterable(filter(None, \
map(self._parse_post, posts)))) map(self._parse_post, self._posts))))
@property
def json_thread_url(self) -> str:
raise NotImplementedError
@property
def file_base_url(self) -> str:
raise NotImplementedError
@property
def subject_field(self) -> str:
return "sub"
@property
def comment_field(self) -> str:
return "com"
@property @property
def imageboard(self) -> str: def imageboard(self) -> str:
"""Returns image board's name.""" """Returns image board's name."""
return NotImplementedError raise NotImplementedError
@property @property
def board(self) -> str: def board(self) -> str:
@ -62,22 +81,40 @@ class Parser:
def op(self) -> str: def op(self) -> str:
"""Returns OP's post as combination of subject and comment separated """Returns OP's post as combination of subject and comment separated
by a new line.""" by a new line."""
raise NotImplementedError op = ""
if self.subject_field in self._op_post:
op = f"{self._op_post[self.subject_field]}\n"
if self.comment_field in self._op_post:
op += self._op_post[self.comment_field]
return op if not op == "" else None
@property @property
def files(self) -> List[FileInfo]: def files(self) -> List[FileInfo]:
"""Returns a list of retrieved files as `FileInfo` objects.""" """Returns a list of retrieved files as `FileInfo` objects."""
return self._files return self._files
def _get_json(self, thread_url: str) -> dict: def _extract_posts_list(self, lst: List) -> List[dict]:
"""Gets JSON version of a thread and converts it in a dictionary.""" """This method must be overridden in child classes where you specify
a path in a JSON document where posts are stored. E.g., on 4chan this is
['posts'], and on 2ch.hk it's ['threads'][0]['posts']."""
return lst
def _get_json(self) -> dict:
"""Retrieves a JSON representation of a thread and converts it in
a dictionary."""
try: try:
thread_url = self.json_thread_url.format(board=self._board, \
thread=self._thread)
req = Request(thread_url, headers={'User-Agent': USER_AGENT}) req = Request(thread_url, headers={'User-Agent': USER_AGENT})
with urlopen(req) as url: with urlopen(req) as url:
return loads(url.read().decode('utf-8')) return loads(url.read().decode('utf-8'))
except: except HTTPError as e:
raise ThreadNotFoundError raise ThreadNotFoundError(str(e))
except Exception as e:
raise e
def _parse_post(self, post: dict) -> List[FileInfo]: def _parse_post(self, post: dict) -> Optional[List[FileInfo]]:
"""Parses a single post and extracts files into `FileInfo` object.""" """Parses a single post and extracts files into `FileInfo` object.
Single object is wrapped in a list for convenient insertion into
a list."""
raise NotImplementedError raise NotImplementedError