1
0

Rewrite of Parser class because it was fucked up. Now there's no problems with inheritance and its subclasses now more pleasant to write. ThreadNotFoundError now has a reason field.

This commit is contained in:
Alexander Andreev 2021-05-03 02:38:46 +04:00
parent 6373518dc3
commit f3ef07af68
Signed by: Arav
GPG Key ID: 610DF2574456329F

View File

@ -4,7 +4,7 @@ from itertools import chain
from json import loads
from re import findall, match
from typing import List, Optional
from urllib.request import urlopen, Request
from urllib.request import urlopen, Request, HTTPError
from scrapthechan import USER_AGENT
from scrapthechan.fileinfo import FileInfo
@ -14,7 +14,12 @@ __all__ = ["Parser", "ThreadNotFoundError"]
class ThreadNotFoundError(Exception):
pass
def __init__(self, reason: str = ""):
self._reason = reason
@property
def reason(self) -> str:
return self._reason
class Parser:
@ -25,28 +30,42 @@ class Parser:
Arguments:
board -- is a name of a board on an image board;
thread -- is a name of a thread inside a board;
posts -- is a list of posts in form of dictionaries exported from a JSON;
thread -- is an id of a thread inside a board;
skip_posts -- number of posts to skip.
All the extracted files will be stored as the `FileInfo` objects."""
__url_thread_json: str = "https://example.org/{board}/{thread}.json"
__url_file_link: str = None
def __init__(self, board: str, thread: str, posts: List[dict],
def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None:
self._board = board
self._thread = thread
self._op_post = posts[0]
if not skip_posts is None:
posts = posts[skip_posts:]
self._board: str = board
self._thread: str = thread
self._posts = self._extract_posts_list(self._get_json())
self._op_post: dict = self._posts[0]
self._posts = self._posts[skip_posts:] if not skip_posts is None else self._posts
self._files = list(chain.from_iterable(filter(None, \
map(self._parse_post, posts))))
map(self._parse_post, self._posts))))
@property
def json_thread_url(self) -> str:
raise NotImplementedError
@property
def file_base_url(self) -> str:
raise NotImplementedError
@property
def subject_field(self) -> str:
return "sub"
@property
def comment_field(self) -> str:
return "com"
@property
def imageboard(self) -> str:
"""Returns image board's name."""
return NotImplementedError
raise NotImplementedError
@property
def board(self) -> str:
@ -62,22 +81,40 @@ class Parser:
def op(self) -> str:
"""Returns OP's post as combination of subject and comment separated
by a new line."""
raise NotImplementedError
op = ""
if self.subject_field in self._op_post:
op = f"{self._op_post[self.subject_field]}\n"
if self.comment_field in self._op_post:
op += self._op_post[self.comment_field]
return op if not op == "" else None
@property
def files(self) -> List[FileInfo]:
"""Returns a list of retrieved files as `FileInfo` objects."""
return self._files
def _get_json(self, thread_url: str) -> dict:
"""Gets JSON version of a thread and converts it in a dictionary."""
def _extract_posts_list(self, lst: List) -> List[dict]:
"""This method must be overridden in child classes where you specify
a path in a JSON document where posts are stored. E.g., on 4chan this is
['posts'], and on 2ch.hk it's ['threads'][0]['posts']."""
return lst
def _get_json(self) -> dict:
"""Retrieves a JSON representation of a thread and converts it in
a dictionary."""
try:
thread_url = self.json_thread_url.format(board=self._board, \
thread=self._thread)
req = Request(thread_url, headers={'User-Agent': USER_AGENT})
with urlopen(req) as url:
return loads(url.read().decode('utf-8'))
except:
raise ThreadNotFoundError
except HTTPError as e:
raise ThreadNotFoundError(str(e))
except Exception as e:
raise e
def _parse_post(self, post: dict) -> List[FileInfo]:
"""Parses a single post and extracts files into `FileInfo` object."""
def _parse_post(self, post: dict) -> Optional[List[FileInfo]]:
"""Parses a single post and extracts files into `FileInfo` object.
Single object is wrapped in a list for convenient insertion into
a list."""
raise NotImplementedError