Rewrite of Parser class because it was fucked up. Now there's no problems with inheritance and its subclasses now more pleasant to write. ThreadNotFoundError now has a reason field.
This commit is contained in:
parent
6373518dc3
commit
f3ef07af68
@ -4,7 +4,7 @@ from itertools import chain
|
||||
from json import loads
|
||||
from re import findall, match
|
||||
from typing import List, Optional
|
||||
from urllib.request import urlopen, Request
|
||||
from urllib.request import urlopen, Request, HTTPError
|
||||
|
||||
from scrapthechan import USER_AGENT
|
||||
from scrapthechan.fileinfo import FileInfo
|
||||
@ -14,7 +14,12 @@ __all__ = ["Parser", "ThreadNotFoundError"]
|
||||
|
||||
|
||||
class ThreadNotFoundError(Exception):
|
||||
pass
|
||||
def __init__(self, reason: str = ""):
|
||||
self._reason = reason
|
||||
|
||||
@property
|
||||
def reason(self) -> str:
|
||||
return self._reason
|
||||
|
||||
|
||||
class Parser:
|
||||
@ -25,28 +30,42 @@ class Parser:
|
||||
|
||||
Arguments:
|
||||
board -- is a name of a board on an image board;
|
||||
thread -- is a name of a thread inside a board;
|
||||
posts -- is a list of posts in form of dictionaries exported from a JSON;
|
||||
thread -- is an id of a thread inside a board;
|
||||
skip_posts -- number of posts to skip.
|
||||
|
||||
All the extracted files will be stored as the `FileInfo` objects."""
|
||||
__url_thread_json: str = "https://example.org/{board}/{thread}.json"
|
||||
__url_file_link: str = None
|
||||
|
||||
def __init__(self, board: str, thread: str, posts: List[dict],
|
||||
def __init__(self, board: str, thread: str,
|
||||
skip_posts: Optional[int] = None) -> None:
|
||||
self._board = board
|
||||
self._thread = thread
|
||||
self._op_post = posts[0]
|
||||
if not skip_posts is None:
|
||||
posts = posts[skip_posts:]
|
||||
|
||||
self._board: str = board
|
||||
self._thread: str = thread
|
||||
self._posts = self._extract_posts_list(self._get_json())
|
||||
self._op_post: dict = self._posts[0]
|
||||
self._posts = self._posts[skip_posts:] if not skip_posts is None else self._posts
|
||||
self._files = list(chain.from_iterable(filter(None, \
|
||||
map(self._parse_post, posts))))
|
||||
map(self._parse_post, self._posts))))
|
||||
|
||||
@property
|
||||
def json_thread_url(self) -> str:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def file_base_url(self) -> str:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def subject_field(self) -> str:
|
||||
return "sub"
|
||||
|
||||
@property
|
||||
def comment_field(self) -> str:
|
||||
return "com"
|
||||
|
||||
@property
|
||||
def imageboard(self) -> str:
|
||||
"""Returns image board's name."""
|
||||
return NotImplementedError
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def board(self) -> str:
|
||||
@ -62,22 +81,40 @@ class Parser:
|
||||
def op(self) -> str:
|
||||
"""Returns OP's post as combination of subject and comment separated
|
||||
by a new line."""
|
||||
raise NotImplementedError
|
||||
op = ""
|
||||
if self.subject_field in self._op_post:
|
||||
op = f"{self._op_post[self.subject_field]}\n"
|
||||
if self.comment_field in self._op_post:
|
||||
op += self._op_post[self.comment_field]
|
||||
return op if not op == "" else None
|
||||
|
||||
@property
|
||||
def files(self) -> List[FileInfo]:
|
||||
"""Returns a list of retrieved files as `FileInfo` objects."""
|
||||
return self._files
|
||||
|
||||
def _get_json(self, thread_url: str) -> dict:
|
||||
"""Gets JSON version of a thread and converts it in a dictionary."""
|
||||
def _extract_posts_list(self, lst: List) -> List[dict]:
|
||||
"""This method must be overridden in child classes where you specify
|
||||
a path in a JSON document where posts are stored. E.g., on 4chan this is
|
||||
['posts'], and on 2ch.hk it's ['threads'][0]['posts']."""
|
||||
return lst
|
||||
|
||||
def _get_json(self) -> dict:
|
||||
"""Retrieves a JSON representation of a thread and converts it in
|
||||
a dictionary."""
|
||||
try:
|
||||
thread_url = self.json_thread_url.format(board=self._board, \
|
||||
thread=self._thread)
|
||||
req = Request(thread_url, headers={'User-Agent': USER_AGENT})
|
||||
with urlopen(req) as url:
|
||||
return loads(url.read().decode('utf-8'))
|
||||
except:
|
||||
raise ThreadNotFoundError
|
||||
except HTTPError as e:
|
||||
raise ThreadNotFoundError(str(e))
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def _parse_post(self, post: dict) -> List[FileInfo]:
|
||||
"""Parses a single post and extracts files into `FileInfo` object."""
|
||||
def _parse_post(self, post: dict) -> Optional[List[FileInfo]]:
|
||||
"""Parses a single post and extracts files into `FileInfo` object.
|
||||
Single object is wrapped in a list for convenient insertion into
|
||||
a list."""
|
||||
raise NotImplementedError
|
||||
|
Loading…
Reference in New Issue
Block a user