2020-07-08 22:53:39 +04:00
|
|
|
"""Base `Parser` class for JSON parsers to inherit."""
|
|
|
|
|
|
|
|
from itertools import chain
|
|
|
|
from json import loads
|
|
|
|
from re import findall, match
|
|
|
|
from typing import List, Optional
|
2021-05-03 02:38:46 +04:00
|
|
|
from urllib.request import urlopen, Request, HTTPError
|
2020-07-08 22:53:39 +04:00
|
|
|
|
2020-09-09 04:34:41 +04:00
|
|
|
from scrapthechan import USER_AGENT
|
2020-07-08 22:53:39 +04:00
|
|
|
from scrapthechan.fileinfo import FileInfo
|
|
|
|
|
|
|
|
|
2020-07-20 04:32:30 +04:00
|
|
|
__all__ = ["Parser", "ThreadNotFoundError"]
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
|
2020-07-20 04:32:30 +04:00
|
|
|
class ThreadNotFoundError(Exception):
|
2021-05-03 02:38:46 +04:00
|
|
|
def __init__(self, reason: str = ""):
|
|
|
|
self._reason = reason
|
|
|
|
|
|
|
|
@property
|
|
|
|
def reason(self) -> str:
|
|
|
|
return self._reason
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
|
|
|
|
class Parser:
|
|
|
|
"""Base class for all parsers.
|
|
|
|
It fetches JSON of a specified thread and collects all the files from it
|
|
|
|
into a list of the `FileInfo` objects.
|
|
|
|
Also it extracts OP's post, that may come handy if you do bulk scraping.
|
|
|
|
|
|
|
|
Arguments:
|
|
|
|
board -- is a name of a board on an image board;
|
2021-05-03 02:38:46 +04:00
|
|
|
thread -- is an id of a thread inside a board;
|
2020-07-08 22:53:39 +04:00
|
|
|
skip_posts -- number of posts to skip.
|
|
|
|
|
|
|
|
All the extracted files will be stored as the `FileInfo` objects."""
|
|
|
|
|
2021-05-03 02:38:46 +04:00
|
|
|
def __init__(self, board: str, thread: str,
|
2020-07-08 22:53:39 +04:00
|
|
|
skip_posts: Optional[int] = None) -> None:
|
2021-05-03 02:38:46 +04:00
|
|
|
|
|
|
|
self._board: str = board
|
|
|
|
self._thread: str = thread
|
|
|
|
self._posts = self._extract_posts_list(self._get_json())
|
|
|
|
self._op_post: dict = self._posts[0]
|
|
|
|
self._posts = self._posts[skip_posts:] if not skip_posts is None else self._posts
|
2020-07-08 22:53:39 +04:00
|
|
|
self._files = list(chain.from_iterable(filter(None, \
|
2021-05-03 02:38:46 +04:00
|
|
|
map(self._parse_post, self._posts))))
|
|
|
|
|
|
|
|
@property
|
|
|
|
def json_thread_url(self) -> str:
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
@property
|
|
|
|
def file_base_url(self) -> str:
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
@property
|
|
|
|
def subject_field(self) -> str:
|
|
|
|
return "sub"
|
|
|
|
|
|
|
|
@property
|
|
|
|
def comment_field(self) -> str:
|
|
|
|
return "com"
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
@property
|
|
|
|
def imageboard(self) -> str:
|
|
|
|
"""Returns image board's name."""
|
2021-05-03 02:38:46 +04:00
|
|
|
raise NotImplementedError
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
@property
|
|
|
|
def board(self) -> str:
|
|
|
|
"""Returns a name of a board of image board."""
|
|
|
|
return self._board
|
|
|
|
|
|
|
|
@property
|
|
|
|
def thread(self) -> str:
|
|
|
|
"""Returns a name of thread from a board."""
|
|
|
|
return self._thread
|
|
|
|
|
|
|
|
@property
|
|
|
|
def op(self) -> str:
|
|
|
|
"""Returns OP's post as combination of subject and comment separated
|
|
|
|
by a new line."""
|
2021-05-03 02:38:46 +04:00
|
|
|
op = ""
|
|
|
|
if self.subject_field in self._op_post:
|
|
|
|
op = f"{self._op_post[self.subject_field]}\n"
|
|
|
|
if self.comment_field in self._op_post:
|
|
|
|
op += self._op_post[self.comment_field]
|
|
|
|
return op if not op == "" else None
|
2020-07-08 22:53:39 +04:00
|
|
|
|
|
|
|
@property
|
|
|
|
def files(self) -> List[FileInfo]:
|
|
|
|
"""Returns a list of retrieved files as `FileInfo` objects."""
|
|
|
|
return self._files
|
|
|
|
|
2021-05-03 02:38:46 +04:00
|
|
|
def _extract_posts_list(self, lst: List) -> List[dict]:
|
|
|
|
"""This method must be overridden in child classes where you specify
|
|
|
|
a path in a JSON document where posts are stored. E.g., on 4chan this is
|
|
|
|
['posts'], and on 2ch.hk it's ['threads'][0]['posts']."""
|
|
|
|
return lst
|
|
|
|
|
|
|
|
def _get_json(self) -> dict:
|
|
|
|
"""Retrieves a JSON representation of a thread and converts it in
|
|
|
|
a dictionary."""
|
2020-07-08 22:53:39 +04:00
|
|
|
try:
|
2021-05-03 02:38:46 +04:00
|
|
|
thread_url = self.json_thread_url.format(board=self._board, \
|
|
|
|
thread=self._thread)
|
2020-09-09 04:34:41 +04:00
|
|
|
req = Request(thread_url, headers={'User-Agent': USER_AGENT})
|
|
|
|
with urlopen(req) as url:
|
2020-07-08 22:53:39 +04:00
|
|
|
return loads(url.read().decode('utf-8'))
|
2021-05-03 02:38:46 +04:00
|
|
|
except HTTPError as e:
|
|
|
|
raise ThreadNotFoundError(str(e))
|
|
|
|
except Exception as e:
|
|
|
|
raise e
|
|
|
|
|
|
|
|
def _parse_post(self, post: dict) -> Optional[List[FileInfo]]:
|
|
|
|
"""Parses a single post and extracts files into `FileInfo` object.
|
|
|
|
Single object is wrapped in a list for convenient insertion into
|
|
|
|
a list."""
|
2020-07-08 22:53:39 +04:00
|
|
|
raise NotImplementedError
|