1
0
ScrapTheChan/scrapthechan/parser.py

121 lines
3.4 KiB
Python
Raw Normal View History

2020-07-08 22:53:39 +04:00
"""Base `Parser` class for JSON parsers to inherit."""
from itertools import chain
from json import loads
from re import findall, match
from typing import List, Optional
from urllib.request import urlopen, Request, HTTPError
2020-07-08 22:53:39 +04:00
from scrapthechan import USER_AGENT
2020-07-08 22:53:39 +04:00
from scrapthechan.fileinfo import FileInfo
__all__ = ["Parser", "ThreadNotFoundError"]
2020-07-08 22:53:39 +04:00
class ThreadNotFoundError(Exception):
def __init__(self, reason: str = ""):
self._reason = reason
@property
def reason(self) -> str:
return self._reason
2020-07-08 22:53:39 +04:00
class Parser:
"""Base class for all parsers.
It fetches JSON of a specified thread and collects all the files from it
into a list of the `FileInfo` objects.
Also it extracts OP's post, that may come handy if you do bulk scraping.
Arguments:
board -- is a name of a board on an image board;
thread -- is an id of a thread inside a board;
2020-07-08 22:53:39 +04:00
skip_posts -- number of posts to skip.
All the extracted files will be stored as the `FileInfo` objects."""
def __init__(self, board: str, thread: str,
2020-07-08 22:53:39 +04:00
skip_posts: Optional[int] = None) -> None:
self._board: str = board
self._thread: str = thread
self._posts = self._extract_posts_list(self._get_json())
self._op_post: dict = self._posts[0]
self._posts = self._posts[skip_posts:] if not skip_posts is None else self._posts
2020-07-08 22:53:39 +04:00
self._files = list(chain.from_iterable(filter(None, \
map(self._parse_post, self._posts))))
@property
def json_thread_url(self) -> str:
raise NotImplementedError
@property
def file_base_url(self) -> str:
raise NotImplementedError
@property
def subject_field(self) -> str:
return "sub"
@property
def comment_field(self) -> str:
return "com"
2020-07-08 22:53:39 +04:00
@property
def imageboard(self) -> str:
"""Returns image board's name."""
raise NotImplementedError
2020-07-08 22:53:39 +04:00
@property
def board(self) -> str:
"""Returns a name of a board of image board."""
return self._board
@property
def thread(self) -> str:
"""Returns a name of thread from a board."""
return self._thread
@property
def op(self) -> str:
"""Returns OP's post as combination of subject and comment separated
by a new line."""
op = ""
if self.subject_field in self._op_post:
op = f"{self._op_post[self.subject_field]}\n"
if self.comment_field in self._op_post:
op += self._op_post[self.comment_field]
return op if not op == "" else None
2020-07-08 22:53:39 +04:00
@property
def files(self) -> List[FileInfo]:
"""Returns a list of retrieved files as `FileInfo` objects."""
return self._files
def _extract_posts_list(self, lst: List) -> List[dict]:
"""This method must be overridden in child classes where you specify
a path in a JSON document where posts are stored. E.g., on 4chan this is
['posts'], and on 2ch.hk it's ['threads'][0]['posts']."""
return lst
def _get_json(self) -> dict:
"""Retrieves a JSON representation of a thread and converts it in
a dictionary."""
2020-07-08 22:53:39 +04:00
try:
thread_url = self.json_thread_url.format(board=self._board, \
thread=self._thread)
req = Request(thread_url, headers={'User-Agent': USER_AGENT})
with urlopen(req) as url:
2020-07-08 22:53:39 +04:00
return loads(url.read().decode('utf-8'))
except HTTPError as e:
raise ThreadNotFoundError(str(e))
except Exception as e:
raise e
def _parse_post(self, post: dict) -> Optional[List[FileInfo]]:
"""Parses a single post and extracts files into `FileInfo` object.
Single object is wrapped in a list for convenient insertion into
a list."""
2020-07-08 22:53:39 +04:00
raise NotImplementedError