84 lines
2.4 KiB
Python
84 lines
2.4 KiB
Python
"""Base `Parser` class for JSON parsers to inherit."""
|
|
|
|
from itertools import chain
|
|
from json import loads
|
|
from re import findall, match
|
|
from typing import List, Optional
|
|
from urllib.request import urlopen, Request
|
|
|
|
from scrapthechan import USER_AGENT
|
|
from scrapthechan.fileinfo import FileInfo
|
|
|
|
|
|
__all__ = ["Parser", "ThreadNotFoundError"]
|
|
|
|
|
|
class ThreadNotFoundError(Exception):
|
|
pass
|
|
|
|
|
|
class Parser:
|
|
"""Base class for all parsers.
|
|
It fetches JSON of a specified thread and collects all the files from it
|
|
into a list of the `FileInfo` objects.
|
|
Also it extracts OP's post, that may come handy if you do bulk scraping.
|
|
|
|
Arguments:
|
|
board -- is a name of a board on an image board;
|
|
thread -- is a name of a thread inside a board;
|
|
posts -- is a list of posts in form of dictionaries exported from a JSON;
|
|
skip_posts -- number of posts to skip.
|
|
|
|
All the extracted files will be stored as the `FileInfo` objects."""
|
|
__url_thread_json: str = "https://example.org/{board}/{thread}.json"
|
|
__url_file_link: str = None
|
|
|
|
def __init__(self, board: str, thread: str, posts: List[dict],
|
|
skip_posts: Optional[int] = None) -> None:
|
|
self._board = board
|
|
self._thread = thread
|
|
self._op_post = posts[0]
|
|
if not skip_posts is None:
|
|
posts = posts[skip_posts:]
|
|
self._files = list(chain.from_iterable(filter(None, \
|
|
map(self._parse_post, posts))))
|
|
|
|
@property
|
|
def imageboard(self) -> str:
|
|
"""Returns image board's name."""
|
|
return NotImplementedError
|
|
|
|
@property
|
|
def board(self) -> str:
|
|
"""Returns a name of a board of image board."""
|
|
return self._board
|
|
|
|
@property
|
|
def thread(self) -> str:
|
|
"""Returns a name of thread from a board."""
|
|
return self._thread
|
|
|
|
@property
|
|
def op(self) -> str:
|
|
"""Returns OP's post as combination of subject and comment separated
|
|
by a new line."""
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def files(self) -> List[FileInfo]:
|
|
"""Returns a list of retrieved files as `FileInfo` objects."""
|
|
return self._files
|
|
|
|
def _get_json(self, thread_url: str) -> dict:
|
|
"""Gets JSON version of a thread and converts it in a dictionary."""
|
|
try:
|
|
req = Request(thread_url, headers={'User-Agent': USER_AGENT})
|
|
with urlopen(req) as url:
|
|
return loads(url.read().decode('utf-8'))
|
|
except:
|
|
raise ThreadNotFoundError
|
|
|
|
def _parse_post(self, post: dict) -> List[FileInfo]:
|
|
"""Parses a single post and extracts files into `FileInfo` object."""
|
|
raise NotImplementedError
|