1
0
ScrapTheChan/scrapthechan/parser.py

84 lines
2.4 KiB
Python

"""Base `Parser` class for JSON parsers to inherit."""
from itertools import chain
from json import loads
from re import findall, match
from typing import List, Optional
from urllib.request import urlopen, Request
from scrapthechan import USER_AGENT
from scrapthechan.fileinfo import FileInfo
__all__ = ["Parser", "ThreadNotFoundError"]
class ThreadNotFoundError(Exception):
pass
class Parser:
"""Base class for all parsers.
It fetches JSON of a specified thread and collects all the files from it
into a list of the `FileInfo` objects.
Also it extracts OP's post, that may come handy if you do bulk scraping.
Arguments:
board -- is a name of a board on an image board;
thread -- is a name of a thread inside a board;
posts -- is a list of posts in form of dictionaries exported from a JSON;
skip_posts -- number of posts to skip.
All the extracted files will be stored as the `FileInfo` objects."""
__url_thread_json: str = "https://example.org/{board}/{thread}.json"
__url_file_link: str = None
def __init__(self, board: str, thread: str, posts: List[dict],
skip_posts: Optional[int] = None) -> None:
self._board = board
self._thread = thread
self._op_post = posts[0]
if not skip_posts is None:
posts = posts[skip_posts:]
self._files = list(chain.from_iterable(filter(None, \
map(self._parse_post, posts))))
@property
def imageboard(self) -> str:
"""Returns image board's name."""
return NotImplementedError
@property
def board(self) -> str:
"""Returns a name of a board of image board."""
return self._board
@property
def thread(self) -> str:
"""Returns a name of thread from a board."""
return self._thread
@property
def op(self) -> str:
"""Returns OP's post as combination of subject and comment separated
by a new line."""
raise NotImplementedError
@property
def files(self) -> List[FileInfo]:
"""Returns a list of retrieved files as `FileInfo` objects."""
return self._files
def _get_json(self, thread_url: str) -> dict:
"""Gets JSON version of a thread and converts it in a dictionary."""
try:
req = Request(thread_url, headers={'User-Agent': USER_AGENT})
with urlopen(req) as url:
return loads(url.read().decode('utf-8'))
except:
raise ThreadNotFoundError
def _parse_post(self, post: dict) -> List[FileInfo]:
"""Parses a single post and extracts files into `FileInfo` object."""
raise NotImplementedError