1
0
ScrapTheChan/scrapthechan/parser.py

84 lines
2.4 KiB
Python
Raw Normal View History

2020-07-08 22:53:39 +04:00
"""Base `Parser` class for JSON parsers to inherit."""
from itertools import chain
from json import loads
from re import findall, match
from typing import List, Optional
from urllib.request import urlopen, Request
2020-07-08 22:53:39 +04:00
from scrapthechan import USER_AGENT
2020-07-08 22:53:39 +04:00
from scrapthechan.fileinfo import FileInfo
__all__ = ["Parser", "ThreadNotFoundError"]
2020-07-08 22:53:39 +04:00
class ThreadNotFoundError(Exception):
2020-07-08 22:53:39 +04:00
pass
class Parser:
"""Base class for all parsers.
It fetches JSON of a specified thread and collects all the files from it
into a list of the `FileInfo` objects.
Also it extracts OP's post, that may come handy if you do bulk scraping.
Arguments:
board -- is a name of a board on an image board;
thread -- is a name of a thread inside a board;
posts -- is a list of posts in form of dictionaries exported from a JSON;
skip_posts -- number of posts to skip.
All the extracted files will be stored as the `FileInfo` objects."""
__url_thread_json: str = "https://example.org/{board}/{thread}.json"
__url_file_link: str = None
def __init__(self, board: str, thread: str, posts: List[dict],
skip_posts: Optional[int] = None) -> None:
self._board = board
self._thread = thread
self._op_post = posts[0]
if not skip_posts is None:
posts = posts[skip_posts:]
self._files = list(chain.from_iterable(filter(None, \
map(self._parse_post, posts))))
@property
def imageboard(self) -> str:
"""Returns image board's name."""
return NotImplementedError
@property
def board(self) -> str:
"""Returns a name of a board of image board."""
return self._board
@property
def thread(self) -> str:
"""Returns a name of thread from a board."""
return self._thread
@property
def op(self) -> str:
"""Returns OP's post as combination of subject and comment separated
by a new line."""
raise NotImplementedError
@property
def files(self) -> List[FileInfo]:
"""Returns a list of retrieved files as `FileInfo` objects."""
return self._files
def _get_json(self, thread_url: str) -> dict:
"""Gets JSON version of a thread and converts it in a dictionary."""
try:
req = Request(thread_url, headers={'User-Agent': USER_AGENT})
with urlopen(req) as url:
2020-07-08 22:53:39 +04:00
return loads(url.read().decode('utf-8'))
except:
raise ThreadNotFoundError
2020-07-08 22:53:39 +04:00
def _parse_post(self, post: dict) -> List[FileInfo]:
"""Parses a single post and extracts files into `FileInfo` object."""
raise NotImplementedError