1
0
Fork 0
ScrapTheChan/scrapthechan/parsers/dvach.py

44 lines
1.2 KiB
Python

from re import match
from typing import List, Optional
from scrapthechan.fileinfo import FileInfo
from scrapthechan.parser import Parser
__all__ = ["DvachParser"]
class DvachParser(Parser):
"""JSON parser for 2ch.hk image board."""
__url_thread_json = "https://2ch.hk/{board}/res/{thread}.json"
__url_file_link = "https://2ch.hk"
def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None:
posts = self._get_json(self.__url_thread_json.format(board=board, \
thread=thread))['threads'][0]['posts']
super(DvachParser, self).__init__(board, thread, posts, skip_posts)
@property
def imageboard(self) -> str:
return "2ch.hk"
@property
def op(self) -> str:
return f"{self._op_post['subject']}\n{self._op_post['comment']}"
def _parse_post(self, post) -> Optional[List[FileInfo]]:
if not 'files' in post: return None
files = []
for f in post['files']:
if match(f['fullname'], r"^image\.\w+$") is None:
fullname = f['fullname']
else:
fullname = f['name']
# Here's same thing as 4chan. 2ch.hk also has md5 field, so it is
# completely fine to hardcode `hash_algo`.
files.append(FileInfo(fullname, f['size'],
f"{self.__url_file_link}{f['path']}",
f['md5'], 'md5'))
return files