64 lines
1.5 KiB
Python
64 lines
1.5 KiB
Python
from re import match
|
|
from typing import List, Optional
|
|
|
|
from scrapthechan.fileinfo import FileInfo
|
|
from scrapthechan.parser import Parser
|
|
|
|
__all__ = ["DvachParser"]
|
|
|
|
|
|
class DvachParser(Parser):
|
|
"""JSON parser for 2ch.hk image board."""
|
|
|
|
def __init__(self, board: str, thread: str,
|
|
skip_posts: Optional[int] = None) -> None:
|
|
super().__init__(board, thread, skip_posts)
|
|
|
|
@property
|
|
def json_thread_url(self) -> str:
|
|
return "https://2ch.hk/{board}/res/{thread}.json"
|
|
|
|
@property
|
|
def file_base_url(self) -> str:
|
|
return "https://2ch.hk"
|
|
|
|
@property
|
|
def subject_field(self) -> str:
|
|
return "subject"
|
|
|
|
@property
|
|
def comment_field(self) -> str:
|
|
return "comment"
|
|
|
|
@property
|
|
def imageboard(self) -> str:
|
|
return "2ch.hk"
|
|
|
|
def _extract_posts_list(self, lst: List) -> List[dict]:
|
|
return lst['threads'][0]['posts']
|
|
|
|
def _parse_post(self, post) -> Optional[List[FileInfo]]:
|
|
if not 'files' in post: return None
|
|
|
|
files = []
|
|
|
|
for f in post['files']:
|
|
if not 'sticker' in f:
|
|
if match(r"^image\.\w+$", f['fullname']) is None:
|
|
fullname = f['fullname']
|
|
else:
|
|
fullname = f['name']
|
|
else:
|
|
fullname = f['name']
|
|
# Here's same thing as 4chan. 2ch.hk also has md5 field, so it is
|
|
# completely fine to hardcode `hash_algo`.
|
|
if 'md5' in f:
|
|
files.append(FileInfo(fullname, f['size'],
|
|
f"{self.file_base_url}{f['path']}",
|
|
f['md5'], 'md5'))
|
|
else:
|
|
files.append(FileInfo(fullname, f['size'],
|
|
f"{self.file_base_url}{f['path']}",
|
|
None, None))
|
|
return files
|