1
0

IB parsers rewritten accordingly to fixed Parser class.

This commit is contained in:
Alexander Andreev 2021-05-03 02:40:21 +04:00
parent f3ef07af68
commit 78d4a62c17
Signed by: Arav
GPG Key ID: 610DF2574456329F
4 changed files with 52 additions and 150 deletions

View File

@ -10,31 +10,38 @@ __all__ = ["DvachParser"]
class DvachParser(Parser): class DvachParser(Parser):
"""JSON parser for 2ch.hk image board.""" """JSON parser for 2ch.hk image board."""
__url_thread_json = "https://2ch.hk/{board}/res/{thread}.json"
__url_file_link = "https://2ch.hk"
def __init__(self, board: str, thread: str, def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None: skip_posts: Optional[int] = None) -> None:
posts = self._get_json(self.__url_thread_json.format(board=board, \ super().__init__(board, thread, skip_posts)
thread=thread))['threads'][0]['posts']
super(DvachParser, self).__init__(board, thread, posts, skip_posts) @property
def json_thread_url(self) -> str:
return "https://2ch.hk/{board}/res/{thread}.json"
@property
def file_base_url(self) -> str:
return "https://2ch.hk"
@property
def subject_field(self) -> str:
return "subject"
@property
def comment_field(self) -> str:
return "comment"
@property @property
def imageboard(self) -> str: def imageboard(self) -> str:
return "2ch.hk" return "2ch.hk"
@property def _extract_posts_list(self, lst: List) -> List[dict]:
def op(self) -> Optional[str]: return lst['threads'][0]['posts']
op = ""
if 'subject' in self._op_post:
op = f"{self._op_post['subject']}\n"
if 'comment' in self._op_post:
op += self._op_post['comment']
return op if not op == "" else None
def _parse_post(self, post) -> Optional[List[FileInfo]]: def _parse_post(self, post) -> Optional[List[FileInfo]]:
if not 'files' in post: return None if not 'files' in post: return None
files = [] files = []
for f in post['files']: for f in post['files']:
if not 'sticker' in f: if not 'sticker' in f:
if match(r"^image\.\w+$", f['fullname']) is None: if match(r"^image\.\w+$", f['fullname']) is None:
@ -47,10 +54,10 @@ class DvachParser(Parser):
# completely fine to hardcode `hash_algo`. # completely fine to hardcode `hash_algo`.
if 'md5' in f: if 'md5' in f:
files.append(FileInfo(fullname, f['size'], files.append(FileInfo(fullname, f['size'],
f"{self.__url_file_link}{f['path']}", f"{self.file_base_url}{f['path']}",
f['md5'], 'md5')) f['md5'], 'md5'))
else: else:
files.append(FileInfo(fullname, f['size'], files.append(FileInfo(fullname, f['size'],
f"{self.__url_file_link}{f['path']}", f"{self.file_base_url}{f['path']}",
None, None)) None, None))
return files return files

View File

@ -1,63 +1,25 @@
from re import match from typing import Optional
from typing import List, Optional
from scrapthechan.fileinfo import FileInfo from scrapthechan.parsers.tinyboardlike import TinyboardLikeParser
from scrapthechan.parser import Parser
__all__ = ["EightKunParser"] __all__ = ["EightKunParser"]
class EightKunParser(Parser): class EightKunParser(TinyboardLikeParser):
"""JSON parser for 8kun.top image board.""" """JSON parser for 8kun.top image board."""
__url_thread_json = "https://8kun.top/{board}/res/{thread}.json"
__url_file_link = "https://media.8kun.top/file_store/{filename}"
def __init__(self, board: str, thread: str, def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None: skip_posts: Optional[int] = None) -> None:
posts = self._get_json(self.__url_thread_json.format(board=board, \ super().__init__(board, thread, skip_posts)
thread=thread))['posts']
super(EightKunParser, self).__init__(board, thread, posts, skip_posts)
@property @property
def imageboard(self) -> str: def imageboard(self) -> str:
return "8kun.top" return "8kun.top"
@property @property
def op(self) -> Optional[str]: def json_thread_url(self) -> str:
op = "" return "https://8kun.top/{board}/res/{thread}.json"
if 'sub' in self._op_post:
op = f"{self._op_post['sub']}\n"
if 'com' in self._op_post:
op += self._op_post['com']
return op if not op == "" else None
def _parse_post(self, post: dict) -> List[FileInfo]: @property
if not 'tim' in post: return None def file_base_url(self) -> str:
return "https://media.8kun.top/file_dl/{filename}"
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
files = []
files.append(FileInfo(filename, post['fsize'],
self.__url_file_link.format(board=self.board, filename=dlfname),
post['md5'], 'md5'))
if "extra_files" in post:
for f in post["extra_files"]:
dlfname = f"{f['tim']}{f['ext']}"
if "filename" in post:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
dlurl = self.__url_file_link.format(board=self.board, \
filename=dlfname)
files.append(FileInfo(filename, f['fsize'], \
dlurl, f['md5'], 'md5'))
return files

View File

@ -1,51 +1,25 @@
from re import match from typing import Optional
from typing import List, Optional
from scrapthechan.fileinfo import FileInfo from scrapthechan.parsers.tinyboardlike import TinyboardLikeParser
from scrapthechan.parser import Parser
__all__ = ["FourChanParser"] __all__ = ["FourChanParser"]
class FourChanParser(Parser): class FourChanParser(TinyboardLikeParser):
"""JSON parser for 4chan.org image board.""" """JSON parser for 4chan.org image board."""
__url_thread_json = "https://a.4cdn.org/{board}/thread/{thread}.json"
__url_file_link = "https://i.4cdn.org/{board}/{filename}"
def __init__(self, board: str, thread: str, def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None: skip_posts: Optional[int] = None) -> None:
posts = self._get_json(self.__url_thread_json.format(board=board, \ super().__init__(board, thread, skip_posts)
thread=thread))['posts']
super(FourChanParser, self).__init__(board, thread, posts, skip_posts)
@property @property
def imageboard(self) -> str: def imageboard(self) -> str:
return "4chan.org" return "4chan.org"
@property @property
def op(self) -> Optional[str]: def json_thread_url(self) -> str:
op = "" return "https://a.4cdn.org/{board}/thread/{thread}.json"
if 'sub' in self._op_post:
op = f"{self._op_post['sub']}\n"
if 'com' in self._op_post:
op += self._op_post['com']
return op if not op == "" else None
def _parse_post(self, post: dict) -> List[FileInfo]: @property
if not 'tim' in post: return None def file_base_url(self) -> str:
return "https://i.4cdn.org/{board}/{filename}"
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
# Hash algorithm is hardcoded since it is highly unlikely that it will
# be changed in foreseeable future. And if it'll change then this line
# will be necessarily updated anyway.
return [FileInfo(filename, post['fsize'],
self.__url_file_link.format(board=self.board, filename=dlfname),
post['md5'], 'md5')]

View File

@ -1,66 +1,25 @@
from re import match from typing import Optional
from typing import List, Optional
from scrapthechan.parser import Parser from scrapthechan.parsers.tinyboardlike import TinyboardLikeParser
from scrapthechan.fileinfo import FileInfo
__all__ = ["LainchanParser"] __all__ = ["LainchanParser"]
class LainchanParser(Parser): class LainchanParser(TinyboardLikeParser):
"""JSON parser for lainchan.org image board. """JSON parser for lainchan.org image board."""
JSON structure is identical to 4chan.org's, so this parser is just inherited
from 4chan.org's parser and only needed things are redefined.
"""
__url_thread_json = "https://lainchan.org/{board}/res/{thread}.json"
__url_file_link = "https://lainchan.org/{board}/src/{filename}"
def __init__(self, board: str, thread: str, def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None: skip_posts: Optional[int] = None) -> None:
posts = self._get_json(self.__url_thread_json.format(board=board, \ super().__init__(board, thread, skip_posts)
thread=thread))['posts']
super(LainchanParser, self).__init__(board, thread, posts, skip_posts)
@property @property
def imageboard(self) -> str: def imageboard(self) -> str:
return "lainchan.org" return "lainchan.org"
@property @property
def op(self) -> Optional[str]: def json_thread_url(self) -> str:
op = "" return "https://lainchan.org/{board}/res/{thread}.json"
if 'sub' in self._op_post:
op = f"{self._op_post['sub']}\n"
if 'com' in self._op_post:
op += self._op_post['com']
return op if not op == "" else None
def _parse_post(self, post) -> List[FileInfo]: @property
if not 'tim' in post: return None def file_base_url(self) -> str:
return "https://lainchan.org/{board}/src/{filename}"
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
files = []
files.append(FileInfo(filename, post['fsize'],
self.__url_file_link.format(board=self.board, filename=dlfname),
post['md5'], 'md5'))
if "extra_files" in post:
for f in post["extra_files"]:
dlfname = f"{f['tim']}{f['ext']}"
if "filename" in post:
if match(r"^image\.\w+$", post['filename']) is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
dlurl = self.__url_file_link.format(board=self.board, \
filename=dlfname)
files.append(FileInfo(filename, f['fsize'], \
dlurl, f['md5'], 'md5'))
return files