IB parsers rewritten accordingly to fixed Parser class.
This commit is contained in:
parent
f3ef07af68
commit
78d4a62c17
@ -10,31 +10,38 @@ __all__ = ["DvachParser"]
|
|||||||
class DvachParser(Parser):
|
class DvachParser(Parser):
|
||||||
"""JSON parser for 2ch.hk image board."""
|
"""JSON parser for 2ch.hk image board."""
|
||||||
|
|
||||||
__url_thread_json = "https://2ch.hk/{board}/res/{thread}.json"
|
|
||||||
__url_file_link = "https://2ch.hk"
|
|
||||||
|
|
||||||
def __init__(self, board: str, thread: str,
|
def __init__(self, board: str, thread: str,
|
||||||
skip_posts: Optional[int] = None) -> None:
|
skip_posts: Optional[int] = None) -> None:
|
||||||
posts = self._get_json(self.__url_thread_json.format(board=board, \
|
super().__init__(board, thread, skip_posts)
|
||||||
thread=thread))['threads'][0]['posts']
|
|
||||||
super(DvachParser, self).__init__(board, thread, posts, skip_posts)
|
@property
|
||||||
|
def json_thread_url(self) -> str:
|
||||||
|
return "https://2ch.hk/{board}/res/{thread}.json"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def file_base_url(self) -> str:
|
||||||
|
return "https://2ch.hk"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def subject_field(self) -> str:
|
||||||
|
return "subject"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def comment_field(self) -> str:
|
||||||
|
return "comment"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def imageboard(self) -> str:
|
def imageboard(self) -> str:
|
||||||
return "2ch.hk"
|
return "2ch.hk"
|
||||||
|
|
||||||
@property
|
def _extract_posts_list(self, lst: List) -> List[dict]:
|
||||||
def op(self) -> Optional[str]:
|
return lst['threads'][0]['posts']
|
||||||
op = ""
|
|
||||||
if 'subject' in self._op_post:
|
|
||||||
op = f"{self._op_post['subject']}\n"
|
|
||||||
if 'comment' in self._op_post:
|
|
||||||
op += self._op_post['comment']
|
|
||||||
return op if not op == "" else None
|
|
||||||
|
|
||||||
def _parse_post(self, post) -> Optional[List[FileInfo]]:
|
def _parse_post(self, post) -> Optional[List[FileInfo]]:
|
||||||
if not 'files' in post: return None
|
if not 'files' in post: return None
|
||||||
|
|
||||||
files = []
|
files = []
|
||||||
|
|
||||||
for f in post['files']:
|
for f in post['files']:
|
||||||
if not 'sticker' in f:
|
if not 'sticker' in f:
|
||||||
if match(r"^image\.\w+$", f['fullname']) is None:
|
if match(r"^image\.\w+$", f['fullname']) is None:
|
||||||
@ -47,10 +54,10 @@ class DvachParser(Parser):
|
|||||||
# completely fine to hardcode `hash_algo`.
|
# completely fine to hardcode `hash_algo`.
|
||||||
if 'md5' in f:
|
if 'md5' in f:
|
||||||
files.append(FileInfo(fullname, f['size'],
|
files.append(FileInfo(fullname, f['size'],
|
||||||
f"{self.__url_file_link}{f['path']}",
|
f"{self.file_base_url}{f['path']}",
|
||||||
f['md5'], 'md5'))
|
f['md5'], 'md5'))
|
||||||
else:
|
else:
|
||||||
files.append(FileInfo(fullname, f['size'],
|
files.append(FileInfo(fullname, f['size'],
|
||||||
f"{self.__url_file_link}{f['path']}",
|
f"{self.file_base_url}{f['path']}",
|
||||||
None, None))
|
None, None))
|
||||||
return files
|
return files
|
||||||
|
@ -1,63 +1,25 @@
|
|||||||
from re import match
|
from typing import Optional
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
from scrapthechan.fileinfo import FileInfo
|
from scrapthechan.parsers.tinyboardlike import TinyboardLikeParser
|
||||||
from scrapthechan.parser import Parser
|
|
||||||
|
|
||||||
__all__ = ["EightKunParser"]
|
__all__ = ["EightKunParser"]
|
||||||
|
|
||||||
|
|
||||||
class EightKunParser(Parser):
|
class EightKunParser(TinyboardLikeParser):
|
||||||
"""JSON parser for 8kun.top image board."""
|
"""JSON parser for 8kun.top image board."""
|
||||||
|
|
||||||
__url_thread_json = "https://8kun.top/{board}/res/{thread}.json"
|
|
||||||
__url_file_link = "https://media.8kun.top/file_store/{filename}"
|
|
||||||
|
|
||||||
def __init__(self, board: str, thread: str,
|
def __init__(self, board: str, thread: str,
|
||||||
skip_posts: Optional[int] = None) -> None:
|
skip_posts: Optional[int] = None) -> None:
|
||||||
posts = self._get_json(self.__url_thread_json.format(board=board, \
|
super().__init__(board, thread, skip_posts)
|
||||||
thread=thread))['posts']
|
|
||||||
super(EightKunParser, self).__init__(board, thread, posts, skip_posts)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def imageboard(self) -> str:
|
def imageboard(self) -> str:
|
||||||
return "8kun.top"
|
return "8kun.top"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def op(self) -> Optional[str]:
|
def json_thread_url(self) -> str:
|
||||||
op = ""
|
return "https://8kun.top/{board}/res/{thread}.json"
|
||||||
if 'sub' in self._op_post:
|
|
||||||
op = f"{self._op_post['sub']}\n"
|
|
||||||
if 'com' in self._op_post:
|
|
||||||
op += self._op_post['com']
|
|
||||||
return op if not op == "" else None
|
|
||||||
|
|
||||||
def _parse_post(self, post: dict) -> List[FileInfo]:
|
@property
|
||||||
if not 'tim' in post: return None
|
def file_base_url(self) -> str:
|
||||||
|
return "https://media.8kun.top/file_dl/{filename}"
|
||||||
dlfname = f"{post['tim']}{post['ext']}"
|
|
||||||
|
|
||||||
if "filename" in post:
|
|
||||||
if match(r"^image\.\w+$", post['filename']) is None:
|
|
||||||
filename = dlfname
|
|
||||||
else:
|
|
||||||
filename = f"{post['filename']}{post['ext']}"
|
|
||||||
|
|
||||||
files = []
|
|
||||||
files.append(FileInfo(filename, post['fsize'],
|
|
||||||
self.__url_file_link.format(board=self.board, filename=dlfname),
|
|
||||||
post['md5'], 'md5'))
|
|
||||||
|
|
||||||
if "extra_files" in post:
|
|
||||||
for f in post["extra_files"]:
|
|
||||||
dlfname = f"{f['tim']}{f['ext']}"
|
|
||||||
if "filename" in post:
|
|
||||||
if match(r"^image\.\w+$", post['filename']) is None:
|
|
||||||
filename = dlfname
|
|
||||||
else:
|
|
||||||
filename = f"{post['filename']}{post['ext']}"
|
|
||||||
dlurl = self.__url_file_link.format(board=self.board, \
|
|
||||||
filename=dlfname)
|
|
||||||
files.append(FileInfo(filename, f['fsize'], \
|
|
||||||
dlurl, f['md5'], 'md5'))
|
|
||||||
return files
|
|
||||||
|
@ -1,51 +1,25 @@
|
|||||||
from re import match
|
from typing import Optional
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
from scrapthechan.fileinfo import FileInfo
|
from scrapthechan.parsers.tinyboardlike import TinyboardLikeParser
|
||||||
from scrapthechan.parser import Parser
|
|
||||||
|
|
||||||
__all__ = ["FourChanParser"]
|
__all__ = ["FourChanParser"]
|
||||||
|
|
||||||
|
|
||||||
class FourChanParser(Parser):
|
class FourChanParser(TinyboardLikeParser):
|
||||||
"""JSON parser for 4chan.org image board."""
|
"""JSON parser for 4chan.org image board."""
|
||||||
|
|
||||||
__url_thread_json = "https://a.4cdn.org/{board}/thread/{thread}.json"
|
|
||||||
__url_file_link = "https://i.4cdn.org/{board}/{filename}"
|
|
||||||
|
|
||||||
def __init__(self, board: str, thread: str,
|
def __init__(self, board: str, thread: str,
|
||||||
skip_posts: Optional[int] = None) -> None:
|
skip_posts: Optional[int] = None) -> None:
|
||||||
posts = self._get_json(self.__url_thread_json.format(board=board, \
|
super().__init__(board, thread, skip_posts)
|
||||||
thread=thread))['posts']
|
|
||||||
super(FourChanParser, self).__init__(board, thread, posts, skip_posts)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def imageboard(self) -> str:
|
def imageboard(self) -> str:
|
||||||
return "4chan.org"
|
return "4chan.org"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def op(self) -> Optional[str]:
|
def json_thread_url(self) -> str:
|
||||||
op = ""
|
return "https://a.4cdn.org/{board}/thread/{thread}.json"
|
||||||
if 'sub' in self._op_post:
|
|
||||||
op = f"{self._op_post['sub']}\n"
|
|
||||||
if 'com' in self._op_post:
|
|
||||||
op += self._op_post['com']
|
|
||||||
return op if not op == "" else None
|
|
||||||
|
|
||||||
def _parse_post(self, post: dict) -> List[FileInfo]:
|
@property
|
||||||
if not 'tim' in post: return None
|
def file_base_url(self) -> str:
|
||||||
|
return "https://i.4cdn.org/{board}/{filename}"
|
||||||
dlfname = f"{post['tim']}{post['ext']}"
|
|
||||||
|
|
||||||
if "filename" in post:
|
|
||||||
if match(r"^image\.\w+$", post['filename']) is None:
|
|
||||||
filename = dlfname
|
|
||||||
else:
|
|
||||||
filename = f"{post['filename']}{post['ext']}"
|
|
||||||
|
|
||||||
# Hash algorithm is hardcoded since it is highly unlikely that it will
|
|
||||||
# be changed in foreseeable future. And if it'll change then this line
|
|
||||||
# will be necessarily updated anyway.
|
|
||||||
return [FileInfo(filename, post['fsize'],
|
|
||||||
self.__url_file_link.format(board=self.board, filename=dlfname),
|
|
||||||
post['md5'], 'md5')]
|
|
||||||
|
@ -1,66 +1,25 @@
|
|||||||
from re import match
|
from typing import Optional
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
from scrapthechan.parser import Parser
|
from scrapthechan.parsers.tinyboardlike import TinyboardLikeParser
|
||||||
from scrapthechan.fileinfo import FileInfo
|
|
||||||
|
|
||||||
__all__ = ["LainchanParser"]
|
__all__ = ["LainchanParser"]
|
||||||
|
|
||||||
|
|
||||||
class LainchanParser(Parser):
|
class LainchanParser(TinyboardLikeParser):
|
||||||
"""JSON parser for lainchan.org image board.
|
"""JSON parser for lainchan.org image board."""
|
||||||
JSON structure is identical to 4chan.org's, so this parser is just inherited
|
|
||||||
from 4chan.org's parser and only needed things are redefined.
|
|
||||||
"""
|
|
||||||
|
|
||||||
__url_thread_json = "https://lainchan.org/{board}/res/{thread}.json"
|
|
||||||
__url_file_link = "https://lainchan.org/{board}/src/{filename}"
|
|
||||||
|
|
||||||
def __init__(self, board: str, thread: str,
|
def __init__(self, board: str, thread: str,
|
||||||
skip_posts: Optional[int] = None) -> None:
|
skip_posts: Optional[int] = None) -> None:
|
||||||
posts = self._get_json(self.__url_thread_json.format(board=board, \
|
super().__init__(board, thread, skip_posts)
|
||||||
thread=thread))['posts']
|
|
||||||
super(LainchanParser, self).__init__(board, thread, posts, skip_posts)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def imageboard(self) -> str:
|
def imageboard(self) -> str:
|
||||||
return "lainchan.org"
|
return "lainchan.org"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def op(self) -> Optional[str]:
|
def json_thread_url(self) -> str:
|
||||||
op = ""
|
return "https://lainchan.org/{board}/res/{thread}.json"
|
||||||
if 'sub' in self._op_post:
|
|
||||||
op = f"{self._op_post['sub']}\n"
|
|
||||||
if 'com' in self._op_post:
|
|
||||||
op += self._op_post['com']
|
|
||||||
return op if not op == "" else None
|
|
||||||
|
|
||||||
def _parse_post(self, post) -> List[FileInfo]:
|
@property
|
||||||
if not 'tim' in post: return None
|
def file_base_url(self) -> str:
|
||||||
|
return "https://lainchan.org/{board}/src/{filename}"
|
||||||
dlfname = f"{post['tim']}{post['ext']}"
|
|
||||||
|
|
||||||
if "filename" in post:
|
|
||||||
if match(r"^image\.\w+$", post['filename']) is None:
|
|
||||||
filename = dlfname
|
|
||||||
else:
|
|
||||||
filename = f"{post['filename']}{post['ext']}"
|
|
||||||
|
|
||||||
files = []
|
|
||||||
files.append(FileInfo(filename, post['fsize'],
|
|
||||||
self.__url_file_link.format(board=self.board, filename=dlfname),
|
|
||||||
post['md5'], 'md5'))
|
|
||||||
|
|
||||||
if "extra_files" in post:
|
|
||||||
for f in post["extra_files"]:
|
|
||||||
dlfname = f"{f['tim']}{f['ext']}"
|
|
||||||
if "filename" in post:
|
|
||||||
if match(r"^image\.\w+$", post['filename']) is None:
|
|
||||||
filename = dlfname
|
|
||||||
else:
|
|
||||||
filename = f"{post['filename']}{post['ext']}"
|
|
||||||
dlurl = self.__url_file_link.format(board=self.board, \
|
|
||||||
filename=dlfname)
|
|
||||||
files.append(FileInfo(filename, f['fsize'], \
|
|
||||||
dlurl, f['md5'], 'md5'))
|
|
||||||
return files
|
|
||||||
|
Loading…
Reference in New Issue
Block a user