ScrapTheChan/scrapthechan/parsers/fourchan.py

from re import match
from typing import List, Optional

from scrapthechan.fileinfo import FileInfo
from scrapthechan.parser import Parser

__all__ = ["FourChanParser"]


class FourChanParser(Parser):
	"""JSON parser for 4chan.org image board."""

	__url_thread_json = "https://a.4cdn.org/{board}/thread/{thread}.json"
	__url_file_link = "https://i.4cdn.org/{board}/{filename}"

	def __init__(self, board: str, thread: str,
				 skip_posts: Optional[int] = None) -> None:
		posts = self._get_json(self.__url_thread_json.format(board=board, \
			thread=thread))['posts']
		super(FourChanParser, self).__init__(board, thread, posts, skip_posts)

	@property
	def imageboard(self) -> str:
		return "4chan.org"

	@property
	def op(self) -> str:
		if 'sub' in self._op_post:
			return f"{self._op_post['sub']}\n{self._op_post['com']}"
		else:
			return self._op_post['com']

	def _parse_post(self, post: dict) -> List[FileInfo]:
		if not 'tim' in post: return None

		dlfname = f"{post['tim']}{post['ext']}"

		if "filename" in post:
			if match(post['filename'], r"^image\.\w+$") is None:
				filename = dlfname
			else:
				filename = f"{post['filename']}{post['ext']}"

		# Hash algorithm is hardcoded since it is highly unlikely that it will
		# be changed in foreseeable future. And if it'll change then this line
		# will be necessarily updated anyway.
		return [FileInfo(filename, post['fsize'],
			self.__url_file_link.format(board=self.board, filename=dlfname),
			post['md5'], 'md5')]
Initial commit with all the files. 2020-07-08 22:53:39 +04:00			`from re import match`
			`from typing import List, Optional`

			`from scrapthechan.fileinfo import FileInfo`
			`from scrapthechan.parser import Parser`

			`__all__ = ["FourChanParser"]`


			`class FourChanParser(Parser):`
			`"""JSON parser for 4chan.org image board."""`

			`__url_thread_json = "https://a.4cdn.org/{board}/thread/{thread}.json"`
			`__url_file_link = "https://i.4cdn.org/{board}/{filename}"`

			`def __init__(self, board: str, thread: str,`
			`skip_posts: Optional[int] = None) -> None:`
			`posts = self._get_json(self.__url_thread_json.format(board=board, \`
			`thread=thread))['posts']`
			`super(FourChanParser, self).__init__(board, thread, posts, skip_posts)`

			`@property`
			`def imageboard(self) -> str:`
			`return "4chan.org"`

			`@property`
			`def op(self) -> str:`
			`if 'sub' in self._op_post:`
			`return f"{self._op_post['sub']}\n{self._op_post['com']}"`
			`else:`
			`return self._op_post['com']`

			`def _parse_post(self, post: dict) -> List[FileInfo]:`
			`if not 'tim' in post: return None`

			`dlfname = f"{post['tim']}{post['ext']}"`

			`if "filename" in post:`
			`if match(post['filename'], r"^image\.\w+$") is None:`
			`filename = dlfname`
			`else:`
			`filename = f"{post['filename']}{post['ext']}"`

			`# Hash algorithm is hardcoded since it is highly unlikely that it will`
			`# be changed in foreseeable future. And if it'll change then this line`
			`# will be necessarily updated anyway.`
			`return [FileInfo(filename, post['fsize'],`
			`self.__url_file_link.format(board=self.board, filename=dlfname),`
			`post['md5'], 'md5')]`