1
0
Fork 0

Added support for lolifox.cc. Fixed User-Agent usage, so it applied correctly everywhere now.

This commit is contained in:
Alexander Andreev 2020-09-09 04:34:41 +04:00
parent 7825b53121
commit a106d5b739
10 changed files with 93 additions and 22 deletions

View File

@ -1,5 +1,16 @@
# Changelog
## 0.3 - 2020-09-09
### Added
- Parser for lolifox.cc.
### Removed
- BasicScraper. Not needed anymore, there is a faster threaded version.
### Fixed
- Now User-Agent is correctly applied everywhere.
## 0.2.2 - 2020-07-20
### Added
- Parser for 8kun.top.
@ -14,11 +25,13 @@
- Consider that issue with size on 2ch.hk. Usually it really tells the size in
kB. The problem is that sometimes it just wrong.
## 0.2.1 - 2020-07-18
### Changed
- Now program tells you what thread doesn't exist or about to be scraped. That
is useful in batch processing with scripts.
## 0.2.0 - 2020-07-18
### Added
- Threaded version of the scraper, so now it is fast as heck!

View File

@ -1,7 +1,7 @@
build: scrapthechan README.md setup.cfg
python setup.py sdist bdist_wheel
install:
python -m pip install --upgrade dist/scrapthechan-0.2.2-py3-none-any.whl --user
python -m pip install --upgrade dist/scrapthechan-0.3-py3-none-any.whl --user
uninstall:
# We change directory so pip uninstall will run, it'll fail otherwise.
@cd ~/

View File

@ -36,4 +36,5 @@ help for a program.
- [4chan.org](https://4chan.org) since 0.1.0
- [lainchan.org](https://lainchan.org) since 0.1.0
- [2ch.hk](https://2ch.hk) since 0.1.0
- [8kun.top](https://8kun.top) since 0.2.2
- [8kun.top](https://8kun.top) since 0.2.2
- [lolifox.cc](https://lolifox.cc) since 0.3

View File

@ -1,5 +1,5 @@
__date__ = "20 July 2020"
__version__ = "0.2.2"
__date__ = "9 September 2020"
__version__ = "0.3"
__author__ = "Alexander \"Arav\" Andreev"
__email__ = "me@arav.top"
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"

View File

@ -4,8 +4,9 @@ from itertools import chain
from json import loads
from re import findall, match
from typing import List, Optional
from urllib.request import urlopen, urlretrieve
from urllib.request import urlopen, Request
from scrapthechan import USER_AGENT
from scrapthechan.fileinfo import FileInfo
@ -71,7 +72,8 @@ class Parser:
def _get_json(self, thread_url: str) -> dict:
"""Gets JSON version of a thread and converts it in a dictionary."""
try:
with urlopen(thread_url) as url:
req = Request(thread_url, headers={'User-Agent': USER_AGENT})
with urlopen(req) as url:
return loads(url.read().decode('utf-8'))
except:
raise ThreadNotFoundError

View File

@ -9,7 +9,7 @@ __all__ = ["SUPPORTED_IMAGEBOARDS", "get_parser_by_url", "get_parser_by_site"]
SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk", \
"8kun.top"]
"8kun.top", "lolifox.cc"]
def get_parser_by_url(url: str) -> Parser:
@ -33,5 +33,8 @@ def get_parser_by_site(site: str, board: str, thread: str) -> Parser:
elif '8kun' in site:
from .eightkun import EightKunParser
return EightKunParser(board, thread)
elif 'lolifox' in site:
from .lolifox import LolifoxParser
return LolifoxParser(board, thread)
else:
raise NotImplementedError(f"Parser for {site} is not implemented")

View File

@ -0,0 +1,65 @@
from re import match
from typing import List, Optional
from scrapthechan.parser import Parser
from scrapthechan.fileinfo import FileInfo
__all__ = ["LolifoxParser"]
class LolifoxParser(Parser):
"""JSON parser for lolifox.cc image board.
JSON structure is identical to lainchan.org.
"""
__url_thread_json = "https://lolifox.cc/{board}/res/{thread}.json"
__url_file_link = "https://lolifox.cc/{board}/src/{filename}"
def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None:
posts = self._get_json(self.__url_thread_json.format(board=board, \
thread=thread))['posts']
super(LolifoxParser, self).__init__(board, thread, posts, skip_posts)
@property
def imageboard(self) -> str:
return "lolifox.cc"
@property
def op(self) -> Optional[str]:
op = ""
if 'sub' in self._op_post:
op = f"{self._op_post['sub']}\n"
if 'com' in self._op_post:
op += self._op_post['com']
return op if not op == "" else None
def _parse_post(self, post) -> List[FileInfo]:
if not 'tim' in post: return None
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w{1,4}$") is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
files = []
files.append(FileInfo(filename, post['fsize'],
self.__url_file_link.format(board=self.board, filename=dlfname),
post['md5'], 'md5'))
if "extra_files" in post:
for f in post["extra_files"]:
dlfname = f"{f['tim']}{f['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w+$") is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
dlurl = self.__url_file_link.format(board=self.board, \
filename=dlfname)
files.append(FileInfo(filename, f['fsize'], \
dlurl, f['md5'], 'md5'))
return files

View File

@ -29,6 +29,7 @@ class Scraper:
self._save_directory = save_directory
self._files = files
self._url_opener = URLopener()
self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
self._url_opener.version = USER_AGENT
self._progress_callback = download_progress_callback

View File

@ -1,15 +0,0 @@
"""Implementation of basic sequential one-threaded scraper that downloads
files one by one."""
from scrapthechan.scraper import Scraper
__all__ = ["BasicScraper"]
class BasicScraper(Scraper):
def run(self):
"""Download files one by one."""
for i, f in enumerate(self._files, start=1):
if not self._progress_callback is None:
self._progress_callback(i)
self._download_file(f)

View File

@ -14,6 +14,7 @@ keywords =
2ch.hk
lainchan.org
8kun.top
lolifox.cc
license = MIT
license_file = COPYING
classifiers =