Added support for lolifox.cc. Fixed User-Agent usage, so it applied correctly everywhere now.
This commit is contained in:
parent
7825b53121
commit
a106d5b739
13
CHANGELOG.md
13
CHANGELOG.md
@ -1,5 +1,16 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## 0.3 - 2020-09-09
|
||||||
|
### Added
|
||||||
|
- Parser for lolifox.cc.
|
||||||
|
|
||||||
|
### Removed
|
||||||
|
- BasicScraper. Not needed anymore, there is a faster threaded version.
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Now User-Agent is correctly applied everywhere.
|
||||||
|
|
||||||
|
|
||||||
## 0.2.2 - 2020-07-20
|
## 0.2.2 - 2020-07-20
|
||||||
### Added
|
### Added
|
||||||
- Parser for 8kun.top.
|
- Parser for 8kun.top.
|
||||||
@ -14,11 +25,13 @@
|
|||||||
- Consider that issue with size on 2ch.hk. Usually it really tells the size in
|
- Consider that issue with size on 2ch.hk. Usually it really tells the size in
|
||||||
kB. The problem is that sometimes it just wrong.
|
kB. The problem is that sometimes it just wrong.
|
||||||
|
|
||||||
|
|
||||||
## 0.2.1 - 2020-07-18
|
## 0.2.1 - 2020-07-18
|
||||||
### Changed
|
### Changed
|
||||||
- Now program tells you what thread doesn't exist or about to be scraped. That
|
- Now program tells you what thread doesn't exist or about to be scraped. That
|
||||||
is useful in batch processing with scripts.
|
is useful in batch processing with scripts.
|
||||||
|
|
||||||
|
|
||||||
## 0.2.0 - 2020-07-18
|
## 0.2.0 - 2020-07-18
|
||||||
### Added
|
### Added
|
||||||
- Threaded version of the scraper, so now it is fast as heck!
|
- Threaded version of the scraper, so now it is fast as heck!
|
||||||
|
2
Makefile
2
Makefile
@ -1,7 +1,7 @@
|
|||||||
build: scrapthechan README.md setup.cfg
|
build: scrapthechan README.md setup.cfg
|
||||||
python setup.py sdist bdist_wheel
|
python setup.py sdist bdist_wheel
|
||||||
install:
|
install:
|
||||||
python -m pip install --upgrade dist/scrapthechan-0.2.2-py3-none-any.whl --user
|
python -m pip install --upgrade dist/scrapthechan-0.3-py3-none-any.whl --user
|
||||||
uninstall:
|
uninstall:
|
||||||
# We change directory so pip uninstall will run, it'll fail otherwise.
|
# We change directory so pip uninstall will run, it'll fail otherwise.
|
||||||
@cd ~/
|
@cd ~/
|
||||||
|
@ -36,4 +36,5 @@ help for a program.
|
|||||||
- [4chan.org](https://4chan.org) since 0.1.0
|
- [4chan.org](https://4chan.org) since 0.1.0
|
||||||
- [lainchan.org](https://lainchan.org) since 0.1.0
|
- [lainchan.org](https://lainchan.org) since 0.1.0
|
||||||
- [2ch.hk](https://2ch.hk) since 0.1.0
|
- [2ch.hk](https://2ch.hk) since 0.1.0
|
||||||
- [8kun.top](https://8kun.top) since 0.2.2
|
- [8kun.top](https://8kun.top) since 0.2.2
|
||||||
|
- [lolifox.cc](https://lolifox.cc) since 0.3
|
@ -1,5 +1,5 @@
|
|||||||
__date__ = "20 July 2020"
|
__date__ = "9 September 2020"
|
||||||
__version__ = "0.2.2"
|
__version__ = "0.3"
|
||||||
__author__ = "Alexander \"Arav\" Andreev"
|
__author__ = "Alexander \"Arav\" Andreev"
|
||||||
__email__ = "me@arav.top"
|
__email__ = "me@arav.top"
|
||||||
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"
|
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"
|
||||||
|
@ -4,8 +4,9 @@ from itertools import chain
|
|||||||
from json import loads
|
from json import loads
|
||||||
from re import findall, match
|
from re import findall, match
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from urllib.request import urlopen, urlretrieve
|
from urllib.request import urlopen, Request
|
||||||
|
|
||||||
|
from scrapthechan import USER_AGENT
|
||||||
from scrapthechan.fileinfo import FileInfo
|
from scrapthechan.fileinfo import FileInfo
|
||||||
|
|
||||||
|
|
||||||
@ -71,7 +72,8 @@ class Parser:
|
|||||||
def _get_json(self, thread_url: str) -> dict:
|
def _get_json(self, thread_url: str) -> dict:
|
||||||
"""Gets JSON version of a thread and converts it in a dictionary."""
|
"""Gets JSON version of a thread and converts it in a dictionary."""
|
||||||
try:
|
try:
|
||||||
with urlopen(thread_url) as url:
|
req = Request(thread_url, headers={'User-Agent': USER_AGENT})
|
||||||
|
with urlopen(req) as url:
|
||||||
return loads(url.read().decode('utf-8'))
|
return loads(url.read().decode('utf-8'))
|
||||||
except:
|
except:
|
||||||
raise ThreadNotFoundError
|
raise ThreadNotFoundError
|
||||||
|
@ -9,7 +9,7 @@ __all__ = ["SUPPORTED_IMAGEBOARDS", "get_parser_by_url", "get_parser_by_site"]
|
|||||||
|
|
||||||
|
|
||||||
SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk", \
|
SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk", \
|
||||||
"8kun.top"]
|
"8kun.top", "lolifox.cc"]
|
||||||
|
|
||||||
|
|
||||||
def get_parser_by_url(url: str) -> Parser:
|
def get_parser_by_url(url: str) -> Parser:
|
||||||
@ -33,5 +33,8 @@ def get_parser_by_site(site: str, board: str, thread: str) -> Parser:
|
|||||||
elif '8kun' in site:
|
elif '8kun' in site:
|
||||||
from .eightkun import EightKunParser
|
from .eightkun import EightKunParser
|
||||||
return EightKunParser(board, thread)
|
return EightKunParser(board, thread)
|
||||||
|
elif 'lolifox' in site:
|
||||||
|
from .lolifox import LolifoxParser
|
||||||
|
return LolifoxParser(board, thread)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Parser for {site} is not implemented")
|
raise NotImplementedError(f"Parser for {site} is not implemented")
|
||||||
|
65
scrapthechan/parsers/lolifox.py
Normal file
65
scrapthechan/parsers/lolifox.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
from re import match
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from scrapthechan.parser import Parser
|
||||||
|
from scrapthechan.fileinfo import FileInfo
|
||||||
|
|
||||||
|
__all__ = ["LolifoxParser"]
|
||||||
|
|
||||||
|
|
||||||
|
class LolifoxParser(Parser):
|
||||||
|
"""JSON parser for lolifox.cc image board.
|
||||||
|
JSON structure is identical to lainchan.org.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__url_thread_json = "https://lolifox.cc/{board}/res/{thread}.json"
|
||||||
|
__url_file_link = "https://lolifox.cc/{board}/src/{filename}"
|
||||||
|
|
||||||
|
def __init__(self, board: str, thread: str,
|
||||||
|
skip_posts: Optional[int] = None) -> None:
|
||||||
|
posts = self._get_json(self.__url_thread_json.format(board=board, \
|
||||||
|
thread=thread))['posts']
|
||||||
|
super(LolifoxParser, self).__init__(board, thread, posts, skip_posts)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def imageboard(self) -> str:
|
||||||
|
return "lolifox.cc"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def op(self) -> Optional[str]:
|
||||||
|
op = ""
|
||||||
|
if 'sub' in self._op_post:
|
||||||
|
op = f"{self._op_post['sub']}\n"
|
||||||
|
if 'com' in self._op_post:
|
||||||
|
op += self._op_post['com']
|
||||||
|
return op if not op == "" else None
|
||||||
|
|
||||||
|
def _parse_post(self, post) -> List[FileInfo]:
|
||||||
|
if not 'tim' in post: return None
|
||||||
|
|
||||||
|
dlfname = f"{post['tim']}{post['ext']}"
|
||||||
|
|
||||||
|
if "filename" in post:
|
||||||
|
if match(post['filename'], r"^image\.\w{1,4}$") is None:
|
||||||
|
filename = dlfname
|
||||||
|
else:
|
||||||
|
filename = f"{post['filename']}{post['ext']}"
|
||||||
|
|
||||||
|
files = []
|
||||||
|
files.append(FileInfo(filename, post['fsize'],
|
||||||
|
self.__url_file_link.format(board=self.board, filename=dlfname),
|
||||||
|
post['md5'], 'md5'))
|
||||||
|
|
||||||
|
if "extra_files" in post:
|
||||||
|
for f in post["extra_files"]:
|
||||||
|
dlfname = f"{f['tim']}{f['ext']}"
|
||||||
|
if "filename" in post:
|
||||||
|
if match(post['filename'], r"^image\.\w+$") is None:
|
||||||
|
filename = dlfname
|
||||||
|
else:
|
||||||
|
filename = f"{post['filename']}{post['ext']}"
|
||||||
|
dlurl = self.__url_file_link.format(board=self.board, \
|
||||||
|
filename=dlfname)
|
||||||
|
files.append(FileInfo(filename, f['fsize'], \
|
||||||
|
dlurl, f['md5'], 'md5'))
|
||||||
|
return files
|
@ -29,6 +29,7 @@ class Scraper:
|
|||||||
self._save_directory = save_directory
|
self._save_directory = save_directory
|
||||||
self._files = files
|
self._files = files
|
||||||
self._url_opener = URLopener()
|
self._url_opener = URLopener()
|
||||||
|
self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
|
||||||
self._url_opener.version = USER_AGENT
|
self._url_opener.version = USER_AGENT
|
||||||
self._progress_callback = download_progress_callback
|
self._progress_callback = download_progress_callback
|
||||||
|
|
||||||
|
@ -1,15 +0,0 @@
|
|||||||
"""Implementation of basic sequential one-threaded scraper that downloads
|
|
||||||
files one by one."""
|
|
||||||
|
|
||||||
from scrapthechan.scraper import Scraper
|
|
||||||
|
|
||||||
__all__ = ["BasicScraper"]
|
|
||||||
|
|
||||||
|
|
||||||
class BasicScraper(Scraper):
|
|
||||||
def run(self):
|
|
||||||
"""Download files one by one."""
|
|
||||||
for i, f in enumerate(self._files, start=1):
|
|
||||||
if not self._progress_callback is None:
|
|
||||||
self._progress_callback(i)
|
|
||||||
self._download_file(f)
|
|
Loading…
Reference in New Issue
Block a user