From a106d5b7393b77f434613c93019a0b02737cec47 Mon Sep 17 00:00:00 2001 From: "Alexander \"Arav\" Andreev" Date: Wed, 9 Sep 2020 04:34:41 +0400 Subject: [PATCH] Added support for lolifox.cc. Fixed User-Agent usage, so it applied correctly everywhere now. --- CHANGELOG.md | 13 ++++++ Makefile | 2 +- README.md | 3 +- scrapthechan/__init__.py | 4 +- scrapthechan/parser.py | 6 ++- scrapthechan/parsers/__init__.py | 5 ++- scrapthechan/parsers/lolifox.py | 65 +++++++++++++++++++++++++++ scrapthechan/scraper.py | 1 + scrapthechan/scrapers/basicscraper.py | 15 ------- setup.cfg | 1 + 10 files changed, 93 insertions(+), 22 deletions(-) create mode 100644 scrapthechan/parsers/lolifox.py delete mode 100644 scrapthechan/scrapers/basicscraper.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 90333a9..bc8fb24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## 0.3 - 2020-09-09 +### Added +- Parser for lolifox.cc. + +### Removed +- BasicScraper. Not needed anymore, there is a faster threaded version. + +### Fixed +- Now User-Agent is correctly applied everywhere. + + ## 0.2.2 - 2020-07-20 ### Added - Parser for 8kun.top. @@ -14,11 +25,13 @@ - Consider that issue with size on 2ch.hk. Usually it really tells the size in kB. The problem is that sometimes it just wrong. + ## 0.2.1 - 2020-07-18 ### Changed - Now program tells you what thread doesn't exist or about to be scraped. That is useful in batch processing with scripts. + ## 0.2.0 - 2020-07-18 ### Added - Threaded version of the scraper, so now it is fast as heck! diff --git a/Makefile b/Makefile index a8e2af3..40e23c3 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ build: scrapthechan README.md setup.cfg python setup.py sdist bdist_wheel install: - python -m pip install --upgrade dist/scrapthechan-0.2.2-py3-none-any.whl --user + python -m pip install --upgrade dist/scrapthechan-0.3-py3-none-any.whl --user uninstall: # We change directory so pip uninstall will run, it'll fail otherwise. @cd ~/ diff --git a/README.md b/README.md index cdec9c8..6bf362e 100644 --- a/README.md +++ b/README.md @@ -36,4 +36,5 @@ help for a program. - [4chan.org](https://4chan.org) since 0.1.0 - [lainchan.org](https://lainchan.org) since 0.1.0 - [2ch.hk](https://2ch.hk) since 0.1.0 -- [8kun.top](https://8kun.top) since 0.2.2 \ No newline at end of file +- [8kun.top](https://8kun.top) since 0.2.2 +- [lolifox.cc](https://lolifox.cc) since 0.3 \ No newline at end of file diff --git a/scrapthechan/__init__.py b/scrapthechan/__init__.py index be09bf8..9451756 100644 --- a/scrapthechan/__init__.py +++ b/scrapthechan/__init__.py @@ -1,5 +1,5 @@ -__date__ = "20 July 2020" -__version__ = "0.2.2" +__date__ = "9 September 2020" +__version__ = "0.3" __author__ = "Alexander \"Arav\" Andreev" __email__ = "me@arav.top" __copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>" diff --git a/scrapthechan/parser.py b/scrapthechan/parser.py index 7a23bd9..e83f3a4 100644 --- a/scrapthechan/parser.py +++ b/scrapthechan/parser.py @@ -4,8 +4,9 @@ from itertools import chain from json import loads from re import findall, match from typing import List, Optional -from urllib.request import urlopen, urlretrieve +from urllib.request import urlopen, Request +from scrapthechan import USER_AGENT from scrapthechan.fileinfo import FileInfo @@ -71,7 +72,8 @@ class Parser: def _get_json(self, thread_url: str) -> dict: """Gets JSON version of a thread and converts it in a dictionary.""" try: - with urlopen(thread_url) as url: + req = Request(thread_url, headers={'User-Agent': USER_AGENT}) + with urlopen(req) as url: return loads(url.read().decode('utf-8')) except: raise ThreadNotFoundError diff --git a/scrapthechan/parsers/__init__.py b/scrapthechan/parsers/__init__.py index aaaa774..0fc99b4 100644 --- a/scrapthechan/parsers/__init__.py +++ b/scrapthechan/parsers/__init__.py @@ -9,7 +9,7 @@ __all__ = ["SUPPORTED_IMAGEBOARDS", "get_parser_by_url", "get_parser_by_site"] SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk", \ - "8kun.top"] + "8kun.top", "lolifox.cc"] def get_parser_by_url(url: str) -> Parser: @@ -33,5 +33,8 @@ def get_parser_by_site(site: str, board: str, thread: str) -> Parser: elif '8kun' in site: from .eightkun import EightKunParser return EightKunParser(board, thread) + elif 'lolifox' in site: + from .lolifox import LolifoxParser + return LolifoxParser(board, thread) else: raise NotImplementedError(f"Parser for {site} is not implemented") diff --git a/scrapthechan/parsers/lolifox.py b/scrapthechan/parsers/lolifox.py new file mode 100644 index 0000000..b0d6f24 --- /dev/null +++ b/scrapthechan/parsers/lolifox.py @@ -0,0 +1,65 @@ +from re import match +from typing import List, Optional + +from scrapthechan.parser import Parser +from scrapthechan.fileinfo import FileInfo + +__all__ = ["LolifoxParser"] + + +class LolifoxParser(Parser): + """JSON parser for lolifox.cc image board. + JSON structure is identical to lainchan.org. + """ + + __url_thread_json = "https://lolifox.cc/{board}/res/{thread}.json" + __url_file_link = "https://lolifox.cc/{board}/src/{filename}" + + def __init__(self, board: str, thread: str, + skip_posts: Optional[int] = None) -> None: + posts = self._get_json(self.__url_thread_json.format(board=board, \ + thread=thread))['posts'] + super(LolifoxParser, self).__init__(board, thread, posts, skip_posts) + + @property + def imageboard(self) -> str: + return "lolifox.cc" + + @property + def op(self) -> Optional[str]: + op = "" + if 'sub' in self._op_post: + op = f"{self._op_post['sub']}\n" + if 'com' in self._op_post: + op += self._op_post['com'] + return op if not op == "" else None + + def _parse_post(self, post) -> List[FileInfo]: + if not 'tim' in post: return None + + dlfname = f"{post['tim']}{post['ext']}" + + if "filename" in post: + if match(post['filename'], r"^image\.\w{1,4}$") is None: + filename = dlfname + else: + filename = f"{post['filename']}{post['ext']}" + + files = [] + files.append(FileInfo(filename, post['fsize'], + self.__url_file_link.format(board=self.board, filename=dlfname), + post['md5'], 'md5')) + + if "extra_files" in post: + for f in post["extra_files"]: + dlfname = f"{f['tim']}{f['ext']}" + if "filename" in post: + if match(post['filename'], r"^image\.\w+$") is None: + filename = dlfname + else: + filename = f"{post['filename']}{post['ext']}" + dlurl = self.__url_file_link.format(board=self.board, \ + filename=dlfname) + files.append(FileInfo(filename, f['fsize'], \ + dlurl, f['md5'], 'md5')) + return files diff --git a/scrapthechan/scraper.py b/scrapthechan/scraper.py index bacc8ff..2b93377 100644 --- a/scrapthechan/scraper.py +++ b/scrapthechan/scraper.py @@ -29,6 +29,7 @@ class Scraper: self._save_directory = save_directory self._files = files self._url_opener = URLopener() + self._url_opener.addheaders = [('User-Agent', USER_AGENT)] self._url_opener.version = USER_AGENT self._progress_callback = download_progress_callback diff --git a/scrapthechan/scrapers/basicscraper.py b/scrapthechan/scrapers/basicscraper.py deleted file mode 100644 index 6c1b430..0000000 --- a/scrapthechan/scrapers/basicscraper.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Implementation of basic sequential one-threaded scraper that downloads -files one by one.""" - -from scrapthechan.scraper import Scraper - -__all__ = ["BasicScraper"] - - -class BasicScraper(Scraper): - def run(self): - """Download files one by one.""" - for i, f in enumerate(self._files, start=1): - if not self._progress_callback is None: - self._progress_callback(i) - self._download_file(f) diff --git a/setup.cfg b/setup.cfg index 66a501d..a52aa5d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,6 +14,7 @@ keywords = 2ch.hk lainchan.org 8kun.top + lolifox.cc license = MIT license_file = COPYING classifiers =