1
0
Fork 0

Compare commits

...

4 Commits

8 changed files with 70 additions and 12 deletions

View File

@ -1,5 +1,13 @@
# Changelog
## 0.2.0 - 2020-07-18
### Added
- Threaded version of the scraper, so now it is fast as heck!
### Fixed
- Handled situation when OP's post has no comment and/or subject.
## 0.1.0 - 2020-07-08
### Added
- JSON parsers for 4chan.org, lainchan.org and 2ch.hk.

View File

@ -1,7 +1,7 @@
build: scrapthechan README.md setup.cfg
python setup.py sdist bdist_wheel
install:
python -m pip install --upgrade dist/scrapthechan-0.1.0-py3-none-any.whl --user
python -m pip install --upgrade dist/scrapthechan-0.2.0-py3-none-any.whl --user
uninstall:
# We change directory so pip uninstall will run, it'll fail otherwise.
@cd ~/

View File

@ -1,5 +1,5 @@
__date__ = "8 Jule 2020"
__version__ = "0.1.0"
__date__ = "18 Jule 2020"
__version__ = "0.2.0"
__author__ = "Alexander \"Arav\" Andreev"
__email__ = "me@arav.top"
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"

View File

@ -9,7 +9,8 @@ from scrapthechan import VERSION
from scrapthechan.parser import Parser, ParserThreadNotFoundError
from scrapthechan.parsers import get_parser_by_url, get_parser_by_site, \
SUPPORTED_IMAGEBOARDS
from scrapthechan.scrapers.basicscraper import BasicScraper
#from scrapthechan.scrapers.basicscraper import BasicScraper
from scrapthechan.scrapers.threadedscraper import ThreadedScraper
__all__ = ["main"]
@ -105,7 +106,9 @@ def main() -> None:
if not args["no-op"]:
print("Writing OP... ", end='')
if not exists(join(save_dir, "!op.txt")):
if parser.op is None:
print("No text's there.")
elif not exists(join(save_dir, "!op.txt")):
with open(join(save_dir, "!op.txt"), 'w') as opf:
opf.write(f"{parser.op}\n")
print("Done.")
@ -113,7 +116,7 @@ def main() -> None:
print("Exists.")
scraper = BasicScraper(save_dir, parser.files, \
scraper = ThreadedScraper(save_dir, parser.files, \
lambda i: print(f"{i}/{flen}", end="\r"))
scraper.run()

View File

@ -24,8 +24,13 @@ class DvachParser(Parser):
return "2ch.hk"
@property
def op(self) -> str:
return f"{self._op_post['subject']}\n{self._op_post['comment']}"
def op(self) -> Optional[str]:
op = ""
if 'sub' in self._op_post:
op = f"{self._op_post['subject']}\n"
if 'com' in self._op_post:
op += self._op_post['comment']
return op if not op == "" else None
def _parse_post(self, post) -> Optional[List[FileInfo]]:
if not 'files' in post: return None

View File

@ -24,11 +24,13 @@ class FourChanParser(Parser):
return "4chan.org"
@property
def op(self) -> str:
def op(self) -> Optional[str]:
op = ""
if 'sub' in self._op_post:
return f"{self._op_post['sub']}\n{self._op_post['com']}"
else:
return self._op_post['com']
op = f"{self._op_post['sub']}\n"
if 'com' in self._op_post:
op += self._op_post['com']
return op if not op == "" else None
def _parse_post(self, post: dict) -> List[FileInfo]:
if not 'tim' in post: return None

View File

@ -25,6 +25,15 @@ class LainchanParser(Parser):
@property
def imageboard(self) -> str:
return "lainchan.org"
@property
def op(self) -> Optional[str]:
op = ""
if 'sub' in self._op_post:
op = f"{self._op_post['sub']}\n"
if 'com' in self._op_post:
op += self._op_post['com']
return op if not op == "" else None
def _parse_post(self, post) -> List[FileInfo]:
if not 'tim' in post: return None

View File

@ -0,0 +1,31 @@
"""Implementation of a threaded version of a scraper."""
from typing import List, Callable
from multiprocessing import cpu_count, Lock
from multiprocessing.pool import ThreadPool
from scrapthechan.scraper import Scraper
from scrapthechan.fileinfo import FileInfo
__all__ = ["ThreadedScraper"]
class ThreadedScraper(Scraper):
def __init__(self, save_directory: str, files: List[FileInfo],
download_progress_callback: Callable[[int], None] = None) -> None:
super(ThreadedScraper, self).__init__(save_directory, files,
download_progress_callback)
self._files_downloaded = 0
self._files_downloaded_mutex = Lock()
def run(self):
pool = ThreadPool(cpu_count() * 2)
pool.map(self._thread_run, self._files)
pool.close()
pool.join()
def _thread_run(self, f: FileInfo):
with self._files_downloaded_mutex:
self._files_downloaded += 1
if not self._progress_callback is None:
self._progress_callback(self._files_downloaded)
self._download_file(f)