Compare commits
4 Commits
0ed1e896ca
...
22961611da
Author | SHA1 | Date |
---|---|---|
Alexander Andreev | 22961611da | |
Alexander Andreev | 195d4d057a | |
Alexander Andreev | 87eecf0a09 | |
Alexander Andreev | 3223c0721a |
|
@ -1,5 +1,13 @@
|
|||
# Changelog
|
||||
|
||||
## 0.2.0 - 2020-07-18
|
||||
### Added
|
||||
- Threaded version of the scraper, so now it is fast as heck!
|
||||
|
||||
### Fixed
|
||||
- Handled situation when OP's post has no comment and/or subject.
|
||||
|
||||
|
||||
## 0.1.0 - 2020-07-08
|
||||
### Added
|
||||
- JSON parsers for 4chan.org, lainchan.org and 2ch.hk.
|
||||
|
|
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
build: scrapthechan README.md setup.cfg
|
||||
python setup.py sdist bdist_wheel
|
||||
install:
|
||||
python -m pip install --upgrade dist/scrapthechan-0.1.0-py3-none-any.whl --user
|
||||
python -m pip install --upgrade dist/scrapthechan-0.2.0-py3-none-any.whl --user
|
||||
uninstall:
|
||||
# We change directory so pip uninstall will run, it'll fail otherwise.
|
||||
@cd ~/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
__date__ = "8 Jule 2020"
|
||||
__version__ = "0.1.0"
|
||||
__date__ = "18 Jule 2020"
|
||||
__version__ = "0.2.0"
|
||||
__author__ = "Alexander \"Arav\" Andreev"
|
||||
__email__ = "me@arav.top"
|
||||
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"
|
||||
|
|
|
@ -9,7 +9,8 @@ from scrapthechan import VERSION
|
|||
from scrapthechan.parser import Parser, ParserThreadNotFoundError
|
||||
from scrapthechan.parsers import get_parser_by_url, get_parser_by_site, \
|
||||
SUPPORTED_IMAGEBOARDS
|
||||
from scrapthechan.scrapers.basicscraper import BasicScraper
|
||||
#from scrapthechan.scrapers.basicscraper import BasicScraper
|
||||
from scrapthechan.scrapers.threadedscraper import ThreadedScraper
|
||||
|
||||
|
||||
__all__ = ["main"]
|
||||
|
@ -105,7 +106,9 @@ def main() -> None:
|
|||
|
||||
if not args["no-op"]:
|
||||
print("Writing OP... ", end='')
|
||||
if not exists(join(save_dir, "!op.txt")):
|
||||
if parser.op is None:
|
||||
print("No text's there.")
|
||||
elif not exists(join(save_dir, "!op.txt")):
|
||||
with open(join(save_dir, "!op.txt"), 'w') as opf:
|
||||
opf.write(f"{parser.op}\n")
|
||||
print("Done.")
|
||||
|
@ -113,7 +116,7 @@ def main() -> None:
|
|||
print("Exists.")
|
||||
|
||||
|
||||
scraper = BasicScraper(save_dir, parser.files, \
|
||||
scraper = ThreadedScraper(save_dir, parser.files, \
|
||||
lambda i: print(f"{i}/{flen}", end="\r"))
|
||||
scraper.run()
|
||||
|
||||
|
|
|
@ -24,8 +24,13 @@ class DvachParser(Parser):
|
|||
return "2ch.hk"
|
||||
|
||||
@property
|
||||
def op(self) -> str:
|
||||
return f"{self._op_post['subject']}\n{self._op_post['comment']}"
|
||||
def op(self) -> Optional[str]:
|
||||
op = ""
|
||||
if 'sub' in self._op_post:
|
||||
op = f"{self._op_post['subject']}\n"
|
||||
if 'com' in self._op_post:
|
||||
op += self._op_post['comment']
|
||||
return op if not op == "" else None
|
||||
|
||||
def _parse_post(self, post) -> Optional[List[FileInfo]]:
|
||||
if not 'files' in post: return None
|
||||
|
|
|
@ -24,11 +24,13 @@ class FourChanParser(Parser):
|
|||
return "4chan.org"
|
||||
|
||||
@property
|
||||
def op(self) -> str:
|
||||
def op(self) -> Optional[str]:
|
||||
op = ""
|
||||
if 'sub' in self._op_post:
|
||||
return f"{self._op_post['sub']}\n{self._op_post['com']}"
|
||||
else:
|
||||
return self._op_post['com']
|
||||
op = f"{self._op_post['sub']}\n"
|
||||
if 'com' in self._op_post:
|
||||
op += self._op_post['com']
|
||||
return op if not op == "" else None
|
||||
|
||||
def _parse_post(self, post: dict) -> List[FileInfo]:
|
||||
if not 'tim' in post: return None
|
||||
|
|
|
@ -25,6 +25,15 @@ class LainchanParser(Parser):
|
|||
@property
|
||||
def imageboard(self) -> str:
|
||||
return "lainchan.org"
|
||||
|
||||
@property
|
||||
def op(self) -> Optional[str]:
|
||||
op = ""
|
||||
if 'sub' in self._op_post:
|
||||
op = f"{self._op_post['sub']}\n"
|
||||
if 'com' in self._op_post:
|
||||
op += self._op_post['com']
|
||||
return op if not op == "" else None
|
||||
|
||||
def _parse_post(self, post) -> List[FileInfo]:
|
||||
if not 'tim' in post: return None
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
"""Implementation of a threaded version of a scraper."""
|
||||
|
||||
from typing import List, Callable
|
||||
from multiprocessing import cpu_count, Lock
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
from scrapthechan.scraper import Scraper
|
||||
from scrapthechan.fileinfo import FileInfo
|
||||
|
||||
__all__ = ["ThreadedScraper"]
|
||||
|
||||
class ThreadedScraper(Scraper):
|
||||
def __init__(self, save_directory: str, files: List[FileInfo],
|
||||
download_progress_callback: Callable[[int], None] = None) -> None:
|
||||
super(ThreadedScraper, self).__init__(save_directory, files,
|
||||
download_progress_callback)
|
||||
self._files_downloaded = 0
|
||||
self._files_downloaded_mutex = Lock()
|
||||
|
||||
def run(self):
|
||||
pool = ThreadPool(cpu_count() * 2)
|
||||
pool.map(self._thread_run, self._files)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
def _thread_run(self, f: FileInfo):
|
||||
with self._files_downloaded_mutex:
|
||||
self._files_downloaded += 1
|
||||
if not self._progress_callback is None:
|
||||
self._progress_callback(self._files_downloaded)
|
||||
self._download_file(f)
|
Loading…
Reference in New Issue