1
0
ScrapTheChan/scrapthechan/scrapers/threadedscraper.py

33 lines
955 B
Python
Raw Permalink Normal View History

2020-07-18 04:43:00 +04:00
"""Implementation of a threaded version of a scraper."""
from typing import List, Callable
from multiprocessing import cpu_count, Lock
from multiprocessing.pool import ThreadPool
from scrapthechan.scraper import Scraper
from scrapthechan.fileinfo import FileInfo
2020-07-18 04:43:00 +04:00
__all__ = ["ThreadedScraper"]
2020-07-18 04:43:00 +04:00
class ThreadedScraper(Scraper):
def __init__(self, save_directory: str, files: List[FileInfo],
download_progress_callback: Callable[[int], None] = None) -> None:
super().__init__(save_directory, files, download_progress_callback)
self._files_downloaded = 0
self._files_downloaded_mutex = Lock()
def run(self):
pool = ThreadPool(cpu_count() * 2)
pool.map(self._thread_run, self._files)
pool.close()
pool.join()
def _thread_run(self, f: FileInfo):
if not self._progress_callback is None:
with self._files_downloaded_mutex:
self._files_downloaded += 1
self._progress_callback(self._files_downloaded)
self._download_file(f)