From 87eecf0a09bd0e53617406a1cbb0c6dd99b99fc8 Mon Sep 17 00:00:00 2001 From: "Alexander \"Arav\" Andreev" Date: Sat, 18 Jul 2020 04:43:00 +0400 Subject: [PATCH] New threaded scraper implemented. --- scrapthechan/scrapers/threadedscraper.py | 31 ++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 scrapthechan/scrapers/threadedscraper.py diff --git a/scrapthechan/scrapers/threadedscraper.py b/scrapthechan/scrapers/threadedscraper.py new file mode 100644 index 0000000..f745649 --- /dev/null +++ b/scrapthechan/scrapers/threadedscraper.py @@ -0,0 +1,31 @@ +"""Implementation of a threaded version of a scraper.""" + +from typing import List, Callable +from multiprocessing import cpu_count, Lock +from multiprocessing.pool import ThreadPool + +from scrapthechan.scraper import Scraper +from scrapthechan.fileinfo import FileInfo + +__all__ = ["ThreadedScraper"] + +class ThreadedScraper(Scraper): + def __init__(self, save_directory: str, files: List[FileInfo], + download_progress_callback: Callable[[int], None] = None) -> None: + super(ThreadedScraper, self).__init__(save_directory, files, + download_progress_callback) + self._files_downloaded = 0 + self._files_downloaded_mutex = Lock() + + def run(self): + pool = ThreadPool(cpu_count() * 2) + pool.map(self._thread_run, self._files) + pool.close() + pool.join() + + def _thread_run(self, f: FileInfo): + with self._files_downloaded_mutex: + self._files_downloaded += 1 + if not self._progress_callback is None: + self._progress_callback(self._files_downloaded) + self._download_file(f)