New threaded scraper implemented.
This commit is contained in:
parent
3223c0721a
commit
87eecf0a09
31
scrapthechan/scrapers/threadedscraper.py
Normal file
31
scrapthechan/scrapers/threadedscraper.py
Normal file
@ -0,0 +1,31 @@
|
||||
"""Implementation of a threaded version of a scraper."""
|
||||
|
||||
from typing import List, Callable
|
||||
from multiprocessing import cpu_count, Lock
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
from scrapthechan.scraper import Scraper
|
||||
from scrapthechan.fileinfo import FileInfo
|
||||
|
||||
__all__ = ["ThreadedScraper"]
|
||||
|
||||
class ThreadedScraper(Scraper):
|
||||
def __init__(self, save_directory: str, files: List[FileInfo],
|
||||
download_progress_callback: Callable[[int], None] = None) -> None:
|
||||
super(ThreadedScraper, self).__init__(save_directory, files,
|
||||
download_progress_callback)
|
||||
self._files_downloaded = 0
|
||||
self._files_downloaded_mutex = Lock()
|
||||
|
||||
def run(self):
|
||||
pool = ThreadPool(cpu_count() * 2)
|
||||
pool.map(self._thread_run, self._files)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
def _thread_run(self, f: FileInfo):
|
||||
with self._files_downloaded_mutex:
|
||||
self._files_downloaded += 1
|
||||
if not self._progress_callback is None:
|
||||
self._progress_callback(self._files_downloaded)
|
||||
self._download_file(f)
|
Loading…
Reference in New Issue
Block a user