BasicScraper is replaced with a threaded one. Also OP's post handling altered.

Version is incremented now and I wrote down the changes.
New threaded scraper implemented.
2020-07-18 04:44:37 +04:00 · 2020-07-18 04:43:45 +04:00 · 2020-07-18 04:43:00 +04:00 · 2020-07-18 04:42:19 +04:00
8 changed files with 70 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,13 @@
 # Changelog

+## 0.2.0 - 2020-07-18
+### Added
+- Threaded version of the scraper, so now it is fast as heck!
+
+### Fixed
+- Handled situation when OP's post has no comment and/or subject.
+
+
 ## 0.1.0 - 2020-07-08
 ### Added
 - JSON parsers for 4chan.org, lainchan.org and 2ch.hk.
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 build: scrapthechan README.md setup.cfg
 	python setup.py sdist bdist_wheel
 install:
-	python -m pip install --upgrade dist/scrapthechan-0.1.0-py3-none-any.whl --user
+	python -m pip install --upgrade dist/scrapthechan-0.2.0-py3-none-any.whl --user
 uninstall:
 	# We change directory so pip uninstall will run, it'll fail otherwise.
 	@cd ~/
--- a/scrapthechan/init.py
+++ b/scrapthechan/init.py
@ -1,5 +1,5 @@
-__date__ = "8 Jule 2020"
-__version__ = "0.1.0"
+__date__ = "18 Jule 2020"
+__version__ = "0.2.0"
 __author__ = "Alexander \"Arav\" Andreev"
 __email__ = "me@arav.top"
 __copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"
--- a/scrapthechan/cli/scraper.py
+++ b/scrapthechan/cli/scraper.py
@ -9,7 +9,8 @@ from scrapthechan import VERSION
 from scrapthechan.parser import Parser, ParserThreadNotFoundError
 from scrapthechan.parsers import get_parser_by_url, get_parser_by_site, \
 								 SUPPORTED_IMAGEBOARDS
-from scrapthechan.scrapers.basicscraper import BasicScraper
+#from scrapthechan.scrapers.basicscraper import BasicScraper
+from scrapthechan.scrapers.threadedscraper import ThreadedScraper


 __all__ = ["main"]
@ -105,7 +106,9 @@ def main() -> None:

 	if not args["no-op"]:
 		print("Writing OP... ", end='')
-		if not exists(join(save_dir, "!op.txt")):
+		if parser.op is None:
+			print("No text's there.")
+		elif not exists(join(save_dir, "!op.txt")):
 			with open(join(save_dir, "!op.txt"), 'w') as opf:
 				opf.write(f"{parser.op}\n")
 			print("Done.")
@ -113,7 +116,7 @@ def main() -> None:
 			print("Exists.")


-	scraper = BasicScraper(save_dir, parser.files, \
+	scraper = ThreadedScraper(save_dir, parser.files, \
 		lambda i: print(f"{i}/{flen}", end="\r"))
 	scraper.run()

--- a/scrapthechan/parsers/dvach.py
+++ b/scrapthechan/parsers/dvach.py
@ -24,8 +24,13 @@ class DvachParser(Parser):
 		return "2ch.hk"

 	@property
-	def op(self) -> str:
-		return f"{self._op_post['subject']}\n{self._op_post['comment']}"
+	def op(self) -> Optional[str]:
+		op = ""
+		if 'sub' in self._op_post:
+			op = f"{self._op_post['subject']}\n"
+		if 'com' in self._op_post:
+			op += self._op_post['comment']
+		return op if not op == "" else None

 	def _parse_post(self, post) -> Optional[List[FileInfo]]:
 		if not 'files' in post: return None
--- a/scrapthechan/parsers/fourchan.py
+++ b/scrapthechan/parsers/fourchan.py
@ -24,11 +24,13 @@ class FourChanParser(Parser):
 		return "4chan.org"

 	@property
-	def op(self) -> str:
+	def op(self) -> Optional[str]:
+		op = ""
 		if 'sub' in self._op_post:
-			return f"{self._op_post['sub']}\n{self._op_post['com']}"
-		else:
-			return self._op_post['com']
+			op = f"{self._op_post['sub']}\n"
+		if 'com' in self._op_post:
+			op += self._op_post['com']
+		return op if not op == "" else None

 	def _parse_post(self, post: dict) -> List[FileInfo]:
 		if not 'tim' in post: return None
--- a/scrapthechan/parsers/lainchan.py
+++ b/scrapthechan/parsers/lainchan.py
@ -25,6 +25,15 @@ class LainchanParser(Parser):
 	@property
 	def imageboard(self) -> str:
 		return "lainchan.org"
+	
+	@property
+	def op(self) -> Optional[str]:
+		op = ""
+		if 'sub' in self._op_post:
+			op = f"{self._op_post['sub']}\n"
+		if 'com' in self._op_post:
+			op += self._op_post['com']
+		return op if not op == "" else None

 	def _parse_post(self, post) -> List[FileInfo]:
 		if not 'tim' in post: return None
--- a/scrapthechan/scrapers/threadedscraper.py
+++ b/scrapthechan/scrapers/threadedscraper.py
@ -0,0 +1,31 @@
+"""Implementation of a threaded version of a scraper."""
+
+from typing import List, Callable
+from multiprocessing import cpu_count, Lock
+from multiprocessing.pool import ThreadPool
+
+from scrapthechan.scraper import Scraper
+from scrapthechan.fileinfo import FileInfo
+
+__all__ = ["ThreadedScraper"]
+
+class ThreadedScraper(Scraper):
+    def __init__(self, save_directory: str, files: List[FileInfo],
+        download_progress_callback: Callable[[int], None] = None) -> None:
+        super(ThreadedScraper, self).__init__(save_directory, files,
+            download_progress_callback)
+        self._files_downloaded = 0
+        self._files_downloaded_mutex = Lock()
+
+    def run(self):
+        pool = ThreadPool(cpu_count() * 2)
+        pool.map(self._thread_run, self._files)
+        pool.close()
+        pool.join()
+
+    def _thread_run(self, f: FileInfo):
+        with self._files_downloaded_mutex:
+            self._files_downloaded += 1
+            if not self._progress_callback is None:
+                self._progress_callback(self._files_downloaded)
+        self._download_file(f)
Author	SHA1	Message	Date
Alexander Andreev	22961611da	BasicScraper is replaced with a threaded one. Also OP's post handling altered.	2020-07-18 04:44:37 +04:00
Alexander Andreev	195d4d057a	Version is incremented now and I wrote down the changes.	2020-07-18 04:43:45 +04:00
Alexander Andreev	87eecf0a09	New threaded scraper implemented.	2020-07-18 04:43:00 +04:00
Alexander Andreev	3223c0721a	Fixed OP's post property. Handled situation when comment and/or subject doesn't exists.	2020-07-18 04:42:19 +04:00