From a106d5b7393b77f434613c93019a0b02737cec47 Mon Sep 17 00:00:00 2001
From: "Alexander \"Arav\" Andreev" <me@arav.top>
Date: Wed, 9 Sep 2020 04:34:41 +0400
Subject: [PATCH] Added support for lolifox.cc. Fixed User-Agent usage, so it
 applied correctly everywhere now.

---
 CHANGELOG.md                          | 13 ++++++
 Makefile                              |  2 +-
 README.md                             |  3 +-
 scrapthechan/__init__.py              |  4 +-
 scrapthechan/parser.py                |  6 ++-
 scrapthechan/parsers/__init__.py      |  5 ++-
 scrapthechan/parsers/lolifox.py       | 65 +++++++++++++++++++++++++++
 scrapthechan/scraper.py               |  1 +
 scrapthechan/scrapers/basicscraper.py | 15 -------
 setup.cfg                             |  1 +
 10 files changed, 93 insertions(+), 22 deletions(-)
 create mode 100644 scrapthechan/parsers/lolifox.py
 delete mode 100644 scrapthechan/scrapers/basicscraper.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 90333a9..bc8fb24 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # Changelog
 
+## 0.3 - 2020-09-09
+### Added
+- Parser for lolifox.cc.
+
+### Removed
+- BasicScraper. Not needed anymore, there is a faster threaded version.
+
+### Fixed
+- Now User-Agent is correctly applied everywhere.
+
+
 ## 0.2.2 - 2020-07-20
 ### Added
 - Parser for 8kun.top.
@@ -14,11 +25,13 @@
 - Consider that issue with size on 2ch.hk. Usually it really tells the size in
   kB. The problem is that sometimes it just wrong.
 
+
 ## 0.2.1 - 2020-07-18
 ### Changed
 - Now program tells you what thread doesn't exist or about to be scraped. That
   is useful in batch processing with scripts.
 
+
 ## 0.2.0 - 2020-07-18
 ### Added
 - Threaded version of the scraper, so now it is fast as heck!
diff --git a/Makefile b/Makefile
index a8e2af3..40e23c3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 build: scrapthechan README.md setup.cfg
 	python setup.py sdist bdist_wheel
 install:
-	python -m pip install --upgrade dist/scrapthechan-0.2.2-py3-none-any.whl --user
+	python -m pip install --upgrade dist/scrapthechan-0.3-py3-none-any.whl --user
 uninstall:
 	# We change directory so pip uninstall will run, it'll fail otherwise.
 	@cd ~/
diff --git a/README.md b/README.md
index cdec9c8..6bf362e 100644
--- a/README.md
+++ b/README.md
@@ -36,4 +36,5 @@ help for a program.
 - [4chan.org](https://4chan.org) since 0.1.0
 - [lainchan.org](https://lainchan.org) since 0.1.0
 - [2ch.hk](https://2ch.hk) since 0.1.0
-- [8kun.top](https://8kun.top) since 0.2.2
\ No newline at end of file
+- [8kun.top](https://8kun.top) since 0.2.2
+- [lolifox.cc](https://lolifox.cc) since 0.3
\ No newline at end of file
diff --git a/scrapthechan/__init__.py b/scrapthechan/__init__.py
index be09bf8..9451756 100644
--- a/scrapthechan/__init__.py
+++ b/scrapthechan/__init__.py
@@ -1,5 +1,5 @@
-__date__ = "20 July 2020"
-__version__ = "0.2.2"
+__date__ = "9 September 2020"
+__version__ = "0.3"
 __author__ = "Alexander \"Arav\" Andreev"
 __email__ = "me@arav.top"
 __copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"
diff --git a/scrapthechan/parser.py b/scrapthechan/parser.py
index 7a23bd9..e83f3a4 100644
--- a/scrapthechan/parser.py
+++ b/scrapthechan/parser.py
@@ -4,8 +4,9 @@ from itertools import chain
 from json import loads
 from re import findall, match
 from typing import List, Optional
-from urllib.request import urlopen, urlretrieve
+from urllib.request import urlopen, Request
 
+from scrapthechan import USER_AGENT
 from scrapthechan.fileinfo import FileInfo
 
 
@@ -71,7 +72,8 @@ class Parser:
 	def _get_json(self, thread_url: str) -> dict:
 		"""Gets JSON version of a thread and converts it in a dictionary."""
 		try:
-			with urlopen(thread_url) as url:
+			req = Request(thread_url, headers={'User-Agent': USER_AGENT})
+			with urlopen(req) as url:
 				return loads(url.read().decode('utf-8'))
 		except:
 			raise ThreadNotFoundError
diff --git a/scrapthechan/parsers/__init__.py b/scrapthechan/parsers/__init__.py
index aaaa774..0fc99b4 100644
--- a/scrapthechan/parsers/__init__.py
+++ b/scrapthechan/parsers/__init__.py
@@ -9,7 +9,7 @@ __all__ = ["SUPPORTED_IMAGEBOARDS", "get_parser_by_url", "get_parser_by_site"]
 
 
 SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk", \
-	"8kun.top"]
+	"8kun.top", "lolifox.cc"]
 
 
 def get_parser_by_url(url: str) -> Parser:
@@ -33,5 +33,8 @@ def get_parser_by_site(site: str, board: str, thread: str) -> Parser:
 	elif '8kun' in site:
 		from .eightkun import EightKunParser
 		return EightKunParser(board, thread)
+	elif 'lolifox' in site:
+		from .lolifox import LolifoxParser
+		return LolifoxParser(board, thread)
 	else:
 		raise NotImplementedError(f"Parser for {site} is not implemented")
diff --git a/scrapthechan/parsers/lolifox.py b/scrapthechan/parsers/lolifox.py
new file mode 100644
index 0000000..b0d6f24
--- /dev/null
+++ b/scrapthechan/parsers/lolifox.py
@@ -0,0 +1,65 @@
+from re import match
+from typing import List, Optional
+
+from scrapthechan.parser import Parser
+from scrapthechan.fileinfo import FileInfo
+
+__all__ = ["LolifoxParser"]
+
+
+class LolifoxParser(Parser):
+	"""JSON parser for lolifox.cc image board.
+	JSON structure is identical to lainchan.org.
+	"""
+
+	__url_thread_json = "https://lolifox.cc/{board}/res/{thread}.json"
+	__url_file_link = "https://lolifox.cc/{board}/src/{filename}"
+
+	def __init__(self, board: str, thread: str,
+				 skip_posts: Optional[int] = None) -> None:
+		posts = self._get_json(self.__url_thread_json.format(board=board, \
+			thread=thread))['posts']
+		super(LolifoxParser, self).__init__(board, thread, posts, skip_posts)
+
+	@property
+	def imageboard(self) -> str:
+		return "lolifox.cc"
+	
+	@property
+	def op(self) -> Optional[str]:
+		op = ""
+		if 'sub' in self._op_post:
+			op = f"{self._op_post['sub']}\n"
+		if 'com' in self._op_post:
+			op += self._op_post['com']
+		return op if not op == "" else None
+
+	def _parse_post(self, post) -> List[FileInfo]:
+		if not 'tim' in post: return None
+
+		dlfname = f"{post['tim']}{post['ext']}"
+
+		if "filename" in post:
+			if match(post['filename'], r"^image\.\w{1,4}$") is None:
+				filename = dlfname
+			else:
+				filename = f"{post['filename']}{post['ext']}"
+
+		files = []
+		files.append(FileInfo(filename, post['fsize'],
+			self.__url_file_link.format(board=self.board, filename=dlfname),
+			post['md5'], 'md5'))
+
+		if "extra_files" in post:
+			for f in post["extra_files"]:
+				dlfname = f"{f['tim']}{f['ext']}"
+				if "filename" in post:
+					if match(post['filename'], r"^image\.\w+$") is None:
+						filename = dlfname
+					else:
+						filename = f"{post['filename']}{post['ext']}"
+				dlurl = self.__url_file_link.format(board=self.board, \
+					filename=dlfname)
+				files.append(FileInfo(filename, f['fsize'], \
+					dlurl, f['md5'], 'md5'))
+		return files
diff --git a/scrapthechan/scraper.py b/scrapthechan/scraper.py
index bacc8ff..2b93377 100644
--- a/scrapthechan/scraper.py
+++ b/scrapthechan/scraper.py
@@ -29,6 +29,7 @@ class Scraper:
         self._save_directory = save_directory
         self._files = files
         self._url_opener = URLopener()
+        self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
         self._url_opener.version = USER_AGENT
         self._progress_callback = download_progress_callback
 
diff --git a/scrapthechan/scrapers/basicscraper.py b/scrapthechan/scrapers/basicscraper.py
deleted file mode 100644
index 6c1b430..0000000
--- a/scrapthechan/scrapers/basicscraper.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""Implementation of basic sequential one-threaded scraper that downloads
-files one by one."""
-
-from scrapthechan.scraper import Scraper
-
-__all__ = ["BasicScraper"]
-
-
-class BasicScraper(Scraper):
-    def run(self):
-        """Download files one by one."""
-        for i, f in enumerate(self._files, start=1):
-            if not self._progress_callback is None:
-                self._progress_callback(i)
-            self._download_file(f)
diff --git a/setup.cfg b/setup.cfg
index 66a501d..a52aa5d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,6 +14,7 @@ keywords =
     2ch.hk
     lainchan.org
     8kun.top
+    lolifox.cc
 license = MIT
 license_file = COPYING
 classifiers =