_is_file_ok now is _check_file and modified to be more efficient. Also added check for if files happened to share same name and size, but IB said wrong hash.

2020-11-18 23:47:26 +04:00 · 2020-11-18 23:47:26 +04:00 · bb47b50c5f
commit bb47b50c5f
parent 8403fcf0f2
1 changed files with 26 additions and 15 deletions
--- a/scrapthechan/scraper.py
+++ b/scrapthechan/scraper.py
@ -5,7 +5,7 @@ from os import remove, stat
 from os.path import exists, join, getsize
 import re
 from typing import List, Callable
-from urllib.request import urlretrieve, URLopener
+from urllib.request import urlretrieve, URLopener, HTTPError
 import hashlib

 from scrapthechan import USER_AGENT
@ -63,35 +63,46 @@ class Scraper:
                        newname = f"{newname[:lbracket]}({int(num)+1})"
        return newname

-    def _hash_file(self, filename: str, hash_algo: str = "md5",
+    def _hash_file(self, filepath: str, hash_algorithm: str = "md5",
                   blocksize: int = 1048576) -> (str, str):
        """Compute hash of a file."""
-        hash_func = hashlib.new(hash_algo)
-        with open(filename, 'rb') as f:
+        hash_func = hashlib.new(hash_algorithm)
+        with open(filepath, 'rb') as f:
            buf = f.read(blocksize)
            while len(buf) > 0:
                hash_func.update(buf)
                buf = f.read(blocksize)
-        return hash_func.hexdigest(), hash_func.digest()
+        return hash_func.hexdigest(), b64encode(hash_func.digest()).decode()

-    def _is_file_ok(self, f: FileInfo, filepath: str) -> bool:
+    def _check_file(self, f: FileInfo, filepath: str) -> bool:
        """Check if a file exist and isn't broken."""
        if not exists(filepath):
            return False
        computed_size = getsize(filepath)
-        is_size_match = f.size == computed_size \
-                        or f.size == round(computed_size / 1024)
-        hexdig, dig = self._hash_file(filepath, f.hash_algo)
-        is_hash_match = f.hash_value == hexdig \
-                        or f.hash_value == b64encode(dig).decode()
-        return is_size_match and is_hash_match
+        if not (f.size == computed_size \
+                or f.size == round(computed_size / 1024)):
+            return False
+        hexdig, dig = self._hash_file(filepath, f.hash_algorithm)
+        return f.hash_value == hexdig or f.hash_value == dig

    def _download_file(self, f: FileInfo):
        """Download a single file."""
+        is_same_filename = False
        filepath = join(self._save_directory, f.name)
-        if self._is_file_ok(f, filepath):
-            return True
+        orig_filepath = filepath
+        if self._check_file(f, filepath):
+            return
        elif exists(filepath):
+            is_same_filename = True
            filepath = join(self._save_directory, \
                self._same_filename(f.name, self._save_directory))
-        self._url_opener.retrieve(f.dlurl, filepath)
+        try:
+            self._url_opener.retrieve(f.download_url, filepath)
+            if is_same_filename:
+                f1_hexdig, f1_dig = self._hash_file(orig_filepath, f.hash_algorithm)
+                f2_hexdig, f2_dig = self._hash_file(filepath, f.hash_algorithm)
+                assert filepath != orig_filepath, 'Filepaths are matching!'
+                if f1_hexdig == f2_hexdig or f1_dig == f2_dig:
+                    remove(filepath)
+        except HTTPError as e:
+            print(e, f.download_url)