Refactored to retry when overshooting pagination to fetch the last page and try again

2023-10-19 21:22:07 -07:00 · 2023-10-19 21:22:07 -07:00 · 22c5a6e231
commit 22c5a6e231
parent 88e11f58bc
1 changed files with 125 additions and 106 deletions
--- a/src/konachan_backend.py
+++ b/src/konachan_backend.py
@ -22,146 +22,165 @@
 import requests
 import os
 import random
 import re
 import time
 def random_tag(*tags):
-	return len(tags) == 1 and tags[0].lower() == "random"
+    return len(tags) == 1 and tags[0].lower() == "random"
 def collect_tags(post):
-	return post["tags"].strip().lower().split()
+    return post["tags"].strip().lower().split()
 def is_banned(post, profile):
-	tag_response = collect_tags(post)
+    tag_response = collect_tags(post)
-	tag_banned = profile["banned_tags"]
+    tag_banned = profile["banned_tags"]
-	for tag in tag_banned:
+    for tag in tag_banned:
-		if tag in tag_response:
+        if tag in tag_response:
-			return tag
+            return tag
-	return None
+    return None
 def get_nsfw(post):
-	return post["rating"] in ("q", "e")
+    return post["rating"] in ("q", "e")
 def select_from_response(response, profile, max_size=None):
-	for post in response:
+    for post in response:
-		if is_banned(post, profile):
+        if is_banned(post, profile):
-			continue
+            continue
-		elif "file_url" not in post:
+        elif "file_url" not in post:
-			continue
+            continue
-		# Select only nsfw
+        # Select only nsfw
-		elif (	profile["force_nsfw"] is not None and
+        elif (  profile["force_nsfw"] is not None and
-				profile["force_nsfw"] != get_nsfw(post)
+                profile["force_nsfw"] != get_nsfw(post)
-		):
+        ):
-			continue
+            continue
-		# Make sure serverside size is not larger than max_size
+        # Make sure serverside size is not larger than max_size
-		elif (	max_size != None and
+        elif (  max_size != None and
-				"file_size" in post and
+                "file_size" in post and
-				post["file_size"] > max_size
+                post["file_size"] > max_size
-		):
+        ):
-			continue
+            continue
-		return post
+        return post
-	return None
+    return None
 class downloader:
-	username = None
+    def __init__(self, backend_credentials):
-	password = None
+        self.api_endpoint = "{}/post/index.json?limit={}&page={}"
-	max_size = None
+        self.html_endpoint = "{}/post/index?limit={}&page={}"
-	tmp = None
+        self.tag_url = "&tags={}"
-	url = ""
+        self.limit = 100
-	api_endpoint = "post.json?random=true&limit=100"
+        self.retry_limit = 3
 	api_tags = "&tags={}"
 	api_limit = "&limit={}"
 	api_offset = "&page={}"
 	limit = 100
 	max_depth = 200
-	def __init__(self, backend_credentials):
+        self.username = backend_credentials["username"]
-		self.username = backend_credentials["username"]
+        self.password = backend_credentials["password"]
-		self.password = backend_credentials["password"]
+        self.depth = backend_credentials["max_size"]
-		self.max_size = backend_credentials["max_size"]
+        self.tmp = backend_credentials["tmp_dir"]
-		self.tmp = backend_credentials["tmp_dir"]
+        self.url = backend_credentials["url"]
-		self.url = backend_credentials["url"]
+        self.max_depth = backend_credentials["max_depth"]
-		self.max_depth = backend_credentials["max_depth"]
+        random.seed(os.urandom(16))
 		random.seed(os.urandom(16))
-	def download_post(self, post):
+    def download_post(self, post):
-		file_url = post["file_url"]
+        file_url = post["file_url"]
-		full_path = post["full_path"]
+        full_path = post["full_path"]
-		remote_image = requests.get(file_url)
+        remote_image = requests.get(file_url)
-		if remote_image.status_code != 200:
+        if remote_image.status_code != 200:
-			print("Remote image request returned:", remote_image.status_code)
+            print("Remote image request returned:", remote_image.status_code)
-			return None
+            return None
-		with open(full_path, "wb") as f:
+        with open(full_path, "wb") as f:
-			f.write(remote_image.content)
+            f.write(remote_image.content)
-		return post
+        return post
-	def get_full_url(self, limit=100, offset=0, *tags):
+    def search(self, search_url):
-		search_url = "/".join((self.url, self.api_endpoint))
+        search_request = None
-		search_url += self.api_limit.format(str(limit))
+        if self.username and self.password:
-		search_url += self.api_offset.format(str(offset))
+            search_request = requests.get(search_url,
-		if tags and not random_tag(*tags):
+                    auth=(self.username, self.password)
-			search_tags = "+".join(tags)
+                    )
-			search_url += self.api_tags.format(search_tags)
+        else:
-		return search_url
+            search_request = requests.get(search_url)
-	
+        return search_request
-	def search(self, search_url):
+    # I suck at regex :(
-		search_request = None
+    def get_max_page(self, html):
-		if self.username and self.password:
+        match = re.findall('page=[0-9]*', html)
-			search_request = requests.get(search_url,
+        if match:
-					auth=(self.username, self.password)
+            last_group = match[len(match) - 1]
-					)
+            last_page = last_group.rsplit("=", 1)[1]
-		else:
+            return int(last_page)
-			search_request = requests.get(search_url)
+        else:
-		return search_request
+            return None
    def fetch_post(self, profile):
        tags = profile["tags"]
        selected = dict()
        max_depth = self.max_depth
        search_url_tags = "+".join(tags)
        search_url = ""
-	def fetch_post(self, profile):
+        for _ in range(0, self.retry_limit):
-		# Search ratings: s=safe, e=nsfw
+            page_offset = random.randint(0, max_depth)
-		tags = profile["tags"]
+            search_url = self.api_endpoint.format(self.url, self.limit, page_offset)
            search_url_html = self.html_endpoint.format(self.url, self.limit, page_offset)
            if search_url_tags:
                search_url += self.tag_url.format(search_url_tags)
                search_url_html += self.tag_url.format(search_url_tags)
-		page_offset = random.randint(0, self.max_depth)
+            search_request = self.search(search_url)
 		search_url = self.get_full_url(self.limit, page_offset, *tags)
 		search_request = self.search(search_url)
-		if search_request.status_code != 200:
+            if search_request.status_code != 200:
-			print("Search request returned:", search_request.status_code)
+                print("Search request returned:", search_request.status_code)
-			return None
+                continue
-		posts = search_request.json()
+            posts = search_request.json()
-		random.shuffle(posts)
+            random.shuffle(posts)
-		selected = select_from_response(posts, profile, self.max_size)
+            selected = select_from_response(posts, profile)
-		if selected is None:
+            if selected is None:
-			print("Could not select image based on criteria")
+                print("Could not select image based on criteria")
-			return None
+                time.sleep(2)
                search_request = self.search(search_url_html)
-		tag_response = collect_tags(selected)
+                if search_request.status_code == 200:
-		nsfw = get_nsfw(selected)
+                    new_max_depth = self.get_max_page(search_request.text)
-		file_url = selected["file_url"]
+                    if new_max_depth < max_depth:
                        max_depth = new_max_depth
                    else:
                        max_depth = max_depth // 2
                else:
                    max_depth = max_depth // 2
                continue
-		basename = "{}.{}".format(selected["md5"], file_url.rsplit(".", 1)[1])
+            break
 		full_path = os.path.join(self.tmp, basename)
-		r =	{
+        if not selected:
-			# Add profile to dictioanry
+            return None
 			"profile":			profile,
-			# Query results
+        tag_response = collect_tags(selected)
-			"search_url":		search_url,
+        nsfw = get_nsfw(selected)
-			"file_url":			file_url,
+        file_url = selected["file_url"]
 			"full_path":		full_path,
 			"tag_response":		tag_response,
 			"nsfw":				nsfw
 			}
-		return r
+        basename = "{}.{}".format(selected["md5"], file_url.rsplit(".", 1)[1])
        full_path = os.path.join(self.tmp, basename)
        r = {
            # Add profile to dictioanry
            "profile":          profile,
            # Query results
            "search_url":       search_url,
            "file_url":         file_url,
            "full_path":        full_path,
            "tag_response":     tag_response,
            "nsfw":             nsfw
            }
        return r