Refactored to retry when overshooting pagination to fetch the last page and try again

This commit is contained in:
Anon 2023-10-19 21:22:07 -07:00
parent 88e11f58bc
commit 22c5a6e231

View File

@ -22,6 +22,8 @@
import requests
import os
import random
import re
import time
def random_tag(*tags):
return len(tags) == 1 and tags[0].lower() == "random"
@ -43,7 +45,6 @@ def is_banned(post, profile):
def get_nsfw(post):
return post["rating"] in ("q", "e")
def select_from_response(response, profile, max_size=None):
for post in response:
if is_banned(post, profile):
@ -66,22 +67,16 @@ def select_from_response(response, profile, max_size=None):
class downloader:
username = None
password = None
max_size = None
tmp = None
url = ""
api_endpoint = "post.json?random=true&limit=100"
api_tags = "&tags={}"
api_limit = "&limit={}"
api_offset = "&page={}"
limit = 100
max_depth = 200
def __init__(self, backend_credentials):
self.api_endpoint = "{}/post/index.json?limit={}&page={}"
self.html_endpoint = "{}/post/index?limit={}&page={}"
self.tag_url = "&tags={}"
self.limit = 100
self.retry_limit = 3
self.username = backend_credentials["username"]
self.password = backend_credentials["password"]
self.max_size = backend_credentials["max_size"]
self.depth = backend_credentials["max_size"]
self.tmp = backend_credentials["tmp_dir"]
self.url = backend_credentials["url"]
self.max_depth = backend_credentials["max_depth"]
@ -103,16 +98,6 @@ class downloader:
return post
def get_full_url(self, limit=100, offset=0, *tags):
search_url = "/".join((self.url, self.api_endpoint))
search_url += self.api_limit.format(str(limit))
search_url += self.api_offset.format(str(offset))
if tags and not random_tag(*tags):
search_tags = "+".join(tags)
search_url += self.api_tags.format(search_tags)
return search_url
def search(self, search_url):
search_request = None
if self.username and self.password:
@ -123,26 +108,60 @@ class downloader:
search_request = requests.get(search_url)
return search_request
# I suck at regex :(
def get_max_page(self, html):
match = re.findall('page=[0-9]*', html)
if match:
last_group = match[len(match) - 1]
last_page = last_group.rsplit("=", 1)[1]
return int(last_page)
else:
return None
def fetch_post(self, profile):
# Search ratings: s=safe, e=nsfw
tags = profile["tags"]
selected = dict()
max_depth = self.max_depth
search_url_tags = "+".join(tags)
search_url = ""
for _ in range(0, self.retry_limit):
page_offset = random.randint(0, max_depth)
search_url = self.api_endpoint.format(self.url, self.limit, page_offset)
search_url_html = self.html_endpoint.format(self.url, self.limit, page_offset)
if search_url_tags:
search_url += self.tag_url.format(search_url_tags)
search_url_html += self.tag_url.format(search_url_tags)
page_offset = random.randint(0, self.max_depth)
search_url = self.get_full_url(self.limit, page_offset, *tags)
search_request = self.search(search_url)
if search_request.status_code != 200:
print("Search request returned:", search_request.status_code)
return None
continue
posts = search_request.json()
random.shuffle(posts)
selected = select_from_response(posts, profile, self.max_size)
selected = select_from_response(posts, profile)
if selected is None:
print("Could not select image based on criteria")
time.sleep(2)
search_request = self.search(search_url_html)
if search_request.status_code == 200:
new_max_depth = self.get_max_page(search_request.text)
if new_max_depth < max_depth:
max_depth = new_max_depth
else:
max_depth = max_depth // 2
else:
max_depth = max_depth // 2
continue
break
if not selected:
return None
tag_response = collect_tags(selected)