Add source engine template for webcomics

2020-08-11 23:53:21 -07:00 · 2020-08-11 23:53:21 -07:00 · 7a85aa3dac
parent 78c3f44735
commit 7a85aa3dac
3 changed files with 110 additions and 0 deletions
--- a/inquisitor/init.py
+++ b/inquisitor/init.py
@ -0,0 +1,2 @@
+from inquisitor.configs import CACHE_PATH
+from inquisitor.templates import cache_image, LinearCrawler
--- a/inquisitor/templates.py
+++ b/inquisitor/templates.py
@ -0,0 +1,99 @@
+"""
+Generates a dummy item.
+"""
+# Standard library imports
+from datetime import datetime
+import logging
+import os
+import random
+from time import sleep
+
+# Third-party library imports
+from bs4 import BeautifulSoup
+import requests
+
+# Module imports
+from inquisitor import CACHE_PATH
+
+logger = logging.getLogger('inquisitor.templates')
+
+
+def cache_image(source, url, filename):
+	# Define some paths
+	path = os.path.join(CACHE_PATH, source)
+	file_path = os.path.join(path, filename)
+	cached_url = f'/cache/{source}/{filename}'
+	# Ensure cache folder
+	if not os.path.isdir(path):
+		os.mkdir(path)
+	# Fetch url
+	logger.info(f'Caching {url} to {file_path}')
+	response = requests.get(url)
+	# Write file to disk
+	with open(file_path, 'wb') as f:
+		f.write(response.content)
+	# Return the inquisitor path to the file
+	return cached_url
+
+
+class LinearCrawler:
+	"""
+	An engine for generating items from web sources that link content
+	together in a linear fashion, such as webcomics.
+	"""
+	def fetch_new(self, state):
+		items = []
+		max_iter = self.max_iterations() - 1
+		new = self.try_fetch(state)
+		items.extend(new)
+		for iter in range(max_iter):
+			sleep(1)
+			# If we've already gotten some items out of this fetch, we don't
+			# want to lose them and have the state still be set to the next
+			# page, so we wrap further calls in a try block and force return
+			# if we hit an error.
+			try:
+				new = self.try_fetch(state)
+			except:
+				new = []
+			items.extend(new)
+			# Cut out early if there was nothing returned
+			if not new:
+				break
+		return items
+
+	def try_fetch(self, state):
+		# Check for whether a new page should be crawled
+		if 'current_page' not in state:
+			next_page = self.get_start_url()
+		else:
+			current = state['current_page']
+			response = requests.get(current)
+			soup = BeautifulSoup(response.text, features='html.parser')
+			next_page = self.get_next_page_url(current, soup)
+		if not next_page:
+			return []  # nothing new
+
+		# Download the new page
+		logger.info('Fetching ' + next_page)
+		response = requests.get(next_page)
+		soup = BeautifulSoup(response.text, features="html.parser")
+
+		# Create an item from the page
+		item = self.make_item(next_page, soup)
+
+		# Update the state and return the item
+		state['current_page'] = next_page
+		return [item]
+
+	def max_iterations(self):
+		return 3
+
+	def get_start_url(self):
+		raise NotImplementedError('get_start_url is required')
+
+	def get_next_page_url(self, url, soup):
+		raise NotImplementedError('get_next_page_url is required')
+
+	def make_item(self, url, soup):
+		raise NotImplementedError('make_item is required')
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,15 @@
+beautifulsoup4==4.9.1
+bs4==0.0.1
+certifi==2020.6.20
+chardet==3.0.4
 click==7.1.2
 Flask==1.1.2
+idna==2.10
 itsdangerous==1.1.0
 Jinja2==2.11.2
 MarkupSafe==1.1.1
+pkg-resources==0.0.0
+requests==2.24.0
+soupsieve==2.0.1
+urllib3==1.25.10
 Werkzeug==1.0.1