Add source engine template for webcomics

2020-08-11 23:53:21 -07:00 · 2020-08-11 23:53:21 -07:00 · 7a85aa3dac
commit 7a85aa3dac
parent 78c3f44735
3 changed files with 110 additions and 0 deletions
--- a/inquisitor/init.py
+++ b/inquisitor/init.py
@ -0,0 +1,2 @@
 from inquisitor.configs import CACHE_PATH
 from inquisitor.templates import cache_image, LinearCrawler
--- a/inquisitor/templates.py
+++ b/inquisitor/templates.py
@ -0,0 +1,99 @@
 """
 Generates a dummy item.
 """
 # Standard library imports
 from datetime import datetime
 import logging
 import os
 import random
 from time import sleep
 # Third-party library imports
 from bs4 import BeautifulSoup
 import requests
 # Module imports
 from inquisitor import CACHE_PATH
 logger = logging.getLogger('inquisitor.templates')
 def cache_image(source, url, filename):
 	# Define some paths
 	path = os.path.join(CACHE_PATH, source)
 	file_path = os.path.join(path, filename)
 	cached_url = f'/cache/{source}/{filename}'
 	# Ensure cache folder
 	if not os.path.isdir(path):
 		os.mkdir(path)
 	# Fetch url
 	logger.info(f'Caching {url} to {file_path}')
 	response = requests.get(url)
 	# Write file to disk
 	with open(file_path, 'wb') as f:
 		f.write(response.content)
 	# Return the inquisitor path to the file
 	return cached_url
 class LinearCrawler:
 	"""
 	An engine for generating items from web sources that link content
 	together in a linear fashion, such as webcomics.
 	"""
 	def fetch_new(self, state):
 		items = []
 		max_iter = self.max_iterations() - 1
 		new = self.try_fetch(state)
 		items.extend(new)
 		for iter in range(max_iter):
 			sleep(1)
 			# If we've already gotten some items out of this fetch, we don't
 			# want to lose them and have the state still be set to the next
 			# page, so we wrap further calls in a try block and force return
 			# if we hit an error.
 			try:
 				new = self.try_fetch(state)
 			except:
 				new = []
 			items.extend(new)
 			# Cut out early if there was nothing returned
 			if not new:
 				break
 		return items
 	def try_fetch(self, state):
 		# Check for whether a new page should be crawled
 		if 'current_page' not in state:
 			next_page = self.get_start_url()
 		else:
 			current = state['current_page']
 			response = requests.get(current)
 			soup = BeautifulSoup(response.text, features='html.parser')
 			next_page = self.get_next_page_url(current, soup)
 		if not next_page:
 			return []  # nothing new
 		# Download the new page
 		logger.info('Fetching ' + next_page)
 		response = requests.get(next_page)
 		soup = BeautifulSoup(response.text, features="html.parser")
 		# Create an item from the page
 		item = self.make_item(next_page, soup)
 		# Update the state and return the item
 		state['current_page'] = next_page
 		return [item]
 	def max_iterations(self):
 		return 3
 	def get_start_url(self):
 		raise NotImplementedError('get_start_url is required')
 	def get_next_page_url(self, url, soup):
 		raise NotImplementedError('get_next_page_url is required')
 	def make_item(self, url, soup):
 		raise NotImplementedError('make_item is required')
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,15 @@
 beautifulsoup4==4.9.1
 bs4==0.0.1
 certifi==2020.6.20
 chardet==3.0.4
 click==7.1.2
 Flask==1.1.2
 idna==2.10
 itsdangerous==1.1.0
 Jinja2==2.11.2
 MarkupSafe==1.1.1
 pkg-resources==0.0.0
 requests==2.24.0
 soupsieve==2.0.1
 urllib3==1.25.10
 Werkzeug==1.0.1
		`@ -0,0 +1,2 @@`
							`from inquisitor.configs import CACHE_PATH`
							`from inquisitor.templates import cache_image, LinearCrawler`