Add source engine template for webcomics

This commit is contained in:
Tim Van Baak 2020-08-11 23:53:21 -07:00
parent 78c3f44735
commit 7a85aa3dac
3 changed files with 110 additions and 0 deletions

View File

@ -0,0 +1,2 @@
from inquisitor.configs import CACHE_PATH
from inquisitor.templates import cache_image, LinearCrawler

99
inquisitor/templates.py Normal file
View File

@ -0,0 +1,99 @@
"""
Generates a dummy item.
"""
# Standard library imports
from datetime import datetime
import logging
import os
import random
from time import sleep
# Third-party library imports
from bs4 import BeautifulSoup
import requests
# Module imports
from inquisitor import CACHE_PATH
logger = logging.getLogger('inquisitor.templates')
def cache_image(source, url, filename):
# Define some paths
path = os.path.join(CACHE_PATH, source)
file_path = os.path.join(path, filename)
cached_url = f'/cache/{source}/{filename}'
# Ensure cache folder
if not os.path.isdir(path):
os.mkdir(path)
# Fetch url
logger.info(f'Caching {url} to {file_path}')
response = requests.get(url)
# Write file to disk
with open(file_path, 'wb') as f:
f.write(response.content)
# Return the inquisitor path to the file
return cached_url
class LinearCrawler:
"""
An engine for generating items from web sources that link content
together in a linear fashion, such as webcomics.
"""
def fetch_new(self, state):
items = []
max_iter = self.max_iterations() - 1
new = self.try_fetch(state)
items.extend(new)
for iter in range(max_iter):
sleep(1)
# If we've already gotten some items out of this fetch, we don't
# want to lose them and have the state still be set to the next
# page, so we wrap further calls in a try block and force return
# if we hit an error.
try:
new = self.try_fetch(state)
except:
new = []
items.extend(new)
# Cut out early if there was nothing returned
if not new:
break
return items
def try_fetch(self, state):
# Check for whether a new page should be crawled
if 'current_page' not in state:
next_page = self.get_start_url()
else:
current = state['current_page']
response = requests.get(current)
soup = BeautifulSoup(response.text, features='html.parser')
next_page = self.get_next_page_url(current, soup)
if not next_page:
return [] # nothing new
# Download the new page
logger.info('Fetching ' + next_page)
response = requests.get(next_page)
soup = BeautifulSoup(response.text, features="html.parser")
# Create an item from the page
item = self.make_item(next_page, soup)
# Update the state and return the item
state['current_page'] = next_page
return [item]
def max_iterations(self):
return 3
def get_start_url(self):
raise NotImplementedError('get_start_url is required')
def get_next_page_url(self, url, soup):
raise NotImplementedError('get_next_page_url is required')
def make_item(self, url, soup):
raise NotImplementedError('make_item is required')

View File

@ -1,6 +1,15 @@
beautifulsoup4==4.9.1
bs4==0.0.1
certifi==2020.6.20
chardet==3.0.4
click==7.1.2
Flask==1.1.2
idna==2.10
itsdangerous==1.1.0
Jinja2==2.11.2
MarkupSafe==1.1.1
pkg-resources==0.0.0
requests==2.24.0
soupsieve==2.0.1
urllib3==1.25.10
Werkzeug==1.0.1