Add source engine template for webcomics
This commit is contained in:
parent
78c3f44735
commit
7a85aa3dac
|
@ -0,0 +1,2 @@
|
||||||
|
from inquisitor.configs import CACHE_PATH
|
||||||
|
from inquisitor.templates import cache_image, LinearCrawler
|
|
@ -0,0 +1,99 @@
|
||||||
|
"""
|
||||||
|
Generates a dummy item.
|
||||||
|
"""
|
||||||
|
# Standard library imports
|
||||||
|
from datetime import datetime
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
# Third-party library imports
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Module imports
|
||||||
|
from inquisitor import CACHE_PATH
|
||||||
|
|
||||||
|
logger = logging.getLogger('inquisitor.templates')
|
||||||
|
|
||||||
|
|
||||||
|
def cache_image(source, url, filename):
|
||||||
|
# Define some paths
|
||||||
|
path = os.path.join(CACHE_PATH, source)
|
||||||
|
file_path = os.path.join(path, filename)
|
||||||
|
cached_url = f'/cache/{source}/{filename}'
|
||||||
|
# Ensure cache folder
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.mkdir(path)
|
||||||
|
# Fetch url
|
||||||
|
logger.info(f'Caching {url} to {file_path}')
|
||||||
|
response = requests.get(url)
|
||||||
|
# Write file to disk
|
||||||
|
with open(file_path, 'wb') as f:
|
||||||
|
f.write(response.content)
|
||||||
|
# Return the inquisitor path to the file
|
||||||
|
return cached_url
|
||||||
|
|
||||||
|
|
||||||
|
class LinearCrawler:
|
||||||
|
"""
|
||||||
|
An engine for generating items from web sources that link content
|
||||||
|
together in a linear fashion, such as webcomics.
|
||||||
|
"""
|
||||||
|
def fetch_new(self, state):
|
||||||
|
items = []
|
||||||
|
max_iter = self.max_iterations() - 1
|
||||||
|
new = self.try_fetch(state)
|
||||||
|
items.extend(new)
|
||||||
|
for iter in range(max_iter):
|
||||||
|
sleep(1)
|
||||||
|
# If we've already gotten some items out of this fetch, we don't
|
||||||
|
# want to lose them and have the state still be set to the next
|
||||||
|
# page, so we wrap further calls in a try block and force return
|
||||||
|
# if we hit an error.
|
||||||
|
try:
|
||||||
|
new = self.try_fetch(state)
|
||||||
|
except:
|
||||||
|
new = []
|
||||||
|
items.extend(new)
|
||||||
|
# Cut out early if there was nothing returned
|
||||||
|
if not new:
|
||||||
|
break
|
||||||
|
return items
|
||||||
|
|
||||||
|
def try_fetch(self, state):
|
||||||
|
# Check for whether a new page should be crawled
|
||||||
|
if 'current_page' not in state:
|
||||||
|
next_page = self.get_start_url()
|
||||||
|
else:
|
||||||
|
current = state['current_page']
|
||||||
|
response = requests.get(current)
|
||||||
|
soup = BeautifulSoup(response.text, features='html.parser')
|
||||||
|
next_page = self.get_next_page_url(current, soup)
|
||||||
|
if not next_page:
|
||||||
|
return [] # nothing new
|
||||||
|
|
||||||
|
# Download the new page
|
||||||
|
logger.info('Fetching ' + next_page)
|
||||||
|
response = requests.get(next_page)
|
||||||
|
soup = BeautifulSoup(response.text, features="html.parser")
|
||||||
|
|
||||||
|
# Create an item from the page
|
||||||
|
item = self.make_item(next_page, soup)
|
||||||
|
|
||||||
|
# Update the state and return the item
|
||||||
|
state['current_page'] = next_page
|
||||||
|
return [item]
|
||||||
|
|
||||||
|
def max_iterations(self):
|
||||||
|
return 3
|
||||||
|
|
||||||
|
def get_start_url(self):
|
||||||
|
raise NotImplementedError('get_start_url is required')
|
||||||
|
|
||||||
|
def get_next_page_url(self, url, soup):
|
||||||
|
raise NotImplementedError('get_next_page_url is required')
|
||||||
|
|
||||||
|
def make_item(self, url, soup):
|
||||||
|
raise NotImplementedError('make_item is required')
|
|
@ -1,6 +1,15 @@
|
||||||
|
beautifulsoup4==4.9.1
|
||||||
|
bs4==0.0.1
|
||||||
|
certifi==2020.6.20
|
||||||
|
chardet==3.0.4
|
||||||
click==7.1.2
|
click==7.1.2
|
||||||
Flask==1.1.2
|
Flask==1.1.2
|
||||||
|
idna==2.10
|
||||||
itsdangerous==1.1.0
|
itsdangerous==1.1.0
|
||||||
Jinja2==2.11.2
|
Jinja2==2.11.2
|
||||||
MarkupSafe==1.1.1
|
MarkupSafe==1.1.1
|
||||||
|
pkg-resources==0.0.0
|
||||||
|
requests==2.24.0
|
||||||
|
soupsieve==2.0.1
|
||||||
|
urllib3==1.25.10
|
||||||
Werkzeug==1.0.1
|
Werkzeug==1.0.1
|
||||||
|
|
Loading…
Reference in New Issue