diff --git a/inquisitor/__init__.py b/inquisitor/__init__.py index e69de29..bca6ead 100644 --- a/inquisitor/__init__.py +++ b/inquisitor/__init__.py @@ -0,0 +1,2 @@ +from inquisitor.configs import CACHE_PATH +from inquisitor.templates import cache_image, LinearCrawler \ No newline at end of file diff --git a/inquisitor/templates.py b/inquisitor/templates.py new file mode 100644 index 0000000..b902cb8 --- /dev/null +++ b/inquisitor/templates.py @@ -0,0 +1,99 @@ +""" +Generates a dummy item. +""" +# Standard library imports +from datetime import datetime +import logging +import os +import random +from time import sleep + +# Third-party library imports +from bs4 import BeautifulSoup +import requests + +# Module imports +from inquisitor import CACHE_PATH + +logger = logging.getLogger('inquisitor.templates') + + +def cache_image(source, url, filename): + # Define some paths + path = os.path.join(CACHE_PATH, source) + file_path = os.path.join(path, filename) + cached_url = f'/cache/{source}/{filename}' + # Ensure cache folder + if not os.path.isdir(path): + os.mkdir(path) + # Fetch url + logger.info(f'Caching {url} to {file_path}') + response = requests.get(url) + # Write file to disk + with open(file_path, 'wb') as f: + f.write(response.content) + # Return the inquisitor path to the file + return cached_url + + +class LinearCrawler: + """ + An engine for generating items from web sources that link content + together in a linear fashion, such as webcomics. + """ + def fetch_new(self, state): + items = [] + max_iter = self.max_iterations() - 1 + new = self.try_fetch(state) + items.extend(new) + for iter in range(max_iter): + sleep(1) + # If we've already gotten some items out of this fetch, we don't + # want to lose them and have the state still be set to the next + # page, so we wrap further calls in a try block and force return + # if we hit an error. + try: + new = self.try_fetch(state) + except: + new = [] + items.extend(new) + # Cut out early if there was nothing returned + if not new: + break + return items + + def try_fetch(self, state): + # Check for whether a new page should be crawled + if 'current_page' not in state: + next_page = self.get_start_url() + else: + current = state['current_page'] + response = requests.get(current) + soup = BeautifulSoup(response.text, features='html.parser') + next_page = self.get_next_page_url(current, soup) + if not next_page: + return [] # nothing new + + # Download the new page + logger.info('Fetching ' + next_page) + response = requests.get(next_page) + soup = BeautifulSoup(response.text, features="html.parser") + + # Create an item from the page + item = self.make_item(next_page, soup) + + # Update the state and return the item + state['current_page'] = next_page + return [item] + + def max_iterations(self): + return 3 + + def get_start_url(self): + raise NotImplementedError('get_start_url is required') + + def get_next_page_url(self, url, soup): + raise NotImplementedError('get_next_page_url is required') + + def make_item(self, url, soup): + raise NotImplementedError('make_item is required') diff --git a/requirements.txt b/requirements.txt index 139affa..0aeefc7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,15 @@ +beautifulsoup4==4.9.1 +bs4==0.0.1 +certifi==2020.6.20 +chardet==3.0.4 click==7.1.2 Flask==1.1.2 +idna==2.10 itsdangerous==1.1.0 Jinja2==2.11.2 MarkupSafe==1.1.1 +pkg-resources==0.0.0 +requests==2.24.0 +soupsieve==2.0.1 +urllib3==1.25.10 Werkzeug==1.0.1