diff --git a/sources/falseknees.py b/sources/falseknees.py new file mode 100644 index 0000000..5566448 --- /dev/null +++ b/sources/falseknees.py @@ -0,0 +1,84 @@ +""" +Generates a dummy item. +""" +# Standard library imports +from datetime import datetime +import logging +import os +import random +import subprocess +from time import sleep + +from inquisitor import LinearCrawler, cache_image, CACHE_PATH + +logger = logging.getLogger('inquisitor.sources.falseknees') + + +class FalseKnees(LinearCrawler): + def get_start_url(self): + return "https://falseknees.com/1.html" + + def get_next_page_url(self, url, soup): + nav = soup.find('a', class_='comic-nav-next') + return nav.get('href') if nav else None + + def make_item(self, url, soup): + # Basic fields + item = { + 'source': 'ksbd', + 'id': url.rstrip('/').rsplit('/', 1)[1], + 'title': soup.find('h2', class_='post-title').text, + 'link': url, + 'tags': ['ksbd'], + 'callback': {'cached': []} + } + + body = "" + # Cache the imgs + imgs = soup.find('div', id='comic').find_all('img') + for img in imgs: + image_name = os.path.split(img['src'])[1] + chapter_num = soup.find('div', class_='comic-chapter').find('a').text + filename = f'{chapter_num:0>2}-{image_name}' + cached_url = cache_image(__name__, img['src'], filename) + item['callback']['cached'].append( + os.path.join(__name__, filename)) + body += f'

' + alt_text = img.get('alt', img.get('title')) + if alt_text: + body += f'

{alt_text}

' + body_text = soup.find('div', class_='entry') + if body_text: + body += '
' + str(body_text) + item['body'] = body + + return item + + + def get_callback(self, url, soup): + # Dead reckon the images from get_body + val = {'cached': []} + imgs = soup.find('div', id='comic').find_all('img') + for img in imgs: + filename = os.path.split(img['src'])[1] + cached_filename = os.path.join(__name__, filename) + val['cached'].append(cached_filename) + return val + + def max_iterations(self): + return 2 + + +def fetch_new(state): + return ThisWebcomic().fetch_new(state) + + +def callback(state, item): + for filename in item['callback']['cached']: + # Get the path to the image in the cache + path = os.path.join(CACHE_PATH, filename) + # scp the image to catacomb + remote_target = 'tvb@10.7.3.16:/nas/image/' + subprocess.run(['scp', path, remote_target]) + # remove the image to signify success + os.remove(path) diff --git a/sources/ksbd.py b/sources/ksbd.py new file mode 100644 index 0000000..4a4730c --- /dev/null +++ b/sources/ksbd.py @@ -0,0 +1,84 @@ +""" +Generates a dummy item. +""" +# Standard library imports +from datetime import datetime +import logging +import os +import random +import subprocess +from time import sleep + +from inquisitor import LinearCrawler, cache_image, CACHE_PATH + +logger = logging.getLogger('inquisitor.sources.ksbd') + + +class ThisWebcomic(LinearCrawler): + def get_start_url(self): + return "https://killsixbilliondemons.com/comic/king-of-swords-10-156/" + + def get_next_page_url(self, url, soup): + nav = soup.find('a', class_='comic-nav-next') + return nav.get('href') if nav else None + + def make_item(self, url, soup): + # Basic fields + item = { + 'source': 'ksbd', + 'id': url.rstrip('/').rsplit('/', 1)[1], + 'title': soup.find('h2', class_='post-title').text, + 'link': url, + 'tags': ['ksbd'], + 'callback': {'cached': []} + } + + body = "" + # Cache the imgs + imgs = soup.find('div', id='comic').find_all('img') + for img in imgs: + image_name = os.path.split(img['src'])[1] + chapter_num = soup.find('div', class_='comic-chapter').find('a').text + filename = f'{chapter_num:0>2}-{image_name}' + cached_url = cache_image(__name__, img['src'], filename) + item['callback']['cached'].append( + os.path.join(__name__, filename)) + body += f'

' + alt_text = img.get('alt', img.get('title')) + if alt_text: + body += f'

{alt_text}

' + body_text = soup.find('div', class_='entry') + if body_text: + body += '
' + str(body_text) + item['body'] = body + + return item + + + def get_callback(self, url, soup): + # Dead reckon the images from get_body + val = {'cached': []} + imgs = soup.find('div', id='comic').find_all('img') + for img in imgs: + filename = os.path.split(img['src'])[1] + cached_filename = os.path.join(__name__, filename) + val['cached'].append(cached_filename) + return val + + def max_iterations(self): + return 2 + + +def fetch_new(state): + return ThisWebcomic().fetch_new(state) + + +def callback(state, item): + for filename in item['callback']['cached']: + # Get the path to the image in the cache + path = os.path.join(CACHE_PATH, filename) + # scp the image to catacomb + remote_target = 'tvb@10.7.3.16:/nas/image/' + subprocess.run(['scp', path, remote_target]) + # remove the image to signify success + os.remove(path) diff --git a/sources/redditExample.py b/sources/redditExample.py new file mode 100644 index 0000000..355adf7 --- /dev/null +++ b/sources/redditExample.py @@ -0,0 +1,145 @@ +""" +""" + +# Standard library imports +import datetime +import hashlib +import logging +import time +import traceback + +from inquisitor import RedditScraper + +# Third party imports +import praw + +# Globals +reddit = praw.Reddit("Inquisitor") +logger = logging.getLogger("inquisitor.source.reddit") + + +def hours_old(post): + return ((time.time() - post.created_utc) / 3600) + + +# class rAnime(RedditScraper): +# subreddit_name = 'anime' +# def filter_post(self, post): +# dead_horse = any([ +# 'kimetsu no yaiba' in post.title.lower(), +# 'demon slayer' in post.title.lower(), +# ]) +# return post.score > 40 and not dead_horse +# def get_tags(self, post): +# return super().get_tags(post) + ['weebery'] + + +# class rEdh(RedditScraper): +# subreddit_name = 'EDH' +# def filter_post(self, post): +# return post.score > 30 +# def get_tags(self, post): +# return super().get_tags(post) + ['mtg'] + + +# class rMagicTcg(RedditScraper): +# subreddit_name = 'magicTCG' +# def filter_post(self, post): +# return post.score > 40 and post.link_flair_text not in [ +# 'Altered Cards', 'Arts and Crafts'] +# def get_tags(self, post): +# return super().get_tags(post) + ['mtg'] + + +class rTf2(RedditScraper): + source = __name__ + subreddit_name = 'tf2' + def subreddit_page(self, subreddit): + return subreddit.top(time_filter="month", limit=25) + def get_ttl(self, post): + return 60 * 60 * 24 * 32 + + +# class rTouhou(RedditScraper): +# subreddit_name = 'touhou' +# def subreddit_page(self, subreddit): +# return subreddit.top(time_filter='month', limit=25) +# def get_ttl(self, post): +# return 60 * 60 * 24 * 32 + + +# class rHomestuck(RedditScraper): +# subreddit_name = 'homestuck' +# def subreddit_page(self, subreddit): +# return subreddit.top(time_filter='month', limit=25) +# def get_ttl(self, post): +# return 60 * 60 * 24 * 32 + + +# class rSubSimGpt2(RedditScraper): +# subreddit_name = 'SubSimulatorGPT2' +# def subreddit_page(self, subreddit): +# return subreddit.top(time_filter='month', limit=25) +# def filter_post(self, post): +# return not post.over_18 +# def get_title(self, post): +# return f"/r/{post.author.name[:post.author.name.find('G')]}: {post.title}" +# def get_link(self, post): +# return None if post.is_self else super().get_link(post) + + +# class rDebateReligion(RedditScraper): +# subreddit_name = 'DebateReligion' +# def subreddit_page(self, subreddit): +# return subreddit.new(limit=25) +# def filter_post(self, post): +# return post.author and post.author.name == 'AutoModerator' and hours_old(post) < 28 +# def get_body(self, post): +# return None +# def get_ttd(self, post): +# return 60 * 60 * 28 + + +# class rDndGreentext(RedditScraper): +# subreddit_name = 'DnDGreentext' +# def filter_post(self, post): +# return post.score > 800 +# def get_body(self, post): +# if post.selftext: +# preview_body = post.selftext.replace("\n", "
") +# return f'

{preview_body}

' +# return super().get_body(post) + + +def fetch_new(state): + reddit = praw.Reddit('Inquisitor') + return RedditScraper.fetch_new(state, __name__, reddit) + + +""" + + +def wp_filter(post): + forbidden = ['superpower', 'superhero', 'supervillain', 'immortal', 'alien', 'satan', 'galaxy', 'galactic'] + title = post.title.lower() + return not any([keyword in title for keyword in forbidden]) + +SUBREDDITS = { + 'WritingPrompts': { + 'lambda_get': lambda subreddit: subreddit.top(time_filter="day", limit=25), + 'lambda_filter': wp_filter, + 'title': lambda title, post: title + " [+{}]".format(post.score) + }, + "madeinabyss": { + 'lambda_get': lambda subreddit: subreddit.top(time_filter="week", limit=25), + 'lambda_filter': lambda post: any([s in post.title.lower() for s in ['movie', 'theat', 'us', 'deep', 'soul', 'dawn', '3']]), + 'tags': lambda tags, post: tags + ['weebery'] + }, + "magicthecirclejerking": { + 'lambda_get': lambda subreddit: subreddit.top(time_filter="week", limit=50), + 'lambda_filter': lambda post: True, + 'tags': lambda tags, post: [t for t in tags + ['mtg', "magicTGCJ"] if t != "magicthecirclejerking"], + } +} + +"""