Test sources in progress

2020-12-28 11:39:30 -08:00 · 2020-12-28 11:39:30 -08:00 · 377aabba55
commit 377aabba55
parent 933596d25c
3 changed files with 313 additions and 0 deletions
--- a/sources/falseknees.py
+++ b/sources/falseknees.py
@ -0,0 +1,84 @@
 """
 Generates a dummy item.
 """
 # Standard library imports
 from datetime import datetime
 import logging
 import os
 import random
 import subprocess
 from time import sleep
 from inquisitor import LinearCrawler, cache_image, CACHE_PATH
 logger = logging.getLogger('inquisitor.sources.falseknees')
 class FalseKnees(LinearCrawler):
 	def get_start_url(self):
 		return "https://falseknees.com/1.html"
 	def get_next_page_url(self, url, soup):
 		nav = soup.find('a', class_='comic-nav-next')
 		return nav.get('href') if nav else None
 	def make_item(self, url, soup):
 		# Basic fields
 		item = {
 			'source': 'ksbd',
 			'id': url.rstrip('/').rsplit('/', 1)[1],
 			'title': soup.find('h2', class_='post-title').text,
 			'link': url,
 			'tags': ['ksbd'],
 			'callback': {'cached': []}
 		}
 		body = ""
 		# Cache the imgs
 		imgs = soup.find('div', id='comic').find_all('img')
 		for img in imgs:
 			image_name = os.path.split(img['src'])[1]
 			chapter_num = soup.find('div', class_='comic-chapter').find('a').text
 			filename = f'{chapter_num:0>2}-{image_name}'
 			cached_url = cache_image(__name__, img['src'], filename)
 			item['callback']['cached'].append(
 				os.path.join(__name__, filename))
 			body += f'<p><img src="{cached_url}"></p>'
 			alt_text = img.get('alt', img.get('title'))
 			if alt_text:
 				body += f'<p>{alt_text}</p>'
 		body_text = soup.find('div', class_='entry')
 		if body_text:
 			body += '<hr>' + str(body_text)
 		item['body'] = body
 		return item
 	def get_callback(self, url, soup):
 		# Dead reckon the images from get_body
 		val = {'cached': []}
 		imgs = soup.find('div', id='comic').find_all('img')
 		for img in imgs:
 			filename = os.path.split(img['src'])[1]
 			cached_filename = os.path.join(__name__, filename)
 			val['cached'].append(cached_filename)
 		return val
 	def max_iterations(self):
 		return 2
 def fetch_new(state):
 	return ThisWebcomic().fetch_new(state)
 def callback(state, item):
 	for filename in item['callback']['cached']:
 		# Get the path to the image in the cache
 		path = os.path.join(CACHE_PATH, filename)
 		# scp the image to catacomb
 		remote_target = 'tvb@10.7.3.16:/nas/image/'
 		subprocess.run(['scp', path, remote_target])
 		# remove the image to signify success
 		os.remove(path)
--- a/sources/ksbd.py
+++ b/sources/ksbd.py
@ -0,0 +1,84 @@
 """
 Generates a dummy item.
 """
 # Standard library imports
 from datetime import datetime
 import logging
 import os
 import random
 import subprocess
 from time import sleep
 from inquisitor import LinearCrawler, cache_image, CACHE_PATH
 logger = logging.getLogger('inquisitor.sources.ksbd')
 class ThisWebcomic(LinearCrawler):
 	def get_start_url(self):
 		return "https://killsixbilliondemons.com/comic/king-of-swords-10-156/"
 	def get_next_page_url(self, url, soup):
 		nav = soup.find('a', class_='comic-nav-next')
 		return nav.get('href') if nav else None
 	def make_item(self, url, soup):
 		# Basic fields
 		item = {
 			'source': 'ksbd',
 			'id': url.rstrip('/').rsplit('/', 1)[1],
 			'title': soup.find('h2', class_='post-title').text,
 			'link': url,
 			'tags': ['ksbd'],
 			'callback': {'cached': []}
 		}
 		body = ""
 		# Cache the imgs
 		imgs = soup.find('div', id='comic').find_all('img')
 		for img in imgs:
 			image_name = os.path.split(img['src'])[1]
 			chapter_num = soup.find('div', class_='comic-chapter').find('a').text
 			filename = f'{chapter_num:0>2}-{image_name}'
 			cached_url = cache_image(__name__, img['src'], filename)
 			item['callback']['cached'].append(
 				os.path.join(__name__, filename))
 			body += f'<p><img src="{cached_url}"></p>'
 			alt_text = img.get('alt', img.get('title'))
 			if alt_text:
 				body += f'<p>{alt_text}</p>'
 		body_text = soup.find('div', class_='entry')
 		if body_text:
 			body += '<hr>' + str(body_text)
 		item['body'] = body
 		return item
 	def get_callback(self, url, soup):
 		# Dead reckon the images from get_body
 		val = {'cached': []}
 		imgs = soup.find('div', id='comic').find_all('img')
 		for img in imgs:
 			filename = os.path.split(img['src'])[1]
 			cached_filename = os.path.join(__name__, filename)
 			val['cached'].append(cached_filename)
 		return val
 	def max_iterations(self):
 		return 2
 def fetch_new(state):
 	return ThisWebcomic().fetch_new(state)
 def callback(state, item):
 	for filename in item['callback']['cached']:
 		# Get the path to the image in the cache
 		path = os.path.join(CACHE_PATH, filename)
 		# scp the image to catacomb
 		remote_target = 'tvb@10.7.3.16:/nas/image/'
 		subprocess.run(['scp', path, remote_target])
 		# remove the image to signify success
 		os.remove(path)
--- a/sources/redditExample.py
+++ b/sources/redditExample.py
@ -0,0 +1,145 @@
 """
 """
 # Standard library imports
 import datetime
 import hashlib
 import logging
 import time
 import traceback
 from inquisitor import RedditScraper
 # Third party imports
 import praw
 # Globals
 reddit = praw.Reddit("Inquisitor")
 logger = logging.getLogger("inquisitor.source.reddit")
 def hours_old(post):
 	return ((time.time() - post.created_utc) / 3600)
 # class rAnime(RedditScraper):
 # 	subreddit_name = 'anime'
 # 	def filter_post(self, post):
 # 		dead_horse = any([
 # 			'kimetsu no yaiba' in post.title.lower(),
 # 			'demon slayer' in post.title.lower(),
 # 		])
 # 		return post.score > 40 and not dead_horse
 # 	def get_tags(self, post):
 # 		return super().get_tags(post) + ['weebery']
 # class rEdh(RedditScraper):
 # 	subreddit_name = 'EDH'
 # 	def filter_post(self, post):
 # 		return post.score > 30
 # 	def get_tags(self, post):
 # 		return super().get_tags(post) + ['mtg']
 # class rMagicTcg(RedditScraper):
 # 	subreddit_name = 'magicTCG'
 # 	def filter_post(self, post):
 # 		return post.score > 40 and post.link_flair_text not in [
 # 			'Altered Cards', 'Arts and Crafts']
 # 	def get_tags(self, post):
 # 		return super().get_tags(post) + ['mtg']
 class rTf2(RedditScraper):
 	source = __name__
 	subreddit_name = 'tf2'
 	def subreddit_page(self, subreddit):
 		return subreddit.top(time_filter="month", limit=25)
 	def get_ttl(self, post):
 		return 60 * 60 * 24 * 32
 # class rTouhou(RedditScraper):
 # 	subreddit_name = 'touhou'
 # 	def subreddit_page(self, subreddit):
 # 		return subreddit.top(time_filter='month', limit=25)
 # 	def get_ttl(self, post):
 # 		return 60 * 60 * 24 * 32
 # class rHomestuck(RedditScraper):
 # 	subreddit_name = 'homestuck'
 # 	def subreddit_page(self, subreddit):
 # 		return subreddit.top(time_filter='month', limit=25)
 # 	def get_ttl(self, post):
 # 		return 60 * 60 * 24 * 32
 # class rSubSimGpt2(RedditScraper):
 # 	subreddit_name = 'SubSimulatorGPT2'
 # 	def subreddit_page(self, subreddit):
 # 		return subreddit.top(time_filter='month', limit=25)
 # 	def filter_post(self, post):
 # 		return not post.over_18
 # 	def get_title(self, post):
 # 		return f"/r/{post.author.name[:post.author.name.find('G')]}: {post.title}"
 # 	def get_link(self, post):
 # 		return None if post.is_self else super().get_link(post)
 # class rDebateReligion(RedditScraper):
 # 	subreddit_name = 'DebateReligion'
 # 	def subreddit_page(self, subreddit):
 # 		return subreddit.new(limit=25)
 # 	def filter_post(self, post):
 # 		return post.author and post.author.name == 'AutoModerator' and hours_old(post) < 28
 # 	def get_body(self, post):
 # 		return None
 # 	def get_ttd(self, post):
 # 		return 60 * 60 * 28
 # class rDndGreentext(RedditScraper):
 # 	subreddit_name = 'DnDGreentext'
 # 	def filter_post(self, post):
 # 		return post.score > 800
 # 	def get_body(self, post):
 # 		if post.selftext:
 # 			preview_body = post.selftext.replace("\n", "<br>")
 # 			return f'<p>{preview_body}</p>'
 # 		return super().get_body(post)
 def fetch_new(state):
 	reddit = praw.Reddit('Inquisitor')
 	return RedditScraper.fetch_new(state, __name__, reddit)
 """
 def wp_filter(post):
 	forbidden = ['superpower', 'superhero', 'supervillain', 'immortal', 'alien', 'satan', 'galaxy', 'galactic']
 	title = post.title.lower()
 	return not any([keyword in title for keyword in forbidden])
 SUBREDDITS = {
 	'WritingPrompts': {
 		'lambda_get': lambda subreddit: subreddit.top(time_filter="day", limit=25),
 		'lambda_filter': wp_filter,
 		'title': lambda title, post: title + " [+{}]".format(post.score)
 	},
 	"madeinabyss": {
 		'lambda_get': lambda subreddit: subreddit.top(time_filter="week", limit=25),
 		'lambda_filter': lambda post: any([s in post.title.lower() for s in ['movie', 'theat', 'us', 'deep', 'soul', 'dawn', '3']]),
 		'tags': lambda tags, post: tags + ['weebery']
 	},
 	"magicthecirclejerking": {
 		'lambda_get': lambda subreddit: subreddit.top(time_filter="week", limit=50),
 		'lambda_filter': lambda post: True,
 		'tags': lambda tags, post: [t for t in tags + ['mtg', "magicTGCJ"] if t != "magicthecirclejerking"],
 	}
 }
 """