Test sources in progress

2020-12-28 11:39:30 -08:00 · 2020-12-28 11:39:30 -08:00 · 377aabba55
parent 933596d25c
commit 377aabba55
3 changed files with 313 additions and 0 deletions
--- a/sources/falseknees.py
+++ b/sources/falseknees.py
@ -0,0 +1,84 @@
+"""
+Generates a dummy item.
+"""
+# Standard library imports
+from datetime import datetime
+import logging
+import os
+import random
+import subprocess
+from time import sleep
+
+from inquisitor import LinearCrawler, cache_image, CACHE_PATH
+
+logger = logging.getLogger('inquisitor.sources.falseknees')
+
+
+class FalseKnees(LinearCrawler):
+	def get_start_url(self):
+		return "https://falseknees.com/1.html"
+
+	def get_next_page_url(self, url, soup):
+		nav = soup.find('a', class_='comic-nav-next')
+		return nav.get('href') if nav else None
+
+	def make_item(self, url, soup):
+		# Basic fields
+		item = {
+			'source': 'ksbd',
+			'id': url.rstrip('/').rsplit('/', 1)[1],
+			'title': soup.find('h2', class_='post-title').text,
+			'link': url,
+			'tags': ['ksbd'],
+			'callback': {'cached': []}
+		}
+
+		body = ""
+		# Cache the imgs
+		imgs = soup.find('div', id='comic').find_all('img')
+		for img in imgs:
+			image_name = os.path.split(img['src'])[1]
+			chapter_num = soup.find('div', class_='comic-chapter').find('a').text
+			filename = f'{chapter_num:0>2}-{image_name}'
+			cached_url = cache_image(__name__, img['src'], filename)
+			item['callback']['cached'].append(
+				os.path.join(__name__, filename))
+			body += f'<p><img src="{cached_url}"></p>'
+			alt_text = img.get('alt', img.get('title'))
+			if alt_text:
+				body += f'<p>{alt_text}</p>'
+		body_text = soup.find('div', class_='entry')
+		if body_text:
+			body += '<hr>' + str(body_text)
+		item['body'] = body
+
+		return item
+
+
+	def get_callback(self, url, soup):
+		# Dead reckon the images from get_body
+		val = {'cached': []}
+		imgs = soup.find('div', id='comic').find_all('img')
+		for img in imgs:
+			filename = os.path.split(img['src'])[1]
+			cached_filename = os.path.join(__name__, filename)
+			val['cached'].append(cached_filename)
+		return val
+
+	def max_iterations(self):
+		return 2
+
+
+def fetch_new(state):
+	return ThisWebcomic().fetch_new(state)
+
+
+def callback(state, item):
+	for filename in item['callback']['cached']:
+		# Get the path to the image in the cache
+		path = os.path.join(CACHE_PATH, filename)
+		# scp the image to catacomb
+		remote_target = 'tvb@10.7.3.16:/nas/image/'
+		subprocess.run(['scp', path, remote_target])
+		# remove the image to signify success
+		os.remove(path)
--- a/sources/ksbd.py
+++ b/sources/ksbd.py
@ -0,0 +1,84 @@
+"""
+Generates a dummy item.
+"""
+# Standard library imports
+from datetime import datetime
+import logging
+import os
+import random
+import subprocess
+from time import sleep
+
+from inquisitor import LinearCrawler, cache_image, CACHE_PATH
+
+logger = logging.getLogger('inquisitor.sources.ksbd')
+
+
+class ThisWebcomic(LinearCrawler):
+	def get_start_url(self):
+		return "https://killsixbilliondemons.com/comic/king-of-swords-10-156/"
+
+	def get_next_page_url(self, url, soup):
+		nav = soup.find('a', class_='comic-nav-next')
+		return nav.get('href') if nav else None
+
+	def make_item(self, url, soup):
+		# Basic fields
+		item = {
+			'source': 'ksbd',
+			'id': url.rstrip('/').rsplit('/', 1)[1],
+			'title': soup.find('h2', class_='post-title').text,
+			'link': url,
+			'tags': ['ksbd'],
+			'callback': {'cached': []}
+		}
+
+		body = ""
+		# Cache the imgs
+		imgs = soup.find('div', id='comic').find_all('img')
+		for img in imgs:
+			image_name = os.path.split(img['src'])[1]
+			chapter_num = soup.find('div', class_='comic-chapter').find('a').text
+			filename = f'{chapter_num:0>2}-{image_name}'
+			cached_url = cache_image(__name__, img['src'], filename)
+			item['callback']['cached'].append(
+				os.path.join(__name__, filename))
+			body += f'<p><img src="{cached_url}"></p>'
+			alt_text = img.get('alt', img.get('title'))
+			if alt_text:
+				body += f'<p>{alt_text}</p>'
+		body_text = soup.find('div', class_='entry')
+		if body_text:
+			body += '<hr>' + str(body_text)
+		item['body'] = body
+
+		return item
+
+
+	def get_callback(self, url, soup):
+		# Dead reckon the images from get_body
+		val = {'cached': []}
+		imgs = soup.find('div', id='comic').find_all('img')
+		for img in imgs:
+			filename = os.path.split(img['src'])[1]
+			cached_filename = os.path.join(__name__, filename)
+			val['cached'].append(cached_filename)
+		return val
+
+	def max_iterations(self):
+		return 2
+
+
+def fetch_new(state):
+	return ThisWebcomic().fetch_new(state)
+
+
+def callback(state, item):
+	for filename in item['callback']['cached']:
+		# Get the path to the image in the cache
+		path = os.path.join(CACHE_PATH, filename)
+		# scp the image to catacomb
+		remote_target = 'tvb@10.7.3.16:/nas/image/'
+		subprocess.run(['scp', path, remote_target])
+		# remove the image to signify success
+		os.remove(path)
--- a/sources/redditExample.py
+++ b/sources/redditExample.py
@ -0,0 +1,145 @@
+"""
+"""
+
+# Standard library imports
+import datetime
+import hashlib
+import logging
+import time
+import traceback
+
+from inquisitor import RedditScraper
+
+# Third party imports
+import praw
+
+# Globals
+reddit = praw.Reddit("Inquisitor")
+logger = logging.getLogger("inquisitor.source.reddit")
+
+
+def hours_old(post):
+	return ((time.time() - post.created_utc) / 3600)
+
+
+# class rAnime(RedditScraper):
+# 	subreddit_name = 'anime'
+# 	def filter_post(self, post):
+# 		dead_horse = any([
+# 			'kimetsu no yaiba' in post.title.lower(),
+# 			'demon slayer' in post.title.lower(),
+# 		])
+# 		return post.score > 40 and not dead_horse
+# 	def get_tags(self, post):
+# 		return super().get_tags(post) + ['weebery']
+
+
+# class rEdh(RedditScraper):
+# 	subreddit_name = 'EDH'
+# 	def filter_post(self, post):
+# 		return post.score > 30
+# 	def get_tags(self, post):
+# 		return super().get_tags(post) + ['mtg']
+
+
+# class rMagicTcg(RedditScraper):
+# 	subreddit_name = 'magicTCG'
+# 	def filter_post(self, post):
+# 		return post.score > 40 and post.link_flair_text not in [
+# 			'Altered Cards', 'Arts and Crafts']
+# 	def get_tags(self, post):
+# 		return super().get_tags(post) + ['mtg']
+
+
+class rTf2(RedditScraper):
+	source = __name__
+	subreddit_name = 'tf2'
+	def subreddit_page(self, subreddit):
+		return subreddit.top(time_filter="month", limit=25)
+	def get_ttl(self, post):
+		return 60 * 60 * 24 * 32
+
+
+# class rTouhou(RedditScraper):
+# 	subreddit_name = 'touhou'
+# 	def subreddit_page(self, subreddit):
+# 		return subreddit.top(time_filter='month', limit=25)
+# 	def get_ttl(self, post):
+# 		return 60 * 60 * 24 * 32
+
+
+# class rHomestuck(RedditScraper):
+# 	subreddit_name = 'homestuck'
+# 	def subreddit_page(self, subreddit):
+# 		return subreddit.top(time_filter='month', limit=25)
+# 	def get_ttl(self, post):
+# 		return 60 * 60 * 24 * 32
+
+
+# class rSubSimGpt2(RedditScraper):
+# 	subreddit_name = 'SubSimulatorGPT2'
+# 	def subreddit_page(self, subreddit):
+# 		return subreddit.top(time_filter='month', limit=25)
+# 	def filter_post(self, post):
+# 		return not post.over_18
+# 	def get_title(self, post):
+# 		return f"/r/{post.author.name[:post.author.name.find('G')]}: {post.title}"
+# 	def get_link(self, post):
+# 		return None if post.is_self else super().get_link(post)
+
+
+# class rDebateReligion(RedditScraper):
+# 	subreddit_name = 'DebateReligion'
+# 	def subreddit_page(self, subreddit):
+# 		return subreddit.new(limit=25)
+# 	def filter_post(self, post):
+# 		return post.author and post.author.name == 'AutoModerator' and hours_old(post) < 28
+# 	def get_body(self, post):
+# 		return None
+# 	def get_ttd(self, post):
+# 		return 60 * 60 * 28
+
+
+# class rDndGreentext(RedditScraper):
+# 	subreddit_name = 'DnDGreentext'
+# 	def filter_post(self, post):
+# 		return post.score > 800
+# 	def get_body(self, post):
+# 		if post.selftext:
+# 			preview_body = post.selftext.replace("\n", "<br>")
+# 			return f'<p>{preview_body}</p>'
+# 		return super().get_body(post)
+
+
+def fetch_new(state):
+	reddit = praw.Reddit('Inquisitor')
+	return RedditScraper.fetch_new(state, __name__, reddit)
+
+
+"""
+
+
+def wp_filter(post):
+	forbidden = ['superpower', 'superhero', 'supervillain', 'immortal', 'alien', 'satan', 'galaxy', 'galactic']
+	title = post.title.lower()
+	return not any([keyword in title for keyword in forbidden])
+
+SUBREDDITS = {
+	'WritingPrompts': {
+		'lambda_get': lambda subreddit: subreddit.top(time_filter="day", limit=25),
+		'lambda_filter': wp_filter,
+		'title': lambda title, post: title + " [+{}]".format(post.score)
+	},
+	"madeinabyss": {
+		'lambda_get': lambda subreddit: subreddit.top(time_filter="week", limit=25),
+		'lambda_filter': lambda post: any([s in post.title.lower() for s in ['movie', 'theat', 'us', 'deep', 'soul', 'dawn', '3']]),
+		'tags': lambda tags, post: tags + ['weebery']
+	},
+	"magicthecirclejerking": {
+		'lambda_get': lambda subreddit: subreddit.top(time_filter="week", limit=50),
+		'lambda_filter': lambda post: True,
+		'tags': lambda tags, post: [t for t in tags + ['mtg', "magicTGCJ"] if t != "magicthecirclejerking"],
+	}
+}
+
+"""