From 9e31bd74b135dd269fb0148db2e2690d3f16f6d7 Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Fri, 14 Aug 2020 11:12:49 -0700 Subject: [PATCH] Add reddit scraper to source templates --- inquisitor/__init__.py | 2 +- inquisitor/templates.py | 116 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 1 deletion(-) diff --git a/inquisitor/__init__.py b/inquisitor/__init__.py index bca6ead..d6e633c 100644 --- a/inquisitor/__init__.py +++ b/inquisitor/__init__.py @@ -1,2 +1,2 @@ from inquisitor.configs import CACHE_PATH -from inquisitor.templates import cache_image, LinearCrawler \ No newline at end of file +from inquisitor.templates import cache_image, LinearCrawler, RedditScraper \ No newline at end of file diff --git a/inquisitor/templates.py b/inquisitor/templates.py index b902cb8..7f5bd8d 100644 --- a/inquisitor/templates.py +++ b/inquisitor/templates.py @@ -3,10 +3,12 @@ Generates a dummy item. """ # Standard library imports from datetime import datetime +import inspect import logging import os import random from time import sleep +import sys # Third-party library imports from bs4 import BeautifulSoup @@ -97,3 +99,117 @@ class LinearCrawler: def make_item(self, url, soup): raise NotImplementedError('make_item is required') + + +class RedditScraper: + """ + An engine for generating items from subreddits. + """ + @staticmethod + def fetch_new(state, name, reddit): + items = [] + for name, obj in inspect.getmembers(sys.modules[name]): + if (inspect.isclass(obj) + and issubclass(obj, RedditScraper) + and obj is not RedditScraper + ): + sub_items = obj(reddit).get_items() + items.extend(sub_items) + return items + + def __init__(self, reddit): + self.reddit = reddit + + def get_items(self): + sub_name = self.subreddit_name + logger.info(f'Fetching posts from r/{sub_name}') + subreddit = self.reddit.subreddit(sub_name) + posts = self.subreddit_page(subreddit) + items = [] + for post in posts: + if self.filter_post(post): + items.append(self.item_from_post(post)) + return items + + def item_from_post(self, post): + item = { + 'source': 'reddit', + 'id': post.id, + 'title': self.get_title(post), + 'link': self.get_link(post), + 'time': post.created_utc, + 'author': '/u/' + (post.author.name if post.author else "[deleted]"), + 'body': self.get_body(post), + 'tags': self.get_tags(post), + 'ttl': self.get_ttl(post), + } + ttl = self.get_ttl(post) + if ttl is not None: item['ttl'] = ttl + ttd = self.get_ttd(post) + if ttd is not None: item['ttd'] = ttd + tts = self.get_tts(post) + if tts is not None: item['tts'] = tts + callback = self.get_callback(post) + if callback is not None: item['callback'] = callback + return item + + def subreddit_page(self, subreddit): + return subreddit.hot(limit=25) + + def filter_post(self, post): + return True + + def get_title(self, post): + s = '[S] ' if post.spoiler else '' + nsfw = '[NSFW] ' if post.over_18 else '' + return f'{s}{nsfw}/{post.subreddit_name_prefixed}: {post.title}' + + def get_link(self, post): + return f'https://reddit.com{post.permalink}' + + def get_body(self, post): + parts = [] + if not post.is_self: + parts.append(f'link post') + if hasattr(post, 'preview'): + try: + previews = post.preview['images'][0]['resolutions'] + small_previews = [p for p in previews if p['width'] < 800] + preview = sorted(small_previews, key=lambda p:-p['width'])[0] + parts.append(f'') + except: + pass + if post.selftext: + limit = post.selftext[1024:].find(' ') + preview_body = post.selftext[:1024 + limit] + if len(preview_body) < len(post.selftext): + preview_body += '[...]' + parts.append(f'

{preview_body}

') + return '

'.join(parts) + + def get_tags(self, post): + tags = ['reddit', post.subreddit_name_prefixed[2:]] + if post.over_18: + tags.append('nsfw') + return tags + + def get_ttl(self, post): + return 60 * 60 * 24 * 7, # 1 week + + def get_ttd(self, post): + return None + + def get_tts(self, post): + return None + + def get_callback(self, post): + return None + + def callback(self, state, item): + raise NotImplementedError('callback') + + def on_create(self, state, item): + raise NotImplementedError('on_create') + + def on_delete(self, state, item): + raise NotImplementedError('on_delete')