Add reddit scraper to source templates

This commit is contained in:
Tim Van Baak 2020-08-14 11:12:49 -07:00
parent 7a85aa3dac
commit 9e31bd74b1
2 changed files with 117 additions and 1 deletions

View File

@ -1,2 +1,2 @@
from inquisitor.configs import CACHE_PATH from inquisitor.configs import CACHE_PATH
from inquisitor.templates import cache_image, LinearCrawler from inquisitor.templates import cache_image, LinearCrawler, RedditScraper

View File

@ -3,10 +3,12 @@ Generates a dummy item.
""" """
# Standard library imports # Standard library imports
from datetime import datetime from datetime import datetime
import inspect
import logging import logging
import os import os
import random import random
from time import sleep from time import sleep
import sys
# Third-party library imports # Third-party library imports
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -97,3 +99,117 @@ class LinearCrawler:
def make_item(self, url, soup): def make_item(self, url, soup):
raise NotImplementedError('make_item is required') raise NotImplementedError('make_item is required')
class RedditScraper:
"""
An engine for generating items from subreddits.
"""
@staticmethod
def fetch_new(state, name, reddit):
items = []
for name, obj in inspect.getmembers(sys.modules[name]):
if (inspect.isclass(obj)
and issubclass(obj, RedditScraper)
and obj is not RedditScraper
):
sub_items = obj(reddit).get_items()
items.extend(sub_items)
return items
def __init__(self, reddit):
self.reddit = reddit
def get_items(self):
sub_name = self.subreddit_name
logger.info(f'Fetching posts from r/{sub_name}')
subreddit = self.reddit.subreddit(sub_name)
posts = self.subreddit_page(subreddit)
items = []
for post in posts:
if self.filter_post(post):
items.append(self.item_from_post(post))
return items
def item_from_post(self, post):
item = {
'source': 'reddit',
'id': post.id,
'title': self.get_title(post),
'link': self.get_link(post),
'time': post.created_utc,
'author': '/u/' + (post.author.name if post.author else "[deleted]"),
'body': self.get_body(post),
'tags': self.get_tags(post),
'ttl': self.get_ttl(post),
}
ttl = self.get_ttl(post)
if ttl is not None: item['ttl'] = ttl
ttd = self.get_ttd(post)
if ttd is not None: item['ttd'] = ttd
tts = self.get_tts(post)
if tts is not None: item['tts'] = tts
callback = self.get_callback(post)
if callback is not None: item['callback'] = callback
return item
def subreddit_page(self, subreddit):
return subreddit.hot(limit=25)
def filter_post(self, post):
return True
def get_title(self, post):
s = '[S] ' if post.spoiler else ''
nsfw = '[NSFW] ' if post.over_18 else ''
return f'{s}{nsfw}/{post.subreddit_name_prefixed}: {post.title}'
def get_link(self, post):
return f'https://reddit.com{post.permalink}'
def get_body(self, post):
parts = []
if not post.is_self:
parts.append(f'<a href="{post.url}">link post</a>')
if hasattr(post, 'preview'):
try:
previews = post.preview['images'][0]['resolutions']
small_previews = [p for p in previews if p['width'] < 800]
preview = sorted(small_previews, key=lambda p:-p['width'])[0]
parts.append(f'<img src="{preview["url"]}">')
except:
pass
if post.selftext:
limit = post.selftext[1024:].find(' ')
preview_body = post.selftext[:1024 + limit]
if len(preview_body) < len(post.selftext):
preview_body += '[...]'
parts.append(f'<p>{preview_body}</p>')
return '<br><hr>'.join(parts)
def get_tags(self, post):
tags = ['reddit', post.subreddit_name_prefixed[2:]]
if post.over_18:
tags.append('nsfw')
return tags
def get_ttl(self, post):
return 60 * 60 * 24 * 7, # 1 week
def get_ttd(self, post):
return None
def get_tts(self, post):
return None
def get_callback(self, post):
return None
def callback(self, state, item):
raise NotImplementedError('callback')
def on_create(self, state, item):
raise NotImplementedError('on_create')
def on_delete(self, state, item):
raise NotImplementedError('on_delete')