Add reddit scraper to source templates
This commit is contained in:
parent
7a85aa3dac
commit
9e31bd74b1
|
@ -1,2 +1,2 @@
|
|||
from inquisitor.configs import CACHE_PATH
|
||||
from inquisitor.templates import cache_image, LinearCrawler
|
||||
from inquisitor.templates import cache_image, LinearCrawler, RedditScraper
|
|
@ -3,10 +3,12 @@ Generates a dummy item.
|
|||
"""
|
||||
# Standard library imports
|
||||
from datetime import datetime
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from time import sleep
|
||||
import sys
|
||||
|
||||
# Third-party library imports
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -97,3 +99,117 @@ class LinearCrawler:
|
|||
|
||||
def make_item(self, url, soup):
|
||||
raise NotImplementedError('make_item is required')
|
||||
|
||||
|
||||
class RedditScraper:
|
||||
"""
|
||||
An engine for generating items from subreddits.
|
||||
"""
|
||||
@staticmethod
|
||||
def fetch_new(state, name, reddit):
|
||||
items = []
|
||||
for name, obj in inspect.getmembers(sys.modules[name]):
|
||||
if (inspect.isclass(obj)
|
||||
and issubclass(obj, RedditScraper)
|
||||
and obj is not RedditScraper
|
||||
):
|
||||
sub_items = obj(reddit).get_items()
|
||||
items.extend(sub_items)
|
||||
return items
|
||||
|
||||
def __init__(self, reddit):
|
||||
self.reddit = reddit
|
||||
|
||||
def get_items(self):
|
||||
sub_name = self.subreddit_name
|
||||
logger.info(f'Fetching posts from r/{sub_name}')
|
||||
subreddit = self.reddit.subreddit(sub_name)
|
||||
posts = self.subreddit_page(subreddit)
|
||||
items = []
|
||||
for post in posts:
|
||||
if self.filter_post(post):
|
||||
items.append(self.item_from_post(post))
|
||||
return items
|
||||
|
||||
def item_from_post(self, post):
|
||||
item = {
|
||||
'source': 'reddit',
|
||||
'id': post.id,
|
||||
'title': self.get_title(post),
|
||||
'link': self.get_link(post),
|
||||
'time': post.created_utc,
|
||||
'author': '/u/' + (post.author.name if post.author else "[deleted]"),
|
||||
'body': self.get_body(post),
|
||||
'tags': self.get_tags(post),
|
||||
'ttl': self.get_ttl(post),
|
||||
}
|
||||
ttl = self.get_ttl(post)
|
||||
if ttl is not None: item['ttl'] = ttl
|
||||
ttd = self.get_ttd(post)
|
||||
if ttd is not None: item['ttd'] = ttd
|
||||
tts = self.get_tts(post)
|
||||
if tts is not None: item['tts'] = tts
|
||||
callback = self.get_callback(post)
|
||||
if callback is not None: item['callback'] = callback
|
||||
return item
|
||||
|
||||
def subreddit_page(self, subreddit):
|
||||
return subreddit.hot(limit=25)
|
||||
|
||||
def filter_post(self, post):
|
||||
return True
|
||||
|
||||
def get_title(self, post):
|
||||
s = '[S] ' if post.spoiler else ''
|
||||
nsfw = '[NSFW] ' if post.over_18 else ''
|
||||
return f'{s}{nsfw}/{post.subreddit_name_prefixed}: {post.title}'
|
||||
|
||||
def get_link(self, post):
|
||||
return f'https://reddit.com{post.permalink}'
|
||||
|
||||
def get_body(self, post):
|
||||
parts = []
|
||||
if not post.is_self:
|
||||
parts.append(f'<a href="{post.url}">link post</a>')
|
||||
if hasattr(post, 'preview'):
|
||||
try:
|
||||
previews = post.preview['images'][0]['resolutions']
|
||||
small_previews = [p for p in previews if p['width'] < 800]
|
||||
preview = sorted(small_previews, key=lambda p:-p['width'])[0]
|
||||
parts.append(f'<img src="{preview["url"]}">')
|
||||
except:
|
||||
pass
|
||||
if post.selftext:
|
||||
limit = post.selftext[1024:].find(' ')
|
||||
preview_body = post.selftext[:1024 + limit]
|
||||
if len(preview_body) < len(post.selftext):
|
||||
preview_body += '[...]'
|
||||
parts.append(f'<p>{preview_body}</p>')
|
||||
return '<br><hr>'.join(parts)
|
||||
|
||||
def get_tags(self, post):
|
||||
tags = ['reddit', post.subreddit_name_prefixed[2:]]
|
||||
if post.over_18:
|
||||
tags.append('nsfw')
|
||||
return tags
|
||||
|
||||
def get_ttl(self, post):
|
||||
return 60 * 60 * 24 * 7, # 1 week
|
||||
|
||||
def get_ttd(self, post):
|
||||
return None
|
||||
|
||||
def get_tts(self, post):
|
||||
return None
|
||||
|
||||
def get_callback(self, post):
|
||||
return None
|
||||
|
||||
def callback(self, state, item):
|
||||
raise NotImplementedError('callback')
|
||||
|
||||
def on_create(self, state, item):
|
||||
raise NotImplementedError('on_create')
|
||||
|
||||
def on_delete(self, state, item):
|
||||
raise NotImplementedError('on_delete')
|
||||
|
|
Loading…
Reference in New Issue