intake-sources/intake-rss/intake_rss/core.py

64 lines
1.6 KiB
Python

import hashlib
import json
import os
import re
import sys
import time
import feedparser
def stderr(*args):
print(*args, file=sys.stderr)
def main():
feed_url = os.environ.get("FEED_URL")
if not feed_url:
stderr("No FEED_URL defined")
return 1
feed = feedparser.parse(feed_url)
if feed.bozo:
stderr("Failed to parse feed", feed_url)
return 1
feed_title = os.environ.get("FEED_TITLE") or feed.feed.get("title")
filter_regex = os.environ.get("FILTER_REGEX")
for entry in feed.entries:
item = {}
entry_link = entry.get("link")
id_basis = entry_link or entry.get("id") or str(entry)
item["id"] = hashlib.md5(id_basis.encode("utf8")).hexdigest()
if filter_regex and re.search(filter_regex, entry.get("title", "")):
stderr("Item matched filter regex, skipping")
continue
entry_title = entry.get("title", "(No title)")
if feed_title:
item["title"] = f"{feed_title}: {entry_title}"
else:
item["title"] = entry_title
if entry_pubparsed := entry.get("published_parsed"):
item["time"] = int(time.mktime(entry_pubparsed))
if entry_desc := entry.get("summary"):
item["body"] = entry_desc
if entry_content := entry.get("content"):
content = " ".join([c.value for c in entry_content])
if "body" in item:
item["body"] = "<hr>" + content
else:
item["body"] = content
if entry_link:
item["link"] = entry_link
print(json.dumps(item))