intake-sources/intake-rss/intake_rss/core.py

64 lines
1.6 KiB
Python
Raw Permalink Normal View History

2023-06-11 04:01:28 +00:00
import hashlib
import json
import os
2023-08-10 03:35:44 +00:00
import re
2023-06-11 04:01:28 +00:00
import sys
import time
import feedparser
def stderr(*args):
print(*args, file=sys.stderr)
def main():
2023-06-11 04:01:28 +00:00
feed_url = os.environ.get("FEED_URL")
if not feed_url:
stderr("No FEED_URL defined")
return 1
2023-06-11 04:01:28 +00:00
feed = feedparser.parse(feed_url)
if feed.bozo:
stderr("Failed to parse feed", feed_url)
return 1
2023-06-11 04:11:13 +00:00
feed_title = os.environ.get("FEED_TITLE") or feed.feed.get("title")
2023-06-11 04:01:28 +00:00
2023-08-10 03:35:44 +00:00
filter_regex = os.environ.get("FILTER_REGEX")
2023-06-11 04:01:28 +00:00
for entry in feed.entries:
item = {}
2023-06-11 04:01:28 +00:00
entry_link = entry.get("link")
id_basis = entry_link or entry.get("id") or str(entry)
item["id"] = hashlib.md5(id_basis.encode("utf8")).hexdigest()
2023-08-10 03:35:44 +00:00
if filter_regex and re.search(filter_regex, entry.get("title", "")):
stderr("Item matched filter regex, skipping")
continue
2023-06-11 04:01:28 +00:00
entry_title = entry.get("title", "(No title)")
if feed_title:
item["title"] = f"{feed_title}: {entry_title}"
else:
item["title"] = entry_title
if entry_pubparsed := entry.get("published_parsed"):
item["time"] = int(time.mktime(entry_pubparsed))
if entry_desc := entry.get("summary"):
item["body"] = entry_desc
2023-07-24 17:52:15 +00:00
if entry_content := entry.get("content"):
content = " ".join([c.value for c in entry_content])
if "body" in item:
item["body"] = "<hr>" + content
else:
item["body"] = content
2023-06-11 04:01:28 +00:00
if entry_link:
item["link"] = entry_link
print(json.dumps(item))