From 222f9d9387c329f73be5f37be5601ddd838e0d47 Mon Sep 17 00:00:00 2001 From: Tim Van Baak Date: Sun, 11 Jun 2023 19:57:00 -0700 Subject: [PATCH] Add reddit fetching source --- intake-rss/.gitignore => .gitignore | 0 flake.nix | 6 + intake-reddit/intake_reddit/__init__.py | 0 intake-reddit/intake_reddit/__main__.py | 3 + intake-reddit/intake_reddit/core.py | 181 ++++++++++++++++++++++++ intake-reddit/pyproject.toml | 10 ++ 6 files changed, 200 insertions(+) rename intake-rss/.gitignore => .gitignore (100%) create mode 100644 intake-reddit/intake_reddit/__init__.py create mode 100644 intake-reddit/intake_reddit/__main__.py create mode 100644 intake-reddit/intake_reddit/core.py create mode 100644 intake-reddit/pyproject.toml diff --git a/intake-rss/.gitignore b/.gitignore similarity index 100% rename from intake-rss/.gitignore rename to .gitignore diff --git a/flake.nix b/flake.nix index f233c61..9a187d3 100644 --- a/flake.nix +++ b/flake.nix @@ -15,6 +15,12 @@ format = "pyproject"; propagatedBuildInputs = with pkgs.python38Packages; [ feedparser setuptools ]; }; + intake-reddit = pkgs.python38Packages.buildPythonPackage { + name = "intake-reddit"; + src = builtins.path { path = ./intake-reddit; name = "intake-reddit"; }; + format = "pyproject"; + propagatedBuildInputs = with pkgs.python38Packages; [ setuptools ]; + }; }; devShells.${system} = { diff --git a/intake-reddit/intake_reddit/__init__.py b/intake-reddit/intake_reddit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/intake-reddit/intake_reddit/__main__.py b/intake-reddit/intake_reddit/__main__.py new file mode 100644 index 0000000..06684aa --- /dev/null +++ b/intake-reddit/intake_reddit/__main__.py @@ -0,0 +1,3 @@ +import sys +from .core import main +sys.exit(main()) diff --git a/intake-reddit/intake_reddit/core.py b/intake-reddit/intake_reddit/core.py new file mode 100644 index 0000000..2a74404 --- /dev/null +++ b/intake-reddit/intake_reddit/core.py @@ -0,0 +1,181 @@ +import json +import os +import sys +import time +import urllib.request + + +PAGE_SUFFIX = { + "hot": "", + "new": "new/", + "rising": "rising/", + "controversial": "controversial/", + "top": "top/" +} + +SORT_TTL = { + "hour": 60 * 60, + "day": 60 * 60 * 24, + "week": 60 * 60 * 24 * 7, + "month": 60 * 60 * 24 * 31, + "year": 60 * 60 * 24 * 366, + "all": 60 * 60 * 24 * 366, +} + + +def stderr(*args): + print(*args, file=sys.stderr) + + +def urlopen(url): + attempts = int(os.environ.get("REQUEST_RETRY", "6")) + backoff = 20 + for attempt in range(attempts): + try: + return urllib.request.urlopen(url) + except Exception as ex: + stderr(f"[{attempt + 1}/{attempts}] Error fetching", url) + stderr(ex) + if attempt < attempts - 1: + stderr("Retrying in", backoff, "seconds") + time.sleep(backoff) + backoff *= 2 + else: + stderr("Failed to fetch in", attempts, "tries") + return None + + +def main(): + # Get the subreddit name + sub_name = os.environ.get("SUBREDDIT_NAME") + if not sub_name: + stderr("No SUBREDDIT_NAME defined") + return 1 + + # Fetch the subreddit page and sort type + sub_page, sub_sort = os.environ.get("SUBREDDIT_PAGE", "hot"), "" + if "_" in sub_page: + sub_page, sub_sort = sub_page.split("_", 1) + if sub_page not in PAGE_SUFFIX: + stderr("Unknown subreddit page:", sub_page) + return 1 + + # Assemble and fetch the subreddit listing url + sort_query = "?t=" + sub_sort if sub_sort else "" + url = f"https://www.reddit.com/r/{sub_name}/{PAGE_SUFFIX[sub_page]}.json{sort_query}" + stderr("Fetching", url) + response = urlopen(url) + if not response: + stderr("Could not reach", url) + return 1 + + # Parse the reddit API data + resp_data = response.read().decode("utf8") + info = json.loads(resp_data) + + # Pull item configuration options from the environment + filter_nsfw = os.environ.get("FILTER_NSFW", False) + tag_nsfw = os.environ.get("TAG_NSFW", True) + filter_spoiler = os.environ.get("FILTER_SPOILER", False) + tag_spoiler = os.environ.get("TAG_SPOILER", True) + min_score = int(os.environ.get("MIN_SCORE", 0)) + tags = [tag for tag in os.environ.get("TAGS", "").split(",") if tag] + author_blocklist = [author for author in os.environ.get("AUTHOR_BLOCKLIST", "").split(",") if author] + stderr("filter nsfw =", bool(filter_nsfw)) + stderr("tag nsfw =", bool(tag_nsfw)) + stderr("filter spoiler =", bool(filter_spoiler)) + stderr("tag spoiler =", bool(tag_spoiler)) + stderr("min score =", min_score) + stderr("tags =", ", ".join(tags)) + stderr("author blocklist =", ", ".join(author_blocklist)) + + for post in info["data"]["children"]: + post_data = post["data"] + + # id and tags + item = {} + item["id"] = post_data["id"] + item["tags"] = list(tags) + + # NSFW filter + is_nsfw = post_data.get("over_18", False) + if is_nsfw: + if filter_nsfw: + continue + if tag_nsfw: + item["tags"].append("nsfw") + + # Spoiler filter + is_spoiler = post_data.get("spoiler", False) + if is_spoiler: + if filter_spoiler: + continue + if tag_spoiler: + item["tags"].append("spoiler") + + # Score filter + post_score = post_data.get("score", 0) + if min_score and post_score < min_score: + continue + + # Author filter + post_author = post_data.get("author") + if post_author in author_blocklist: + continue + + # Title + if post_title := post_data.get("title"): + sub_prefixed = post_data.get("subreddit_name_prefixed") or f"r/{sub_name}" + spoiler_part = "[S] " if is_spoiler else "" + nsfw_part = "[NSFW] " if is_nsfw else "" + item["title"] = f"{spoiler_part}{nsfw_part}/{sub_prefixed}: {post_title}" + + # Author + if post_author: + item["author"] = f"/u/{post_author}" + + # Time + if post_created := post_data.get("created_utc"): + item["time"] = int(post_created) + + # Body + parts = [] + if not post_data.get("is_self"): + parts.append(f'link: {post_data.get("url", "(no url)")}') + if preview := post_data.get("preview"): + try: + previews = preview["images"][0]["resolutions"] + small_previews = [p for p in previews if p["width"] < 800] + preview = sorted(small_previews, key=lambda p: -p["width"])[0] + parts.append(f'') + except: + pass + if post_data.get("is_gallery", False): + try: + for gallery_item in post_data["gallery_data"]["items"]: + media_id = gallery_item["media_id"] + metadata = post["media_metadata"][media_id] + small_previews = [p for p in metadata["p"] if p["x"] < 800] + preview = sorted(small_previews, key=lambda p: -p["x"])[0] + parts.append( + f'link: {metadata["s"]["u"]}' + ) + parts.append(f'') + except: + pass + if post_selftext := post_data.get("selftext"): + limit = post_selftext[1024:].find(" ") + preview_body = post_selftext[: 1024 + limit] + if len(preview_body) < len(post_selftext): + preview_body += "[...]" + parts.append(f"

{preview_body}

") + item["body"] = "

".join(parts) + + # Link + if post_link := post_data.get("permalink"): + item["link"] = f"https://reddit.com{post_link}" + + # TTL + item["ttl"] = SORT_TTL.get(sub_sort, 60 * 60 * 24 * 8) + + print(json.dumps(item)) diff --git a/intake-reddit/pyproject.toml b/intake-reddit/pyproject.toml new file mode 100644 index 0000000..297da41 --- /dev/null +++ b/intake-reddit/pyproject.toml @@ -0,0 +1,10 @@ +[project] +name = "intake-reddit" +version = "0.1.0" + +[project.scripts] +intake-reddit = "intake_reddit.core:main" + +[tool.setuptools] +packages = ["intake_reddit"] +