intake/intake/source.py

395 lines
12 KiB
Python

from datetime import timedelta
from pathlib import Path
from subprocess import Popen, PIPE, TimeoutExpired
from threading import Thread
from time import time as current_time
from typing import Generator
import json
import os
import os.path
import sys
from intake.types import InvalidConfigException, SourceUpdateException
def intake_data_dir() -> Path:
if intake_data := os.environ.get("INTAKE_DATA"):
return Path(intake_data)
if xdg_data_home := os.environ.get("XDG_DATA_HOME"):
return Path(xdg_data_home) / "intake"
if home := os.environ.get("HOME"):
return Path(home) / ".local" / "share" / "intake"
raise Exception("No intake data directory defined")
class Item:
"""
A wrapper for an item object.
"""
def __init__(self, source: "LocalSource", item: dict):
self.source = source
self._item = item
# Methods to allow Item as a drop-in replacement for the item dict itself
def __contains__(self, key):
return self._item.__contains__(key)
def __iter__(self):
return self._item.__iter__
def __getitem__(self, key):
return self._item.__getitem__(key)
def __setitem__(self, key, value):
return self._item.__setitem__(key, value)
def get(self, key, default=None):
return self._item.get(key, default)
@staticmethod
def create(source: "LocalSource", **fields) -> "Item":
if "id" not in fields:
raise KeyError("id")
item = {
"id": fields["id"],
"created": int(current_time()),
"active": True,
}
for field_name in (
"title",
"author",
"body",
"link",
"time",
"tags",
"tts",
"ttl",
"ttd",
"action",
):
if val := fields.get(field_name):
item[field_name] = val
return Item(source, item)
@property
def display_title(self):
return self._item.get("title", self._item["id"])
@property
def can_remove(self):
# The time-to-live fields protects an item from removal until expiry.
# This is mainly used to avoid old items resurfacing when their source
# cannot guarantee monotonocity.
if "ttl" in self._item:
ttl_date = self._item["created"] + self._item["ttl"]
if ttl_date > current_time():
return False
# The time-to-die field puts a maximum lifespan on an item, removing it
# even if it is active.
if "ttd" in self._item:
ttd_date = self._item["created"] + self._item["ttd"]
if ttd_date < current_time():
return True
return not self._item["active"]
@property
def before_tts(self):
return (
"tts" in self._item
and current_time() < self._item["created"] + self._item["tts"]
)
@property
def is_hidden(self):
return not self._item["active"] or self.before_tts
@property
def sort_key(self):
item_date = self._item.get(
"time",
self._item.get(
"created",
),
)
return (item_date, self._item["id"])
def serialize(self, indent=True):
return json.dumps(self._item, indent=2 if indent else None)
def update_from(self, updated: "Item") -> None:
for field in (
"title",
"author",
"body",
"link",
"time",
"tags",
"tts",
"ttl",
"ttd",
):
if field in updated and self[field] != updated[field]:
self[field] = updated[field]
# Actions are not updated since the available actions and associated
# content is left to the action executor to manage.
class LocalSource:
"""
An intake source backed by a filesystem directory.
"""
def __init__(self, data_path: Path, source_name: str):
self.data_path: Path = data_path
self.source_name = source_name
self.source_path: Path = data_path / source_name
def __str__(self) -> str:
return self.source_name
def get_config(self) -> dict:
config_path = self.source_path / "intake.json"
with open(config_path, "r", encoding="utf8") as config_file:
return json.load(config_file)
def save_config(self, config: dict) -> None:
config_path = self.source_path / "intake.json"
tmp_path = config_path.with_name(f"{config_path.name}.tmp")
with tmp_path.open("w") as f:
f.write(json.dumps(config, indent=2))
os.rename(tmp_path, config_path)
def get_state_path(self) -> Path:
return (self.source_path / "state").absolute()
def get_item_path(self, item_id: dict) -> Path:
return self.source_path / f"{item_id}.item"
def get_item_ids(self) -> list[str]:
return [
filepath.name[:-5]
for filepath in self.source_path.iterdir()
if filepath.name.endswith(".item")
]
def item_exists(self, item_id) -> bool:
return self.get_item_path(item_id).exists()
def get_item(self, item_id: str) -> Item:
with self.get_item_path(item_id).open() as f:
return Item(self, json.load(f))
def save_item(self, item: Item) -> None:
# Write to a tempfile first to avoid losing the item on write failure
item_path = self.get_item_path(item["id"])
tmp_path = item_path.with_name(f"{item_path.name}.tmp")
with tmp_path.open("w") as f:
f.write(item.serialize())
os.rename(tmp_path, item_path)
def delete_item(self, item_id) -> None:
os.remove(self.get_item_path(item_id))
def get_all_items(self) -> Generator[Item, None, None]:
for filepath in self.source_path.iterdir():
if filepath.name.endswith(".item"):
yield Item(self, json.loads(filepath.read_text(encoding="utf8")))
def _read_stdout(process: Popen, output: list) -> None:
"""
Read the subprocess's stdout into memory.
This prevents the process from blocking when the pipe fills up.
"""
while True:
data = process.stdout.readline()
if data:
print(f"[stdout] {data.rstrip()}", file=sys.stderr)
output.append(data)
if process.poll() is not None:
break
def _read_stderr(process: Popen) -> None:
"""
Read the subprocess's stderr stream and pass it to logging.
This prevents the process from blocking when the pipe fills up.
"""
while True:
data = process.stderr.readline()
if data:
print(f"[stderr] {data.rstrip()}", file=sys.stderr)
if process.poll() is not None:
break
def _execute_source_action(
source: LocalSource, action: str, input: str, timeout: timedelta
) -> list[str]:
"""
Execute the action from a given source. If stdin is specified, pass it
along to the process. Returns lines from stdout.
"""
# Gather the information necessary to launch the process
config = source.get_config()
action_cfg = config.get("action", {}).get(action)
if not action_cfg:
raise InvalidConfigException(f"No such action {action}")
exe = [action_cfg["exe"]] if "exe" in action_cfg else []
command = exe + action_cfg.get("args", [])
if not command:
raise InvalidConfigException(f"No exe for action {action}")
config_env = {key: str(value) for key, value in config.get("env", {}).items()}
env = {
**os.environ.copy(),
**config_env,
"STATE_PATH": str(source.get_state_path()),
}
# Launch the process
try:
process = Popen(
command,
stdin=PIPE,
stdout=PIPE,
stderr=PIPE,
cwd=source.source_path,
env=env,
encoding="utf8",
)
except PermissionError:
raise SourceUpdateException(f"Command not executable: {''.join(command)}")
# Kick off monitoring threads
output = []
t_stdout: Thread = Thread(target=_read_stdout, args=(process, output), daemon=True)
t_stdout.start()
t_stderr: Thread = Thread(target=_read_stderr, args=(process,), daemon=True)
t_stderr.start()
# Send input to the process, if provided
if input:
process.stdin.write(input)
if not input.endswith("\n"):
process.stdin.write("\n")
process.stdin.flush()
try:
process.wait(timeout=timeout.total_seconds())
except TimeoutExpired:
process.kill()
t_stdout.join(timeout=1)
t_stderr.join(timeout=1)
if process.poll():
raise SourceUpdateException(
f"{source.source_name} {action} failed with code {process.returncode}"
)
return output
def fetch_items(source: LocalSource, timeout: int = 60) -> list[Item]:
"""
Execute the feed source and return the current feed items.
Returns a list of feed items on success.
Throws SourceUpdateException if the feed source update failed.
"""
items: list[Item] = []
output = _execute_source_action(source, "fetch", None, timedelta(timeout))
for line in output:
try:
item = Item.create(source, **json.loads(line))
items.append(item)
except json.JSONDecodeError:
raise SourceUpdateException("invalid json")
return items
def execute_action(
source: LocalSource, item_id: str, action: str, timeout: int = 60
) -> dict:
"""
Execute the action for a feed source.
"""
item: Item = source.get_item(item_id)
output = _execute_source_action(
source, action, item.serialize(indent=False), timedelta(timeout)
)
if not output:
raise SourceUpdateException("no item")
try:
item = Item(source, json.loads(output[0]))
source.save_item(item)
return item
except json.JSONDecodeError:
raise SourceUpdateException("invalid json")
def update_items(source: LocalSource, fetched_items: list[Item]):
"""
Update the source with a batch of new items, doing creations, updates, and
deletions as necessary.
"""
config = source.get_config()
# Get a list of item ids that already existed for this source.
prior_ids = source.get_item_ids()
print(f"Found {len(prior_ids)} prior items", file=sys.stderr)
# Determine which items are new and which are updates.
new_items: list[Item] = []
upd_items: list[Item] = []
for item in fetched_items:
if source.item_exists(item["id"]):
upd_items.append(item)
else:
new_items.append(item)
# Write all the new items to the source directory.
for item in new_items:
source.save_item(item)
# If the source has an on-create trigger, run it for new items
if "on_create" in config.get("action", {}):
for item in new_items:
try:
execute_action(source, item["id"], "on_create")
except Exception as ex:
print(
f"on_create failed for {source.source_name}/{item['id']}:\n{ex}",
file=sys.stderr,
)
# Update the other items using the fetched items' values.
for upd_item in upd_items:
old_item = source.get_item(upd_item["id"])
old_item.update_from(upd_item)
source.save_item(old_item)
# Items are removed when they are old (not in the latest fetch) and
# inactive. Some item fields change this basic behavior.
del_count = 0
# now = int(current_time())
upd_ids = [item["id"] for item in upd_items]
old_item_ids = [item_id for item_id in prior_ids if item_id not in upd_ids]
for item_id in old_item_ids:
if source.get_item(item_id).can_remove:
source.delete_item(item_id)
del_count += 1
print(len(new_items), "new,", del_count, "deleted", file=sys.stderr)