2023-05-29 21:58:22 +00:00
|
|
|
from pathlib import Path
|
|
|
|
from subprocess import Popen, PIPE, TimeoutExpired
|
|
|
|
from threading import Thread
|
2023-05-29 23:19:29 +00:00
|
|
|
from typing import List
|
2023-05-29 21:58:22 +00:00
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import os.path
|
2023-05-29 23:19:29 +00:00
|
|
|
import time
|
2023-05-29 21:58:22 +00:00
|
|
|
|
|
|
|
from .types import InvalidConfigException, SourceUpdateException
|
|
|
|
|
|
|
|
|
2023-05-29 23:19:29 +00:00
|
|
|
class LocalSource:
|
|
|
|
"""
|
|
|
|
An intake source backed by a filesystem directory.
|
|
|
|
"""
|
2023-05-30 00:17:51 +00:00
|
|
|
|
2023-05-29 23:19:29 +00:00
|
|
|
def __init__(self, data_path: Path, source_name: str):
|
|
|
|
self.data_path: Path = data_path
|
|
|
|
self.source_name = source_name
|
|
|
|
self.source_path: Path = data_path / source_name
|
|
|
|
|
|
|
|
def get_config(self) -> dict:
|
|
|
|
config_path = self.source_path / "intake.json"
|
|
|
|
with open(config_path, "r", encoding="utf8") as config_file:
|
|
|
|
return json.load(config_file)
|
|
|
|
|
|
|
|
def get_state_path(self) -> Path:
|
|
|
|
return (self.source_path / "state").absolute()
|
|
|
|
|
|
|
|
def get_item_path(self, item_id: dict) -> Path:
|
|
|
|
return self.source_path / f"{item_id}.item"
|
|
|
|
|
|
|
|
def get_item_ids(self) -> List[str]:
|
|
|
|
return [
|
|
|
|
filepath.name[:-5]
|
|
|
|
for filepath in self.source_path.iterdir()
|
2023-05-30 00:17:51 +00:00
|
|
|
if filepath.name.endswith(".item")
|
|
|
|
]
|
2023-05-29 23:19:29 +00:00
|
|
|
|
|
|
|
def item_exists(self, item_id) -> bool:
|
|
|
|
return self.get_item_path(item_id).exists()
|
|
|
|
|
|
|
|
def new_item(self, item: dict) -> dict:
|
|
|
|
# Ensure required fields
|
|
|
|
if "id" not in item:
|
|
|
|
raise KeyError("id")
|
|
|
|
item["source"] = self.source_name
|
|
|
|
item["active"] = True
|
|
|
|
item["created"] = int(time.time())
|
|
|
|
item["title"] = item.get("title", item["id"])
|
|
|
|
item["tags"] = item.get("tags", [self.source_name])
|
|
|
|
# All other fields are optiona
|
|
|
|
self.save_item(item)
|
|
|
|
return item
|
|
|
|
|
|
|
|
def get_item(self, item_id: str) -> dict:
|
|
|
|
with self.get_item_path(item_id).open() as f:
|
|
|
|
return json.load(f)
|
|
|
|
|
|
|
|
def save_item(self, item: dict) -> None:
|
|
|
|
# Write to a tempfile first to avoid losing the item on write failure
|
|
|
|
tmp_path = self.source_path / f"{item['id']}.item.tmp"
|
|
|
|
with tmp_path.open("w") as f:
|
|
|
|
f.write(json.dumps(item, indent=2))
|
2023-05-30 00:17:51 +00:00
|
|
|
os.rename(tmp_path, self.get_item_path(item["id"]))
|
2023-05-29 23:19:29 +00:00
|
|
|
|
|
|
|
def delete_item(self, item_id) -> None:
|
|
|
|
os.remove(self.get_item_path(item_id))
|
|
|
|
|
|
|
|
|
2023-05-29 21:58:22 +00:00
|
|
|
def read_stdout(process: Popen, outs: list):
|
|
|
|
"""
|
|
|
|
Read the subprocess's stdout into memory.
|
|
|
|
This prevents the process from blocking when the pipe fills up.
|
|
|
|
"""
|
|
|
|
while True:
|
|
|
|
data = process.stdout.readline()
|
|
|
|
if data:
|
|
|
|
print(f"[stdout] <{repr(data)}>")
|
|
|
|
outs.append(data)
|
|
|
|
if process.poll() is not None:
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
def read_stderr(process: Popen):
|
|
|
|
"""
|
|
|
|
Read the subprocess's stderr stream and pass it to logging.
|
|
|
|
This prevents the process from blocking when the pipe fills up.
|
|
|
|
"""
|
|
|
|
while True:
|
|
|
|
data = process.stderr.readline()
|
|
|
|
if data:
|
|
|
|
print(f"[stderr] <{repr(data)}>")
|
|
|
|
if process.poll() is not None:
|
|
|
|
break
|
|
|
|
|
|
|
|
|
2023-05-29 23:19:29 +00:00
|
|
|
def fetch_items(source: LocalSource, update_timeout=60):
|
2023-05-29 21:58:22 +00:00
|
|
|
"""
|
|
|
|
Execute the feed source and return the current feed items.
|
|
|
|
Returns a list of feed items on success.
|
|
|
|
Throws SourceUpdateException if the feed source update failed.
|
|
|
|
"""
|
|
|
|
# Load the source's config to get its update command
|
2023-05-29 23:19:29 +00:00
|
|
|
config = source.get_config()
|
2023-05-29 21:58:22 +00:00
|
|
|
|
|
|
|
if "fetch" not in config:
|
2023-05-29 23:19:29 +00:00
|
|
|
raise InvalidConfigException("Missing fetch")
|
2023-05-29 21:58:22 +00:00
|
|
|
|
|
|
|
exe_name = config["fetch"]["exe"]
|
|
|
|
exe_args = config["fetch"].get("args", [])
|
|
|
|
|
|
|
|
# Overlay the current env with the config env and intake-provided values
|
|
|
|
exe_env = {
|
|
|
|
**os.environ.copy(),
|
|
|
|
**config.get("env", {}),
|
2023-05-29 23:19:29 +00:00
|
|
|
"STATE_PATH": str(source.get_state_path()),
|
2023-05-29 21:58:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# Launch the update command
|
|
|
|
try:
|
|
|
|
process = Popen(
|
|
|
|
[exe_name, *exe_args],
|
|
|
|
stdout=PIPE,
|
|
|
|
stderr=PIPE,
|
2023-05-29 23:19:29 +00:00
|
|
|
cwd=source.source_path,
|
2023-05-29 21:58:22 +00:00
|
|
|
env=exe_env,
|
|
|
|
encoding="utf8",
|
|
|
|
)
|
|
|
|
except PermissionError:
|
|
|
|
raise SourceUpdateException("command not executable")
|
|
|
|
|
|
|
|
# While the update command is executing, watch its output
|
|
|
|
t_stderr = Thread(target=read_stderr, args=(process,), daemon=True)
|
|
|
|
t_stderr.start()
|
|
|
|
|
|
|
|
outs = []
|
|
|
|
t_stdout = Thread(target=read_stdout, args=(process, outs), daemon=True)
|
|
|
|
t_stdout.start()
|
|
|
|
|
|
|
|
# Time out the process if it takes too long
|
|
|
|
try:
|
|
|
|
process.wait(timeout=update_timeout)
|
|
|
|
except TimeoutExpired:
|
|
|
|
process.kill()
|
|
|
|
t_stdout.join(timeout=1)
|
|
|
|
t_stderr.join(timeout=1)
|
|
|
|
|
|
|
|
if process.poll():
|
|
|
|
raise SourceUpdateException("return code")
|
|
|
|
|
|
|
|
items = []
|
|
|
|
for line in outs:
|
|
|
|
try:
|
|
|
|
item = json.loads(line)
|
|
|
|
items.append(item)
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
raise SourceUpdateException("invalid json")
|
|
|
|
|
|
|
|
return items
|
2023-05-29 23:19:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
def update_items(source: LocalSource, fetched_items):
|
|
|
|
"""
|
|
|
|
Update the source with a batch of new items, doing creations, updates, and
|
|
|
|
deletions as necessary.
|
|
|
|
"""
|
|
|
|
# Get a list of item ids that already existed for this source.
|
|
|
|
prior_ids = source.get_item_ids()
|
2023-05-30 00:17:51 +00:00
|
|
|
print(f"Found {len(prior_ids)} prior items")
|
2023-05-29 23:19:29 +00:00
|
|
|
|
|
|
|
# Determine which items are new and which are updates.
|
|
|
|
new_items = []
|
|
|
|
upd_items = []
|
|
|
|
for item in fetched_items:
|
|
|
|
if source.item_exists(item["id"]):
|
|
|
|
upd_items.append(item)
|
|
|
|
else:
|
|
|
|
new_items.append(item)
|
|
|
|
|
|
|
|
# Write all the new items to the source directory.
|
|
|
|
for item in new_items:
|
|
|
|
# TODO: support on-create trigger
|
|
|
|
source.new_item(item)
|
|
|
|
|
|
|
|
# Update the other items using the fetched items' values.
|
|
|
|
for upd_item in upd_items:
|
|
|
|
old_item = source.get_item(upd_item["id"])
|
2023-05-30 00:17:51 +00:00
|
|
|
for field in (
|
|
|
|
"title",
|
|
|
|
"tags",
|
|
|
|
"link",
|
|
|
|
"time",
|
|
|
|
"author",
|
|
|
|
"body",
|
|
|
|
"ttl",
|
|
|
|
"ttd",
|
|
|
|
"tts",
|
|
|
|
):
|
2023-05-29 23:19:29 +00:00
|
|
|
if field in upd_item and old_item[field] != upd_item[field]:
|
|
|
|
old_item[field] = upd_item[field]
|
|
|
|
if "callback" in upd_item:
|
|
|
|
# Because of the way this update happens, any fields that are set
|
|
|
|
# in the callback when the item is new will keep their original
|
|
|
|
# values, as those values reappear in new_item on subsequent
|
|
|
|
# updates.
|
2023-05-30 00:17:51 +00:00
|
|
|
old_item["callback"] = {**old_item["callback"], **upd_item["callback"]}
|
2023-05-29 23:19:29 +00:00
|
|
|
|
|
|
|
# Items are removed when they are old (not in the latest fetch) and
|
|
|
|
# inactive. Some item fields change this basic behavior.
|
|
|
|
del_count = 0
|
|
|
|
now = int(time.time())
|
|
|
|
upd_ids = [item["id"] for item in upd_items]
|
2023-05-30 00:17:51 +00:00
|
|
|
old_item_ids = [item_id for item_id in prior_ids if item_id not in upd_ids]
|
2023-05-29 23:19:29 +00:00
|
|
|
|
|
|
|
for item_id in old_item_ids:
|
|
|
|
item = source.get_item(item_id)
|
|
|
|
remove = not item["active"]
|
|
|
|
|
|
|
|
# The time-to-live field protects an item from removal until expiry.
|
|
|
|
# This is mainly used to avoid old items resurfacing when their source
|
|
|
|
# cannot guarantee monotonicity.
|
|
|
|
if "ttl" in item:
|
|
|
|
ttl_date = item["created"] + item["ttl"]
|
|
|
|
if ttl_date > now:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# The time-to-die field puts a maximum lifespan on an item, removing it
|
|
|
|
# even if it is active.
|
|
|
|
if "ttd" in item:
|
|
|
|
ttd_date = item["created"] + item["ttd"]
|
|
|
|
if ttd_date < now:
|
|
|
|
remove = True
|
|
|
|
|
|
|
|
# Items to be removed are deleted.
|
|
|
|
if remove:
|
|
|
|
source.delete_item(item["id"])
|
|
|
|
del_count += 1
|
|
|
|
|
|
|
|
print(len(new_items), "new,", del_count, "deleted")
|