Add source-level item batching
This commit is contained in:
parent
d0780a9fd1
commit
ec33495c56
|
@ -40,7 +40,8 @@ intake
|
|||
"env": {
|
||||
"...": "..."
|
||||
},
|
||||
"cron": "* * * * *"
|
||||
"cron": "* * * * *",
|
||||
"batch": "<number>"
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -50,6 +51,10 @@ Each key under `env` defines an environment variable that will be set when `fetc
|
|||
|
||||
If `cron` is present, it must define a crontab schedule. Intake will automatically create crontab entries to update each source according to its cron schedule.
|
||||
|
||||
`batch` may be a number or string of a number. If it is present, items created by the source will be batched via `tts` so that all items created in a single 24-hour window become visible at the same time. Items created with a longer `tts` will keep their `tts`.
|
||||
|
||||
The batch window is computed from midnight to midnight UTC, offset by the value of `batch` (in seconds).
|
||||
|
||||
## Interface for source programs
|
||||
|
||||
Intake interacts with sources by executing the actions defined in the source's `intake.json`. The `fetch` action is required and used to check for new feed items when `intake update` is executed.
|
||||
|
|
|
@ -333,6 +333,8 @@ def _parse_source_config(config_str: str):
|
|||
config["env"] = parsed["env"]
|
||||
if "cron" in parsed:
|
||||
config["cron"] = parsed["cron"]
|
||||
if "batch" in parsed:
|
||||
config["batch"] = parsed["batch"]
|
||||
return (None, config)
|
||||
|
||||
|
||||
|
|
|
@ -81,26 +81,31 @@ class Item:
|
|||
# The time-to-live fields protects an item from removal until expiry.
|
||||
# This is mainly used to avoid old items resurfacing when their source
|
||||
# cannot guarantee monotonocity.
|
||||
if "ttl" in self._item:
|
||||
ttl_date = self._item["created"] + self._item["ttl"]
|
||||
if ttl_date > current_time():
|
||||
return False
|
||||
if "ttl" in self._item and self.ttl_at > current_time():
|
||||
return False
|
||||
|
||||
# The time-to-die field puts a maximum lifespan on an item, removing it
|
||||
# even if it is active.
|
||||
if "ttd" in self._item:
|
||||
ttd_date = self._item["created"] + self._item["ttd"]
|
||||
if ttd_date < current_time():
|
||||
return True
|
||||
if "ttd" in self._item and self.ttd_at < current_time():
|
||||
return True
|
||||
|
||||
return not self._item["active"]
|
||||
|
||||
@property
|
||||
def tts_at(self):
|
||||
return self._item["created"] + self._item.get("tts", 0)
|
||||
|
||||
@property
|
||||
def ttl_at(self):
|
||||
return self._item["created"] + self._item.get("ttl", 0)
|
||||
|
||||
@property
|
||||
def ttd_at(self):
|
||||
return self._item["created"] + self._item.get("ttd", 0)
|
||||
|
||||
@property
|
||||
def before_tts(self):
|
||||
return (
|
||||
"tts" in self._item
|
||||
and current_time() < self._item["created"] + self._item["tts"]
|
||||
)
|
||||
return "tts" in self._item and current_time() < self.tts_at
|
||||
|
||||
@property
|
||||
def is_hidden(self):
|
||||
|
@ -358,6 +363,19 @@ def update_items(source: LocalSource, fetched_items: list[Item]):
|
|||
else:
|
||||
new_items.append(item)
|
||||
|
||||
# If the source is batched, set the tts on new items to at least the batch tts
|
||||
if "batch" in config:
|
||||
try:
|
||||
batch_adj = int(config["batch"])
|
||||
now = current_time() - batch_adj
|
||||
batch_start = now - (now % 86400)
|
||||
batch_end = batch_start + 86400 + batch_adj
|
||||
for item in new_items:
|
||||
min_tts = batch_end - item["created"]
|
||||
item["tts"] = min(min_tts, item.get("tts", min_tts))
|
||||
except:
|
||||
pass
|
||||
|
||||
# Write all the new items to the source directory.
|
||||
for item in new_items:
|
||||
source.save_item(item)
|
||||
|
|
|
@ -178,7 +178,9 @@ var doAction = function (source, itemid, action) {
|
|||
{% if item.source %}{{item.source}}{% endif %}
|
||||
{% if item.id %}{{item.id}}{% endif %}
|
||||
{% if item.created %}{{item.created|datetimeformat}}{% endif %}
|
||||
{% if item.ttl %}L{% endif %}{% if item.ttd %}D{% endif %}{% if item.tts %}S{% endif %}
|
||||
{% if item.ttl %}<span title="TTL {{item.ttl_at|datetimeformat}}">[L]</span>{% endif %}
|
||||
{% if item.ttd %}<span title="TTD {{item.ttd_at|datetimeformat}}">[D]</span>{% endif %}
|
||||
{% if item.tts %}<span title="TTS {{item.tts_at|datetimeformat}}">[S]</span>{% endif %}
|
||||
</span>
|
||||
{% endif %}
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from pathlib import Path
|
||||
from typing import List, Callable
|
||||
from typing import Generator, Callable
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -14,9 +14,9 @@ def clean_source(source_path: Path):
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
def using_source() -> Callable:
|
||||
def using_source() -> Generator[Callable, None, LocalSource]:
|
||||
test_data = Path(__file__).parent
|
||||
sources: List[Path] = []
|
||||
sources: list[Path] = []
|
||||
|
||||
def _using_source(name: str):
|
||||
source_path = test_data / name
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import json
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
|
||||
from intake.source import fetch_items, update_items, LocalSource
|
||||
|
||||
|
@ -62,3 +64,44 @@ def test_basic_lifecycle(using_source):
|
|||
items = list(source.get_all_items())
|
||||
assert len(items) == 1
|
||||
assert items[0]["id"] == "second"
|
||||
|
||||
|
||||
def test_batch():
|
||||
with tempfile.TemporaryDirectory() as data_dir:
|
||||
root = Path(data_dir)
|
||||
source_dir = root / "batching"
|
||||
source_dir.mkdir()
|
||||
config_file = source_dir / "intake.json"
|
||||
sh_args = [
|
||||
"python",
|
||||
"-c",
|
||||
"import random; print(f'{{\"id\":\"{random.randrange(16**16):016x}\"}}')"
|
||||
]
|
||||
batch_config = {
|
||||
"action": {
|
||||
"fetch": {
|
||||
"args": sh_args
|
||||
}
|
||||
},
|
||||
"batch": 0
|
||||
}
|
||||
config_file.write_text(json.dumps(batch_config))
|
||||
source = LocalSource(root, source_dir.name)
|
||||
|
||||
# batch sets the tts
|
||||
fetch1 = fetch_items(source)
|
||||
assert len(fetch1) == 1
|
||||
update_items(source, fetch1)
|
||||
item1 = source.get_item(fetch1[0]["id"])
|
||||
assert "tts" in item1._item
|
||||
|
||||
batch_config["batch"] = 3600
|
||||
config_file.write_text(json.dumps(batch_config))
|
||||
|
||||
fetch2 = fetch_items(source)
|
||||
assert len(fetch2) == 1
|
||||
update_items(source, fetch2)
|
||||
item2 = source.get_item(fetch2[0]["id"])
|
||||
assert "tts" in item2._item
|
||||
assert item1["id"] != item2["id"]
|
||||
assert item2.tts_at == item1.tts_at + 3600
|
||||
|
|
Loading…
Reference in New Issue