Clean up source update code

This commit is contained in:
Tim Van Baak 2020-08-06 23:12:59 -07:00
parent 024d81336d
commit c116476487
3 changed files with 117 additions and 88 deletions

View File

@ -107,23 +107,17 @@ def command_add(args):
source = args.source or 'inquisitor'
cell_path = os.path.join(DUNGEON_PATH, source)
if not os.path.isdir(cell_path):
if args.create:
os.mkdir(cell_path)
state_path = os.path.join(cell_path, "state")
with open(state_path, 'w', encoding='utf8') as f:
f.write(json.dumps({}))
else:
from inquisitor.sources import ensure_cell
ensure_cell(source)
elif not os.path.isdir(cell_path):
logger.error("Source '{}' does not exist".format(source))
return -1
from inquisitor.sources import populate_new
item = {
'id': '{:x}'.format(random.getrandbits(16 * 4)),
'source': 'inquisitor'
'id': args.id or '{:x}'.format(random.getrandbits(16 * 4)),
'source': source,
}
if args.id: item['id'] = str(args.id)
if args.source: item['source'] = str(args.source)
if args.title: item['title'] = str(args.title)
if args.link: item['link'] = str(args.link)
if args.time: item['time'] = int(args.time)
@ -133,13 +127,10 @@ def command_add(args):
if args.ttl: item['ttl'] = int(args.ttl)
if args.ttd: item['ttd'] = int(args.ttd)
if args.tts: item['tts'] = int(args.tts)
populate_new(item['source'], item)
s = json.dumps(item, indent=2)
path = os.path.join(DUNGEON_PATH, item['source'], item['id'] + '.item')
with open(path, 'w', encoding='utf8') as f:
f.write(s)
logger.info(item)
from inquisitor.loader import new_item
saved_item = new_item(source, item)
logger.info(saved_item)
def command_feed(args):

View File

@ -45,11 +45,6 @@ class WritethroughDict():
self.item[key] = value
self.flush()
def set(self, dict):
for key, value in dict.items():
self.item[key] = value
self.flush()
def __contains__(self, key):
return key in self.item
@ -76,6 +71,60 @@ def load_item(source_name, item_id):
item_path = os.path.join(DUNGEON_PATH, source_name, f'{item_id}.item')
return WritethroughDict.load(item_path)
def item_exists(source_name, item_id):
"""
Checks for the existence of an item.
"""
item_path = os.path.join(DUNGEON_PATH, source_name, f'{item_id}.item')
return os.path.isfile(item_path)
def get_item_ids(cell_name):
"""
Returns a list of item ids in the given cell.
"""
cell_path = os.path.join(DUNGEON_PATH, cell_name)
return [
filename[:-5]
for filename in os.listdir(cell_path)
if filename.endswith('.item')
]
def new_item(source_name, item):
"""
Creates a new item with the fields in the provided dictionary.
Initializes other fields to their default values.
"""
# id is required
if 'id' not in item:
raise Exception(f'Cannot create item with no id. Value = {item}')
# source must be filled in, so if it is absent it is auto-populated with
# source_name. Note: this allows sources to fill in a different source.
if 'source' not in item:
item['source'] = source_name
# active is forced to True for new items
item['active'] = True
# created is forced to the current timestamp
item['created'] = timestamp.now()
# title is auto-populated with the id if missing
if 'title' not in item:
item['title'] = item['id']
# tags is auto-populated if missing (not if empty!)
if 'tags' not in item:
item['tags'] = [source_name]
# All other fields are optional.
item_path = os.path.join(DUNGEON_PATH, item['source'], f'{item["id"]}.item')
return WritethroughDict.create(item_path, item)
def load_items(source_name):
"""
Returns a map of ids to items and a list of unreadable files.

View File

@ -9,6 +9,19 @@ from inquisitor import loader, timestamp, error
from inquisitor.configs import SOURCES_PATH, DUNGEON_PATH, logger
USE_NEWEST = (
'title',
'tags',
'link',
'time'
'author',
'body',
'ttl',
'ttd',
'tts',
)
def ensure_cell(name):
"""
Creates a cell in the dungeon. Idempotent.
@ -92,54 +105,59 @@ def update_source(source_name, fetch_new):
"""
Attempts to update the given source. Raises an exception if the source does.
"""
# Get the existing items from the source's cell.
prior_items, errors = loader.load_items(source_name)
if any(errors):
raise Exception(f'Can\'t update source "{source_name}", some items are corrupt')
logger.debug("Found {} prior items".format(len(prior_items)))
# Get a list of item ids that already existed in this source's cell.
prior_ids = loader.get_item_ids(source_name)
logger.debug(f'Found {len(prior_ids)} prior items')
# Get the feed items from the source's fetch method.
state = loader.load_state(source_name)
fetched = fetch_new(state)
fetched_items = {item['id']: item for item in fetched}
state.flush()
logger.debug(f'Fetched {len(fetched)} items')
fetched_items = {item['id']: item for item in fetched}
# Populate all the fetched items with required or auto-generated fields.
# This also provides an opportunity to throw if the source isn't returning
# valid items.
for item in fetched_items.values():
populate_new(source_name, item)
logger.debug("Fetched {} items".format(len(fetched_items)))
# Determine which items are new and which are updates.
# We query the file system here instead of checking against this source's
# item ids from above because sources are allowed to generate in other
# sources' cells.
new_items = []
updated_items = []
for item in fetched:
item_source = item.get('source', source_name)
if loader.item_exists(item_source, item['id']):
updated_items.append(item)
else:
new_items.append(item)
# Write all the new fetched items to the source's cell.
new_items = [
item for item in fetched_items.values()
if item['id'] not in prior_items]
# Write all the new items to the source's cell.
for item in new_items:
s = json.dumps(item)
path = os.path.join(DUNGEON_PATH, item['source'], item['id'] + ".item")
with open(path, 'w', encoding='utf8') as f:
f.write(s)
item_source = item.get('source', source_name)
loader.new_item(item_source, item)
# Update the extant items using the fetched item's values.
extant_items = [
item for item in fetched_items.values()
if item['id'] in prior_items]
for item in extant_items:
# The items in prior_items are writethrough dicts.
prior_item = prior_items[item['id']]
# Only bother updating active items.
if prior_item['active']:
populate_old(prior_item, item)
# Update the other items using the fetched items' values.
for new_item in updated_items:
old_item = loader.load_item(new_item['source'], new_item['id'])
for field in USE_NEWEST:
if field in new_item and old_item[field] != new_item[field]:
old_item[field] = new_item[field]
if 'callback' in new_item:
old_callback = old_item.get('callback', {})
# Because of the way this update happens, any fields that are set
# in the callback when the item is new will keep their original
# values, as those values reappear in new_item on subsequent
# updates.
old_item['callback'] = {**old_item['callback'], **new_item['callback']}
# In general, items are removed when they are old (not found in the last
# fetch) and inactive. Some item fields can change this basic behavior.
del_count = 0
now = timestamp.now()
old_items = [
item for item in prior_items.values()
if item['id'] not in fetched_items]
for item in old_items:
fetched_ids = [item['id'] for item in updated_items]
old_item_ids = [
item_id for item_id in prior_ids
if item_id not in fetched_ids]
for item_id in old_item_ids:
item = loader.load_item(source_name, item_id)
remove = not item['active']
# The time-to-live field protects an item from removal until expiry.
# This is mainly used to avoid old items resurfacing when their source
@ -170,35 +188,6 @@ def update_source(source_name, fetch_new):
len(new_items), "s" if len(new_items) != 1 else "",
del_count, "s" if del_count != 1 else ""))
def populate_new(source_name, item):
# id is required
if 'id' not in item:
raise Exception(f'Source "{source_name}" returned an item with no id')
# source is auto-populated with the source name if missing
# Note: this allows sources to create items in other cells!
if 'source' not in item: item['source'] = source_name
# active is forced to True for new items
item['active'] = True
# created is forced to the current timestamp
item['created'] = timestamp.now()
# title is auto-populated with the id if missing
if 'title' not in item: item['title'] = item['id']
# tags is auto-populated if missing (not if empty!)
if 'tags' not in item: item['tags'] = [source_name]
# link, time, author, body, ttl, ttd, tts, callback are optional
def populate_old(prior, new):
# Not updated: id, source, active, created
if 'title' in new: prior['title'] = new['title']
if 'tags' in new: prior['tags'] = new['tags']
if 'link' in new: prior['link'] = new['link']
if 'time' in new: prior['time'] = new['time']
if 'author' in new: prior['author'] = new['author']
if 'body' in new: prior['body'] = new['body']
if 'ttl' in new: prior['ttl'] = new['ttl']
if 'ttd' in new: prior['ttd'] = new['ttd']
if 'tts' in new: prior['tts'] = new['tts']
if 'callback' in new: prior['callback'] = new['callback']
def item_callback(source_name, itemid):
try: