239 lines
9.6 KiB
Python
239 lines
9.6 KiB
Python
|
|
"""Deterministic dedup/reconcile for the recurring "Events" calendar automation.
|
||
|
|
|
||
|
|
Pure stdlib. Given a list of candidate events (from the curation step) and the
|
||
|
|
events already on the target Google Calendar (from a calendar list call), split
|
||
|
|
the candidates into: new events to insert, duplicates to skip, candidates dropped
|
||
|
|
as past-dated, and candidates dropped because they exceed the per-run cap.
|
||
|
|
|
||
|
|
Identity model
|
||
|
|
--------------
|
||
|
|
An event's identity is ``(normalized_title, start_date)``. Every event we insert
|
||
|
|
is stamped with ``autoKey = sha1(normalized_title + "|" + start_date)``. But
|
||
|
|
matching does NOT depend on a stored key: the existing seed events were
|
||
|
|
hand-created and carry no autoKey, so we always also match on the ``(title,
|
||
|
|
date)`` identity directly. This is the fix for the most likely failure mode —
|
||
|
|
re-inserting events that are already on the calendar.
|
||
|
|
|
||
|
|
Storage note: the claude.ai Google Calendar connector's ``create_event`` cannot
|
||
|
|
write ``extendedProperties``, so autoKey cannot actually be persisted under that
|
||
|
|
backend. autoKey is still computed (a stable identity, forward-compatible with a
|
||
|
|
direct Calendar API / service-account backend), but with the connector the
|
||
|
|
load-bearing dedup path is the ``(title, date)`` match against the events already
|
||
|
|
on the calendar — which is why the existing keyless seed events must (and do)
|
||
|
|
dedup correctly.
|
||
|
|
|
||
|
|
Title matching is deliberately fuzzy (strip a trailing "(...)" tag, strip a
|
||
|
|
trailing "— Venue" segment, then compare with token-subset / Jaccard overlap)
|
||
|
|
because the same event is routinely reported with slightly different titles
|
||
|
|
across runs: with or without the venue, with or without a "(tickets req'd)" tag.
|
||
|
|
|
||
|
|
Determinism guarantees (do not rely on model judgment for these):
|
||
|
|
- intra-run dedup: two candidate variants of the same event in one batch collapse
|
||
|
|
to a single insert;
|
||
|
|
- past-date floor: with ``today`` set, any candidate starting before today is
|
||
|
|
dropped (so the "no past events" rule has a deterministic source);
|
||
|
|
- cap: with ``max_new`` set, inserts beyond the cap overflow into a bucket rather
|
||
|
|
than being silently truncated by call order.
|
||
|
|
|
||
|
|
This module is intentionally dependency-free and side-effect-free so it can be
|
||
|
|
unit-tested in isolation and dropped verbatim into the routine.
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import hashlib
|
||
|
|
import re
|
||
|
|
|
||
|
|
# A trailing parenthetical tag we append by convention, e.g. "(tickets req'd)".
|
||
|
|
_TRAILING_PAREN = re.compile(r"\s*\([^()]*\)\s*$")
|
||
|
|
# "Event Title — Venue": em dash (U+2014), en dash (U+2013), or hyphen, padded.
|
||
|
|
_VENUE_SEP = re.compile(r"\s+[—–-]\s+")
|
||
|
|
_WS = re.compile(r"\s+")
|
||
|
|
_PUNCT = re.compile(r"[^\w\s]")
|
||
|
|
|
||
|
|
JACCARD_THRESHOLD = 0.6
|
||
|
|
|
||
|
|
|
||
|
|
def normalize_title(title: str) -> str:
|
||
|
|
"""Lowercase, drop a trailing "(...)" tag, collapse whitespace."""
|
||
|
|
t = title.strip().lower()
|
||
|
|
t = _TRAILING_PAREN.sub("", t)
|
||
|
|
t = _WS.sub(" ", t).strip()
|
||
|
|
return t
|
||
|
|
|
||
|
|
|
||
|
|
def strip_venue(normalized: str) -> str:
|
||
|
|
"""Keep only the part before the first " — Venue" separator."""
|
||
|
|
return _VENUE_SEP.split(normalized, 1)[0].strip()
|
||
|
|
|
||
|
|
|
||
|
|
def _tokens(s: str) -> set[str]:
|
||
|
|
return {w for w in _PUNCT.sub(" ", s).split() if w}
|
||
|
|
|
||
|
|
|
||
|
|
def _jaccard(a: set[str], b: set[str]) -> float:
|
||
|
|
if not a or not b:
|
||
|
|
return 0.0
|
||
|
|
return len(a & b) / len(a | b)
|
||
|
|
|
||
|
|
|
||
|
|
def titles_match(a: str, b: str) -> bool:
|
||
|
|
na, nb = normalize_title(a), normalize_title(b)
|
||
|
|
if na == nb:
|
||
|
|
return True
|
||
|
|
sa, sb = strip_venue(na), strip_venue(nb)
|
||
|
|
if sa == sb:
|
||
|
|
return True
|
||
|
|
ta, tb = _tokens(sa), _tokens(sb)
|
||
|
|
if ta and tb and (ta <= tb or tb <= ta): # one is a subset of the other
|
||
|
|
return True
|
||
|
|
return _jaccard(ta, tb) >= JACCARD_THRESHOLD
|
||
|
|
|
||
|
|
|
||
|
|
def start_date(event: dict) -> str:
|
||
|
|
"""Return ``YYYY-MM-DD`` for a candidate or a Google event ('' if unknown).
|
||
|
|
|
||
|
|
Handles a Google event ({"start": {"dateTime"|"date": ...}}) and a candidate
|
||
|
|
({"start": "2026-06-12T19:00:00"} or all-day {"start": "2026-06-12"}).
|
||
|
|
"""
|
||
|
|
start = event.get("start")
|
||
|
|
if isinstance(start, dict):
|
||
|
|
val = start.get("dateTime") or start.get("date") or ""
|
||
|
|
else:
|
||
|
|
val = start or ""
|
||
|
|
return val[:10]
|
||
|
|
|
||
|
|
|
||
|
|
def title_of(event: dict) -> str:
|
||
|
|
return event.get("summary") or event.get("title") or ""
|
||
|
|
|
||
|
|
|
||
|
|
def auto_key(title: str, date: str) -> str:
|
||
|
|
basis = f"{normalize_title(title)}|{date}"
|
||
|
|
return hashlib.sha1(basis.encode("utf-8")).hexdigest()
|
||
|
|
|
||
|
|
|
||
|
|
def _existing_autokey(event: dict) -> str | None:
|
||
|
|
# Google may return null (not just absent) for these on hand-added events;
|
||
|
|
# `or {}` guards both the missing-key and present-but-null cases.
|
||
|
|
ep = event.get("extendedProperties") or {}
|
||
|
|
priv = ep.get("private") or {}
|
||
|
|
return priv.get("autoKey")
|
||
|
|
|
||
|
|
|
||
|
|
def is_duplicate(candidate: dict, existing: dict) -> bool:
|
||
|
|
"""Is ``candidate`` the same event as the already-present ``existing``?"""
|
||
|
|
ek = _existing_autokey(existing)
|
||
|
|
if ek and ek == auto_key(title_of(candidate), start_date(candidate)):
|
||
|
|
return True
|
||
|
|
|
||
|
|
ct, et = title_of(candidate), title_of(existing)
|
||
|
|
# A recurring candidate covers the whole horizon, so it is a duplicate if ANY
|
||
|
|
# existing event shares its (fuzzy) title — regardless of date, and whether or
|
||
|
|
# not the existing copy is flagged recurring (the connector may return an
|
||
|
|
# expanded instance).
|
||
|
|
if candidate.get("recurrence") and titles_match(ct, et):
|
||
|
|
return True
|
||
|
|
# Otherwise require the same calendar day and a fuzzy title match.
|
||
|
|
cd = start_date(candidate)
|
||
|
|
if cd and cd == start_date(existing) and titles_match(ct, et):
|
||
|
|
return True
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def reconcile(candidates: list[dict], existing: list[dict],
|
||
|
|
today: str | None = None, max_new: int | None = None) -> dict:
|
||
|
|
"""Split candidates into insert / skip / dropped_past / dropped_overflow.
|
||
|
|
|
||
|
|
- ``today`` (``YYYY-MM-DD``): candidates starting before it are dropped_past.
|
||
|
|
- ``max_new``: inserts beyond the cap overflow into dropped_overflow, in
|
||
|
|
input order — so upstream priority ordering is preserved, not truncated by
|
||
|
|
authoring accident.
|
||
|
|
Duplicates are detected against ``existing`` AND against candidates already
|
||
|
|
accepted this run (so two variants of the same event collapse to one insert).
|
||
|
|
Each insert is the candidate dict plus a computed ``autoKey``.
|
||
|
|
"""
|
||
|
|
inserts: list[dict] = []
|
||
|
|
skips: list[dict] = []
|
||
|
|
dropped_past: list[dict] = []
|
||
|
|
for c in candidates:
|
||
|
|
cd = start_date(c)
|
||
|
|
if today and cd and cd < today:
|
||
|
|
dropped_past.append({"candidate": c, "reason": f"starts {cd}, before {today}"})
|
||
|
|
continue
|
||
|
|
match = next((e for e in existing if is_duplicate(c, e)), None)
|
||
|
|
if match is None: # collapse same-run duplicates too
|
||
|
|
match = next((p for p in inserts if is_duplicate(c, p)), None)
|
||
|
|
if match is not None:
|
||
|
|
skips.append({"candidate": c, "matched": title_of(match),
|
||
|
|
"reason": "already present"})
|
||
|
|
continue
|
||
|
|
stamped = dict(c)
|
||
|
|
stamped["autoKey"] = auto_key(title_of(c), cd)
|
||
|
|
inserts.append(stamped)
|
||
|
|
|
||
|
|
dropped_overflow: list[dict] = []
|
||
|
|
if max_new is not None and len(inserts) > max_new:
|
||
|
|
dropped_overflow = inserts[max_new:]
|
||
|
|
inserts = inserts[:max_new]
|
||
|
|
|
||
|
|
return {"insert": inserts, "skip": skips,
|
||
|
|
"dropped_past": dropped_past, "dropped_overflow": dropped_overflow}
|
||
|
|
|
||
|
|
|
||
|
|
def as_event_list(obj) -> list[dict]:
|
||
|
|
"""Accept a bare list, or the wrapper objects the calendar tools return."""
|
||
|
|
if isinstance(obj, list):
|
||
|
|
return obj
|
||
|
|
if isinstance(obj, dict):
|
||
|
|
for key in ("events", "items"):
|
||
|
|
if isinstance(obj.get(key), list):
|
||
|
|
return obj[key]
|
||
|
|
return []
|
||
|
|
|
||
|
|
|
||
|
|
def _main(argv: list[str]) -> int:
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
|
||
|
|
p = argparse.ArgumentParser(
|
||
|
|
description="Reconcile candidate events against events already on the calendar.")
|
||
|
|
p.add_argument("candidates", help="JSON file: list of candidate events")
|
||
|
|
p.add_argument("existing",
|
||
|
|
help="JSON file: calendar events (a list, or {\"events\": [...]})")
|
||
|
|
p.add_argument("--today", metavar="YYYY-MM-DD",
|
||
|
|
help="drop candidates that start before this date")
|
||
|
|
p.add_argument("--max", type=int, dest="max_new", metavar="N",
|
||
|
|
help="cap inserts at N; the rest go to dropped_overflow (input order)")
|
||
|
|
p.add_argument("--explain", action="store_true",
|
||
|
|
help="print a human-readable summary instead of machine JSON")
|
||
|
|
args = p.parse_args(argv)
|
||
|
|
|
||
|
|
with open(args.candidates, encoding="utf-8") as f:
|
||
|
|
candidates = as_event_list(json.load(f))
|
||
|
|
with open(args.existing, encoding="utf-8") as f:
|
||
|
|
existing = as_event_list(json.load(f))
|
||
|
|
|
||
|
|
result = reconcile(candidates, existing, today=args.today, max_new=args.max_new)
|
||
|
|
if args.explain:
|
||
|
|
print(f"{len(result['insert'])} to insert, "
|
||
|
|
f"{len(result['skip'])} skipped (already present), "
|
||
|
|
f"{len(result['dropped_past'])} dropped (past), "
|
||
|
|
f"{len(result['dropped_overflow'])} dropped (over cap):\n")
|
||
|
|
for i in result["insert"]:
|
||
|
|
print(f" + {title_of(i)} [{start_date(i)}]")
|
||
|
|
for s in result["skip"]:
|
||
|
|
print(f' = {title_of(s["candidate"])} -> matches "{s["matched"]}"')
|
||
|
|
for d in result["dropped_past"]:
|
||
|
|
print(f' x {title_of(d["candidate"])} [{start_date(d["candidate"])}] (past)')
|
||
|
|
for d in result["dropped_overflow"]:
|
||
|
|
print(f" ~ {title_of(d)} [{start_date(d)}] (over cap)")
|
||
|
|
else:
|
||
|
|
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
import sys
|
||
|
|
|
||
|
|
raise SystemExit(_main(sys.argv[1:]))
|