"""Deterministic dedup/reconcile for the recurring "Events" calendar automation. Pure stdlib. Given a list of candidate events (from the curation step) and the events already on the target Google Calendar (from a calendar list call), split the candidates into: new events to insert, duplicates to skip, candidates dropped as past-dated, and candidates dropped because they exceed the per-run cap. Identity model -------------- An event's identity is ``(normalized_title, start_date)``. Every event we insert is stamped with ``autoKey = sha1(normalized_title + "|" + start_date)``. But matching does NOT depend on a stored key: the existing seed events were hand-created and carry no autoKey, so we always also match on the ``(title, date)`` identity directly. This is the fix for the most likely failure mode — re-inserting events that are already on the calendar. Storage note: the claude.ai Google Calendar connector's ``create_event`` cannot write ``extendedProperties``, so autoKey cannot actually be persisted under that backend. autoKey is still computed (a stable identity, forward-compatible with a direct Calendar API / service-account backend), but with the connector the load-bearing dedup path is the ``(title, date)`` match against the events already on the calendar — which is why the existing keyless seed events must (and do) dedup correctly. Title matching is deliberately fuzzy (strip a trailing "(...)" tag, strip a trailing "— Venue" segment, then compare with token-subset / Jaccard overlap) because the same event is routinely reported with slightly different titles across runs: with or without the venue, with or without a "(tickets req'd)" tag. Determinism guarantees (do not rely on model judgment for these): - intra-run dedup: two candidate variants of the same event in one batch collapse to a single insert; - past-date floor: with ``today`` set, any candidate starting before today is dropped (so the "no past events" rule has a deterministic source); - cap: with ``max_new`` set, inserts beyond the cap overflow into a bucket rather than being silently truncated by call order. This module is intentionally dependency-free and side-effect-free so it can be unit-tested in isolation and dropped verbatim into the routine. """ from __future__ import annotations import hashlib import re # A trailing parenthetical tag we append by convention, e.g. "(tickets req'd)". _TRAILING_PAREN = re.compile(r"\s*\([^()]*\)\s*$") # "Event Title — Venue": em dash (U+2014), en dash (U+2013), or hyphen, padded. _VENUE_SEP = re.compile(r"\s+[—–-]\s+") _WS = re.compile(r"\s+") _PUNCT = re.compile(r"[^\w\s]") JACCARD_THRESHOLD = 0.6 def normalize_title(title: str) -> str: """Lowercase, drop a trailing "(...)" tag, collapse whitespace.""" t = title.strip().lower() t = _TRAILING_PAREN.sub("", t) t = _WS.sub(" ", t).strip() return t def strip_venue(normalized: str) -> str: """Keep only the part before the first " — Venue" separator.""" return _VENUE_SEP.split(normalized, 1)[0].strip() def _tokens(s: str) -> set[str]: return {w for w in _PUNCT.sub(" ", s).split() if w} def _jaccard(a: set[str], b: set[str]) -> float: if not a or not b: return 0.0 return len(a & b) / len(a | b) def titles_match(a: str, b: str) -> bool: na, nb = normalize_title(a), normalize_title(b) if na == nb: return True sa, sb = strip_venue(na), strip_venue(nb) if sa == sb: return True ta, tb = _tokens(sa), _tokens(sb) if ta and tb and (ta <= tb or tb <= ta): # one is a subset of the other return True return _jaccard(ta, tb) >= JACCARD_THRESHOLD def start_date(event: dict) -> str: """Return ``YYYY-MM-DD`` for a candidate or a Google event ('' if unknown). Handles a Google event ({"start": {"dateTime"|"date": ...}}) and a candidate ({"start": "2026-06-12T19:00:00"} or all-day {"start": "2026-06-12"}). """ start = event.get("start") if isinstance(start, dict): val = start.get("dateTime") or start.get("date") or "" else: val = start or "" return val[:10] def title_of(event: dict) -> str: return event.get("summary") or event.get("title") or "" def auto_key(title: str, date: str) -> str: basis = f"{normalize_title(title)}|{date}" return hashlib.sha1(basis.encode("utf-8")).hexdigest() def _existing_autokey(event: dict) -> str | None: # Google may return null (not just absent) for these on hand-added events; # `or {}` guards both the missing-key and present-but-null cases. ep = event.get("extendedProperties") or {} priv = ep.get("private") or {} return priv.get("autoKey") def is_duplicate(candidate: dict, existing: dict) -> bool: """Is ``candidate`` the same event as the already-present ``existing``?""" ek = _existing_autokey(existing) if ek and ek == auto_key(title_of(candidate), start_date(candidate)): return True ct, et = title_of(candidate), title_of(existing) # A recurring candidate covers the whole horizon, so it is a duplicate if ANY # existing event shares its (fuzzy) title — regardless of date, and whether or # not the existing copy is flagged recurring (the connector may return an # expanded instance). if candidate.get("recurrence") and titles_match(ct, et): return True # Otherwise require the same calendar day and a fuzzy title match. cd = start_date(candidate) if cd and cd == start_date(existing) and titles_match(ct, et): return True return False def reconcile(candidates: list[dict], existing: list[dict], today: str | None = None, max_new: int | None = None) -> dict: """Split candidates into insert / skip / dropped_past / dropped_overflow. - ``today`` (``YYYY-MM-DD``): candidates starting before it are dropped_past. - ``max_new``: inserts beyond the cap overflow into dropped_overflow, in input order — so upstream priority ordering is preserved, not truncated by authoring accident. Duplicates are detected against ``existing`` AND against candidates already accepted this run (so two variants of the same event collapse to one insert). Each insert is the candidate dict plus a computed ``autoKey``. """ inserts: list[dict] = [] skips: list[dict] = [] dropped_past: list[dict] = [] for c in candidates: cd = start_date(c) if today and cd and cd < today: dropped_past.append({"candidate": c, "reason": f"starts {cd}, before {today}"}) continue match = next((e for e in existing if is_duplicate(c, e)), None) if match is None: # collapse same-run duplicates too match = next((p for p in inserts if is_duplicate(c, p)), None) if match is not None: skips.append({"candidate": c, "matched": title_of(match), "reason": "already present"}) continue stamped = dict(c) stamped["autoKey"] = auto_key(title_of(c), cd) inserts.append(stamped) dropped_overflow: list[dict] = [] if max_new is not None and len(inserts) > max_new: dropped_overflow = inserts[max_new:] inserts = inserts[:max_new] return {"insert": inserts, "skip": skips, "dropped_past": dropped_past, "dropped_overflow": dropped_overflow} def as_event_list(obj) -> list[dict]: """Accept a bare list, or the wrapper objects the calendar tools return.""" if isinstance(obj, list): return obj if isinstance(obj, dict): for key in ("events", "items"): if isinstance(obj.get(key), list): return obj[key] return [] def _main(argv: list[str]) -> int: import argparse import json p = argparse.ArgumentParser( description="Reconcile candidate events against events already on the calendar.") p.add_argument("candidates", help="JSON file: list of candidate events") p.add_argument("existing", help="JSON file: calendar events (a list, or {\"events\": [...]})") p.add_argument("--today", metavar="YYYY-MM-DD", help="drop candidates that start before this date") p.add_argument("--max", type=int, dest="max_new", metavar="N", help="cap inserts at N; the rest go to dropped_overflow (input order)") p.add_argument("--explain", action="store_true", help="print a human-readable summary instead of machine JSON") args = p.parse_args(argv) with open(args.candidates, encoding="utf-8") as f: candidates = as_event_list(json.load(f)) with open(args.existing, encoding="utf-8") as f: existing = as_event_list(json.load(f)) result = reconcile(candidates, existing, today=args.today, max_new=args.max_new) if args.explain: print(f"{len(result['insert'])} to insert, " f"{len(result['skip'])} skipped (already present), " f"{len(result['dropped_past'])} dropped (past), " f"{len(result['dropped_overflow'])} dropped (over cap):\n") for i in result["insert"]: print(f" + {title_of(i)} [{start_date(i)}]") for s in result["skip"]: print(f' = {title_of(s["candidate"])} -> matches "{s["matched"]}"') for d in result["dropped_past"]: print(f' x {title_of(d["candidate"])} [{start_date(d["candidate"])}] (past)') for d in result["dropped_overflow"]: print(f" ~ {title_of(d)} [{start_date(d)}] (over cap)") else: print(json.dumps(result, ensure_ascii=False, indent=2)) return 0 if __name__ == "__main__": import sys raise SystemExit(_main(sys.argv[1:]))