event_curator/reconcile.py

"""Deterministic dedup/reconcile for the recurring "Events" calendar automation.

Pure stdlib. Given a list of candidate events (from the curation step) and the
events already on the target Google Calendar (from a calendar list call), split
the candidates into: new events to insert, duplicates to skip, candidates dropped
as past-dated, and candidates dropped because they exceed the per-run cap.

Identity model
--------------
An event's identity is ``(normalized_title, start_date)``. Every event we insert
is stamped with ``autoKey = sha1(normalized_title + "|" + start_date)``. But
matching does NOT depend on a stored key: the existing seed events were
hand-created and carry no autoKey, so we always also match on the ``(title,
date)`` identity directly. This is the fix for the most likely failure mode —
re-inserting events that are already on the calendar.

Storage note: the claude.ai Google Calendar connector's ``create_event`` cannot
write ``extendedProperties``, so autoKey cannot actually be persisted under that
backend. autoKey is still computed (a stable identity, forward-compatible with a
direct Calendar API / service-account backend), but with the connector the
load-bearing dedup path is the ``(title, date)`` match against the events already
on the calendar — which is why the existing keyless seed events must (and do)
dedup correctly.

Title matching is deliberately fuzzy (strip a trailing "(...)" tag, strip a
trailing "— Venue" segment, then compare with token-subset / Jaccard overlap)
because the same event is routinely reported with slightly different titles
across runs: with or without the venue, with or without a "(tickets req'd)" tag.

Determinism guarantees (do not rely on model judgment for these):
- intra-run dedup: two candidate variants of the same event in one batch collapse
  to a single insert;
- past-date floor: with ``today`` set, any candidate starting before today is
  dropped (so the "no past events" rule has a deterministic source);
- cap: with ``max_new`` set, inserts beyond the cap overflow into a bucket rather
  than being silently truncated by call order.

This module is intentionally dependency-free and side-effect-free so it can be
unit-tested in isolation and dropped verbatim into the routine.
"""
from __future__ import annotations

import hashlib
import re

# A trailing parenthetical tag we append by convention, e.g. "(tickets req'd)".
_TRAILING_PAREN = re.compile(r"\s*\([^()]*\)\s*$")
# "Event Title — Venue": em dash (U+2014), en dash (U+2013), or hyphen, padded.
_VENUE_SEP = re.compile(r"\s+[—–-]\s+")
_WS = re.compile(r"\s+")
_PUNCT = re.compile(r"[^\w\s]")

JACCARD_THRESHOLD = 0.6


def normalize_title(title: str) -> str:
    """Lowercase, drop a trailing "(...)" tag, collapse whitespace."""
    t = title.strip().lower()
    t = _TRAILING_PAREN.sub("", t)
    t = _WS.sub(" ", t).strip()
    return t


def strip_venue(normalized: str) -> str:
    """Keep only the part before the first " — Venue" separator."""
    return _VENUE_SEP.split(normalized, 1)[0].strip()


def _tokens(s: str) -> set[str]:
    return {w for w in _PUNCT.sub(" ", s).split() if w}


def _jaccard(a: set[str], b: set[str]) -> float:
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)


def titles_match(a: str, b: str) -> bool:
    na, nb = normalize_title(a), normalize_title(b)
    if na == nb:
        return True
    sa, sb = strip_venue(na), strip_venue(nb)
    if sa == sb:
        return True
    ta, tb = _tokens(sa), _tokens(sb)
    if ta and tb and (ta <= tb or tb <= ta):  # one is a subset of the other
        return True
    return _jaccard(ta, tb) >= JACCARD_THRESHOLD


def start_date(event: dict) -> str:
    """Return ``YYYY-MM-DD`` for a candidate or a Google event ('' if unknown).

    Handles a Google event ({"start": {"dateTime"|"date": ...}}) and a candidate
    ({"start": "2026-06-12T19:00:00"} or all-day {"start": "2026-06-12"}).
    """
    start = event.get("start")
    if isinstance(start, dict):
        val = start.get("dateTime") or start.get("date") or ""
    else:
        val = start or ""
    return val[:10]


def title_of(event: dict) -> str:
    return event.get("summary") or event.get("title") or ""


def auto_key(title: str, date: str) -> str:
    basis = f"{normalize_title(title)}|{date}"
    return hashlib.sha1(basis.encode("utf-8")).hexdigest()


def _existing_autokey(event: dict) -> str | None:
    # Google may return null (not just absent) for these on hand-added events;
    # `or {}` guards both the missing-key and present-but-null cases.
    ep = event.get("extendedProperties") or {}
    priv = ep.get("private") or {}
    return priv.get("autoKey")


def is_duplicate(candidate: dict, existing: dict) -> bool:
    """Is ``candidate`` the same event as the already-present ``existing``?"""
    ek = _existing_autokey(existing)
    if ek and ek == auto_key(title_of(candidate), start_date(candidate)):
        return True

    ct, et = title_of(candidate), title_of(existing)
    # A recurring candidate covers the whole horizon, so it is a duplicate if ANY
    # existing event shares its (fuzzy) title — regardless of date, and whether or
    # not the existing copy is flagged recurring (the connector may return an
    # expanded instance).
    if candidate.get("recurrence") and titles_match(ct, et):
        return True
    # Otherwise require the same calendar day and a fuzzy title match.
    cd = start_date(candidate)
    if cd and cd == start_date(existing) and titles_match(ct, et):
        return True
    return False


def reconcile(candidates: list[dict], existing: list[dict],
              today: str | None = None, max_new: int | None = None) -> dict:
    """Split candidates into insert / skip / dropped_past / dropped_overflow.

    - ``today`` (``YYYY-MM-DD``): candidates starting before it are dropped_past.
    - ``max_new``: inserts beyond the cap overflow into dropped_overflow, in
      input order — so upstream priority ordering is preserved, not truncated by
      authoring accident.
    Duplicates are detected against ``existing`` AND against candidates already
    accepted this run (so two variants of the same event collapse to one insert).
    Each insert is the candidate dict plus a computed ``autoKey``.
    """
    inserts: list[dict] = []
    skips: list[dict] = []
    dropped_past: list[dict] = []
    for c in candidates:
        cd = start_date(c)
        if today and cd and cd < today:
            dropped_past.append({"candidate": c, "reason": f"starts {cd}, before {today}"})
            continue
        match = next((e for e in existing if is_duplicate(c, e)), None)
        if match is None:  # collapse same-run duplicates too
            match = next((p for p in inserts if is_duplicate(c, p)), None)
        if match is not None:
            skips.append({"candidate": c, "matched": title_of(match),
                          "reason": "already present"})
            continue
        stamped = dict(c)
        stamped["autoKey"] = auto_key(title_of(c), cd)
        inserts.append(stamped)

    dropped_overflow: list[dict] = []
    if max_new is not None and len(inserts) > max_new:
        dropped_overflow = inserts[max_new:]
        inserts = inserts[:max_new]

    return {"insert": inserts, "skip": skips,
            "dropped_past": dropped_past, "dropped_overflow": dropped_overflow}


def as_event_list(obj) -> list[dict]:
    """Accept a bare list, or the wrapper objects the calendar tools return."""
    if isinstance(obj, list):
        return obj
    if isinstance(obj, dict):
        for key in ("events", "items"):
            if isinstance(obj.get(key), list):
                return obj[key]
    return []


def _main(argv: list[str]) -> int:
    import argparse
    import json

    p = argparse.ArgumentParser(
        description="Reconcile candidate events against events already on the calendar.")
    p.add_argument("candidates", help="JSON file: list of candidate events")
    p.add_argument("existing",
                   help="JSON file: calendar events (a list, or {\"events\": [...]})")
    p.add_argument("--today", metavar="YYYY-MM-DD",
                   help="drop candidates that start before this date")
    p.add_argument("--max", type=int, dest="max_new", metavar="N",
                   help="cap inserts at N; the rest go to dropped_overflow (input order)")
    p.add_argument("--explain", action="store_true",
                   help="print a human-readable summary instead of machine JSON")
    args = p.parse_args(argv)

    with open(args.candidates, encoding="utf-8") as f:
        candidates = as_event_list(json.load(f))
    with open(args.existing, encoding="utf-8") as f:
        existing = as_event_list(json.load(f))

    result = reconcile(candidates, existing, today=args.today, max_new=args.max_new)
    if args.explain:
        print(f"{len(result['insert'])} to insert, "
              f"{len(result['skip'])} skipped (already present), "
              f"{len(result['dropped_past'])} dropped (past), "
              f"{len(result['dropped_overflow'])} dropped (over cap):\n")
        for i in result["insert"]:
            print(f"  + {title_of(i)}  [{start_date(i)}]")
        for s in result["skip"]:
            print(f'  = {title_of(s["candidate"])}  ->  matches "{s["matched"]}"')
        for d in result["dropped_past"]:
            print(f'  x {title_of(d["candidate"])}  [{start_date(d["candidate"])}]  (past)')
        for d in result["dropped_overflow"]:
            print(f"  ~ {title_of(d)}  [{start_date(d)}]  (over cap)")
    else:
        print(json.dumps(result, ensure_ascii=False, indent=2))
    return 0


if __name__ == "__main__":
    import sys

    raise SystemExit(_main(sys.argv[1:]))
Weekly Events-calendar curation routine and deterministic deduper 2026-06-08 02:55:05 +00:00			`"""Deterministic dedup/reconcile for the recurring "Events" calendar automation.`

			`Pure stdlib. Given a list of candidate events (from the curation step) and the`
			`events already on the target Google Calendar (from a calendar list call), split`
			`the candidates into: new events to insert, duplicates to skip, candidates dropped`
			`as past-dated, and candidates dropped because they exceed the per-run cap.`

			`Identity model`
			`--------------`
			An event's identity is ``(normalized_title, start_date)``. Every event we insert
			is stamped with ``autoKey = sha1(normalized_title + "\|" + start_date)``. But
			`matching does NOT depend on a stored key: the existing seed events were`
			hand-created and carry no autoKey, so we always also match on the ``(title,
			date)`` identity directly. This is the fix for the most likely failure mode —
			`re-inserting events that are already on the calendar.`

			Storage note: the claude.ai Google Calendar connector's ``create_event`` cannot
			write ``extendedProperties``, so autoKey cannot actually be persisted under that
			`backend. autoKey is still computed (a stable identity, forward-compatible with a`
			`direct Calendar API / service-account backend), but with the connector the`
			load-bearing dedup path is the ``(title, date)`` match against the events already
			`on the calendar — which is why the existing keyless seed events must (and do)`
			`dedup correctly.`

			`Title matching is deliberately fuzzy (strip a trailing "(...)" tag, strip a`
			`trailing "— Venue" segment, then compare with token-subset / Jaccard overlap)`
			`because the same event is routinely reported with slightly different titles`
			`across runs: with or without the venue, with or without a "(tickets req'd)" tag.`

			`Determinism guarantees (do not rely on model judgment for these):`
			`- intra-run dedup: two candidate variants of the same event in one batch collapse`
			`to a single insert;`
			- past-date floor: with ``today`` set, any candidate starting before today is
			`dropped (so the "no past events" rule has a deterministic source);`
			- cap: with ``max_new`` set, inserts beyond the cap overflow into a bucket rather
			`than being silently truncated by call order.`

			`This module is intentionally dependency-free and side-effect-free so it can be`
			`unit-tested in isolation and dropped verbatim into the routine.`
			`"""`
			`from __future__ import annotations`

			`import hashlib`
			`import re`

			`# A trailing parenthetical tag we append by convention, e.g. "(tickets req'd)".`
			`_TRAILING_PAREN = re.compile(r"\s\([^()]\)\s*$")`
			`# "Event Title — Venue": em dash (U+2014), en dash (U+2013), or hyphen, padded.`
			`_VENUE_SEP = re.compile(r"\s+[—–-]\s+")`
			`_WS = re.compile(r"\s+")`
			`_PUNCT = re.compile(r"[^\w\s]")`

			`JACCARD_THRESHOLD = 0.6`


			`def normalize_title(title: str) -> str:`
			`"""Lowercase, drop a trailing "(...)" tag, collapse whitespace."""`
			`t = title.strip().lower()`
			`t = _TRAILING_PAREN.sub("", t)`
			`t = _WS.sub(" ", t).strip()`
			`return t`


			`def strip_venue(normalized: str) -> str:`
			`"""Keep only the part before the first " — Venue" separator."""`
			`return _VENUE_SEP.split(normalized, 1)[0].strip()`


			`def _tokens(s: str) -> set[str]:`
			`return {w for w in _PUNCT.sub(" ", s).split() if w}`


			`def _jaccard(a: set[str], b: set[str]) -> float:`
			`if not a or not b:`
			`return 0.0`
			`return len(a & b) / len(a \| b)`


			`def titles_match(a: str, b: str) -> bool:`
			`na, nb = normalize_title(a), normalize_title(b)`
			`if na == nb:`
			`return True`
			`sa, sb = strip_venue(na), strip_venue(nb)`
			`if sa == sb:`
			`return True`
			`ta, tb = _tokens(sa), _tokens(sb)`
			`if ta and tb and (ta <= tb or tb <= ta): # one is a subset of the other`
			`return True`
			`return _jaccard(ta, tb) >= JACCARD_THRESHOLD`


			`def start_date(event: dict) -> str:`
			"""Return ``YYYY-MM-DD`` for a candidate or a Google event ('' if unknown).

			`Handles a Google event ({"start": {"dateTime"\|"date": ...}}) and a candidate`
			`({"start": "2026-06-12T19:00:00"} or all-day {"start": "2026-06-12"}).`
			`"""`
			`start = event.get("start")`
			`if isinstance(start, dict):`
			`val = start.get("dateTime") or start.get("date") or ""`
			`else:`
			`val = start or ""`
			`return val[:10]`


			`def title_of(event: dict) -> str:`
			`return event.get("summary") or event.get("title") or ""`


			`def auto_key(title: str, date: str) -> str:`
			`basis = f"{normalize_title(title)}\|{date}"`
			`return hashlib.sha1(basis.encode("utf-8")).hexdigest()`


			`def _existing_autokey(event: dict) -> str \| None:`
			`# Google may return null (not just absent) for these on hand-added events;`
			# `or {}` guards both the missing-key and present-but-null cases.
			`ep = event.get("extendedProperties") or {}`
			`priv = ep.get("private") or {}`
			`return priv.get("autoKey")`


			`def is_duplicate(candidate: dict, existing: dict) -> bool:`
			"""Is ``candidate`` the same event as the already-present ``existing``?"""
			`ek = _existing_autokey(existing)`
			`if ek and ek == auto_key(title_of(candidate), start_date(candidate)):`
			`return True`

			`ct, et = title_of(candidate), title_of(existing)`
			`# A recurring candidate covers the whole horizon, so it is a duplicate if ANY`
			`# existing event shares its (fuzzy) title — regardless of date, and whether or`
			`# not the existing copy is flagged recurring (the connector may return an`
			`# expanded instance).`
			`if candidate.get("recurrence") and titles_match(ct, et):`
			`return True`
			`# Otherwise require the same calendar day and a fuzzy title match.`
			`cd = start_date(candidate)`
			`if cd and cd == start_date(existing) and titles_match(ct, et):`
			`return True`
			`return False`


			`def reconcile(candidates: list[dict], existing: list[dict],`
			`today: str \| None = None, max_new: int \| None = None) -> dict:`
			`"""Split candidates into insert / skip / dropped_past / dropped_overflow.`

			- ``today`` (``YYYY-MM-DD``): candidates starting before it are dropped_past.
			- ``max_new``: inserts beyond the cap overflow into dropped_overflow, in
			`input order — so upstream priority ordering is preserved, not truncated by`
			`authoring accident.`
			Duplicates are detected against ``existing`` AND against candidates already
			`accepted this run (so two variants of the same event collapse to one insert).`
			Each insert is the candidate dict plus a computed ``autoKey``.
			`"""`
			`inserts: list[dict] = []`
			`skips: list[dict] = []`
			`dropped_past: list[dict] = []`
			`for c in candidates:`
			`cd = start_date(c)`
			`if today and cd and cd < today:`
			`dropped_past.append({"candidate": c, "reason": f"starts {cd}, before {today}"})`
			`continue`
			`match = next((e for e in existing if is_duplicate(c, e)), None)`
			`if match is None: # collapse same-run duplicates too`
			`match = next((p for p in inserts if is_duplicate(c, p)), None)`
			`if match is not None:`
			`skips.append({"candidate": c, "matched": title_of(match),`
			`"reason": "already present"})`
			`continue`
			`stamped = dict(c)`
			`stamped["autoKey"] = auto_key(title_of(c), cd)`
			`inserts.append(stamped)`

			`dropped_overflow: list[dict] = []`
			`if max_new is not None and len(inserts) > max_new:`
			`dropped_overflow = inserts[max_new:]`
			`inserts = inserts[:max_new]`

			`return {"insert": inserts, "skip": skips,`
			`"dropped_past": dropped_past, "dropped_overflow": dropped_overflow}`


			`def as_event_list(obj) -> list[dict]:`
			`"""Accept a bare list, or the wrapper objects the calendar tools return."""`
			`if isinstance(obj, list):`
			`return obj`
			`if isinstance(obj, dict):`
			`for key in ("events", "items"):`
			`if isinstance(obj.get(key), list):`
			`return obj[key]`
			`return []`


			`def _main(argv: list[str]) -> int:`
			`import argparse`
			`import json`

			`p = argparse.ArgumentParser(`
			`description="Reconcile candidate events against events already on the calendar.")`
			`p.add_argument("candidates", help="JSON file: list of candidate events")`
			`p.add_argument("existing",`
			`help="JSON file: calendar events (a list, or {\"events\": [...]})")`
			`p.add_argument("--today", metavar="YYYY-MM-DD",`
			`help="drop candidates that start before this date")`
			`p.add_argument("--max", type=int, dest="max_new", metavar="N",`
			`help="cap inserts at N; the rest go to dropped_overflow (input order)")`
			`p.add_argument("--explain", action="store_true",`
			`help="print a human-readable summary instead of machine JSON")`
			`args = p.parse_args(argv)`

			`with open(args.candidates, encoding="utf-8") as f:`
			`candidates = as_event_list(json.load(f))`
			`with open(args.existing, encoding="utf-8") as f:`
			`existing = as_event_list(json.load(f))`

			`result = reconcile(candidates, existing, today=args.today, max_new=args.max_new)`
			`if args.explain:`
			`print(f"{len(result['insert'])} to insert, "`
			`f"{len(result['skip'])} skipped (already present), "`
			`f"{len(result['dropped_past'])} dropped (past), "`
			`f"{len(result['dropped_overflow'])} dropped (over cap):\n")`
			`for i in result["insert"]:`
			`print(f" + {title_of(i)} [{start_date(i)}]")`
			`for s in result["skip"]:`
			`print(f' = {title_of(s["candidate"])} -> matches "{s["matched"]}"')`
			`for d in result["dropped_past"]:`
			`print(f' x {title_of(d["candidate"])} [{start_date(d["candidate"])}] (past)')`
			`for d in result["dropped_overflow"]:`
			`print(f" ~ {title_of(d)} [{start_date(d)}] (over cap)")`
			`else:`
			`print(json.dumps(result, ensure_ascii=False, indent=2))`
			`return 0`


			`if __name__ == "__main__":`
			`import sys`

			`raise SystemExit(_main(sys.argv[1:]))`