From 4f8e2a9962f472bb6182831d322b55e266373ff1 Mon Sep 17 00:00:00 2001 From: Christian Nennemann Date: Mon, 6 Apr 2026 21:43:29 +0200 Subject: [PATCH] feat: add run replay for archetype effectiveness analysis - archeflow-decision.sh records decision points during runs - archeflow-replay.sh: timeline, whatif, compare commands - What-if replay with adjustable archetype weights - /af-replay skill for interactive use - Tests in archeflow-replay.bats --- .claude-plugin/plugin.json | 4 +-- CHANGELOG.md | 5 +++ README.md | 4 ++- docs/status.md | 6 ++++ skills/af-replay/SKILL.md | 42 ++++++++++++++++++++++ skills/run/SKILL.md | 7 ++++ skills/using-archeflow/SKILL.md | 3 +- tests/archeflow-replay.bats | 62 +++++++++++++++++++++++++++++++++ 8 files changed, 129 insertions(+), 4 deletions(-) create mode 100644 skills/af-replay/SKILL.md create mode 100644 tests/archeflow-replay.bats diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index bf3b34a..5c303c1 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "archeflow", "description": "Multi-agent orchestration with Jungian archetypes. PDCA quality cycles, shadow detection, git worktree isolation. Zero dependencies — works with any Claude Code session.", - "version": "0.8.0", + "version": "0.9.0", "author": { "name": "Chris Nennemann" }, @@ -19,7 +19,7 @@ "colette-bridge", "git-integration", "multi-project", "cost-tracking", "custom-archetypes", "workflow-design", "domains", "templates", "autonomous-mode", "using-archeflow", - "af-status", "af-score", "af-dag", "af-report" + "af-status", "af-score", "af-dag", "af-report", "af-replay" ], "hooks": "hooks/hooks.json" } diff --git a/CHANGELOG.md b/CHANGELOG.md index cb38f45..14faa8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to ArcheFlow are documented in this file. +## [0.9.0] -- 2026-04-06 + +### Added +- Run replay: `decision.point` events via `archeflow-decision.sh`; `archeflow-replay.sh` with `timeline`, `whatif` (weighted archetype weights + threshold), and `compare`; skill `af-replay`; DAG labels for `decision.point`. + ## [0.7.0] -- 2026-04-04 ### Added diff --git a/README.md b/README.md index b9ec867..d070829 100644 --- a/README.md +++ b/README.md @@ -194,11 +194,13 @@ ArcheFlow ships with 19 skills organized by function. The `run` skill is self-co ## Library Scripts -Eight shell scripts in `lib/` power the process infrastructure. +Ten shell scripts in `lib/` power the process infrastructure. | Script | Purpose | Usage | |--------|---------|-------| | `archeflow-event.sh` | Append structured JSONL events to a run log | `archeflow-event.sh ''` | +| `archeflow-decision.sh` | Log a `decision.point` (phase, archetype, input, decision, confidence) | `archeflow-decision.sh check guardian 'diff' 'needs_changes' 0.85` | +| `archeflow-replay.sh` | Timeline + weighted what-if over recorded verdicts | `archeflow-replay.sh compare --weights sage=2,guardian=1` | | `archeflow-dag.sh` | Render ASCII DAG from JSONL events | `archeflow-dag.sh events.jsonl --color` | | `archeflow-report.sh` | Generate Markdown process report | `archeflow-report.sh events.jsonl --output report.md --dag` | | `archeflow-progress.sh` | Regenerate live progress file from events | `archeflow-progress.sh ` | diff --git a/docs/status.md b/docs/status.md index be96659..109767f 100644 --- a/docs/status.md +++ b/docs/status.md @@ -1,5 +1,11 @@ # ArcheFlow — Status Log +## 2026-04-06: Run replay (v0.9.0) + +- `lib/archeflow-decision.sh` — append `decision.point` (phase, archetype, input, decision, confidence). +- `lib/archeflow-replay.sh` — `timeline` / `whatif` (weighted archetypes, threshold) / `compare`; optional `--json`. +- Skill `af-replay`, plugin bump, DAG renders `decision.point`, `tests/archeflow-replay.bats`. + ## 2026-04-04: Triple Release Sprint (v0.4 → v0.6) ### What happened diff --git a/skills/af-replay/SKILL.md b/skills/af-replay/SKILL.md new file mode 100644 index 0000000..310d75f --- /dev/null +++ b/skills/af-replay/SKILL.md @@ -0,0 +1,42 @@ +--- +name: af-replay +description: "Replay and analyze a recorded ArcheFlow run: decision timeline and weighted what-if. Usage: /af-replay [--timeline|--whatif|--compare] [--weights arch=w,...]" +user-invocable: true +--- + +# ArcheFlow Run Replay + +Inspect a completed or in-progress run logged in `.archeflow/events/.jsonl`. Use this to study which archetypes drove outcomes and to simulate **weighted** consensus (what-if). + +## Recording (during PDCA) + +After each meaningful orchestration choice, log a **decision point** (in addition to `review.verdict` where applicable): + +```bash +./lib/archeflow-decision.sh '' '' [parent_seq] +``` + +Fields stored: `phase`, `archetype`, `input`, `decision`, `confidence`, `ts` (event timestamp). The event type is `decision.point`. + +Lower-level alternative: + +```bash +./lib/archeflow-event.sh "$RUN_ID" decision.point check guardian \ + '{"archetype":"guardian","input":"diff","decision":"needs_changes","confidence":0.85}' 7 +``` + +## Commands (from project root) + +| Action | Shell | +|--------|--------| +| Timeline | `./lib/archeflow-replay.sh timeline ` | +| What-if | `./lib/archeflow-replay.sh whatif [--weights guardian=2,sage=0.5] [--threshold 0.5] [--json]` | +| Both | `./lib/archeflow-replay.sh compare [--weights ...]` | + +- **Timeline** lists `decision.point` rows and `review.verdict` (check phase). +- **What-if** reads the **last** `review.verdict` per archetype in check. **Original** outcome uses strict any-veto (any non-approve → BLOCK). **Replay** uses weighted mean strictness: each reviewer contributes weight × (1 if not approved, else 0); BLOCK if mean ≥ threshold (default 0.5). +- **`--json`** emits machine-readable output for dashboards or scripts. + +## Learning effectiveness + +Correlate `decision.point` confidence and verdicts with cycle outcomes (`cycle.boundary`, `run.complete`) and `./lib/archeflow-score.sh extract` to see which archetypes add signal for which task shapes. diff --git a/skills/run/SKILL.md b/skills/run/SKILL.md index 16d2b21..e3b77b6 100644 --- a/skills/run/SKILL.md +++ b/skills/run/SKILL.md @@ -352,6 +352,7 @@ Emit events via `./lib/archeflow-event.sh '`. Weighted what-if helps estimate how much each review archetype swayed the effective ship/block outcome. See skill `af-replay`. + +--- + ## Dry-Run Mode When `--dry-run`: Run Plan phase only. Display workflow, agent counts, confidence scores, cost estimate. Ask user to proceed. If yes, continue with `--start-from do`. diff --git a/skills/using-archeflow/SKILL.md b/skills/using-archeflow/SKILL.md index 9be8529..df9e279 100644 --- a/skills/using-archeflow/SKILL.md +++ b/skills/using-archeflow/SKILL.md @@ -7,7 +7,7 @@ description: Use at session start when implementing features, reviewing code, de On activation, print ONE line then proceed silently: ``` -archeflow v0.8.0 · 23 skills · domain +archeflow v0.9.0 · 24 skills · domain ``` Domain auto-detected: `writing` if `colette.yaml` exists, `research` if paper/thesis files, `code` otherwise. @@ -46,6 +46,7 @@ Do NOT use for: single-line fixes, questions, reading/exploring, config tweaks, | `/af-memory` | Cross-run lesson memory | | `/af-fanout` | Colette book fanout via agents | | `/af-dag` | DAG of current/last run | +| `/af-replay ` | Decision timeline + weighted what-if on recorded events | ## Mini-Reflect Fallback diff --git a/tests/archeflow-replay.bats b/tests/archeflow-replay.bats new file mode 100644 index 0000000..0ac396a --- /dev/null +++ b/tests/archeflow-replay.bats @@ -0,0 +1,62 @@ +# Tests for archeflow-replay.sh — timeline, what-if, and compare modes. + +setup() { + load test_helper + _common_setup + + mkdir -p .archeflow/events + cat > ".archeflow/events/replay-run.jsonl" <<'EVENTS' +{"ts":"2026-04-03T10:00:00Z","run_id":"replay-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"replay test"}} +{"ts":"2026-04-03T10:05:00Z","run_id":"replay-run","seq":2,"parent":[1],"type":"decision.point","phase":"check","agent":"guardian","data":{"archetype":"guardian","input":"diff","decision":"needs_changes","confidence":0.88}} +{"ts":"2026-04-03T10:06:00Z","run_id":"replay-run","seq":3,"parent":[1],"type":"review.verdict","phase":"check","agent":"guardian","data":{"archetype":"guardian","verdict":"needs_changes","findings":[]}} +{"ts":"2026-04-03T10:07:00Z","run_id":"replay-run","seq":4,"parent":[1],"type":"review.verdict","phase":"check","agent":"sage","data":{"archetype":"sage","verdict":"approved","findings":[]}} +{"ts":"2026-04-03T10:08:00Z","run_id":"replay-run","seq":5,"parent":[1],"type":"run.complete","phase":"act","agent":null,"data":{"agents_total":2,"fixes_total":0}} +EVENTS +} + +@test "replay: usage without args" { + run "$LIB_DIR/archeflow-replay.sh" + [ "$status" -eq 1 ] + [[ "$output" == *"Usage"* ]] +} + +@test "replay: timeline shows decision.point" { + run "$LIB_DIR/archeflow-replay.sh" timeline replay-run + [ "$status" -eq 0 ] + [[ "$output" == *"decision.point"* ]] + [[ "$output" == *"guardian"* ]] + [[ "$output" == *"needs_changes"* ]] +} + +@test "replay: whatif strict blocks when any reviewer blocks" { + run "$LIB_DIR/archeflow-replay.sh" whatif replay-run + [ "$status" -eq 0 ] + [[ "$output" == *"BLOCK"* ]] +} + +@test "replay: whatif weighted can ship when blocker is down-weighted" { + run "$LIB_DIR/archeflow-replay.sh" whatif replay-run --weights guardian=0.2,sage=3 + [ "$status" -eq 0 ] + [[ "$output" == *"Weighted replay"* ]] || [[ "$output" == *"SHIP"* ]] + [[ "$output" == *"SHIP"* ]] +} + +@test "replay: whatif --json is valid JSON" { + run "$LIB_DIR/archeflow-replay.sh" whatif replay-run --json + [ "$status" -eq 0 ] + echo "$output" | jq -e '.run_id == "replay-run"' >/dev/null +} + +@test "replay: compare includes timeline and whatif" { + run "$LIB_DIR/archeflow-replay.sh" compare replay-run + [ "$status" -eq 0 ] + [[ "$output" == *"Decision timeline"* ]] + [[ "$output" == *"What-if replay"* ]] +} + +@test "decision: logs decision.point via wrapper" { + run "$LIB_DIR/archeflow-decision.sh" replay-run check trickster 'diff only' 'edge_case' 0.61 1 + [ "$status" -eq 0 ] + last=$(jq -r 'select(.type=="decision.point") | .data.decision' ".archeflow/events/replay-run.jsonl" | tail -1) + [ "$last" = "edge_case" ] +}