Compare commits
12 Commits
14d70689ce
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 3ef956485f | |||
| 1e96d87f49 | |||
| d99f449083 | |||
| 58315ac982 | |||
| 24ea632207 | |||
| 55dde5f07a | |||
| 4f8e2a9962 | |||
| 506143d613 | |||
| 607a53f1bf | |||
| 6a49c21bbe | |||
| 6bae80b874 | |||
| 43a147676e |
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "archeflow",
|
||||
"description": "Multi-agent orchestration with Jungian archetypes. PDCA quality cycles, shadow detection, git worktree isolation. Zero dependencies — works with any Claude Code session.",
|
||||
"version": "0.8.0",
|
||||
"version": "0.9.0",
|
||||
"author": {
|
||||
"name": "Chris Nennemann"
|
||||
},
|
||||
@@ -18,7 +18,8 @@
|
||||
"shadow-detection", "memory", "progress", "presence",
|
||||
"colette-bridge", "git-integration", "multi-project", "cost-tracking",
|
||||
"custom-archetypes", "workflow-design", "domains",
|
||||
"templates", "autonomous-mode", "using-archeflow"
|
||||
"templates", "autonomous-mode", "using-archeflow",
|
||||
"af-status", "af-score", "af-dag", "af-report", "af-replay"
|
||||
],
|
||||
"hooks": "hooks/hooks.json"
|
||||
}
|
||||
|
||||
8
.gitignore
vendored
8
.gitignore
vendored
@@ -8,3 +8,11 @@ Thumbs.db
|
||||
# Editor
|
||||
*.swp
|
||||
*~
|
||||
# Paper build artifacts
|
||||
paper/*.aux
|
||||
paper/*.bbl
|
||||
paper/*.blg
|
||||
paper/*.log
|
||||
paper/*.out
|
||||
paper/*.pdf
|
||||
paper/*.toc
|
||||
|
||||
@@ -2,6 +2,11 @@
|
||||
|
||||
All notable changes to ArcheFlow are documented in this file.
|
||||
|
||||
## [0.9.0] -- 2026-04-06
|
||||
|
||||
### Added
|
||||
- Run replay: `decision.point` events via `archeflow-decision.sh`; `archeflow-replay.sh` with `timeline`, `whatif` (weighted archetype weights + threshold), and `compare`; skill `af-replay`; DAG labels for `decision.point`.
|
||||
|
||||
## [0.7.0] -- 2026-04-04
|
||||
|
||||
### Added
|
||||
|
||||
@@ -55,7 +55,7 @@ Explorer (research), Creator (design), Maker (implement), Guardian (security), S
|
||||
Three layers, one escalation protocol:
|
||||
- **Archetype shadows** — individual agent dysfunction
|
||||
- **System shadows** — orchestration-level issues (echo chamber, tunnel vision, scope creep)
|
||||
- **Policy boundaries** — operational limits (checkpoints, budgets, circuit breakers)
|
||||
- **Policy boundaries** — operational limits (checkpoints, budgets, Wiggum Breaks)
|
||||
|
||||
### Workflows
|
||||
| Risk Level | Workflow | Agents |
|
||||
|
||||
@@ -194,11 +194,13 @@ ArcheFlow ships with 19 skills organized by function. The `run` skill is self-co
|
||||
|
||||
## Library Scripts
|
||||
|
||||
Eight shell scripts in `lib/` power the process infrastructure.
|
||||
Ten shell scripts in `lib/` power the process infrastructure.
|
||||
|
||||
| Script | Purpose | Usage |
|
||||
|--------|---------|-------|
|
||||
| `archeflow-event.sh` | Append structured JSONL events to a run log | `archeflow-event.sh <run_id> <type> <phase> <agent> '<json>'` |
|
||||
| `archeflow-decision.sh` | Log a `decision.point` (phase, archetype, input, decision, confidence) | `archeflow-decision.sh <run_id> check guardian 'diff' 'needs_changes' 0.85` |
|
||||
| `archeflow-replay.sh` | Timeline + weighted what-if over recorded verdicts | `archeflow-replay.sh compare <run_id> --weights sage=2,guardian=1` |
|
||||
| `archeflow-dag.sh` | Render ASCII DAG from JSONL events | `archeflow-dag.sh events.jsonl --color` |
|
||||
| `archeflow-report.sh` | Generate Markdown process report | `archeflow-report.sh events.jsonl --output report.md --dag` |
|
||||
| `archeflow-progress.sh` | Regenerate live progress file from events | `archeflow-progress.sh <run_id>` |
|
||||
|
||||
235
docs/plans/archeflow-roadmap-v1.md
Normal file
235
docs/plans/archeflow-roadmap-v1.md
Normal file
@@ -0,0 +1,235 @@
|
||||
# ArcheFlow Roadmap — From Framework to Tool
|
||||
|
||||
Status: Planning (2026-04-06)
|
||||
Context: v0.8.0 shipped — consolidated skills, corrective action framework, 110 tests. The scaffolding is solid. Now make it genuinely useful.
|
||||
|
||||
## Guiding Principle
|
||||
|
||||
Every feature must close a feedback loop or remove friction. No features that add complexity without measurable improvement in either speed, cost, or quality.
|
||||
|
||||
---
|
||||
|
||||
## Tier 1: Make the Sprint Runner Smart (highest impact)
|
||||
|
||||
### 1.1 Queue from Git Issues
|
||||
|
||||
**Problem:** Manual `queue.json` is the biggest friction point. Nobody wants to maintain a JSON file by hand.
|
||||
|
||||
**Solution:** `./scripts/ws sync-issues` that:
|
||||
- Reads Gitea/GitHub issues via API (`gh issue list` or Gitea REST)
|
||||
- Maps labels to priority: `P0`=critical/blocker, `P1`=high, `P2`=medium, `P3`=low/enhancement
|
||||
- Maps labels to estimate: `size/S`, `size/M`, `size/L`, `size/XL` (default: M)
|
||||
- Extracts `depends_on` from "blocks #N" / "depends on #N" in issue body
|
||||
- Upserts into `queue.json` (doesn't overwrite manual edits, merges by issue ID)
|
||||
- Skips issues with `wontfix`, `duplicate`, `question` labels
|
||||
|
||||
**Scope:** One script in `scripts/`, ~100 lines. Gitea API + GitHub API (detect from remote URL). Needs API token in env var `GITEA_TOKEN` or `GITHUB_TOKEN`.
|
||||
|
||||
**Test:** bats tests with mock API responses (curl fixture files).
|
||||
|
||||
### 1.2 Cost Estimation
|
||||
|
||||
**Problem:** Users don't know what a sprint will cost before running it.
|
||||
|
||||
**Solution:** `/af-sprint --dry-run` shows estimated cost:
|
||||
```
|
||||
Sprint estimate: 7 tasks, ~18 agents, est. $1.20-$2.40, ~12 minutes
|
||||
P1: writing.colette fanout (L) — est. $0.50, 4 agents
|
||||
P1: tool.archeflow review (M) — est. $0.15, 2 agents
|
||||
...
|
||||
Proceed? [y/n]
|
||||
```
|
||||
|
||||
**How:** Track actual token counts per task size (S/M/L/XL) in `.archeflow/memory/cost-history.jsonl`. After 5+ tasks per size bucket, use median. Before that, use defaults: S=$0.05, M=$0.15, L=$0.50, XL=$1.50.
|
||||
|
||||
**Scope:** Update `sprint` skill with estimation section. Add cost logging to `archeflow-event.sh` (include `tokens_used` in `agent.complete` data). New script `lib/archeflow-cost.sh` for estimation.
|
||||
|
||||
### 1.3 Smart Workflow Selection
|
||||
|
||||
**Problem:** Current auto-selection uses keyword matching ("fix" -> pipeline). This is crude.
|
||||
|
||||
**Solution:** Analyze the actual task + codebase signals:
|
||||
|
||||
| Signal | Source | Workflow |
|
||||
|--------|--------|----------|
|
||||
| Files matching `auth|crypto|secret|token|session` | task description + file paths | -> thorough |
|
||||
| Public API changes (OpenAPI spec modified, exported functions changed) | git diff | -> thorough |
|
||||
| <3 files changed, all in same dir | git diff | -> fast/pipeline |
|
||||
| Test files only | git diff | -> pipeline |
|
||||
| Historical: this project's last 3 runs needed 0 cycles | memory | -> fast |
|
||||
| Historical: this project's last run had 2+ CRITICALs | memory | -> thorough |
|
||||
|
||||
**Scope:** Add to the `run` skill's Strategy Selection section. Read git diff stats + memory lessons before choosing. ~20 lines of logic replacing the current keyword table.
|
||||
|
||||
---
|
||||
|
||||
## Tier 2: Close the Learning Loop
|
||||
|
||||
### 2.1 Confidence Calibration
|
||||
|
||||
**Problem:** Creator's confidence scores (0.0-1.0) are self-reported and uncalibrated. A Creator that always says 0.8 but gets rejected 40% of the time is not useful.
|
||||
|
||||
**Solution:** After each `run.complete`, log calibration data:
|
||||
```jsonl
|
||||
{"run_id":"...","creator_confidence":{"task":0.8,"solution":0.7,"risk":0.6},"actual_outcome":"rejected","cycles":2,"criticals":1}
|
||||
```
|
||||
|
||||
At run start, inject calibration context into Creator prompt:
|
||||
```
|
||||
Your historical calibration: You rate task understanding at 0.8 avg,
|
||||
but 35% of runs with that score needed cycle-back. Consider scoring
|
||||
more conservatively.
|
||||
```
|
||||
|
||||
**Scope:** New field in `archeflow-memory.sh` calibration store. ~30 lines in `run` skill to log + inject. Needs 5+ runs before meaningful.
|
||||
|
||||
### 2.2 Archetype Auto-Tuning
|
||||
|
||||
**Problem:** The effectiveness scoring system exists (`archeflow-score.sh`) but nothing acts on it.
|
||||
|
||||
**Solution:** After 10+ runs, auto-generate recommendations:
|
||||
```
|
||||
Archetype Recommendations (based on 15 runs):
|
||||
Guardian: essential (caught real issues in 80% of runs)
|
||||
Sage: keep (useful findings in 60% of runs)
|
||||
Skeptic: demote to thorough-only (useful in 20%, mostly INFO)
|
||||
Trickster: keep for thorough (caught 2 bugs Guardian missed)
|
||||
```
|
||||
|
||||
Add to `/af-score` output. Store recommendation in config as `reviewers.recommended`:
|
||||
```yaml
|
||||
reviewers:
|
||||
recommended:
|
||||
always: [guardian]
|
||||
default: [sage]
|
||||
thorough_only: [skeptic, trickster]
|
||||
# Auto-generated 2026-04-06 from 15 runs. Override with explicit config.
|
||||
```
|
||||
|
||||
**Scope:** Update `archeflow-score.sh` with recommendation logic. Update `run` skill to read recommended config. Add to `af-score` skill display.
|
||||
|
||||
### 2.3 Campaign Memory
|
||||
|
||||
**Problem:** Related runs (e.g., "harden all API endpoints") don't share context.
|
||||
|
||||
**Solution:** Optional `--campaign <id>` flag on `/af-run`:
|
||||
- Links runs under a campaign ID
|
||||
- Cross-run context: "In Run 1, we found the auth pattern uses middleware X. In Run 2, the same pattern applies."
|
||||
- Campaign-level progress: "3/8 endpoints hardened, 2 CRITICALs remaining"
|
||||
- Campaign memory injected into Explorer/Creator prompts
|
||||
|
||||
**Scope:** New field in event schema. Campaign index in `.archeflow/campaigns/`. Update memory injection to filter by campaign. ~50 lines in `run` skill.
|
||||
|
||||
---
|
||||
|
||||
## Tier 3: Integrate with Real Workflow
|
||||
|
||||
### 3.1 Findings as PR Comments
|
||||
|
||||
**Problem:** Review findings live in `.archeflow/artifacts/`. Nobody reads artifact files — they read PR comments.
|
||||
|
||||
**Solution:** After Check phase, if a PR exists for the branch:
|
||||
```bash
|
||||
# Post each CRITICAL/WARNING as a PR review comment
|
||||
gh api repos/{owner}/{repo}/pulls/{pr}/comments \
|
||||
--field body="🛡️ **Guardian** [CRITICAL/security]\n\n${description}\n\nSuggested fix: ${fix}" \
|
||||
--field path="${file}" --field line="${line}"
|
||||
```
|
||||
|
||||
**Scope:** New `--pr <number>` flag on `/af-run` and `/af-review`. Script `lib/archeflow-pr.sh` for posting comments. Falls back gracefully if no PR or no API token.
|
||||
|
||||
### 3.2 CI Hook Mode
|
||||
|
||||
**Problem:** ArcheFlow runs manually. It should run automatically on PRs.
|
||||
|
||||
**Solution:** Lightweight CI integration:
|
||||
```yaml
|
||||
# .github/workflows/archeflow-review.yml (or Gitea equivalent)
|
||||
on: pull_request
|
||||
jobs:
|
||||
review:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- run: claude --plugin-dir ./archeflow -p "/af-review --branch ${{ github.head_ref }} --pr ${{ github.event.number }}"
|
||||
```
|
||||
|
||||
Only runs Guardian (fast, cheap). Posts findings as PR comments. No PDCA overhead.
|
||||
|
||||
**Scope:** Template workflow file in `examples/ci/`. Update `review` skill to support `--pr` flag. Documentation.
|
||||
|
||||
### 3.3 Watch Mode
|
||||
|
||||
**Problem:** You have to remember to run `/af-review` after pushing.
|
||||
|
||||
**Solution:** `/af-watch` — background process that monitors a branch:
|
||||
- Uses `git log --since` polling (every 60s)
|
||||
- On new commits: auto-run `/af-review` on the diff
|
||||
- Posts findings as PR comments if PR exists
|
||||
- Respects budget gate from corrective action framework
|
||||
|
||||
**Scope:** New skill `af-watch/SKILL.md` (~30 lines). Uses the `loop` skill infrastructure. Low priority — CI hook mode covers most use cases.
|
||||
|
||||
---
|
||||
|
||||
## Tier 4: Replay and Analysis
|
||||
|
||||
### 4.1 Decision Journal
|
||||
|
||||
**Problem:** No visibility into why ArcheFlow made specific choices during a run.
|
||||
|
||||
**Solution:** Already started with `archeflow-decision.sh` and `archeflow-replay.sh`. Extend:
|
||||
- Log every decision point: workflow selection, A1/A2/A3 triggers, fix routing, shadow detections
|
||||
- `/af-replay <run_id> --timeline` shows the decision chain
|
||||
- `/af-replay <run_id> --whatif --workflow thorough` simulates: "What would thorough have found?"
|
||||
|
||||
**Scope:** Mostly built. Needs integration into the `run` skill (emit `decision.point` events at each choice). The replay script needs the what-if simulation logic.
|
||||
|
||||
### 4.2 Run Comparison
|
||||
|
||||
**Problem:** No way to evaluate whether workflow X is better than workflow Y for a project.
|
||||
|
||||
**Solution:** `/af-replay compare <run_a> <run_b>`:
|
||||
```
|
||||
Run A (standard, 4m30s, $0.80): 5 findings, 4 resolved, 1 INFO remaining
|
||||
Run B (thorough, 12m, $2.10): 7 findings, 6 resolved, 1 INFO remaining
|
||||
Delta: +2 findings (both INFO), +165% cost, +167% time
|
||||
Verdict: Standard was sufficient for this task.
|
||||
```
|
||||
|
||||
**Scope:** Update `archeflow-replay.sh` with comparison mode. Needs at least 2 runs on similar tasks.
|
||||
|
||||
---
|
||||
|
||||
## Implementation Order
|
||||
|
||||
```
|
||||
v0.9.0 — Sprint Intelligence
|
||||
1.1 Queue from issues
|
||||
1.2 Cost estimation
|
||||
1.3 Smart workflow selection
|
||||
|
||||
v0.10.0 — Learning Loop
|
||||
2.1 Confidence calibration
|
||||
2.2 Archetype auto-tuning
|
||||
2.3 Campaign memory
|
||||
|
||||
v0.11.0 — Integration
|
||||
3.1 Findings as PR comments
|
||||
3.2 CI hook mode
|
||||
3.3 Watch mode (stretch)
|
||||
|
||||
v0.12.0 — Analysis
|
||||
4.1 Decision journal (mostly done)
|
||||
4.2 Run comparison
|
||||
```
|
||||
|
||||
Each version is independently shippable. No version depends on a later one.
|
||||
|
||||
## What NOT to Build
|
||||
|
||||
- **Web dashboard** — Terminal is the interface. Don't add a server.
|
||||
- **Embedding-based memory** — Keyword matching works. Don't add vector DBs.
|
||||
- **Agent marketplace** — Focus on the 7 built-in archetypes being excellent.
|
||||
- **Multi-user collaboration** — ArcheFlow is a single-user tool. Git is the collaboration layer.
|
||||
- **Plugin system for plugins** — ArcheFlow IS a plugin. Don't go meta.
|
||||
@@ -1,5 +1,11 @@
|
||||
# ArcheFlow — Status Log
|
||||
|
||||
## 2026-04-06: Run replay (v0.9.0)
|
||||
|
||||
- `lib/archeflow-decision.sh` — append `decision.point` (phase, archetype, input, decision, confidence).
|
||||
- `lib/archeflow-replay.sh` — `timeline` / `whatif` (weighted archetypes, threshold) / `compare`; optional `--json`.
|
||||
- Skill `af-replay`, plugin bump, DAG renders `decision.point`, `tests/archeflow-replay.bats`.
|
||||
|
||||
## 2026-04-04: Triple Release Sprint (v0.4 → v0.6)
|
||||
|
||||
### What happened
|
||||
|
||||
@@ -7,7 +7,7 @@ const path = require("path");
|
||||
|
||||
try {
|
||||
const pluginRoot = path.resolve(__dirname, "..");
|
||||
const skillFile = path.join(pluginRoot, "skills", "using-archeflow", "SKILL.md");
|
||||
const skillFile = path.join(pluginRoot, "skills", "using-archeflow", "ACTIVATION.md");
|
||||
|
||||
if (!fs.existsSync(skillFile)) {
|
||||
console.log("{}");
|
||||
|
||||
@@ -87,6 +87,9 @@ EVENTS_PARSED=$(jq -r '
|
||||
elif .type == "agent.complete" then
|
||||
(.data.archetype // .agent // "unknown") + " (" + .phase + ")" +
|
||||
(if (.data.tokens // 0) > 0 then " [" + (.data.tokens | tostring) + " tok]" else "" end)
|
||||
elif .type == "decision.point" then
|
||||
(.data.archetype // .agent // "?") + " → " + (.data.decision // "?") +
|
||||
" (conf " + ((.data.confidence // 0) | tostring) + ")"
|
||||
elif .type == "decision" then
|
||||
"decision: " + (.data.what // "unknown") + " → " + (.data.chosen // "unknown")
|
||||
elif .type == "phase.transition" then
|
||||
@@ -209,7 +212,7 @@ render_node() {
|
||||
local colored_label
|
||||
case "$type" in
|
||||
phase.transition) colored_label="${C_TRANS}${label}${C_RESET}" ;;
|
||||
decision) colored_label="${C_DECISION}${label}${C_RESET}" ;;
|
||||
decision|decision.point) colored_label="${C_DECISION}${label}${C_RESET}" ;;
|
||||
review.verdict) colored_label="${C_VERDICT}${label}${C_RESET}" ;;
|
||||
*) colored_label="${pc}${label}${C_RESET}" ;;
|
||||
esac
|
||||
|
||||
48
lib/archeflow-decision.sh
Executable file
48
lib/archeflow-decision.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
# archeflow-decision.sh — Log a PDCA decision point for run replay / effectiveness analysis.
|
||||
#
|
||||
# Appends a decision.point event to .archeflow/events/<run_id>.jsonl with:
|
||||
# phase, archetype (agent + data.archetype), input, decision, confidence, ts (via event layer)
|
||||
#
|
||||
# Usage:
|
||||
# ./lib/archeflow-decision.sh <run_id> <phase> <archetype> '<input>' '<decision>' <confidence> [parent_seq]
|
||||
#
|
||||
# Examples:
|
||||
# ./lib/archeflow-decision.sh 2026-04-06-auth check guardian \
|
||||
# 'diff + proposal risks' 'needs_changes' 0.82 7
|
||||
# ./lib/archeflow-decision.sh 2026-04-06-auth act "" 'route findings' 'send_to_maker' 0.9
|
||||
#
|
||||
# confidence: 0.0–1.0 (orchestrator-estimated certainty in the recorded choice)
|
||||
#
|
||||
# Requires: jq (via archeflow-event.sh)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
if [[ $# -lt 6 ]]; then
|
||||
echo "Usage: $0 <run_id> <phase> <archetype> '<input>' '<decision>' <confidence> [parent_seq]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
RUN_ID="$1"
|
||||
PHASE="$2"
|
||||
ARCH="$3"
|
||||
INPUT="$4"
|
||||
DECISION="$5"
|
||||
CONF_RAW="$6"
|
||||
PARENT="${7:-}"
|
||||
|
||||
if ! [[ "$CONF_RAW" =~ ^[0-9]*\.?[0-9]+$ ]]; then
|
||||
echo "Error: confidence must be a number (e.g. 0.85)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DATA=$(jq -cn \
|
||||
--arg a "$ARCH" \
|
||||
--arg i "$INPUT" \
|
||||
--arg d "$DECISION" \
|
||||
--argjson c "$CONF_RAW" \
|
||||
'{archetype:$a, input:$i, decision:$d, confidence:$c}')
|
||||
|
||||
exec "$LIB_DIR/archeflow-event.sh" "$RUN_ID" decision.point "$PHASE" "$ARCH" "$DATA" "$PARENT"
|
||||
@@ -8,6 +8,9 @@
|
||||
# ./lib/archeflow-event.sh 2026-04-03-der-huster agent.complete plan creator '{"duration_ms":167522}' 2
|
||||
# ./lib/archeflow-event.sh 2026-04-03-der-huster phase.transition do "" '{"from":"plan","to":"do"}' 3,4
|
||||
# ./lib/archeflow-event.sh 2026-04-03-der-huster fix.applied act "" '{"source":"guardian"}' 8
|
||||
# ./lib/archeflow-event.sh 2026-04-03-der-huster decision.point check guardian \
|
||||
# '{"archetype":"guardian","input":"diff","decision":"needs_changes","confidence":0.85}' 7
|
||||
# # Or use: ./lib/archeflow-decision.sh <run_id> <phase> <arch> '<input>' '<decision>' <confidence> [parent]
|
||||
#
|
||||
# Parent seqs: comma-separated seq numbers of causal parent events (DAG).
|
||||
# "2" → single parent [2]
|
||||
|
||||
228
lib/archeflow-replay.sh
Executable file
228
lib/archeflow-replay.sh
Executable file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env bash
|
||||
# archeflow-replay.sh — Inspect recorded runs: decision timeline and weighted what-if replay.
|
||||
#
|
||||
# Usage:
|
||||
# archeflow-replay.sh timeline <run_id>
|
||||
# archeflow-replay.sh whatif <run_id> [--weights arch=w,arch2=w2] [--threshold 0.5] [--json]
|
||||
# archeflow-replay.sh compare <run_id> [--weights ...] [--threshold ...] [--json]
|
||||
#
|
||||
# Events file: .archeflow/events/<run_id>.jsonl (relative to current working directory)
|
||||
#
|
||||
# whatif / compare:
|
||||
# - Loads check-phase review.verdict events (last verdict per archetype).
|
||||
# - Original gate (strict): BLOCK if any reviewer is not approved.
|
||||
# - Replay gate (weighted): BLOCK if sum(weight * strict) / sum(weight) >= threshold,
|
||||
# where strict=1 for non-approved verdicts, else 0. Default weight per archetype is 1.0.
|
||||
#
|
||||
# Requires: jq
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "Usage: $0 {timeline|whatif|compare} <run_id> [options]" >&2
|
||||
echo "" >&2
|
||||
echo " timeline <run_id> Decision timeline (decision.point + review.verdict)" >&2
|
||||
echo " whatif <run_id> [--weights k=v,...] [--threshold 0.5] [--json]" >&2
|
||||
echo " compare <run_id> (timeline + whatif summary)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
COMMAND="$1"
|
||||
RUN_ID="$2"
|
||||
shift 2
|
||||
|
||||
if ! command -v jq &>/dev/null; then
|
||||
echo "Error: jq is required." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
EVENT_FILE=".archeflow/events/${RUN_ID}.jsonl"
|
||||
|
||||
resolve_event_file() {
|
||||
if [[ ! -f "$EVENT_FILE" ]]; then
|
||||
echo "Error: event file not found: $EVENT_FILE" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
cmd_timeline() {
|
||||
resolve_event_file
|
||||
echo "## Decision timeline — run_id=${RUN_ID}"
|
||||
echo ""
|
||||
local cnt
|
||||
cnt=$(jq -s '[.[] | select(.type == "decision.point")] | length' "$EVENT_FILE")
|
||||
if [[ "$cnt" -gt 0 ]]; then
|
||||
echo "### decision.point (${cnt})"
|
||||
jq -r 'select(.type == "decision.point")
|
||||
| "- \(.ts) [\(.phase)] \(.data.archetype // .agent // "?") \(.data.decision) conf=\(.data.confidence // "n/a") input=\(.data.input // "")"' \
|
||||
"$EVENT_FILE"
|
||||
echo ""
|
||||
else
|
||||
echo "### decision.point"
|
||||
echo "(none — emit with ./lib/archeflow-decision.sh during the run)"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
echo "### review.verdict (check phase)"
|
||||
if jq -e -s '[.[] | select(.type == "review.verdict" and .phase == "check")] | length > 0' "$EVENT_FILE" >/dev/null 2>&1; then
|
||||
jq -r 'select(.type == "review.verdict" and .phase == "check")
|
||||
| "- \(.ts) \(.data.archetype // .agent // "?") verdict=\(.data.verdict) findings=\((.data.findings // []) | length)"' \
|
||||
"$EVENT_FILE"
|
||||
else
|
||||
echo "(none)"
|
||||
fi
|
||||
echo ""
|
||||
}
|
||||
|
||||
parse_weights_to_json() {
|
||||
local raw="${1:-}"
|
||||
local obj='{}'
|
||||
if [[ -z "$raw" ]]; then
|
||||
echo '{}'
|
||||
return
|
||||
fi
|
||||
IFS=',' read -ra pairs <<< "$raw"
|
||||
for pair in "${pairs[@]}"; do
|
||||
[[ -z "$pair" ]] && continue
|
||||
local k="${pair%%=*}"
|
||||
local v="${pair#*=}"
|
||||
k=$(echo "$k" | tr '[:upper:]' '[:lower:]' | xargs)
|
||||
v=$(echo "$v" | xargs)
|
||||
if [[ -z "$k" || "$k" == "$pair" ]]; then
|
||||
echo "Error: invalid weight entry (use arch=1.5): $pair" >&2
|
||||
exit 1
|
||||
fi
|
||||
obj=$(echo "$obj" | jq --arg k "$k" --argjson v "$v" '. + {($k): $v}')
|
||||
done
|
||||
echo "$obj"
|
||||
}
|
||||
|
||||
cmd_whatif() {
|
||||
local weights_str=""
|
||||
local threshold="0.5"
|
||||
local json_out="false"
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--weights)
|
||||
weights_str="$2"
|
||||
shift 2
|
||||
;;
|
||||
--threshold)
|
||||
threshold="$2"
|
||||
shift 2
|
||||
;;
|
||||
--json)
|
||||
json_out="true"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
resolve_event_file
|
||||
local weights_json
|
||||
weights_json="$(parse_weights_to_json "$weights_str")"
|
||||
|
||||
local result
|
||||
result=$(jq -s --argjson weights "$weights_json" --argjson thr "$threshold" --arg run_id "$RUN_ID" '
|
||||
def strict($v):
|
||||
if $v == null then 1
|
||||
else ($v | ascii_downcase) as $lv
|
||||
| if ($lv == "approved" or $lv == "approve") then 0 else 1 end
|
||||
end;
|
||||
|
||||
def norm_key: ascii_downcase;
|
||||
|
||||
([.[] | select(.type == "review.verdict" and .phase == "check")]
|
||||
| sort_by(.seq)
|
||||
| reduce .[] as $e ({}; . + { (($e.data.archetype // $e.agent // "unknown") | norm_key): $e })
|
||||
) as $last |
|
||||
|
||||
($last | keys) as $keys |
|
||||
if ($keys | length) == 0 then
|
||||
{
|
||||
run_id: $run_id,
|
||||
error: "no check-phase review.verdict events; nothing to simulate"
|
||||
}
|
||||
else
|
||||
[ $keys[] as $k | $last[$k] as $ev |
|
||||
($weights[($k | norm_key)] // 1.0) as $w
|
||||
| strict($ev.data.verdict) as $s
|
||||
| {
|
||||
archetype: ($ev.data.archetype // $ev.agent // $k),
|
||||
verdict: ($ev.data.verdict // "unknown"),
|
||||
weight: $w,
|
||||
strict: $s,
|
||||
weighted_contrib: ($w * $s)
|
||||
}
|
||||
] as $rows |
|
||||
($rows | map(.weighted_contrib) | add) as $num |
|
||||
($rows | map(.weight) | add) as $den |
|
||||
(if $den > 0 then ($num / $den) else 0 end) as $ratio |
|
||||
(if ($rows | map(.strict) | max) == 1 then "BLOCK" else "SHIP" end) as $strict_out |
|
||||
(if $ratio >= $thr then "BLOCK" else "SHIP" end) as $replay_out |
|
||||
{
|
||||
run_id: $run_id,
|
||||
threshold: $thr,
|
||||
weights_used: $weights,
|
||||
strict_any_veto: {
|
||||
outcome: $strict_out,
|
||||
description: "BLOCK if any reviewer verdict is not approved"
|
||||
},
|
||||
weighted_replay: {
|
||||
weighted_strictness: ($ratio * 1000 | round / 1000),
|
||||
outcome: $replay_out,
|
||||
description: ("BLOCK if weighted strictness >= " + ($thr | tostring))
|
||||
},
|
||||
reviewers: $rows
|
||||
}
|
||||
end
|
||||
' "$EVENT_FILE")
|
||||
|
||||
if [[ "$json_out" == "true" ]]; then
|
||||
echo "$result"
|
||||
else
|
||||
echo "$result" | jq -r '
|
||||
if .error then "Error: \(.error)" else
|
||||
"# What-if replay — run_id=\(.run_id)\n",
|
||||
"",
|
||||
"## Outcomes",
|
||||
"| Model | Result |",
|
||||
"|-------|--------|",
|
||||
"| Original (any non-approve → BLOCK) | \(.strict_any_veto.outcome) |",
|
||||
"| Weighted replay (threshold=\(.threshold)) | \(.weighted_replay.outcome) |",
|
||||
"",
|
||||
"## Weighted strictness",
|
||||
"\(.weighted_replay.weighted_strictness) (0 = all approved, 1 = all blocking)",
|
||||
"",
|
||||
"## Per reviewer",
|
||||
"| Archetype | Verdict | Weight | Strict | w×strict |",
|
||||
"|-----------|---------|--------|--------|----------|",
|
||||
(.reviewers[] | "| \(.archetype) | \(.verdict) | \(.weight) | \(.strict) | \(.weighted_contrib) |"),
|
||||
"",
|
||||
(if (.weights_used | length) > 0 then
|
||||
"## Custom weights applied\n" + (.weights_used | to_entries | map("- \(.key): \(.value)") | join("\n")) + "\n"
|
||||
else empty end)
|
||||
end
|
||||
'
|
||||
fi
|
||||
}
|
||||
|
||||
cmd_compare() {
|
||||
cmd_timeline
|
||||
echo ""
|
||||
cmd_whatif "$@"
|
||||
}
|
||||
|
||||
case "$COMMAND" in
|
||||
timeline) cmd_timeline ;;
|
||||
whatif) cmd_whatif "$@" ;;
|
||||
compare) cmd_compare "$@" ;;
|
||||
*)
|
||||
echo "Unknown command: $COMMAND" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
18
paper/Makefile
Normal file
18
paper/Makefile
Normal file
@@ -0,0 +1,18 @@
|
||||
# Build the ArcheFlow paper
|
||||
# Usage: make (build PDF)
|
||||
# make clean (remove build artifacts)
|
||||
|
||||
MAIN = archeflow
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
all: $(MAIN).pdf
|
||||
|
||||
$(MAIN).pdf: $(MAIN).tex references.bib
|
||||
pdflatex $(MAIN)
|
||||
bibtex $(MAIN)
|
||||
pdflatex $(MAIN)
|
||||
pdflatex $(MAIN)
|
||||
|
||||
clean:
|
||||
rm -f $(MAIN).{aux,bbl,blg,log,out,pdf,toc,lof,lot,nav,snm,vrb}
|
||||
880
paper/archeflow.tex
Normal file
880
paper/archeflow.tex
Normal file
@@ -0,0 +1,880 @@
|
||||
\documentclass[11pt,a4paper]{article}
|
||||
|
||||
% ---- Packages ----
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{amsmath,amssymb}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{booktabs}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{listings}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{tikz}
|
||||
\usetikzlibrary{shapes,arrows.meta,positioning,fit,calc}
|
||||
\usepackage[numbers]{natbib}
|
||||
\usepackage{geometry}
|
||||
\geometry{margin=1in}
|
||||
|
||||
% ---- Listings style ----
|
||||
\lstset{
|
||||
basicstyle=\ttfamily\small,
|
||||
breaklines=true,
|
||||
frame=single,
|
||||
framesep=3pt,
|
||||
columns=flexible,
|
||||
keepspaces=true,
|
||||
showstringspaces=false,
|
||||
commentstyle=\color{gray},
|
||||
keywordstyle=\color{blue!70!black},
|
||||
}
|
||||
|
||||
% ---- Title ----
|
||||
\title{%
|
||||
ArcheFlow: Multi-Agent Orchestration with\\
|
||||
Archetypal Roles and PDCA Quality Cycles%
|
||||
}
|
||||
|
||||
\author{
|
||||
Christian Nennemann\\
|
||||
Independent Researcher\\
|
||||
\texttt{chris@nennemann.de}\\
|
||||
\texttt{https://github.com/XORwell/archeflow}
|
||||
}
|
||||
|
||||
\date{April 2026}
|
||||
|
||||
\begin{document}
|
||||
\maketitle
|
||||
|
||||
% ============================================================
|
||||
\begin{abstract}
|
||||
We present \textsc{ArcheFlow}, an open-source orchestration framework for
|
||||
multi-agent software engineering that assigns \emph{archetypal roles}---derived
|
||||
from Jungian analytical psychology---to LLM agents and coordinates them through
|
||||
\emph{Plan--Do--Check--Act} (PDCA) quality cycles. Each of seven archetypes
|
||||
(Explorer, Creator, Maker, Guardian, Skeptic, Trickster, Sage) carries a defined
|
||||
cognitive virtue and a quantitatively detected \emph{shadow}---a failure mode
|
||||
triggered when the virtue becomes excessive. The framework implements a
|
||||
three-layer corrective action system (archetype shadows, system shadows, policy
|
||||
boundaries) that detects and mitigates agent dysfunction during autonomous
|
||||
operation. We describe ArcheFlow's architecture as a zero-dependency plugin for
|
||||
Claude Code, detail its attention filtering, feedback routing, convergence
|
||||
detection, and effectiveness scoring mechanisms, and discuss connections to
|
||||
recent work on persona stability in language models
|
||||
\citep{lu2026assistant}. ArcheFlow demonstrates that structured persona
|
||||
assignment with shadow detection can maintain productive agent behavior across
|
||||
extended autonomous sessions spanning multiple projects and quality domains
|
||||
(code, prose, research). The system is publicly available under the MIT license.
|
||||
\end{abstract}
|
||||
|
||||
% ============================================================
|
||||
\section{Introduction}
|
||||
\label{sec:introduction}
|
||||
|
||||
The rise of agentic coding assistants---tools that autonomously write, test,
|
||||
review, and commit code---has created a new class of software engineering
|
||||
challenges. While individual LLM agents can produce competent code, the quality
|
||||
of autonomous output degrades under conditions that are well-known from human
|
||||
software teams: reviewers who rubber-stamp, architects who over-engineer,
|
||||
implementers who ignore specifications, and testers who optimize for coverage
|
||||
metrics rather than real defects.
|
||||
|
||||
These failure modes are not merely analogies. \citet{lu2026assistant}
|
||||
demonstrate that language models occupy a measurable \emph{persona space} and
|
||||
can drift from their trained Assistant identity during extended conversations,
|
||||
particularly under emotional or philosophical pressure. Their ``Assistant
|
||||
Axis''---a dominant directional component in activation space---predicts when
|
||||
models will exhibit uncharacteristic behavior. If a single model drifts, a
|
||||
multi-agent system where each agent maintains a distinct persona faces
|
||||
compounded persona management challenges.
|
||||
|
||||
ArcheFlow addresses this problem by drawing on two established frameworks:
|
||||
\begin{enumerate}
|
||||
\item \textbf{Jungian archetypal psychology} \citep{jung1968archetypes}, which
|
||||
provides a taxonomy of cognitive orientations---each with a productive
|
||||
\emph{virtue} and a destructive \emph{shadow}---that map naturally onto
|
||||
software engineering roles.
|
||||
\item \textbf{PDCA quality cycles} \citep{deming1986out}, which provide a
|
||||
convergence mechanism for iterative refinement with measurable exit criteria.
|
||||
\end{enumerate}
|
||||
|
||||
The contribution of this paper is threefold:
|
||||
\begin{itemize}
|
||||
\item We present a \emph{shadow detection framework} that quantitatively
|
||||
identifies agent dysfunction---not through sentiment analysis or output
|
||||
classification, but through structural metrics (output length, finding ratios,
|
||||
scope violations) specific to each archetype's failure mode (Section~\ref{sec:shadows}).
|
||||
\item We describe \emph{attention filters} and \emph{feedback routing} mechanisms
|
||||
that constrain what each agent sees and where its output flows, preventing the
|
||||
information overload and echo chamber effects that plague na\"ive multi-agent
|
||||
systems (Section~\ref{sec:attention}).
|
||||
\item We demonstrate that PDCA convergence detection---including oscillation
|
||||
analysis and divergence scoring---provides principled stopping criteria for
|
||||
iterative review cycles (Section~\ref{sec:convergence}).
|
||||
\end{itemize}
|
||||
|
||||
ArcheFlow is implemented as a zero-dependency plugin (Bash + Markdown) for
|
||||
Claude Code\footnote{\url{https://claude.ai/claude-code}}, Anthropic's CLI
|
||||
coding assistant. It has been used in production across a portfolio of 10--30
|
||||
repositories spanning code, creative writing, and academic research.
|
||||
|
||||
% ============================================================
|
||||
\section{Related Work}
|
||||
\label{sec:related}
|
||||
|
||||
\subsection{Multi-Agent Software Engineering}
|
||||
|
||||
Multi-agent systems for software engineering have proliferated since 2024.
|
||||
\citet{hong2024metagpt} propose MetaGPT, which assigns human-like roles
|
||||
(product manager, architect, engineer) to LLM agents and enforces structured
|
||||
communication through Standardized Operating Procedures (SOPs). ChatDev
|
||||
\citep{qian2024chatdev} simulates a virtual software company with role-playing
|
||||
agents communicating through natural language chat. SWE-Agent
|
||||
\citep{yang2024sweagent} focuses on single-agent benchmark performance on
|
||||
GitHub issues, demonstrating that tool-augmented agents can resolve real-world
|
||||
bugs.
|
||||
|
||||
These systems share a common limitation: roles are defined by \emph{job
|
||||
descriptions} rather than \emph{cognitive orientations}. A ``product manager''
|
||||
agent may behave identically to a ``tech lead'' agent when both receive the same
|
||||
context, because the role boundary is semantic rather than structural. ArcheFlow
|
||||
addresses this through attention filters (Section~\ref{sec:attention}) that
|
||||
physically restrict what each agent perceives, ensuring that role differences
|
||||
manifest in behavior rather than merely in prompts.
|
||||
|
||||
\subsection{Persona Stability in Language Models}
|
||||
|
||||
\citet{lu2026assistant} identify the ``Assistant Axis'' in LLM activation
|
||||
space---a linear direction capturing the degree to which a model operates in its
|
||||
default helpful mode versus an alternative persona. Their key findings are
|
||||
directly relevant to multi-agent orchestration:
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{Persona space is low-dimensional}: only 4--19 principal
|
||||
components explain 70\% of persona variance across 275 character archetypes.
|
||||
\item \textbf{Drift is predictable}: user message embeddings predict response
|
||||
position along the Assistant Axis ($R^2 = 0.53$--$0.77$).
|
||||
\item \textbf{Drift correlates with harm}: models are more liable to produce
|
||||
harmful outputs when drifted from the Assistant identity ($r = 0.39$--$0.52$).
|
||||
\end{enumerate}
|
||||
|
||||
ArcheFlow's shadow detection (Section~\ref{sec:shadows}) can be understood as an
|
||||
\emph{application-level} analog to activation capping: where \citet{lu2026assistant}
|
||||
constrain neural activations to maintain persona stability, ArcheFlow constrains
|
||||
\emph{behavioral outputs} through quantitative triggers and corrective prompts.
|
||||
Both approaches recognize that productive personas require active stabilization,
|
||||
not merely initial assignment.
|
||||
|
||||
\subsection{Quality Cycles in Software Engineering}
|
||||
|
||||
The Plan--Do--Check--Act (PDCA) cycle, formalized by \citet{deming1986out} and
|
||||
rooted in Shewhart's statistical process control \citep{shewhart1939statistical},
|
||||
is the dominant quality improvement framework in manufacturing and has been
|
||||
applied to software engineering through agile retrospectives and continuous
|
||||
improvement. To our knowledge, ArcheFlow is the first system to apply PDCA
|
||||
cycles to multi-agent LLM orchestration with formal convergence detection and
|
||||
oscillation analysis.
|
||||
|
||||
\subsection{Jungian Archetypes in Computing}
|
||||
|
||||
While Jungian archetypes have been applied in user experience design
|
||||
\citep{hartson2012ux}, brand strategy, and game design, their application to
|
||||
AI agent systems is novel. The closest related work is in computational
|
||||
creativity, where archetypal narratives have been used to structure story
|
||||
generation \citep{winston2011strong}. ArcheFlow extends this to software
|
||||
engineering by mapping archetypal virtues and shadows to measurable engineering
|
||||
outcomes.
|
||||
|
||||
% ============================================================
|
||||
\section{Architecture}
|
||||
\label{sec:architecture}
|
||||
|
||||
ArcheFlow is a plugin for Claude Code that operates entirely through prompt
|
||||
engineering, shell scripts, and file-based communication. It has zero runtime
|
||||
dependencies beyond Bash and a compatible LLM backend.
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\begin{tikzpicture}[
|
||||
node distance=1.2cm and 2cm,
|
||||
phase/.style={draw, rounded corners, minimum width=2.5cm, minimum height=0.8cm, font=\small\bfseries},
|
||||
agent/.style={draw, rounded corners, minimum width=2cm, minimum height=0.6cm, font=\small, fill=blue!5},
|
||||
arrow/.style={-{Stealth[length=3mm]}, thick},
|
||||
label/.style={font=\scriptsize, text=gray},
|
||||
]
|
||||
|
||||
% PDCA Cycle
|
||||
\node[phase, fill=yellow!20] (plan) {Plan};
|
||||
\node[phase, fill=green!20, right=of plan] (do) {Do};
|
||||
\node[phase, fill=orange!20, right=of do] (check) {Check};
|
||||
\node[phase, fill=red!15, right=of check] (act) {Act};
|
||||
|
||||
% Plan agents
|
||||
\node[agent, below left=0.8cm and 0.3cm of plan] (explorer) {Explorer};
|
||||
\node[agent, below right=0.8cm and 0.3cm of plan] (creator) {Creator};
|
||||
|
||||
% Do agent
|
||||
\node[agent, below=0.8cm of do] (maker) {Maker};
|
||||
|
||||
% Check agents
|
||||
\node[agent, below left=0.8cm and -0.2cm of check] (guardian) {Guardian};
|
||||
\node[agent, below=0.8cm of check] (skeptic) {Skeptic};
|
||||
\node[agent, below right=0.8cm and -0.2cm of check] (sage) {Sage};
|
||||
|
||||
% Arrows
|
||||
\draw[arrow] (plan) -- (do);
|
||||
\draw[arrow] (do) -- (check);
|
||||
\draw[arrow] (check) -- (act);
|
||||
\draw[arrow, dashed] (act.south) -- ++(0,-0.5) -| node[label, below, pos=0.25] {cycle back} (plan.south);
|
||||
|
||||
% Agent connections
|
||||
\draw[-] (plan.south) -- (explorer.north);
|
||||
\draw[-] (plan.south) -- (creator.north);
|
||||
\draw[-] (do.south) -- (maker.north);
|
||||
\draw[-] (check.south) -- (guardian.north);
|
||||
\draw[-] (check.south) -- (skeptic.north);
|
||||
\draw[-] (check.south) -- (sage.north);
|
||||
|
||||
\end{tikzpicture}
|
||||
\caption{ArcheFlow PDCA cycle with archetypal agent assignments. The dashed arrow represents cycle-back when reviewers find issues. A Trickster agent (not shown) joins the Check phase in \texttt{thorough} workflows.}
|
||||
\label{fig:pdca}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Components}
|
||||
|
||||
The system comprises four component types:
|
||||
|
||||
\begin{description}
|
||||
\item[Agent personas] (\texttt{agents/*.md}): Behavioral protocols for each
|
||||
archetype, defining the agent's cognitive lens, output format, and quality
|
||||
criteria. Each persona is a Markdown file loaded as a system prompt.
|
||||
|
||||
\item[Skills] (\texttt{skills/*/SKILL.md}): Operational instructions that
|
||||
Claude Code follows to orchestrate the PDCA cycle. The core \texttt{run} skill
|
||||
(466 lines) is self-contained---it encodes the complete orchestration protocol
|
||||
including workflow selection, agent spawning, attention filtering, convergence
|
||||
checking, and exit decisions.
|
||||
|
||||
\item[Library scripts] (\texttt{lib/*.sh}): Ten Bash scripts handling
|
||||
infrastructure concerns: JSONL event logging, git operations (per-phase
|
||||
commits, branch management, rollback), cross-run memory, progress tracking,
|
||||
effectiveness scoring, and run replay.
|
||||
|
||||
\item[Hooks] (\texttt{hooks/}): Session-start hook that auto-activates
|
||||
ArcheFlow and injects the domain detection logic.
|
||||
\end{description}
|
||||
|
||||
\subsection{Execution Modes}
|
||||
|
||||
ArcheFlow provides three execution modes optimized for different use cases:
|
||||
|
||||
\begin{description}
|
||||
\item[Sprint] (\texttt{/af-sprint}): Queue-driven parallel dispatch. Reads a
|
||||
priority-ordered task queue, spawns 3--5 agents across different projects
|
||||
simultaneously, collects results, commits, and starts the next batch. Designed
|
||||
for throughput over ceremony.
|
||||
|
||||
\item[Review] (\texttt{/af-review}): Guardian-led post-implementation review
|
||||
on existing diffs, branches, or commit ranges. No planning or implementation
|
||||
orchestration---pure quality analysis.
|
||||
|
||||
\item[Run] (\texttt{/af-run}): Full PDCA orchestration for complex tasks
|
||||
requiring structured exploration, design, implementation, and multi-perspective
|
||||
review.
|
||||
\end{description}
|
||||
|
||||
\subsection{Domain Adaptation}
|
||||
|
||||
ArcheFlow adapts its terminology and quality criteria based on domain detection:
|
||||
\texttt{code} (diffs, tests, security), \texttt{writing} (voice consistency,
|
||||
dialect authenticity, narrative structure), and \texttt{research} (source quality,
|
||||
argument coherence, citation accuracy). Domain is auto-detected from project
|
||||
contents or specified in configuration.
|
||||
|
||||
% ============================================================
|
||||
\section{The Seven Archetypes}
|
||||
\label{sec:archetypes}
|
||||
|
||||
Each archetype embodies a cognitive orientation with a defined virtue (productive
|
||||
mode) and shadow (destructive mode). \Cref{tab:archetypes} summarizes the
|
||||
complete taxonomy.
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\caption{The seven ArcheFlow archetypes with their PDCA phase assignments,
|
||||
cognitive virtues, and shadow failure modes.}
|
||||
\label{tab:archetypes}
|
||||
\begin{tabular}{@{}llllll@{}}
|
||||
\toprule
|
||||
\textbf{Archetype} & \textbf{Phase} & \textbf{Virtue} & \textbf{Shadow} & \textbf{Model Tier} \\
|
||||
\midrule
|
||||
Explorer & Plan & Contextual Clarity & Rabbit Hole & Haiku \\
|
||||
Creator & Plan & Decisive Framing & Over-Architect & Sonnet \\
|
||||
Maker & Do & Execution Discipline & Rogue & Sonnet \\
|
||||
Guardian & Check & Threat Intuition & Paranoid & Sonnet \\
|
||||
Skeptic & Check & Assumption Surfacing & Paralytic & Haiku \\
|
||||
Trickster & Check & Adversarial Creativity & False Alarm & Haiku \\
|
||||
Sage & Check & Maintainability Judgment & Bureaucrat & Haiku \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
The archetype--shadow pairing is not metaphorical; it is the core mechanism
|
||||
for maintaining agent quality. The virtue describes \emph{what} the archetype
|
||||
contributes; the shadow describes what happens when that contribution becomes
|
||||
excessive. An Explorer who never stops researching (Rabbit Hole) delays the
|
||||
entire pipeline. A Guardian who rejects everything (Paranoid) prevents any
|
||||
code from shipping.
|
||||
|
||||
\subsection{Cost-Aware Model Assignment}
|
||||
|
||||
Not all archetypes require the same model capability. Analytical tasks
|
||||
(exploration, assumption checking, code quality review) can be performed by
|
||||
cheaper models (Haiku), while creative tasks (architecture design,
|
||||
implementation, security analysis) benefit from more capable models (Sonnet).
|
||||
This tiered assignment reduces per-run costs by 40--60\% compared to using the
|
||||
most capable model for all agents, with no observed quality degradation in
|
||||
analytical roles.
|
||||
|
||||
% ============================================================
|
||||
\section{Shadow Detection and Corrective Action}
|
||||
\label{sec:shadows}
|
||||
|
||||
\subsection{Archetype Shadows}
|
||||
|
||||
Shadow detection is \emph{quantitative, not sentiment-based}. Each archetype has
|
||||
a specific trigger condition derived from structural properties of its output:
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Shadow detection triggers. Each trigger is evaluated automatically
|
||||
after the agent completes.}
|
||||
\label{tab:shadows}
|
||||
\begin{tabular}{@{}lll@{}}
|
||||
\toprule
|
||||
\textbf{Archetype} & \textbf{Shadow} & \textbf{Trigger} \\
|
||||
\midrule
|
||||
Explorer & Rabbit Hole & Output $> 2000$ words without Recommendation section \\
|
||||
Creator & Over-Architect & $> 2$ new abstractions for a single feature \\
|
||||
Maker & Rogue & No tests in changeset, or files outside proposal scope \\
|
||||
Guardian & Paranoid & CRITICAL:WARNING ratio $> 2{:}1$, or zero approvals \\
|
||||
Skeptic & Paralytic & $> 7$ challenges with $< 50\%$ having alternatives \\
|
||||
Trickster & False Alarm & Findings in untouched code, or $> 10$ total findings \\
|
||||
Sage & Bureaucrat & Review length $> 2\times$ code change length \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
The escalation protocol follows a three-strike pattern:
|
||||
\begin{enumerate}
|
||||
\item \textbf{First detection}: Inject a correction prompt that names the
|
||||
shadow and redirects the agent toward its virtue.
|
||||
\item \textbf{Second detection} (same shadow, same run): Replace the agent
|
||||
with a fresh instance.
|
||||
\item \textbf{Third detection}: Escalate to the user for manual intervention.
|
||||
\end{enumerate}
|
||||
|
||||
\subsection{System Shadows}
|
||||
|
||||
Beyond individual archetype dysfunction, ArcheFlow monitors for
|
||||
\emph{system-level} failure modes:
|
||||
|
||||
\begin{description}
|
||||
\item[Echo Chamber]: Multiple reviewers produce identical findings, suggesting
|
||||
they are confirming each other rather than applying independent judgment.
|
||||
Detected when $> 60\%$ of findings across reviewers share the same
|
||||
file-and-category tuple.
|
||||
|
||||
\item[Tunnel Vision]: All findings cluster in a single file or module while
|
||||
the changeset spans multiple. Detected when $> 80\%$ of findings target
|
||||
$< 20\%$ of changed files.
|
||||
|
||||
\item[Scope Creep]: Maker modifies files not mentioned in the Creator's
|
||||
proposal. Detected by comparing \texttt{do-maker-files.txt} against the
|
||||
proposal's file list.
|
||||
\end{description}
|
||||
|
||||
\subsection{Policy Boundaries and the Wiggum Break}
|
||||
|
||||
The third layer enforces operational limits through budget gates, cycle
|
||||
limits, and checkpoint policies. When limits are exceeded, the system
|
||||
triggers a \emph{Wiggum Break}\footnote{Named after Chief Wiggum from
|
||||
\emph{The Simpsons}---a nod to both ``policy enforcement'' and the
|
||||
Ralph Loop plugin for Claude Code.}---a circuit breaker that halts
|
||||
execution, saves state, and reports to the user.
|
||||
|
||||
Wiggum Breaks are classified as \emph{hard} (halt immediately) or
|
||||
\emph{soft} (finish current task, then halt):
|
||||
|
||||
\begin{description}
|
||||
\item[Hard breaks]: 3 consecutive agent failures, 3 consecutive shadow
|
||||
detections in one run, test suite broken after merge, 2+ oscillating
|
||||
findings.
|
||||
\item[Soft breaks]: convergence score $< 0.5$ for 2 consecutive cycles,
|
||||
findings unchanged between cycles, budget $> 95\%$ spent.
|
||||
\end{description}
|
||||
|
||||
Each Wiggum Break emits a \texttt{wiggum.break} event capturing the
|
||||
trigger, run state, and unresolved findings for post-run analysis.
|
||||
|
||||
\subsection{Connection to the Assistant Axis}
|
||||
|
||||
The shadow detection framework addresses the same fundamental problem identified
|
||||
by \citet{lu2026assistant}: models drift from productive personas during
|
||||
extended operation. Where their work identifies drift in activation space and
|
||||
proposes activation capping as a mitigation, ArcheFlow operates at the
|
||||
\emph{behavioral} level---detecting drift through output structure rather than
|
||||
internal representations, and correcting through prompt injection rather than
|
||||
activation manipulation.
|
||||
|
||||
This application-level approach has a practical advantage: it requires no access
|
||||
to model internals and works with any LLM backend, including API-only models
|
||||
where activation-level interventions are impossible. The tradeoff is that
|
||||
behavioral detection is necessarily coarser than activation-level measurement
|
||||
and can only detect drift after it manifests in output, not before.
|
||||
|
||||
% ============================================================
|
||||
\section{Attention Filters and Information Flow}
|
||||
\label{sec:attention}
|
||||
|
||||
A key design principle is that each agent receives \emph{only the information
|
||||
relevant to its role}. This is implemented through \emph{attention filters}---rules
|
||||
governing which artifacts from prior phases are injected into each agent's
|
||||
context.
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Attention filter matrix. Each agent receives only the artifacts marked
|
||||
with \checkmark.}
|
||||
\label{tab:attention}
|
||||
\begin{tabular}{@{}lccccc@{}}
|
||||
\toprule
|
||||
\textbf{Agent} & \textbf{Task} & \textbf{Explorer} & \textbf{Creator} & \textbf{Diff} & \textbf{Reviews} \\
|
||||
\midrule
|
||||
Explorer & \checkmark & & & & \\
|
||||
Creator & \checkmark & \checkmark & & & \\
|
||||
Maker & \checkmark & & \checkmark & & \\
|
||||
Guardian & & & (risks) & \checkmark & \\
|
||||
Skeptic & & & \checkmark & & \\
|
||||
Sage & & & \checkmark & \checkmark & \\
|
||||
Trickster & & & & \checkmark & \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
The rationale for attention filtering is twofold:
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{Independence}: Reviewers who see each other's findings tend to
|
||||
converge on a shared narrative rather than applying independent judgment. By
|
||||
isolating reviewer inputs, ArcheFlow ensures that each reviewer contributes a
|
||||
genuinely distinct perspective.
|
||||
|
||||
\item \textbf{Focus}: An agent given everything tends to address everything,
|
||||
producing diluted analysis. The Trickster, for example, receives \emph{only}
|
||||
the diff---no design rationale, no risk analysis---forcing it to evaluate the
|
||||
code purely on its own terms.
|
||||
\end{enumerate}
|
||||
|
||||
In PDCA cycle 2+, the feedback from the Act phase is routed selectively:
|
||||
Creator-routed issues go to the Creator, Maker-routed issues go to the Maker.
|
||||
Neither sees the other's feedback, preventing defensive responses to criticism
|
||||
that was directed elsewhere.
|
||||
|
||||
% ============================================================
|
||||
\section{Feedback Routing}
|
||||
\label{sec:routing}
|
||||
|
||||
When the Check phase identifies issues, the Act phase must decide where to route
|
||||
each finding for the next cycle. ArcheFlow uses a deterministic routing table
|
||||
based on the source archetype and finding category:
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Feedback routing table. Findings are routed to the agent best equipped
|
||||
to address them, preventing cross-contamination.}
|
||||
\label{tab:routing}
|
||||
\begin{tabular}{@{}llll@{}}
|
||||
\toprule
|
||||
\textbf{Source} & \textbf{Category} & \textbf{Routes To} & \textbf{Rationale} \\
|
||||
\midrule
|
||||
Guardian & security, breaking-change & Creator & Design must change \\
|
||||
Guardian & reliability, dependency & Creator & Architectural decision \\
|
||||
Skeptic & design, scalability & Creator & Assumptions need revision \\
|
||||
Sage & quality, consistency & Maker & Implementation refinement \\
|
||||
Sage & testing & Maker & Test gap, not design flaw \\
|
||||
Trickster & reliability (design flaw) & Creator & Needs redesign \\
|
||||
Trickster & reliability (test gap) & Maker & Needs more tests \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
The disambiguation principle: if fixing the issue requires changing the
|
||||
\emph{approach}, route to Creator. If it requires changing the \emph{code within
|
||||
the existing approach}, route to Maker. Findings that persist across two
|
||||
consecutive cycles are escalated to the user rather than cycled indefinitely.
|
||||
|
||||
% ============================================================
|
||||
\section{Convergence Detection}
|
||||
\label{sec:convergence}
|
||||
|
||||
\subsection{Convergence Score}
|
||||
|
||||
In PDCA cycle 2+, ArcheFlow compares current findings against the previous cycle
|
||||
and classifies each as \textsc{New}, \textsc{Resolved}, \textsc{Persistent}, or
|
||||
\textsc{Regressed}. The convergence score is:
|
||||
|
||||
\begin{equation}
|
||||
C = \frac{|\textsc{Resolved}|}{|\textsc{Resolved}| + |\textsc{New}| + |\textsc{Regressed}|}
|
||||
\label{eq:convergence}
|
||||
\end{equation}
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Convergence score interpretation and corresponding actions.}
|
||||
\label{tab:convergence}
|
||||
\begin{tabular}{@{}lll@{}}
|
||||
\toprule
|
||||
\textbf{Score Range} & \textbf{Status} & \textbf{Action} \\
|
||||
\midrule
|
||||
$C > 0.8$ & Converging & Continue if cycles remain \\
|
||||
$0.5 \leq C \leq 0.8$ & Stalling & Continue with caution \\
|
||||
$C < 0.5$ & Diverging & Stop if 2 consecutive diverging cycles \\
|
||||
$C = 0$ & Stuck & Stop immediately \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\subsection{Oscillation Detection}
|
||||
|
||||
A finding is \emph{oscillating} if it was present in cycle $n-2$, absent in
|
||||
cycle $n-1$, and present again in cycle $n$. Two or more oscillating findings
|
||||
trigger an immediate stop with escalation to the user, as oscillation indicates
|
||||
a fundamental tension in the review criteria that automated cycles cannot
|
||||
resolve.
|
||||
|
||||
\subsection{Adaptive Workflow Escalation}
|
||||
|
||||
Convergence detection interacts with workflow selection through Rule A1: if a
|
||||
\texttt{fast} workflow and Guardian finds $\geq 2$ CRITICAL findings, the next
|
||||
cycle escalates to \texttt{standard} (adding Skeptic and Sage reviewers). Once
|
||||
escalated, the workflow remains escalated for the duration of the run.
|
||||
|
||||
Conversely, Rule A2 provides a \emph{fast-path}: if Guardian finds zero CRITICAL
|
||||
and zero WARNING findings, remaining reviewers are skipped entirely, and the
|
||||
system proceeds directly to Act. This optimization reduces the cost of runs
|
||||
where the Maker's implementation is clean.
|
||||
|
||||
% ============================================================
|
||||
\section{Evidence Validation}
|
||||
\label{sec:evidence}
|
||||
|
||||
Reviewer findings are subject to evidence validation before they influence
|
||||
routing decisions. A CRITICAL or WARNING finding is downgraded to INFO if:
|
||||
|
||||
\begin{itemize}
|
||||
\item It uses \emph{banned hedging phrases} without supporting evidence:
|
||||
``might be'', ``could potentially'', ``appears to'', ``seems like'', ``may not''.
|
||||
\item It contains \emph{no evidence}: no command output, code citation, line
|
||||
reference, or reproduction steps.
|
||||
\end{itemize}
|
||||
|
||||
This mechanism addresses a well-known failure mode of LLM reviewers: generating
|
||||
plausible-sounding but unsupported concerns. By requiring evidence for
|
||||
high-severity findings, ArcheFlow forces reviewers to ground their analysis in
|
||||
the actual changeset rather than speculation.
|
||||
|
||||
Downgrades are tracked in the event log but do \emph{not} modify the original
|
||||
artifact files, preserving the complete reviewer output for post-run analysis.
|
||||
|
||||
% ============================================================
|
||||
\section{Effectiveness Scoring}
|
||||
\label{sec:effectiveness}
|
||||
|
||||
After each completed run, ArcheFlow scores review archetypes across five
|
||||
dimensions:
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Effectiveness scoring dimensions and their weights.}
|
||||
\label{tab:effectiveness}
|
||||
\begin{tabular}{@{}lp{7cm}r@{}}
|
||||
\toprule
|
||||
\textbf{Dimension} & \textbf{Description} & \textbf{Weight} \\
|
||||
\midrule
|
||||
Signal-to-noise & Ratio of useful findings to total findings & 0.30 \\
|
||||
Fix rate & Fraction of findings that led to applied fixes & 0.25 \\
|
||||
Cost efficiency & Useful findings per dollar of model inference cost & 0.20 \\
|
||||
Accuracy & Fraction not contradicted by other reviewers & 0.15 \\
|
||||
Cycle impact & Whether findings contributed to cycle exit decision & 0.10 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
Scores accumulate in a cross-run memory file
|
||||
(\texttt{.archeflow/memory/effectiveness.jsonl}). After 10+ completed runs,
|
||||
the system recommends model tier changes (e.g., promoting a Haiku-tier reviewer
|
||||
to Sonnet if its signal-to-noise is consistently high) and, in extreme cases,
|
||||
archetype removal for persistently low-scoring reviewers.
|
||||
|
||||
% ============================================================
|
||||
\section{Cross-Run Memory}
|
||||
\label{sec:memory}
|
||||
|
||||
ArcheFlow maintains a lesson-learning system that persists across runs. When
|
||||
recurring findings are detected---the same category of issue appearing in
|
||||
multiple runs---the system stores a lesson and injects it into future agents
|
||||
as additional context.
|
||||
|
||||
Lessons decay over time: each lesson has a relevance counter that increments on
|
||||
reuse and decrements on irrelevance. Lessons that fall below a threshold are
|
||||
archived rather than injected, preventing the accumulation of stale guidance.
|
||||
|
||||
The memory system also performs regression detection: if a previously resolved
|
||||
issue reappears, it is flagged as a regression with higher priority than a
|
||||
fresh finding.
|
||||
|
||||
% ============================================================
|
||||
\section{Implementation}
|
||||
\label{sec:implementation}
|
||||
|
||||
ArcheFlow is implemented in approximately 6,700 lines across three layers:
|
||||
|
||||
\begin{itemize}
|
||||
\item \textbf{Skills} (19 Markdown files, $\sim$2,500 lines): Operational
|
||||
instructions for Claude Code, written as imperative protocols. The core
|
||||
\texttt{run} skill encodes the complete PDCA orchestration in 466 lines.
|
||||
|
||||
\item \textbf{Agent personas} (7 Markdown files, $\sim$700 lines): Behavioral
|
||||
protocols defining each archetype's cognitive lens, output format, and
|
||||
self-review checklist.
|
||||
|
||||
\item \textbf{Library scripts} (10 Bash scripts, $\sim$3,500 lines): Event
|
||||
logging, git operations, memory management, progress tracking, effectiveness
|
||||
scoring, and run replay.
|
||||
\end{itemize}
|
||||
|
||||
The system uses no database, no API server, and no runtime dependencies beyond
|
||||
Bash 4+ and a Claude Code installation. All state is stored in JSONL event logs
|
||||
and Markdown artifact files. This zero-dependency architecture was a deliberate
|
||||
design choice: orchestration infrastructure that itself requires complex setup
|
||||
and maintenance undermines the autonomy it is supposed to enable.
|
||||
|
||||
\subsection{Git Integration}
|
||||
|
||||
ArcheFlow creates per-phase commits, enabling fine-grained rollback. The Maker
|
||||
operates in a git worktree---an isolated working copy---so its changes do not
|
||||
affect the main branch until explicitly merged. If post-merge tests fail, the
|
||||
system auto-reverts the merge and cycles back with ``integration test failure''
|
||||
feedback.
|
||||
|
||||
\subsection{Run Replay}
|
||||
|
||||
All orchestration decisions are logged as \texttt{decision.point} events,
|
||||
enabling post-hoc analysis. The replay system provides:
|
||||
\begin{itemize}
|
||||
\item \textbf{Timeline view}: chronological sequence of all decisions with
|
||||
confidence scores.
|
||||
\item \textbf{Weighted what-if}: re-evaluation of the ship/block outcome
|
||||
using different reviewer weights, answering questions like ``would the outcome
|
||||
have changed if we weighted Guardian 2x and Sage 0.5x?''
|
||||
\item \textbf{Cross-run comparison}: side-by-side analysis of decision
|
||||
patterns across runs.
|
||||
\end{itemize}
|
||||
|
||||
% ============================================================
|
||||
\section{Multi-Domain Application}
|
||||
\label{sec:domains}
|
||||
|
||||
ArcheFlow's archetype system extends beyond code. The framework has been
|
||||
deployed across three domains:
|
||||
|
||||
\subsection{Software Engineering}
|
||||
|
||||
The primary domain. Archetypes map to standard engineering roles: Explorer
|
||||
performs codebase research, Creator designs architecture, Maker writes code,
|
||||
and the Check-phase archetypes review for security (Guardian), design flaws
|
||||
(Skeptic), edge cases (Trickster), and overall quality (Sage).
|
||||
|
||||
\subsection{Creative Writing}
|
||||
|
||||
In writing mode, the same archetype structure applies with adapted quality
|
||||
criteria. Custom archetypes (story-explorer, story-sage) replace or augment
|
||||
the defaults. The framework integrates with Colette, a voice profiling system
|
||||
that maintains consistent authorial voice across chapters. Quality gates check
|
||||
for voice consistency, dialect authenticity, and narrative structure rather
|
||||
than test coverage and security.
|
||||
|
||||
\subsection{Academic Research}
|
||||
|
||||
In research mode, quality criteria shift to source quality, argument coherence,
|
||||
citation accuracy, and methodological rigor. The Guardian reviews for logical
|
||||
fallacies and unsupported claims rather than security vulnerabilities.
|
||||
|
||||
% ============================================================
|
||||
\section{Discussion}
|
||||
\label{sec:discussion}
|
||||
|
||||
\subsection{Archetypes vs. Role Descriptions}
|
||||
|
||||
The key distinction between ArcheFlow's approach and prior multi-agent systems
|
||||
is the \emph{shadow} mechanism. A role description tells an agent what to do;
|
||||
an archetype tells an agent what to do \emph{and what doing too much of it
|
||||
looks like}. This bidirectional specification creates a bounded operating
|
||||
range for each agent, preventing the unbounded optimization that leads to
|
||||
dysfunction.
|
||||
|
||||
The connection to \citet{lu2026assistant}'s persona axis is instructive.
|
||||
They show that model personas exist on a continuum, with the Assistant identity
|
||||
at one extreme and theatrical/mystical identities at the other. ArcheFlow's
|
||||
archetypes deliberately position agents \emph{away} from the default Assistant
|
||||
toward specific cognitive orientations---but the shadow mechanism prevents them
|
||||
from drifting too far, maintaining a productive operating range analogous to
|
||||
what \citeauthor{lu2026assistant} achieve through activation capping.
|
||||
|
||||
\subsection{Wiggum Breaks as Human-in-the-Loop Boundaries}
|
||||
|
||||
A central question in autonomous agent systems is: \emph{when should the
|
||||
system stop acting and ask a human?} Most frameworks treat this as an
|
||||
implementation detail---a timeout, a retry limit, an exception handler.
|
||||
ArcheFlow treats it as a first-class architectural concept through the
|
||||
\emph{Wiggum Break}.
|
||||
|
||||
The Wiggum Break defines the \textbf{formal boundary between autonomous and
|
||||
human-supervised operation}. It is not a failure mode; it is the system's
|
||||
\emph{designed} response to situations where autonomous resolution is
|
||||
provably unproductive:
|
||||
|
||||
\begin{itemize}
|
||||
\item \textbf{Oscillation} (finding present $\to$ absent $\to$ present)
|
||||
indicates a genuine tension in the review criteria that no amount of
|
||||
cycling will resolve---only human judgment about which criterion takes
|
||||
priority.
|
||||
|
||||
\item \textbf{Divergence} (convergence score $< 0.5$ for two consecutive
|
||||
cycles) indicates that the implementation is getting worse with each
|
||||
iteration---the agents lack the context or capability to solve the
|
||||
problem, and continuing wastes resources.
|
||||
|
||||
\item \textbf{Repeated shadow detection} (same dysfunction three times)
|
||||
indicates that the corrective action framework has exhausted its
|
||||
options---the task structure is incompatible with the assigned archetype,
|
||||
and a human must re-scope.
|
||||
\end{itemize}
|
||||
|
||||
This framing inverts the typical HITL paradigm. Rather than asking
|
||||
``how much autonomy should the system have?'' and pre-defining approval
|
||||
gates, ArcheFlow asks ``under what conditions is autonomy
|
||||
\emph{provably unproductive}?'' and derives the HITL boundary from
|
||||
convergence theory. The system runs autonomously by default and escalates
|
||||
only when it can demonstrate---through quantitative metrics, not
|
||||
heuristics---that continued autonomous operation will not improve the
|
||||
outcome.
|
||||
|
||||
This approach has three advantages over pre-defined approval gates:
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{Adaptive autonomy}: Simple tasks never trigger a Wiggum
|
||||
Break; complex tasks trigger one quickly. The HITL boundary adapts to
|
||||
task difficulty without manual configuration.
|
||||
|
||||
\item \textbf{Auditable escalation}: Every Wiggum Break emits a
|
||||
\texttt{wiggum.break} event with the trigger condition, run state, and
|
||||
unresolved findings. The human receives not just a request for help,
|
||||
but a structured summary of \emph{why} autonomous resolution failed
|
||||
and what specifically needs their judgment.
|
||||
|
||||
\item \textbf{Minimal interruption}: Pre-defined gates (``approve every
|
||||
PR'', ``review every design'') interrupt the human on tasks the system
|
||||
could have handled autonomously. Convergence-derived breaks interrupt
|
||||
only when the system has evidence that it cannot proceed productively.
|
||||
\end{enumerate}
|
||||
|
||||
The Wiggum Break thus operationalizes a principle from resilience
|
||||
engineering: the system should be \emph{autonomy-seeking} (preferring to
|
||||
resolve issues itself) but \emph{escalation-ready} (able to produce a
|
||||
useful handoff when self-resolution fails). The quality of the handoff---not
|
||||
just the fact of escalation---is what makes HITL effective.
|
||||
|
||||
\subsection{Limitations}
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{No activation-level control}: ArcheFlow operates purely at the
|
||||
prompt level. It cannot detect persona drift before it manifests in output,
|
||||
unlike activation-level approaches \citep{lu2026assistant}.
|
||||
|
||||
\item \textbf{Single LLM backend}: The current implementation targets Claude
|
||||
Code. While the architectural principles are model-agnostic, the skill and
|
||||
hook system is specific to Claude Code's plugin API.
|
||||
|
||||
\item \textbf{Evaluation methodology}: We have not conducted controlled
|
||||
experiments comparing ArcheFlow's output quality against baselines (single-agent,
|
||||
role-based multi-agent without shadows, PDCA without archetypes). The system
|
||||
has been evaluated through production use across real projects, which
|
||||
demonstrates practical utility but not causal attribution.
|
||||
|
||||
\item \textbf{Shadow trigger thresholds}: The quantitative thresholds
|
||||
(e.g., 2000 words for Rabbit Hole, ratio $> 2{:}1$ for Paranoid) were
|
||||
determined empirically through iterative use and may not generalize across
|
||||
all codebases and domains.
|
||||
\end{enumerate}
|
||||
|
||||
\subsection{Future Work}
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{Activation-level integration}: Combining behavioral shadow
|
||||
detection with the Assistant Axis measurement from \citet{lu2026assistant}
|
||||
could provide earlier and more reliable drift detection, particularly for
|
||||
open-weight models where activations are accessible.
|
||||
|
||||
\item \textbf{Controlled evaluation}: A systematic comparison across standard
|
||||
benchmarks (SWE-bench, HumanEval) would establish whether the archetype +
|
||||
PDCA approach provides measurable quality improvements over simpler
|
||||
orchestration strategies.
|
||||
|
||||
\item \textbf{Archetype discovery}: Rather than hand-designing archetypes,
|
||||
the persona space analysis from \citet{lu2026assistant} could be used to
|
||||
identify \emph{natural} cognitive orientations that models adopt, potentially
|
||||
revealing useful archetypes that human intuition would not suggest.
|
||||
|
||||
\item \textbf{Cross-model persona stability}: Investigating whether shadow
|
||||
triggers calibrated for one model family transfer to others, or whether
|
||||
per-model calibration is necessary.
|
||||
\end{enumerate}
|
||||
|
||||
% ============================================================
|
||||
\section{Conclusion}
|
||||
\label{sec:conclusion}
|
||||
|
||||
ArcheFlow demonstrates that multi-agent LLM orchestration benefits from
|
||||
structured persona management---not just telling agents \emph{what to do},
|
||||
but actively monitoring and correcting \emph{how they do it}. The combination
|
||||
of Jungian archetypes (providing a principled taxonomy of cognitive virtues and
|
||||
their failure modes) with PDCA quality cycles (providing convergence guarantees
|
||||
and principled stopping criteria) produces an orchestration framework that
|
||||
maintains productive agent behavior across extended autonomous sessions.
|
||||
|
||||
The shadow detection mechanism---quantitative triggers for archetype-specific
|
||||
dysfunction---addresses the same persona stability challenge identified by
|
||||
\citet{lu2026assistant} at the application level, requiring no access to model
|
||||
internals and working with any LLM backend. While coarser than activation-level
|
||||
approaches, behavioral shadow detection is practical, interpretable, and
|
||||
immediately deployable.
|
||||
|
||||
ArcheFlow is open-source under the MIT license and available at
|
||||
\url{https://github.com/XORwell/archeflow}.
|
||||
|
||||
% ============================================================
|
||||
\section*{Acknowledgments}
|
||||
|
||||
The author thanks the Claude Code team at Anthropic for building the plugin
|
||||
infrastructure that made ArcheFlow possible, and the authors of
|
||||
\citet{lu2026assistant} for the Assistant Axis framework that informed the
|
||||
theoretical grounding of shadow detection.
|
||||
|
||||
% ============================================================
|
||||
\bibliographystyle{plainnat}
|
||||
\bibliography{references}
|
||||
|
||||
\end{document}
|
||||
89
paper/references.bib
Normal file
89
paper/references.bib
Normal file
@@ -0,0 +1,89 @@
|
||||
@article{lu2026assistant,
|
||||
title={The Assistant Axis: Situating and Stabilizing the Default Persona of Language Models},
|
||||
author={Lu, Christina and Gallagher, Jack and Michala, Jonathan and Fish, Kyle and Lindsey, Jack},
|
||||
journal={arXiv preprint arXiv:2601.10387},
|
||||
year={2026},
|
||||
url={https://arxiv.org/abs/2601.10387}
|
||||
}
|
||||
|
||||
@book{jung1968archetypes,
|
||||
title={The Archetypes and the Collective Unconscious},
|
||||
author={Jung, Carl Gustav},
|
||||
year={1968},
|
||||
publisher={Princeton University Press},
|
||||
edition={2nd},
|
||||
series={Collected Works of C.G. Jung},
|
||||
volume={9}
|
||||
}
|
||||
|
||||
@book{deming1986out,
|
||||
title={Out of the Crisis},
|
||||
author={Deming, W. Edwards},
|
||||
year={1986},
|
||||
publisher={MIT Press},
|
||||
address={Cambridge, MA}
|
||||
}
|
||||
|
||||
@book{shewhart1939statistical,
|
||||
title={Statistical Method from the Viewpoint of Quality Control},
|
||||
author={Shewhart, Walter Andrew},
|
||||
year={1939},
|
||||
publisher={Graduate School of the Department of Agriculture},
|
||||
address={Washington, DC}
|
||||
}
|
||||
|
||||
@article{hong2024metagpt,
|
||||
title={MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework},
|
||||
author={Hong, Sirui and Zhuge, Mingchen and Chen, Jonathan and Zheng, Xiawu and Cheng, Yuheng and Zhang, Ceyao and Wang, Jinlin and Wang, Zili and Yau, Steven Ka Shing and Lin, Zijuan and Zhou, Liyang and Ran, Chenyu and Xiao, Lingfeng and Wu, Chenglin and Schmidhuber, J{\"u}rgen},
|
||||
journal={arXiv preprint arXiv:2308.00352},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2308.00352}
|
||||
}
|
||||
|
||||
@article{qian2024chatdev,
|
||||
title={ChatDev: Communicative Agents for Software Development},
|
||||
author={Qian, Chen and Liu, Wei and Liu, Hongzhang and Chen, Nuo and Dang, Yufan and Li, Jiahao and Yang, Cheng and Chen, Weize and Su, Yusheng and Cong, Xin and Xu, Juyuan and Li, Dahai and Liu, Zhiyuan and Sun, Maosong},
|
||||
journal={arXiv preprint arXiv:2307.07924},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2307.07924}
|
||||
}
|
||||
|
||||
@article{yang2024sweagent,
|
||||
title={SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering},
|
||||
author={Yang, John and Jimenez, Carlos E and Wettig, Alexander and Liber, Kilian and Narasimhan, Karthik and Press, Ofir},
|
||||
journal={arXiv preprint arXiv:2405.15793},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2405.15793}
|
||||
}
|
||||
|
||||
@article{chen2025persona,
|
||||
title={Persona Vectors: Monitoring and Controlling Character Traits via Activation Directions},
|
||||
author={Chen, Yiwei and others},
|
||||
journal={arXiv preprint arXiv:2507.21509},
|
||||
year={2025},
|
||||
url={https://arxiv.org/abs/2507.21509}
|
||||
}
|
||||
|
||||
@article{bai2022constitutional,
|
||||
title={Constitutional AI: Harmlessness from AI Feedback},
|
||||
author={Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and others},
|
||||
journal={arXiv preprint arXiv:2212.08073},
|
||||
year={2022},
|
||||
url={https://arxiv.org/abs/2212.08073}
|
||||
}
|
||||
|
||||
@book{hartson2012ux,
|
||||
title={The UX Book: Process and Guidelines for Ensuring a Quality User Experience},
|
||||
author={Hartson, Rex and Pyla, Pardha S.},
|
||||
year={2012},
|
||||
publisher={Morgan Kaufmann},
|
||||
address={Burlington, MA}
|
||||
}
|
||||
|
||||
@inproceedings{winston2011strong,
|
||||
title={The Strong Story Hypothesis and the Directed Perception Hypothesis},
|
||||
author={Winston, Patrick Henry},
|
||||
booktitle={AAAI Fall Symposium: Advances in Cognitive Systems},
|
||||
year={2011},
|
||||
pages={345--352}
|
||||
}
|
||||
194
paper/taxonomy-refs.bib
Normal file
194
paper/taxonomy-refs.bib
Normal file
@@ -0,0 +1,194 @@
|
||||
% ---- Agent Frameworks ----
|
||||
|
||||
@article{hong2024metagpt,
|
||||
title={MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework},
|
||||
author={Hong, Sirui and Zhuge, Mingchen and Chen, Jonathan and Zheng, Xiawu and Cheng, Yuheng and Zhang, Ceyao and Wang, Jinlin and Wang, Zili and Yau, Steven Ka Shing and Lin, Zijuan and Zhou, Liyang and Ran, Chenyu and Xiao, Lingfeng and Wu, Chenglin and Schmidhuber, J{\"u}rgen},
|
||||
journal={arXiv preprint arXiv:2308.00352},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2308.00352}
|
||||
}
|
||||
|
||||
@article{qian2024chatdev,
|
||||
title={ChatDev: Communicative Agents for Software Development},
|
||||
author={Qian, Chen and Liu, Wei and Liu, Hongzhang and Chen, Nuo and Dang, Yufan and Li, Jiahao and Yang, Cheng and Chen, Weize and Su, Yusheng and Cong, Xin and Xu, Juyuan and Li, Dahai and Liu, Zhiyuan and Sun, Maosong},
|
||||
journal={arXiv preprint arXiv:2307.07924},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2307.07924}
|
||||
}
|
||||
|
||||
@article{wu2023autogen,
|
||||
title={AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation},
|
||||
author={Wu, Qingyun and Bansal, Gagan and Zhang, Jieyu and Wu, Yiran and Li, Beibin and Zhu, Erkang and Jiang, Li and Zhang, Xiaoyun and Zhang, Shaokun and Liu, Jiale and Awadallah, Ahmed Hassan and White, Ryen W. and Burger, Doug and Wang, Chi},
|
||||
journal={arXiv preprint arXiv:2308.08155},
|
||||
year={2023},
|
||||
url={https://arxiv.org/abs/2308.08155}
|
||||
}
|
||||
|
||||
@article{yang2024sweagent,
|
||||
title={SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering},
|
||||
author={Yang, John and Jimenez, Carlos E and Wettig, Alexander and Liber, Kilian and Narasimhan, Karthik and Press, Ofir},
|
||||
journal={arXiv preprint arXiv:2405.15793},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2405.15793}
|
||||
}
|
||||
|
||||
@article{nennemann2026archeflow,
|
||||
title={ArcheFlow: Multi-Agent Orchestration with Archetypal Roles and PDCA Quality Cycles},
|
||||
author={Nennemann, Christian},
|
||||
journal={arXiv preprint},
|
||||
year={2026},
|
||||
url={https://github.com/XORwell/archeflow}
|
||||
}
|
||||
|
||||
@article{nguyen2024agilecoder,
|
||||
title={AgileCoder: Dynamic Collaborative Agents for Software Development based on Agile Methodology},
|
||||
author={Nguyen, Minh Huynh and Chau, Thang Phan and Phung, Phong X. and Nguyen, Nghi D. Q.},
|
||||
journal={arXiv preprint arXiv:2406.11912},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2406.11912}
|
||||
}
|
||||
|
||||
@article{patel2026sixsigma,
|
||||
title={The Six Sigma Agent: Achieving Enterprise-Grade Reliability in LLM Systems Through Consensus-Driven Decomposed Execution},
|
||||
author={Patel, Rushi and Surendira, Bala and George, Allen and Kapale, Kiran},
|
||||
journal={arXiv preprint arXiv:2601.22290},
|
||||
year={2026},
|
||||
url={https://arxiv.org/abs/2601.22290}
|
||||
}
|
||||
|
||||
@article{shinn2023reflexion,
|
||||
title={Reflexion: Language Agents with Verbal Reinforcement Learning},
|
||||
author={Shinn, Noah and Cassano, Federico and Gopinath, Ashwin and Narasimhan, Karthik and Yao, Shunyu},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={36},
|
||||
year={2023},
|
||||
url={https://arxiv.org/abs/2303.11366}
|
||||
}
|
||||
|
||||
@article{xia2024eddops,
|
||||
title={Evaluation-Driven Development and Operations of LLM Agents: A Process Model and Reference Architecture},
|
||||
author={Xia, Boming and Lu, Qinghua and Zhu, Liming and Xing, Zhenchang and Zhao, Dehai and Zhang, Hao},
|
||||
journal={arXiv preprint arXiv:2411.13768},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2411.13768}
|
||||
}
|
||||
|
||||
@article{rasheed2024survey,
|
||||
title={LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision and the Road Ahead},
|
||||
author={Rasheed, Zeeshan and others},
|
||||
journal={ACM Transactions on Software Engineering and Methodology},
|
||||
year={2025},
|
||||
url={https://arxiv.org/abs/2404.04834}
|
||||
}
|
||||
|
||||
@article{li2023camel,
|
||||
title={CAMEL: Communicative Agents for ``Mind'' Exploration of Large Language Model Society},
|
||||
author={Li, Guohao and Hammoud, Hasan Abed Al Kader and Itani, Hani and Khizbullin, Dmitrii and Ghanem, Bernard},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={36},
|
||||
year={2023},
|
||||
url={https://arxiv.org/abs/2303.17760}
|
||||
}
|
||||
|
||||
% ---- Persona Stability ----
|
||||
|
||||
@article{lu2026assistant,
|
||||
title={The Assistant Axis: Situating and Stabilizing the Default Persona of Language Models},
|
||||
author={Lu, Christina and Gallagher, Jack and Michala, Jonathan and Fish, Kyle and Lindsey, Jack},
|
||||
journal={arXiv preprint arXiv:2601.10387},
|
||||
year={2026},
|
||||
url={https://arxiv.org/abs/2601.10387}
|
||||
}
|
||||
|
||||
% ---- PM/OM Foundations ----
|
||||
|
||||
@book{deming1986out,
|
||||
title={Out of the Crisis},
|
||||
author={Deming, W. Edwards},
|
||||
year={1986},
|
||||
publisher={MIT Press},
|
||||
address={Cambridge, MA}
|
||||
}
|
||||
|
||||
@book{shewhart1939statistical,
|
||||
title={Statistical Method from the Viewpoint of Quality Control},
|
||||
author={Shewhart, Walter Andrew},
|
||||
year={1939},
|
||||
publisher={Graduate School of the Department of Agriculture},
|
||||
address={Washington, DC}
|
||||
}
|
||||
|
||||
@book{goldratt1984goal,
|
||||
title={The Goal: A Process of Ongoing Improvement},
|
||||
author={Goldratt, Eliyahu M. and Cox, Jeff},
|
||||
year={1984},
|
||||
publisher={North River Press},
|
||||
address={Great Barrington, MA}
|
||||
}
|
||||
|
||||
@book{ohno1988toyota,
|
||||
title={Toyota Production System: Beyond Large-Scale Production},
|
||||
author={Ohno, Taiichi},
|
||||
year={1988},
|
||||
publisher={Productivity Press},
|
||||
address={Portland, OR}
|
||||
}
|
||||
|
||||
@book{womack1996lean,
|
||||
title={Lean Thinking: Banish Waste and Create Wealth in Your Corporation},
|
||||
author={Womack, James P. and Jones, Daniel T.},
|
||||
year={1996},
|
||||
publisher={Simon \& Schuster},
|
||||
address={New York}
|
||||
}
|
||||
|
||||
@article{cooper1990stagegate,
|
||||
title={Stage-Gate Systems: A New Tool for Managing New Products},
|
||||
author={Cooper, Robert G.},
|
||||
journal={Business Horizons},
|
||||
volume={33},
|
||||
number={3},
|
||||
pages={44--54},
|
||||
year={1990},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
|
||||
@article{snowden2007cynefin,
|
||||
title={A Leader's Framework for Decision Making},
|
||||
author={Snowden, David J. and Boone, Mary E.},
|
||||
journal={Harvard Business Review},
|
||||
volume={85},
|
||||
number={11},
|
||||
pages={68--76},
|
||||
year={2007}
|
||||
}
|
||||
|
||||
@book{altshuller1999innovation,
|
||||
title={The Innovation Algorithm: TRIZ, Systematic Innovation and Technical Creativity},
|
||||
author={Altshuller, Genrich},
|
||||
year={1999},
|
||||
publisher={Technical Innovation Center},
|
||||
address={Worcester, MA}
|
||||
}
|
||||
|
||||
@article{boyd1976destruction,
|
||||
title={Destruction and Creation},
|
||||
author={Boyd, John R.},
|
||||
year={1976},
|
||||
note={Unpublished manuscript, widely circulated}
|
||||
}
|
||||
|
||||
@book{schwaber2020scrum,
|
||||
title={The Scrum Guide},
|
||||
author={Schwaber, Ken and Sutherland, Jeff},
|
||||
year={2020},
|
||||
publisher={Scrum.org},
|
||||
note={Available at \url{https://scrumguides.org}}
|
||||
}
|
||||
|
||||
@techreport{mil1949fmea,
|
||||
title={MIL-P-1629: Procedures for Performing a Failure Mode, Effects and Criticality Analysis},
|
||||
institution={United States Department of Defense},
|
||||
year={1949},
|
||||
note={Revised as MIL-STD-1629A, 1980}
|
||||
}
|
||||
805
paper/taxonomy.tex
Normal file
805
paper/taxonomy.tex
Normal file
@@ -0,0 +1,805 @@
|
||||
\documentclass[11pt,a4paper]{article}
|
||||
|
||||
% ---- Packages ----
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{amsmath,amssymb}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{booktabs}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{listings}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{tikz}
|
||||
\usetikzlibrary{shapes,arrows.meta,positioning,fit,calc,matrix}
|
||||
\usepackage[numbers]{natbib}
|
||||
\usepackage{geometry}
|
||||
\usepackage{enumitem}
|
||||
\geometry{margin=1in}
|
||||
|
||||
% ---- Colors ----
|
||||
\definecolor{highfit}{HTML}{2E7D32}
|
||||
\definecolor{medfit}{HTML}{F57F17}
|
||||
\definecolor{lowfit}{HTML}{C62828}
|
||||
\definecolor{neutral}{HTML}{546E7A}
|
||||
|
||||
% ---- Title ----
|
||||
\title{%
|
||||
From Factory Floor to Token Stream:\\
|
||||
A Taxonomy of Operations Management Methods\\
|
||||
for LLM Agent Orchestration%
|
||||
}
|
||||
|
||||
\author{
|
||||
Christian Nennemann\\
|
||||
Independent Researcher\\
|
||||
\texttt{chris@nennemann.de}
|
||||
}
|
||||
|
||||
\date{April 2026}
|
||||
|
||||
\begin{document}
|
||||
\maketitle
|
||||
|
||||
% ============================================================
|
||||
\begin{abstract}
|
||||
Multi-agent systems built on large language models (LLMs) increasingly adopt
|
||||
metaphors from human project management---sprints, standups, code review---yet
|
||||
draw from a remarkably narrow slice of the operations management literature.
|
||||
This paper presents a systematic taxonomy of twelve established PM/OM methods,
|
||||
evaluates their structural compatibility with LLM agent constraints (stateless
|
||||
invocation, cheap cloning, deterministic dysfunction, absence of human
|
||||
psychology), and identifies which methods are underexploited, which are
|
||||
inapplicable, and which require fundamental adaptation. We find that methods
|
||||
designed for \emph{flow optimization} (Kanban, Theory of Constraints) and
|
||||
\emph{rapid decision-making} (OODA Loop) are structurally well-suited to
|
||||
agent orchestration but remain largely unexplored, while methods centered on
|
||||
\emph{human psychology} (Scrum ceremonies, Design Thinking empathy phases)
|
||||
transfer poorly without significant reformulation. We propose a decision
|
||||
framework for selecting orchestration methods based on task complexity, agent
|
||||
count, and quality requirements, and identify five open research directions
|
||||
at the intersection of operations management and agentic AI.
|
||||
\end{abstract}
|
||||
|
||||
% ============================================================
|
||||
\section{Introduction}
|
||||
\label{sec:intro}
|
||||
|
||||
The dominant paradigm for multi-agent LLM systems borrows from agile software
|
||||
development: agents are organized into ``teams'' with role-based
|
||||
specialization, tasks are decomposed into work items, and results are reviewed
|
||||
before merging \citep{hong2024metagpt, qian2024chatdev}. This borrowing is
|
||||
natural---the humans building these systems are software engineers familiar
|
||||
with agile methods---but it is also narrow. The operations management
|
||||
literature contains dozens of methods developed over a century of industrial
|
||||
practice, each encoding different assumptions about workflow structure, quality
|
||||
assurance, failure modes, and coordination costs.
|
||||
|
||||
Not all of these methods are equally applicable to LLM agents. Agents differ
|
||||
from human workers in five structurally important ways:
|
||||
|
||||
\begin{enumerate}[label=\textbf{C\arabic*}]
|
||||
\item \label{c:stateless} \textbf{Stateless invocation}: Agents do not
|
||||
retain memory between invocations unless explicitly persisted. Human team
|
||||
members accumulate institutional knowledge automatically.
|
||||
|
||||
\item \label{c:cloning} \textbf{Cheap to clone, expensive to coordinate}:
|
||||
Spawning a new agent costs milliseconds and cents; coordinating two agents
|
||||
costs tokens and latency. For human teams, the inverse holds---hiring is
|
||||
expensive, coordination is (comparatively) cheap.
|
||||
|
||||
\item \label{c:dysfunction} \textbf{Deterministic dysfunction}: LLM agents
|
||||
fail in predictable, repeatable patterns---verbosity, scope creep, false
|
||||
positives---rather than the varied, context-dependent failures of human
|
||||
cognition \citep{nennemann2026archeflow}.
|
||||
|
||||
\item \label{c:psychology} \textbf{No psychology}: Agents have no morale,
|
||||
fatigue, ego, or office politics. Methods designed to manage human
|
||||
psychology (retrospectives, team-building, conflict resolution) have no
|
||||
direct function.
|
||||
|
||||
\item \label{c:speed} \textbf{Cycle speed}: Agents complete tasks in
|
||||
seconds to minutes, enabling iteration frequencies that would be
|
||||
impractical for human teams. Methods that assume week-long or month-long
|
||||
cycles can be compressed.
|
||||
\end{enumerate}
|
||||
|
||||
These constraints define a \emph{fitness landscape}: some PM/OM methods gain
|
||||
effectiveness when applied to agents (because agents remove friction those
|
||||
methods were designed to manage), while others lose their raison d'\^etre
|
||||
(because they solve human problems agents don't have).
|
||||
|
||||
This paper contributes:
|
||||
\begin{itemize}
|
||||
\item A systematic taxonomy of twelve PM/OM methods evaluated against the
|
||||
five agent constraints (\ref{c:stateless}--\ref{c:speed}).
|
||||
\item A compatibility matrix scoring each method's structural fit for
|
||||
agent orchestration (\S\ref{sec:matrix}).
|
||||
\item A decision framework for practitioners selecting orchestration
|
||||
strategies (\S\ref{sec:decision}).
|
||||
\item Five open research directions at the intersection of operations
|
||||
management theory and agentic AI (\S\ref{sec:future}).
|
||||
\end{itemize}
|
||||
|
||||
% ============================================================
|
||||
\section{Background: Current Agent Orchestration Landscape}
|
||||
\label{sec:background}
|
||||
|
||||
\subsection{Frameworks and Their Implicit PM Models}
|
||||
|
||||
The current generation of multi-agent LLM frameworks implicitly adopts
|
||||
project management concepts, though rarely with explicit attribution to
|
||||
PM/OM theory.
|
||||
|
||||
\textbf{MetaGPT} \citep{hong2024metagpt} assigns human job titles (product
|
||||
manager, architect, engineer) and enforces communication through Standardized
|
||||
Operating Procedures (SOPs)---an implicit adoption of \emph{waterfall}
|
||||
phase gates with role-based access control.
|
||||
|
||||
\textbf{ChatDev} \citep{qian2024chatdev} simulates a software company with
|
||||
sequential phases (design, coding, testing, documentation). Despite the
|
||||
``company'' framing, the execution model is a \emph{linear pipeline} with
|
||||
pair-programming-style chat between adjacent roles.
|
||||
|
||||
\textbf{AgileCoder} \citep{nguyen2024agilecoder} is the first framework to
|
||||
explicitly adopt sprint-based iteration, assigning Scrum Master and Product
|
||||
Manager roles to LLM agents with a Dynamic Code Graph Generator tracking
|
||||
inter-file dependencies between sprints.
|
||||
|
||||
\textbf{CrewAI} organizes agents into ``crews'' with a ``manager'' agent
|
||||
orchestrating task delegation---an implicit \emph{hierarchical management}
|
||||
model with single-point-of-failure coordination.
|
||||
|
||||
\textbf{AutoGen} \citep{wu2023autogen} provides a conversation-based
|
||||
framework where agents negotiate through multi-turn dialogue. The implicit
|
||||
model is \emph{committee decision-making}---all agents see all messages,
|
||||
consensus emerges through discussion.
|
||||
|
||||
\textbf{The Six Sigma Agent} \citep{patel2026sixsigma} decomposes tasks
|
||||
into atomic dependency trees, executes each node $n$ times with independent
|
||||
LLM samples, and uses consensus voting to achieve defect rates scaling as
|
||||
$O(p^{\lceil n/2 \rceil})$---reaching 3.4 DPMO (the Six Sigma threshold)
|
||||
at $n=13$.
|
||||
|
||||
\textbf{Reflexion} \citep{shinn2023reflexion} implements a de facto PDCA
|
||||
loop through verbal reinforcement: Plan $\to$ Act $\to$ Evaluate (Check)
|
||||
$\to$ Reflect (Act), though it does not name this structure explicitly.
|
||||
|
||||
\textbf{ArcheFlow} \citep{nennemann2026archeflow} explicitly applies PDCA
|
||||
quality cycles with Jungian archetypal roles, representing the first
|
||||
framework to deliberately adopt a named PM/OM methodology with formal
|
||||
convergence criteria.
|
||||
|
||||
\subsection{The Gap}
|
||||
|
||||
Despite the variety of frameworks, the PM/OM methods actually employed
|
||||
cluster tightly around four approaches: (1) waterfall-style sequential
|
||||
phases (MetaGPT, ChatDev), (2) role-based team simulation (CAMEL
|
||||
\citep{li2023camel}, CrewAI), (3) informal ``manager'' delegation
|
||||
(AutoGen), and (4) agile sprints (AgileCoder). The Six Sigma Agent
|
||||
\citep{patel2026sixsigma} is a notable exception---the only framework to
|
||||
explicitly name a PM/OM method as its primary architectural contribution.
|
||||
|
||||
Methods from lean manufacturing, constraint theory, military
|
||||
decision-making, innovation management, and failure analysis remain
|
||||
unexplored in the peer-reviewed agent orchestration literature, despite
|
||||
strong structural compatibility with agent constraints.
|
||||
|
||||
% ============================================================
|
||||
\section{Taxonomy of PM/OM Methods}
|
||||
\label{sec:taxonomy}
|
||||
|
||||
We evaluate twelve methods spanning five categories: iterative improvement,
|
||||
flow optimization, decision-making, innovation management, and quality
|
||||
engineering. For each method, we describe the core mechanism, evaluate
|
||||
structural compatibility with agent constraints \ref{c:stateless}--\ref{c:speed},
|
||||
identify the primary adaptation required, and assess overall fitness.
|
||||
|
||||
% ---- 3.1 Iterative Improvement ----
|
||||
\subsection{Iterative Improvement Methods}
|
||||
|
||||
\subsubsection{PDCA (Plan--Do--Check--Act)}
|
||||
\label{sec:pdca}
|
||||
|
||||
\textbf{Origin}: Shewhart \citep{shewhart1939statistical}, popularized by
|
||||
Deming \citep{deming1986out}.
|
||||
|
||||
\textbf{Mechanism}: Four-phase cycle repeated until quality targets are met.
|
||||
Each cycle narrows the gap between current and desired state through
|
||||
structured feedback.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{High}. PDCA's phase structure maps directly
|
||||
to agent orchestration: Plan (research + design agents), Do (implementation
|
||||
agent), Check (review agents), Act (routing + merge decisions). The cycle
|
||||
abstraction handles the core challenge of ``when to stop iterating'' through
|
||||
convergence metrics. Demonstrated in ArcheFlow \citep{nennemann2026archeflow}.
|
||||
|
||||
\textbf{Key adaptation}: Convergence detection must be automated (human PDCA
|
||||
relies on subjective judgment). ArcheFlow addresses this with a convergence
|
||||
score based on finding classification (new, resolved, persistent, regressed)
|
||||
and oscillation detection.
|
||||
|
||||
\textbf{Constraint fit}: Stateless (\ref{c:stateless})---artifacts persist
|
||||
state between cycles. Cloning (\ref{c:cloning})---fresh agents per cycle
|
||||
avoid accumulated bias. Speed (\ref{c:speed})---cycles complete in minutes,
|
||||
enabling 2--3 cycles where humans would manage one.
|
||||
|
||||
\subsubsection{Scrum}
|
||||
\label{sec:scrum}
|
||||
|
||||
\textbf{Origin}: Schwaber \& Sutherland, 1995.
|
||||
|
||||
\textbf{Mechanism}: Time-boxed sprints with defined roles (Product Owner,
|
||||
Scrum Master, Development Team), ceremonies (planning, daily standup,
|
||||
review, retrospective), and artifacts (backlog, sprint board, burndown).
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Low--Medium}. Scrum's ceremony-heavy
|
||||
structure exists primarily to manage human coordination challenges: standups
|
||||
maintain shared awareness (agents can share a filesystem), retrospectives
|
||||
address interpersonal friction (agents have none), sprint planning negotiates
|
||||
capacity (agents have deterministic throughput). The useful kernel---time-boxed
|
||||
work with a prioritized backlog---is trivially implementable without Scrum's
|
||||
overhead.
|
||||
|
||||
\textbf{Key adaptation}: Strip ceremonies, keep the backlog + sprint
|
||||
structure. ``Daily standups'' become status file reads. ``Retrospectives''
|
||||
become cross-run memory extraction. The Scrum Master role is pure overhead
|
||||
for agents.
|
||||
|
||||
\textbf{Constraint fit}: Psychology (\ref{c:psychology})---most Scrum
|
||||
ceremonies solve human problems. Speed (\ref{c:speed})---sprint length
|
||||
compresses from weeks to minutes. Cloning (\ref{c:cloning})---team
|
||||
stability (a Scrum value) is irrelevant when agents are stateless.
|
||||
|
||||
\subsubsection{DMAIC (Six Sigma)}
|
||||
\label{sec:dmaic}
|
||||
|
||||
\textbf{Origin}: Motorola, 1986; systematized by General Electric.
|
||||
|
||||
\textbf{Mechanism}: Define--Measure--Analyze--Improve--Control. Unlike PDCA,
|
||||
DMAIC emphasizes \emph{statistical measurement} of process capability and
|
||||
explicitly separates analysis (understanding the problem) from improvement
|
||||
(fixing it).
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium--High}. The Define--Measure--Analyze
|
||||
front-loading is valuable for agents: it forces explicit quality metrics
|
||||
\emph{before} implementation, preventing the common failure mode of agents
|
||||
optimizing for the wrong objective. The Control phase---establishing
|
||||
monitoring to prevent regression---maps to cross-run memory systems.
|
||||
|
||||
\textbf{Key adaptation}: Agents can compute statistical process control
|
||||
metrics (defect rates, cycle times, sigma levels) automatically from event
|
||||
logs. The ``Measure'' phase, which is expensive and tedious for humans,
|
||||
becomes a strength: agents can instrument everything.
|
||||
|
||||
\textbf{Constraint fit}: Speed (\ref{c:speed})---full DMAIC in minutes.
|
||||
Dysfunction (\ref{c:dysfunction})---agent failure modes have measurable
|
||||
baselines, making sigma calculations meaningful. Stateless
|
||||
(\ref{c:stateless})---Control phase requires persistent monitoring, which
|
||||
must be explicitly built.
|
||||
|
||||
% ---- 3.2 Flow Optimization ----
|
||||
\subsection{Flow Optimization Methods}
|
||||
|
||||
\subsubsection{Kanban}
|
||||
\label{sec:kanban}
|
||||
|
||||
\textbf{Origin}: Toyota Production System, Taiichi Ohno, 1950s.
|
||||
|
||||
\textbf{Mechanism}: Pull-based workflow with explicit work-in-progress (WIP)
|
||||
limits. Work items flow through columns (stages); new work is pulled only
|
||||
when capacity is available. No iterations---continuous flow.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{High}. Kanban's WIP limits directly address
|
||||
a critical agent challenge: \emph{coordination cost scaling}. Without WIP
|
||||
limits, spawning more agents increases throughput initially but eventually
|
||||
degrades quality due to coordination overhead (conflicting changes, merge
|
||||
conflicts, context fragmentation). Kanban provides a principled mechanism for
|
||||
determining optimal concurrency.
|
||||
|
||||
\textbf{Key adaptation}: WIP limits should be \emph{dynamic}, adjusting
|
||||
based on observed coordination costs (merge conflicts, finding duplications)
|
||||
rather than fixed. The pull mechanism maps naturally: agents poll a task
|
||||
queue and pull the highest-priority item they can handle.
|
||||
|
||||
\textbf{Constraint fit}: Cloning (\ref{c:cloning})---WIP limits are
|
||||
\emph{exactly} the missing constraint for cheap-to-clone agents. Speed
|
||||
(\ref{c:speed})---flow metrics (lead time, cycle time, throughput) update
|
||||
in real-time. Psychology (\ref{c:psychology})---no ``swarming'' or
|
||||
``blocked item'' social dynamics to manage.
|
||||
|
||||
\subsubsection{Theory of Constraints (TOC)}
|
||||
\label{sec:toc}
|
||||
|
||||
\textbf{Origin}: Goldratt, \emph{The Goal}, 1984.
|
||||
|
||||
\textbf{Mechanism}: Identify the system's constraint (bottleneck), exploit
|
||||
it (maximize its throughput), subordinate everything else to it, elevate it
|
||||
(invest to remove it), repeat. The Five Focusing Steps.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{High}. In multi-agent pipelines, the
|
||||
bottleneck is typically the most capable (and expensive) agent: the
|
||||
implementation agent that must run on a powerful model, or the security
|
||||
reviewer that requires deep context. TOC provides a framework for
|
||||
organizing the entire pipeline around this constraint.
|
||||
|
||||
\textbf{Key adaptation}: ``Exploit the constraint'' means ensuring the
|
||||
bottleneck agent never waits for input. Pre-compute its context, batch
|
||||
its inputs, and schedule cheaper agents (research, formatting, validation)
|
||||
to run during its processing time. ``Subordinate'' means cheaper agents
|
||||
should produce output in the format the bottleneck needs, not in whatever
|
||||
format is easiest for them.
|
||||
|
||||
\textbf{Constraint fit}: Cloning (\ref{c:cloning})---non-bottleneck agents
|
||||
are cheap to overprovision. Speed (\ref{c:speed})---constraint shifts can
|
||||
be detected and responded to within a single run. Dysfunction
|
||||
(\ref{c:dysfunction})---bottleneck agent's failure mode has outsized impact,
|
||||
justifying targeted shadow detection.
|
||||
|
||||
\subsubsection{Lean / Toyota Production System}
|
||||
\label{sec:lean}
|
||||
|
||||
\textbf{Origin}: Ohno, 1988; Womack \& Jones, 1996.
|
||||
|
||||
\textbf{Mechanism}: Eliminate waste (\emph{muda}), reduce variability
|
||||
(\emph{mura}), avoid overburden (\emph{muri}). Seven wastes: overproduction,
|
||||
waiting, transport, overprocessing, inventory, motion, defects.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium--High}. The seven wastes map
|
||||
surprisingly well to agent systems:
|
||||
|
||||
\begin{itemize}[nosep]
|
||||
\item \textbf{Overproduction}: Agents generating output nobody reads
|
||||
(verbose research reports, unused alternative proposals).
|
||||
\item \textbf{Waiting}: Agents idle while waiting for predecessor output
|
||||
(sequential pipeline where parallel would work).
|
||||
\item \textbf{Transport}: Redundant context passing (sending full codebase
|
||||
to agents that need only a diff).
|
||||
\item \textbf{Overprocessing}: Running thorough review on trivial changes.
|
||||
\item \textbf{Inventory}: Accumulated artifacts from prior cycles that
|
||||
are never referenced.
|
||||
\item \textbf{Motion}: Agents reading files they don't need, exploring
|
||||
irrelevant code paths.
|
||||
\item \textbf{Defects}: Findings that are false positives, requiring
|
||||
rework to dismiss.
|
||||
\end{itemize}
|
||||
|
||||
\textbf{Key adaptation}: Lean's ``respect for people'' pillar has no direct
|
||||
analog. The technical pillar (continuous improvement, waste elimination)
|
||||
transfers fully.
|
||||
|
||||
% ---- 3.3 Decision-Making ----
|
||||
\subsection{Decision-Making Methods}
|
||||
|
||||
\subsubsection{OODA Loop (Observe--Orient--Decide--Act)}
|
||||
\label{sec:ooda}
|
||||
|
||||
\textbf{Origin}: John Boyd, 1976. Military strategy for air combat; later
|
||||
generalized to competitive decision-making.
|
||||
|
||||
\textbf{Mechanism}: Continuous loop of Observe (gather data), Orient (analyze
|
||||
context, update mental models), Decide (select course of action), Act
|
||||
(execute). The key insight is that the \emph{speed} of the loop---not any
|
||||
individual decision's quality---determines competitive advantage. ``Getting
|
||||
inside the opponent's OODA loop'' means acting faster than the adversary can
|
||||
react.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{High}. OODA is structurally similar to PDCA
|
||||
but optimized for speed over thoroughness. For agent systems, this maps to
|
||||
scenarios requiring rapid adaptation: adversarial testing, incident response,
|
||||
market-reactive coding, or any context where the problem space changes
|
||||
during execution.
|
||||
|
||||
\textbf{Key adaptation}: Boyd's ``Orient'' phase---updating mental models
|
||||
based on new information---is the hardest to implement for stateless agents.
|
||||
It requires either persistent state (a world model that updates across
|
||||
iterations) or a ``fast reorientation'' agent that rapidly synthesizes new
|
||||
information into an updated context.
|
||||
|
||||
\textbf{Constraint fit}: Speed (\ref{c:speed})---agents can OODA at
|
||||
superhuman frequency. Stateless (\ref{c:stateless})---the Orient phase
|
||||
needs explicit state management. Psychology (\ref{c:psychology})---Boyd's
|
||||
concept of ``mental agility'' translates to model selection: smaller, faster
|
||||
models for rapid OODA; larger models for deep Orient phases.
|
||||
|
||||
\subsubsection{Cynefin Framework}
|
||||
\label{sec:cynefin}
|
||||
|
||||
\textbf{Origin}: Snowden \& Boone, 2007.
|
||||
|
||||
\textbf{Mechanism}: Classify problems into five domains---\textsc{Clear}
|
||||
(obvious cause-effect), \textsc{Complicated} (expert analysis needed),
|
||||
\textsc{Complex} (emergent, probe-sense-respond), \textsc{Chaotic}
|
||||
(act first, then sense), \textsc{Confused} (unknown domain)---and apply
|
||||
domain-appropriate strategies.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium--High}. Cynefin provides a
|
||||
\emph{meta-framework}: instead of choosing one orchestration method for all
|
||||
tasks, classify the task first, then select the appropriate method:
|
||||
|
||||
\begin{itemize}[nosep]
|
||||
\item \textsc{Clear}: Single agent, no review (``fix this typo'').
|
||||
\item \textsc{Complicated}: Expert agent with review (PDCA fast workflow).
|
||||
\item \textsc{Complex}: Multiple competing proposals, let results emerge
|
||||
(PDCA standard/thorough with parallel alternatives).
|
||||
\item \textsc{Chaotic}: Act immediately, stabilize, then analyze (OODA
|
||||
with hotfix agent, then PDCA for proper fix).
|
||||
\end{itemize}
|
||||
|
||||
\textbf{Key adaptation}: Task classification must be automated. Proxies:
|
||||
number of files affected, cross-module dependencies, security sensitivity,
|
||||
test coverage of affected area.
|
||||
|
||||
% ---- 3.4 Innovation Management ----
|
||||
\subsection{Innovation Management Methods}
|
||||
|
||||
\subsubsection{Stage-Gate}
|
||||
\label{sec:stagegate}
|
||||
|
||||
\textbf{Origin}: Cooper, 1990.
|
||||
|
||||
\textbf{Mechanism}: Innovation projects pass through stages (scoping,
|
||||
business case, development, testing, launch), separated by gates where a
|
||||
cross-functional team decides: Go, Kill, Hold, or Recycle. The gate
|
||||
decision is binary---no ``continue with reservations.''
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium}. The gate mechanism maps well to
|
||||
agent confidence checks: a Creator agent's proposal either meets the
|
||||
confidence threshold (Go) or doesn't (Kill/Recycle). However, Stage-Gate
|
||||
assumes expensive stages (weeks/months of human work), making Kill decisions
|
||||
high-stakes. For agents, stages are cheap (minutes), reducing the value of
|
||||
formal gate decisions.
|
||||
|
||||
\textbf{Key adaptation}: Gates become lightweight confidence checks rather
|
||||
than committee reviews. The ``Kill'' decision---rare and painful in human
|
||||
innovation---should be common and cheap for agents. Explore multiple
|
||||
proposals in parallel, gate aggressively, continue only the best.
|
||||
|
||||
\subsubsection{Design Thinking}
|
||||
\label{sec:designthinking}
|
||||
|
||||
\textbf{Origin}: IDEO / Stanford d.school, 2000s.
|
||||
|
||||
\textbf{Mechanism}: Five phases: Empathize (understand the user),
|
||||
Define (frame the problem), Ideate (generate solutions), Prototype (build
|
||||
quickly), Test (get feedback). Emphasis on user empathy and divergent
|
||||
thinking.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Low}. Design Thinking's core value
|
||||
proposition---\emph{empathy with users}---is precisely what LLM agents
|
||||
cannot genuinely do. Agents can simulate empathy (generate persona-based
|
||||
scenarios), but the insight that comes from observing real users in context
|
||||
has no agent equivalent. The Ideate phase (divergent brainstorming) is
|
||||
feasible but produces quantity over quality without the ``empathy filter''
|
||||
that makes Design Thinking effective.
|
||||
|
||||
\textbf{Key adaptation}: If used, the Empathize phase must be replaced
|
||||
with explicit user research artifacts (personas, journey maps, interview
|
||||
transcripts) provided as input. This transforms Design Thinking from a
|
||||
discovery method into a synthesis method---fundamentally changing its nature.
|
||||
|
||||
\subsubsection{TRIZ}
|
||||
\label{sec:triz}
|
||||
|
||||
\textbf{Origin}: Altshuller, 1946--1985. Theory of Inventive Problem
|
||||
Solving.
|
||||
|
||||
\textbf{Mechanism}: Problems contain contradictions (improving one parameter
|
||||
worsens another). TRIZ provides a contradiction matrix mapping 39 engineering
|
||||
parameters to 40 inventive principles. Instead of compromise, TRIZ seeks
|
||||
solutions that resolve the contradiction.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium}. TRIZ's structured problem-solving
|
||||
is well-suited to agents: the contradiction matrix is a lookup table, and
|
||||
agents can systematically apply inventive principles. However, TRIZ requires
|
||||
\emph{reformulating the problem as a contradiction}---a creative step that
|
||||
is itself challenging for agents.
|
||||
|
||||
\textbf{Key adaptation}: Provide the contradiction matrix as context. Train
|
||||
agents to identify the ``improving parameter'' and ``worsening parameter''
|
||||
in engineering tasks (e.g., ``improving security worsens performance'').
|
||||
Use TRIZ principles as a structured brainstorming prompt for the Creator
|
||||
archetype.
|
||||
|
||||
% ---- 3.5 Quality Engineering ----
|
||||
\subsection{Quality Engineering Methods}
|
||||
|
||||
\subsubsection{FMEA (Failure Mode and Effects Analysis)}
|
||||
\label{sec:fmea}
|
||||
|
||||
\textbf{Origin}: US Military, 1949; adopted by automotive (AIAG) and
|
||||
aerospace.
|
||||
|
||||
\textbf{Mechanism}: For each component/process step, systematically
|
||||
enumerate: (1) potential failure modes, (2) effects of each failure,
|
||||
(3) causes, (4) current controls, (5) risk priority number
|
||||
(severity $\times$ occurrence $\times$ detection). Address highest-RPN
|
||||
items first.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{High}. FMEA's systematic enumeration is
|
||||
exactly what LLM agents excel at: given a design, enumerate everything that
|
||||
could go wrong, assess severity, and propose mitigations. The Risk Priority
|
||||
Number provides a quantitative framework for prioritizing review effort---more
|
||||
principled than the common ``CRITICAL/WARNING/INFO'' severity classification.
|
||||
|
||||
\textbf{Key adaptation}: Use FMEA \emph{before} implementation (as part of
|
||||
the Plan phase) rather than only during review. An FMEA agent analyzes the
|
||||
Creator's proposal and generates a failure mode table; the Maker then
|
||||
implements with awareness of high-RPN failure modes; the Guardian validates
|
||||
that mitigations are in place.
|
||||
|
||||
\textbf{Constraint fit}: Dysfunction (\ref{c:dysfunction})---agents' own
|
||||
failure modes can be pre-enumerated via FMEA, creating a meta-level
|
||||
quality system. Cloning (\ref{c:cloning})---FMEA agents are cheap
|
||||
(analytical, not creative), enabling systematic coverage.
|
||||
|
||||
\subsubsection{Statistical Process Control (SPC)}
|
||||
\label{sec:spc}
|
||||
|
||||
\textbf{Origin}: Shewhart, 1920s.
|
||||
|
||||
\textbf{Mechanism}: Monitor process outputs over time using control charts.
|
||||
Distinguish \emph{common cause} variation (inherent to the process) from
|
||||
\emph{special cause} variation (attributable to specific events). React only
|
||||
to special causes; reduce common cause variation through process improvement.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium--High}. SPC requires historical data,
|
||||
which agent orchestration systems naturally generate (event logs, finding
|
||||
counts, cycle times, token usage). Control charts over agent effectiveness
|
||||
scores can distinguish between normal variation (``Guardian found 2 issues
|
||||
this run vs. 1 last run'') and genuine degradation (``Guardian's false
|
||||
positive rate spiked after a model update'').
|
||||
|
||||
\textbf{Key adaptation}: Sufficient run history is needed to establish
|
||||
control limits. Early runs operate without SPC; after 10--20 runs,
|
||||
control limits become meaningful. Model updates reset control limits
|
||||
(new process = new baseline).
|
||||
|
||||
% ============================================================
|
||||
\section{Compatibility Matrix}
|
||||
\label{sec:matrix}
|
||||
|
||||
Table~\ref{tab:matrix} scores each method against the five agent constraints,
|
||||
producing an overall fitness assessment.
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\small
|
||||
\caption{Compatibility matrix: PM/OM methods scored against agent constraints.
|
||||
\textcolor{highfit}{\textbf{+}} = method benefits from this constraint;
|
||||
\textcolor{lowfit}{\textbf{--}} = method is undermined;
|
||||
\textcolor{neutral}{\textbf{0}} = neutral.
|
||||
Overall fitness: H = High, M = Medium, L = Low.}
|
||||
\label{tab:matrix}
|
||||
\begin{tabular}{@{}l*{5}{c}c@{}}
|
||||
\toprule
|
||||
\textbf{Method} &
|
||||
\textbf{C1} &
|
||||
\textbf{C2} &
|
||||
\textbf{C3} &
|
||||
\textbf{C4} &
|
||||
\textbf{C5} &
|
||||
\textbf{Fit} \\
|
||||
\midrule
|
||||
PDCA & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textbf{H} \\
|
||||
Scrum & \textcolor{lowfit}{--} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{lowfit}{--} & \textcolor{highfit}{+} & \textbf{L--M} \\
|
||||
DMAIC & \textcolor{lowfit}{--} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textbf{M--H} \\
|
||||
Kanban & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textbf{H} \\
|
||||
TOC & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textbf{H} \\
|
||||
Lean & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{lowfit}{--} & \textcolor{highfit}{+} & \textbf{M--H} \\
|
||||
OODA & \textcolor{lowfit}{--} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textbf{H} \\
|
||||
Cynefin & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textbf{M--H} \\
|
||||
Stage-Gate & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{lowfit}{--} & \textbf{M} \\
|
||||
Design Think. & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{lowfit}{--} & \textcolor{neutral}{0} & \textbf{L} \\
|
||||
TRIZ & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textbf{M} \\
|
||||
FMEA & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textbf{H} \\
|
||||
SPC & \textcolor{lowfit}{--} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textbf{M--H} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\subsection{Analysis}
|
||||
|
||||
Several patterns emerge from the compatibility matrix:
|
||||
|
||||
\textbf{High-fitness methods share three properties}: they are
|
||||
\emph{mechanistic} (decisions follow rules, not judgment), \emph{flow-oriented}
|
||||
(optimize throughput, not team dynamics), and \emph{metric-driven} (quality
|
||||
is quantified, not discussed). PDCA, Kanban, TOC, OODA, and FMEA all share
|
||||
this profile.
|
||||
|
||||
\textbf{Low-fitness methods are psychology-dependent}: Scrum and Design
|
||||
Thinking derive their primary value from managing human cognitive and social
|
||||
limitations. Without those limitations, the methods become overhead.
|
||||
|
||||
\textbf{The ``Cheap Clone'' constraint is universally beneficial}: every
|
||||
method either benefits from or is neutral to the ability to spawn agents
|
||||
cheaply. This suggests that agent orchestration should generally favor
|
||||
\emph{parallelism}---run multiple approaches simultaneously, then
|
||||
select the best result.
|
||||
|
||||
\textbf{``Stateless'' is the most disruptive constraint}: methods that
|
||||
assume accumulated knowledge (Scrum's team velocity, SPC's control charts,
|
||||
DMAIC's baseline measurements) require explicit persistence mechanisms that
|
||||
agents don't provide natively.
|
||||
|
||||
% ============================================================
|
||||
\section{Hybrid Approaches and Method Composition}
|
||||
\label{sec:hybrid}
|
||||
|
||||
The methods in our taxonomy are not mutually exclusive. Effective agent
|
||||
orchestration likely requires combining methods at different levels:
|
||||
|
||||
\subsection{Proposed Three-Layer Architecture}
|
||||
|
||||
\begin{description}
|
||||
\item[Strategic layer (Cynefin)]: Classify the task and select the
|
||||
appropriate orchestration method. Simple tasks get a single agent;
|
||||
complicated tasks get PDCA; complex tasks get parallel competing
|
||||
approaches; chaotic tasks get OODA.
|
||||
|
||||
\item[Operational layer (PDCA/OODA + Kanban)]: Execute the selected
|
||||
method with flow control. Kanban WIP limits prevent coordination
|
||||
overload. PDCA provides quality convergence for standard tasks; OODA
|
||||
provides rapid adaptation for time-sensitive tasks.
|
||||
|
||||
\item[Quality layer (FMEA + SPC + TOC)]: Monitor execution quality.
|
||||
FMEA front-loads failure analysis in the Plan phase. SPC monitors
|
||||
long-term agent effectiveness trends. TOC identifies and optimizes
|
||||
around bottleneck agents.
|
||||
\end{description}
|
||||
|
||||
\subsection{ArcheFlow as a Case Study}
|
||||
|
||||
ArcheFlow \citep{nennemann2026archeflow} already implements elements of
|
||||
this three-layer architecture, though without explicitly naming all methods:
|
||||
|
||||
\begin{itemize}[nosep]
|
||||
\item \textbf{Strategic}: Workflow selection (fast/standard/thorough)
|
||||
functions as a simplified Cynefin classification.
|
||||
\item \textbf{Operational}: PDCA cycles with convergence detection;
|
||||
sprint mode with WIP-limited parallel dispatch (implicit Kanban).
|
||||
\item \textbf{Quality}: Shadow detection (behavioral FMEA for agent
|
||||
failure modes); effectiveness scoring (rudimentary SPC); Guardian
|
||||
fast-path (TOC---don't waste the bottleneck on clean code); ``Wiggum
|
||||
Break'' circuit breakers (hard/soft halt conditions with event logging).
|
||||
\end{itemize}
|
||||
|
||||
The gap is in explicit TOC application (identifying and optimizing around
|
||||
the most expensive agent) and in OODA integration for time-sensitive tasks.
|
||||
|
||||
% ============================================================
|
||||
\section{Decision Framework}
|
||||
\label{sec:decision}
|
||||
|
||||
We propose a practitioner-oriented decision framework for selecting
|
||||
orchestration methods based on three dimensions:
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\begin{tikzpicture}[
|
||||
box/.style={draw, rounded corners, minimum width=3.5cm, minimum height=0.7cm, font=\small, fill=#1},
|
||||
arrow/.style={-{Stealth[length=3mm]}, thick},
|
||||
]
|
||||
|
||||
% Decision tree
|
||||
\node[box=yellow!20] (start) {Task arrives};
|
||||
\node[box=orange!15, below=0.8cm of start] (cynefin) {Classify (Cynefin)};
|
||||
|
||||
\node[box=green!15, below left=1cm and 2cm of cynefin] (clear) {Clear};
|
||||
\node[box=green!15, below left=1cm and 0cm of cynefin] (complicated) {Complicated};
|
||||
\node[box=blue!10, below right=1cm and 0cm of cynefin] (complex) {Complex};
|
||||
\node[box=red!10, below right=1cm and 2cm of cynefin] (chaotic) {Chaotic};
|
||||
|
||||
\node[box=white, below=0.7cm of clear, text width=2.5cm, align=center, font=\scriptsize] (m1) {Single agent\\No review};
|
||||
\node[box=white, below=0.7cm of complicated, text width=2.5cm, align=center, font=\scriptsize] (m2) {PDCA fast\\+ FMEA};
|
||||
\node[box=white, below=0.7cm of complex, text width=2.5cm, align=center, font=\scriptsize] (m3) {PDCA thorough\\+ parallel proposals};
|
||||
\node[box=white, below=0.7cm of chaotic, text width=2.5cm, align=center, font=\scriptsize] (m4) {OODA\\then PDCA};
|
||||
|
||||
\draw[arrow] (start) -- (cynefin);
|
||||
\draw[arrow] (cynefin) -- (clear);
|
||||
\draw[arrow] (cynefin) -- (complicated);
|
||||
\draw[arrow] (cynefin) -- (complex);
|
||||
\draw[arrow] (cynefin) -- (chaotic);
|
||||
\draw[arrow] (clear) -- (m1);
|
||||
\draw[arrow] (complicated) -- (m2);
|
||||
\draw[arrow] (complex) -- (m3);
|
||||
\draw[arrow] (chaotic) -- (m4);
|
||||
|
||||
\end{tikzpicture}
|
||||
\caption{Decision framework for selecting agent orchestration method
|
||||
based on Cynefin task classification.}
|
||||
\label{fig:decision}
|
||||
\end{figure}
|
||||
|
||||
\textbf{Cross-cutting concerns} apply regardless of classification:
|
||||
\begin{itemize}[nosep]
|
||||
\item \textbf{Kanban WIP limits}: Always. Prevents coordination overload.
|
||||
\item \textbf{TOC awareness}: Identify the costliest agent; schedule
|
||||
others around it.
|
||||
\item \textbf{SPC monitoring}: After 10+ runs, establish control limits
|
||||
for agent effectiveness.
|
||||
\item \textbf{Lean waste audit}: Periodically review token usage patterns
|
||||
for waste (unused artifacts, redundant context, overprocessing).
|
||||
\end{itemize}
|
||||
|
||||
% ============================================================
|
||||
\section{Open Research Directions}
|
||||
\label{sec:future}
|
||||
|
||||
\subsection{Adaptive Method Selection}
|
||||
|
||||
Current frameworks use a fixed orchestration method. An adaptive system
|
||||
would classify each incoming task (Cynefin), select the appropriate method,
|
||||
and switch methods mid-execution if the task's nature changes (e.g.,
|
||||
a ``complicated'' task reveals unexpected complexity during exploration).
|
||||
This requires a \emph{method-aware orchestrator} that understands the
|
||||
assumptions and exit criteria of each method.
|
||||
|
||||
\subsection{Kanban for Agent Swarms}
|
||||
|
||||
As agent counts increase beyond 5--10, coordination costs dominate.
|
||||
Kanban's WIP limits and flow metrics provide a theoretical basis for
|
||||
determining optimal agent concurrency, but empirical studies are needed
|
||||
to establish how coordination cost scales with agent count across
|
||||
different task types and model capabilities.
|
||||
|
||||
\subsection{OODA for Adversarial Agent Scenarios}
|
||||
|
||||
Boyd's OODA loop was designed for competitive environments where speed of
|
||||
decision-making determines the winner. Applications include adversarial
|
||||
testing (red team agents vs. blue team agents), competitive code generation
|
||||
(multiple agents racing to solve a problem), and incident response
|
||||
(rapid diagnosis and mitigation under time pressure).
|
||||
|
||||
\subsection{Cross-Method Quality Metrics}
|
||||
|
||||
Each PM/OM method defines quality differently: PDCA uses convergence scores,
|
||||
Six Sigma uses sigma levels, Lean uses waste ratios, SPC uses control
|
||||
limits. A unified quality metric for agent orchestration---one that allows
|
||||
meaningful comparison across methods---does not yet exist.
|
||||
|
||||
\subsection{FMEA for Agent Failure Modes}
|
||||
|
||||
Agent failure modes (hallucination, scope creep, false positive reviews,
|
||||
persona drift \citep{lu2026assistant}) can be systematically enumerated
|
||||
using FMEA methodology. A comprehensive FMEA catalog for LLM agents---with
|
||||
severity, occurrence, and detection ratings calibrated from empirical
|
||||
data---would provide a foundation for designing more robust orchestration
|
||||
systems.
|
||||
|
||||
% ============================================================
|
||||
\section{Conclusion}
|
||||
\label{sec:conclusion}
|
||||
|
||||
The operations management literature offers a rich toolkit for agent
|
||||
orchestration that extends far beyond the agile methods currently dominant
|
||||
in the field. Our taxonomy reveals that the highest-fitness methods---PDCA,
|
||||
Kanban, TOC, OODA, and FMEA---share a common profile: mechanistic,
|
||||
flow-oriented, and metric-driven. Methods centered on human psychology
|
||||
(Scrum, Design Thinking) transfer poorly without fundamental reformulation.
|
||||
|
||||
The key insight is that LLM agents are not ``fast humans.'' They have
|
||||
fundamentally different constraint profiles---cheap to clone, expensive to
|
||||
coordinate, stateless, psychologically inert---and these differences make
|
||||
some PM/OM methods \emph{more} effective (OODA loops at superhuman speed,
|
||||
FMEA with exhaustive enumeration) while rendering others irrelevant
|
||||
(standups without psychology, retrospectives without learning).
|
||||
|
||||
We encourage the agent orchestration community to look beyond agile sprints
|
||||
and role-playing frameworks toward the broader operations management
|
||||
tradition. A century of industrial practice has much to teach us about
|
||||
orchestrating intelligent agents---if we take the time to translate.
|
||||
|
||||
% ============================================================
|
||||
\section*{Acknowledgments}
|
||||
|
||||
The author thanks the operations management and quality engineering
|
||||
communities whose work, developed over decades for human organizations,
|
||||
provides the theoretical foundation for this analysis.
|
||||
|
||||
% ============================================================
|
||||
\bibliographystyle{plainnat}
|
||||
\bibliography{taxonomy-refs}
|
||||
|
||||
\end{document}
|
||||
34
scripts/run-tests.sh
Executable file
34
scripts/run-tests.sh
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env bash
|
||||
# run-tests.sh — Run all ArcheFlow bats tests.
|
||||
#
|
||||
# Usage: ./scripts/run-tests.sh [bats-args...]
|
||||
# Examples:
|
||||
# ./scripts/run-tests.sh # Run all tests
|
||||
# ./scripts/run-tests.sh --filter "event" # Run only event tests
|
||||
# ./scripts/run-tests.sh -t # TAP output
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
TESTS_DIR="$PROJECT_DIR/tests"
|
||||
|
||||
# Find bats binary
|
||||
BATS="${BATS:-}"
|
||||
if [[ -z "$BATS" ]]; then
|
||||
if command -v bats &>/dev/null; then
|
||||
BATS="bats"
|
||||
elif [[ -x "$HOME/.local/bin/bats" ]]; then
|
||||
BATS="$HOME/.local/bin/bats"
|
||||
else
|
||||
echo "ERROR: bats not found. Install bats-core or set BATS env var." >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Running ArcheFlow tests..."
|
||||
echo " bats: $($BATS --version)"
|
||||
echo " tests: $TESTS_DIR"
|
||||
echo ""
|
||||
|
||||
exec "$BATS" "$@" "$TESTS_DIR"/*.bats
|
||||
34
skills/af-dag/SKILL.md
Normal file
34
skills/af-dag/SKILL.md
Normal file
@@ -0,0 +1,34 @@
|
||||
---
|
||||
name: af-dag
|
||||
description: |
|
||||
Show the DAG of the current or last ArcheFlow run.
|
||||
<example>User: "/af-dag"</example>
|
||||
<example>User: "/af-dag 2026-04-06-jwt-auth"</example>
|
||||
---
|
||||
|
||||
# ArcheFlow Run DAG
|
||||
|
||||
1. Parse `run_id` from args. If none provided, read the latest run_id from `.archeflow/events/index.jsonl`.
|
||||
2. Run `./lib/archeflow-dag.sh .archeflow/events/<run_id>.jsonl` if the script exists. Display its output.
|
||||
3. If the script does not exist, read `.archeflow/events/<run_id>.jsonl` and render a text DAG:
|
||||
- Each node is an event (phase transitions, agent starts/completes, findings).
|
||||
- Show parent relationships via indentation.
|
||||
- Mark completed events with `[done]`, active with `[running]`, failed with `[FAIL]`.
|
||||
|
||||
Example output:
|
||||
```
|
||||
run.start 2026-04-06-jwt-auth
|
||||
plan.start
|
||||
agent.complete explorer (42s)
|
||||
agent.complete creator (68s)
|
||||
do.start
|
||||
agent.complete maker (180s)
|
||||
check.start
|
||||
agent.complete guardian (55s) -- 3 findings
|
||||
agent.complete skeptic (40s) -- 1 finding
|
||||
act.start
|
||||
fixes.applied 3/4
|
||||
run.complete (6m12s)
|
||||
```
|
||||
|
||||
4. If no events found for the run_id, say: "No events found for run `<run_id>`."
|
||||
42
skills/af-replay/SKILL.md
Normal file
42
skills/af-replay/SKILL.md
Normal file
@@ -0,0 +1,42 @@
|
||||
---
|
||||
name: af-replay
|
||||
description: "Replay and analyze a recorded ArcheFlow run: decision timeline and weighted what-if. Usage: /af-replay <run-id> [--timeline|--whatif|--compare] [--weights arch=w,...]"
|
||||
user-invocable: true
|
||||
---
|
||||
|
||||
# ArcheFlow Run Replay
|
||||
|
||||
Inspect a completed or in-progress run logged in `.archeflow/events/<run_id>.jsonl`. Use this to study which archetypes drove outcomes and to simulate **weighted** consensus (what-if).
|
||||
|
||||
## Recording (during PDCA)
|
||||
|
||||
After each meaningful orchestration choice, log a **decision point** (in addition to `review.verdict` where applicable):
|
||||
|
||||
```bash
|
||||
./lib/archeflow-decision.sh <run_id> <phase> <archetype> '<input_summary>' '<decision>' <confidence> [parent_seq]
|
||||
```
|
||||
|
||||
Fields stored: `phase`, `archetype`, `input`, `decision`, `confidence`, `ts` (event timestamp). The event type is `decision.point`.
|
||||
|
||||
Lower-level alternative:
|
||||
|
||||
```bash
|
||||
./lib/archeflow-event.sh "$RUN_ID" decision.point check guardian \
|
||||
'{"archetype":"guardian","input":"diff","decision":"needs_changes","confidence":0.85}' 7
|
||||
```
|
||||
|
||||
## Commands (from project root)
|
||||
|
||||
| Action | Shell |
|
||||
|--------|--------|
|
||||
| Timeline | `./lib/archeflow-replay.sh timeline <run_id>` |
|
||||
| What-if | `./lib/archeflow-replay.sh whatif <run_id> [--weights guardian=2,sage=0.5] [--threshold 0.5] [--json]` |
|
||||
| Both | `./lib/archeflow-replay.sh compare <run_id> [--weights ...]` |
|
||||
|
||||
- **Timeline** lists `decision.point` rows and `review.verdict` (check phase).
|
||||
- **What-if** reads the **last** `review.verdict` per archetype in check. **Original** outcome uses strict any-veto (any non-approve → BLOCK). **Replay** uses weighted mean strictness: each reviewer contributes weight × (1 if not approved, else 0); BLOCK if mean ≥ threshold (default 0.5).
|
||||
- **`--json`** emits machine-readable output for dashboards or scripts.
|
||||
|
||||
## Learning effectiveness
|
||||
|
||||
Correlate `decision.point` confidence and verdicts with cycle outcomes (`cycle.boundary`, `run.complete`) and `./lib/archeflow-score.sh extract` to see which archetypes add signal for which task shapes.
|
||||
40
skills/af-report/SKILL.md
Normal file
40
skills/af-report/SKILL.md
Normal file
@@ -0,0 +1,40 @@
|
||||
---
|
||||
name: af-report
|
||||
description: |
|
||||
Generate a full process report for an ArcheFlow run.
|
||||
<example>User: "/af-report"</example>
|
||||
<example>User: "/af-report 2026-04-06-jwt-auth"</example>
|
||||
---
|
||||
|
||||
# ArcheFlow Run Report
|
||||
|
||||
1. Parse `run_id` from args. If none provided, read the latest run_id from `.archeflow/events/index.jsonl`.
|
||||
2. Run `./lib/archeflow-report.sh .archeflow/events/<run_id>.jsonl` if the script exists. Display its output.
|
||||
3. If the script does not exist, read `.archeflow/events/<run_id>.jsonl` and produce a markdown report:
|
||||
|
||||
```markdown
|
||||
# ArcheFlow Report: <run_id>
|
||||
|
||||
## Overview
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| Task | ... |
|
||||
| Workflow | fast/standard/thorough |
|
||||
| Cycles | N |
|
||||
| Duration | Xm Ys |
|
||||
| Total Cost | $X.XX |
|
||||
|
||||
## Phase Summary
|
||||
For each phase (Plan, Do, Check, Act): agents involved, duration, token cost, key outputs.
|
||||
|
||||
## Findings
|
||||
Table of all findings: severity, category, description, archetype source, resolution (fixed/dismissed/deferred).
|
||||
|
||||
## Fixes Applied
|
||||
List of fixes with before/after summary and which finding they addressed.
|
||||
|
||||
## Lessons Learned
|
||||
Any new lessons extracted to memory during this run.
|
||||
```
|
||||
|
||||
4. If no events found for the run_id, say: "No events found for run `<run_id>`."
|
||||
23
skills/af-score/SKILL.md
Normal file
23
skills/af-score/SKILL.md
Normal file
@@ -0,0 +1,23 @@
|
||||
---
|
||||
name: af-score
|
||||
description: |
|
||||
Show archetype effectiveness scores across runs.
|
||||
<example>User: "/af-score"</example>
|
||||
---
|
||||
|
||||
# ArcheFlow Effectiveness Scores
|
||||
|
||||
1. Run `./lib/archeflow-score.sh list` if the script exists. Display its output.
|
||||
2. If the script does not exist, read `.archeflow/memory/effectiveness.jsonl` directly.
|
||||
3. Summarize per archetype as a table:
|
||||
|
||||
| Archetype | Runs | Signal/Noise | Fix Rate | Avg Cost |
|
||||
|-----------|------|--------------|----------|----------|
|
||||
| Guardian | ... | ... | ... | ... |
|
||||
| Skeptic | ... | ... | ... | ... |
|
||||
|
||||
- **Signal/Noise**: findings that led to actual fixes vs total findings raised.
|
||||
- **Fix Rate**: percentage of findings that were applied (not dismissed).
|
||||
- **Avg Cost**: mean token cost per review across runs.
|
||||
|
||||
4. If no effectiveness data exists, say: "No effectiveness data yet. Run `/af-run` at least once."
|
||||
25
skills/af-status/SKILL.md
Normal file
25
skills/af-status/SKILL.md
Normal file
@@ -0,0 +1,25 @@
|
||||
---
|
||||
name: af-status
|
||||
description: |
|
||||
Show ArcheFlow status — current/last run, active agents, findings.
|
||||
<example>User: "/af-status"</example>
|
||||
---
|
||||
|
||||
# ArcheFlow Status
|
||||
|
||||
1. Read `.archeflow/state.json` if it exists. Extract: task, phase, cycle, workflow, active agents, findings count, start time.
|
||||
2. If `state.json` does not exist, read the latest entry from `.archeflow/events/index.jsonl`. Extract run_id, task, last event type, timestamp.
|
||||
3. Calculate duration from start time to now (or to completion time if run finished).
|
||||
4. Report as a compact table:
|
||||
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| Run | `<run_id>` |
|
||||
| Task | `<task description>` |
|
||||
| Phase | `<current phase>` |
|
||||
| Cycle | `<cycle number>` |
|
||||
| Workflow | `<fast/standard/thorough>` |
|
||||
| Findings | `<count>` |
|
||||
| Duration | `<elapsed>` |
|
||||
|
||||
5. If no `state.json` and no `index.jsonl`, say: "No active or recent ArcheFlow runs."
|
||||
@@ -352,10 +352,12 @@ Emit events via `./lib/archeflow-event.sh <run_id> <type> <phase> <agent> '<json
|
||||
| After agent returns | `agent.complete` | archetype, duration_ms, artifacts, summary |
|
||||
| Phase boundary | `phase.transition` | from, to, artifacts_so_far |
|
||||
| Alternative chosen | `decision` | what, chosen, alternatives, rationale |
|
||||
| Orchestrator decision (replay) | `decision.point` | archetype, input, decision, confidence — use `./lib/archeflow-decision.sh` |
|
||||
| Reviewer verdict | `review.verdict` | archetype, verdict, findings[] |
|
||||
| Fix addressing review | `fix.applied` | source, finding, file, line |
|
||||
| End of PDCA cycle | `cycle.boundary` | cycle, max_cycles, exit_condition, convergence |
|
||||
| Shadow triggered | `shadow.detected` | archetype, shadow, trigger, action |
|
||||
| Policy halt | `wiggum.break` | trigger, run_state, unresolved_findings, hard/soft |
|
||||
| Run ends | `run.complete` | status, cycles, agents_total, fixes_total |
|
||||
|
||||
Parent rules: `run.start` has `parent: []`. Agents parent to the event that triggered them. Phase transitions fan-in from all completing events. Parallel agents share the same parent.
|
||||
@@ -403,6 +405,12 @@ Scores stored in `.archeflow/memory/effectiveness.jsonl`. After 10+ runs, recomm
|
||||
|
||||
---
|
||||
|
||||
## Run replay (decision log + what-if)
|
||||
|
||||
After key choices (routing, fast-path skip, escalation), emit `decision.point` via `./lib/archeflow-decision.sh` so runs can be inspected with `./lib/archeflow-replay.sh timeline|whatif|compare <run_id>`. Weighted what-if helps estimate how much each review archetype swayed the effective ship/block outcome. See skill `af-replay`.
|
||||
|
||||
---
|
||||
|
||||
## Dry-Run Mode
|
||||
|
||||
When `--dry-run`: Run Plan phase only. Display workflow, agent counts, confidence scores, cost estimate. Ask user to proceed. If yes, continue with `--start-from do`.
|
||||
|
||||
@@ -79,22 +79,32 @@ Every **45 minutes** or **3 completed tasks** (whichever first):
|
||||
| 95% budget spent | Complete current task, then STOP |
|
||||
| 100% budget | STOP immediately, commit WIP |
|
||||
|
||||
### Circuit Breaker
|
||||
### Wiggum Break (Circuit Breaker)
|
||||
|
||||
| Trigger | Action |
|
||||
Named after Chief Wiggum — policy enforcement AND the Ralph Loop's dad.
|
||||
When a Wiggum Break triggers, the system halts execution, saves state, and
|
||||
reports to the user. "Bake 'em away, toys."
|
||||
|
||||
**Hard breaks** (halt immediately, commit WIP):
|
||||
|
||||
| Trigger | Reason |
|
||||
|---------|--------|
|
||||
| 3 consecutive agent failures/timeouts | STOP. Infrastructure issue, not a code problem. |
|
||||
| 3 consecutive task failures in sprint | STOP. Something systemic is wrong. |
|
||||
| Same shadow detected 3+ times in one cycle | STOP. Task needs to be broken down or re-scoped. |
|
||||
| Test suite broken after merge | Auto-revert, STOP, report. |
|
||||
| 3 consecutive agent failures/timeouts | Infrastructure issue, not a code problem |
|
||||
| 3 consecutive task failures in sprint | Something systemic is wrong |
|
||||
| Same shadow detected 3+ times in one cycle | Task needs to be broken down or re-scoped |
|
||||
| Test suite broken after merge | Auto-revert, then halt |
|
||||
| 2+ oscillating findings (present→absent→present) | Fundamental tension in review criteria |
|
||||
|
||||
### Diminishing Returns
|
||||
**Soft breaks** (finish current task, then halt):
|
||||
|
||||
| Signal | Action |
|
||||
| Signal | Reason |
|
||||
|--------|--------|
|
||||
| Cycle N findings identical to cycle N-1 | STOP cycling. Present best result. |
|
||||
| Convergence score <0.5 for 2 consecutive cycles | STOP. "This needs a different approach." |
|
||||
| Reviewer finding count increases cycle over cycle | STOP. Implementation is diverging, not converging. |
|
||||
| Cycle N findings identical to cycle N-1 | No progress — present best result |
|
||||
| Convergence score <0.5 for 2 consecutive cycles | "This needs a different approach" |
|
||||
| Reviewer finding count increases cycle over cycle | Implementation is diverging, not converging |
|
||||
|
||||
When a Wiggum Break fires, emit a `wiggum.break` event with trigger, run state, and unresolved findings.
|
||||
The event log makes it easy to audit why a run was halted and whether the break was warranted.
|
||||
|
||||
### Context Pollution
|
||||
|
||||
|
||||
22
skills/using-archeflow/ACTIVATION.md
Normal file
22
skills/using-archeflow/ACTIVATION.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# ArcheFlow -- Active
|
||||
|
||||
Multi-agent orchestration using archetypal roles and PDCA quality cycles.
|
||||
|
||||
## Session Start
|
||||
|
||||
On activation, print ONE line then proceed silently:
|
||||
```
|
||||
archeflow v0.8.0 · 19 skills · <domain> domain
|
||||
```
|
||||
Domain: `writing` if `colette.yaml` exists, `research` if paper/thesis files, `code` otherwise.
|
||||
|
||||
## When to Use
|
||||
|
||||
| Need | Command |
|
||||
|------|---------|
|
||||
| Work the queue | `/af-sprint` |
|
||||
| Deep orchestration | `/af-run <task>` |
|
||||
| Code review | `/af-review` |
|
||||
| Simple fix / question | Skip ArcheFlow — just do it directly |
|
||||
|
||||
Do NOT use ArcheFlow for: single-line fixes, questions, reading code, config tweaks, git ops.
|
||||
@@ -7,7 +7,7 @@ description: Use at session start when implementing features, reviewing code, de
|
||||
|
||||
On activation, print ONE line then proceed silently:
|
||||
```
|
||||
archeflow v0.8.0 · 19 skills · <domain> domain
|
||||
archeflow v0.9.0 · 24 skills · <domain> domain
|
||||
```
|
||||
Domain auto-detected: `writing` if `colette.yaml` exists, `research` if paper/thesis files, `code` otherwise.
|
||||
|
||||
@@ -46,6 +46,7 @@ Do NOT use for: single-line fixes, questions, reading/exploring, config tweaks,
|
||||
| `/af-memory` | Cross-run lesson memory |
|
||||
| `/af-fanout` | Colette book fanout via agents |
|
||||
| `/af-dag` | DAG of current/last run |
|
||||
| `/af-replay <run_id>` | Decision timeline + weighted what-if on recorded events |
|
||||
|
||||
## Mini-Reflect Fallback
|
||||
|
||||
|
||||
71
tests/archeflow-dag.bats
Normal file
71
tests/archeflow-dag.bats
Normal file
@@ -0,0 +1,71 @@
|
||||
# Tests for archeflow-dag.sh — ASCII DAG rendering from JSONL events.
|
||||
#
|
||||
# Validates: basic rendering, parent relationships, color flags, missing file handling.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
|
||||
# Create a standard events file with parent relationships
|
||||
cat > "$BATS_TEST_TMPDIR/dag-events.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"dag-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"DAG test"}}
|
||||
{"ts":"2026-04-03T10:01:00Z","run_id":"dag-run","seq":2,"parent":[1],"type":"agent.complete","phase":"plan","agent":"creator","data":{"archetype":"creator","duration_ms":60000,"tokens":1500}}
|
||||
{"ts":"2026-04-03T10:02:00Z","run_id":"dag-run","seq":3,"parent":[2],"type":"phase.transition","phase":"do","agent":null,"data":{"from":"plan","to":"do"}}
|
||||
{"ts":"2026-04-03T10:03:00Z","run_id":"dag-run","seq":4,"parent":[3],"type":"agent.complete","phase":"do","agent":"maker","data":{"archetype":"maker","duration_ms":120000,"tokens":3000}}
|
||||
{"ts":"2026-04-03T10:04:00Z","run_id":"dag-run","seq":5,"parent":[4],"type":"run.complete","phase":"act","agent":null,"data":{"agents_total":2,"fixes_total":0}}
|
||||
EVENTS
|
||||
}
|
||||
|
||||
@test "dag: exits 1 with usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-dag.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "dag: exits 1 when events file not found" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" nonexistent.jsonl
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "dag: renders run.start as root node" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"#1"* ]]
|
||||
[[ "$output" == *"run.start"* ]]
|
||||
}
|
||||
|
||||
@test "dag: renders agent.complete events with archetype name" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"creator"* ]]
|
||||
[[ "$output" == *"maker"* ]]
|
||||
}
|
||||
|
||||
@test "dag: renders phase transitions" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"plan"* ]]
|
||||
[[ "$output" == *"do"* ]]
|
||||
}
|
||||
|
||||
@test "dag: renders run.complete with agent/fix counts" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"run.complete"* ]]
|
||||
[[ "$output" == *"2 agents"* ]]
|
||||
}
|
||||
|
||||
@test "dag: --no-color suppresses ANSI codes" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
# Should not contain escape sequences
|
||||
[[ "$output" != *$'\033'* ]]
|
||||
}
|
||||
|
||||
@test "dag: uses tree-drawing characters for hierarchy" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
# Should contain box-drawing characters (either unicode or ASCII connectors)
|
||||
[[ "$output" == *"├"* ]] || [[ "$output" == *"└"* ]]
|
||||
}
|
||||
127
tests/archeflow-event.bats
Normal file
127
tests/archeflow-event.bats
Normal file
@@ -0,0 +1,127 @@
|
||||
# Tests for archeflow-event.sh — structured JSONL event logging.
|
||||
#
|
||||
# Validates: JSONL output format, sequence numbering, parent field handling,
|
||||
# input validation, file/directory creation.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
@test "event: exits 1 with usage when called with fewer than 4 args" {
|
||||
run "$LIB_DIR/archeflow-event.sh" run1 type1 plan
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "event: creates events directory and file on first call" {
|
||||
run "$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{"task":"test"}'
|
||||
[ "$status" -eq 0 ]
|
||||
[ -d ".archeflow/events" ]
|
||||
[ -f ".archeflow/events/test-run.jsonl" ]
|
||||
}
|
||||
|
||||
@test "event: first event has seq=1" {
|
||||
run "$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{"task":"test"}'
|
||||
[ "$status" -eq 0 ]
|
||||
local seq
|
||||
seq=$(head -1 ".archeflow/events/test-run.jsonl" | jq -r '.seq')
|
||||
[ "$seq" -eq 1 ]
|
||||
}
|
||||
|
||||
@test "event: second event has seq=2" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{"task":"test"}' 2>/dev/null
|
||||
"$LIB_DIR/archeflow-event.sh" test-run agent.complete plan creator '{"dur":100}' "1" 2>/dev/null
|
||||
local count
|
||||
count=$(wc -l < ".archeflow/events/test-run.jsonl")
|
||||
[ "$count" -eq 2 ]
|
||||
local seq2
|
||||
seq2=$(tail -1 ".archeflow/events/test-run.jsonl" | jq -r '.seq')
|
||||
[ "$seq2" -eq 2 ]
|
||||
}
|
||||
|
||||
@test "event: output is valid JSONL" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{"task":"hello"}' 2>/dev/null
|
||||
# jq will fail if the line is not valid JSON
|
||||
jq empty ".archeflow/events/test-run.jsonl"
|
||||
}
|
||||
|
||||
@test "event: fields are correctly populated" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run agent.complete do maker '{"tokens":500}' 2>/dev/null
|
||||
local event
|
||||
event=$(head -1 ".archeflow/events/test-run.jsonl")
|
||||
[ "$(echo "$event" | jq -r '.run_id')" = "test-run" ]
|
||||
[ "$(echo "$event" | jq -r '.type')" = "agent.complete" ]
|
||||
[ "$(echo "$event" | jq -r '.phase')" = "do" ]
|
||||
[ "$(echo "$event" | jq -r '.agent')" = "maker" ]
|
||||
[ "$(echo "$event" | jq -r '.data.tokens')" = "500" ]
|
||||
}
|
||||
|
||||
@test "event: empty agent becomes null in JSON" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run phase.transition do "" '{"from":"plan","to":"do"}' 2>/dev/null
|
||||
local agent
|
||||
agent=$(head -1 ".archeflow/events/test-run.jsonl" | jq -r '.agent')
|
||||
[ "$agent" = "null" ]
|
||||
}
|
||||
|
||||
@test "event: parent field is empty array for root events" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' 2>/dev/null
|
||||
local parent
|
||||
parent=$(head -1 ".archeflow/events/test-run.jsonl" | jq -c '.parent')
|
||||
[ "$parent" = "[]" ]
|
||||
}
|
||||
|
||||
@test "event: single parent is parsed correctly" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' 2>/dev/null
|
||||
"$LIB_DIR/archeflow-event.sh" test-run agent.complete plan creator '{}' "1" 2>/dev/null
|
||||
local parent
|
||||
parent=$(tail -1 ".archeflow/events/test-run.jsonl" | jq -c '.parent')
|
||||
[ "$parent" = "[1]" ]
|
||||
}
|
||||
|
||||
@test "event: multiple parents (fan-in) are parsed correctly" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' 2>/dev/null
|
||||
"$LIB_DIR/archeflow-event.sh" test-run a plan "" '{}' "1" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-event.sh" test-run b plan "" '{}' "1" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-event.sh" test-run merge plan "" '{}' "2,3" 2>/dev/null
|
||||
local parent
|
||||
parent=$(tail -1 ".archeflow/events/test-run.jsonl" | jq -c '.parent')
|
||||
[ "$parent" = "[2,3]" ]
|
||||
}
|
||||
|
||||
@test "event: rejects invalid JSON data" {
|
||||
run "$LIB_DIR/archeflow-event.sh" test-run run.start plan "" 'not-json'
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"invalid JSON"* ]]
|
||||
}
|
||||
|
||||
@test "event: rejects invalid parent format" {
|
||||
run "$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' "abc"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"invalid parent format"* ]]
|
||||
}
|
||||
|
||||
@test "event: timestamp is ISO 8601 UTC format" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' 2>/dev/null
|
||||
local ts
|
||||
ts=$(head -1 ".archeflow/events/test-run.jsonl" | jq -r '.ts')
|
||||
# Matches YYYY-MM-DDTHH:MM:SSZ
|
||||
[[ "$ts" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$ ]]
|
||||
}
|
||||
|
||||
@test "event: default data is empty object when omitted" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan agent 2>/dev/null
|
||||
local data
|
||||
data=$(head -1 ".archeflow/events/test-run.jsonl" | jq -c '.data')
|
||||
[ "$data" = "{}" ]
|
||||
}
|
||||
|
||||
@test "event: confirmation message goes to stderr" {
|
||||
run "$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' "" 2>&1
|
||||
[[ "$output" == *"[archeflow-event]"* ]]
|
||||
[[ "$output" == *"#1"* ]]
|
||||
}
|
||||
212
tests/archeflow-git.bats
Normal file
212
tests/archeflow-git.bats
Normal file
@@ -0,0 +1,212 @@
|
||||
# Tests for archeflow-git.sh — git branch/commit strategy for ArcheFlow runs.
|
||||
#
|
||||
# Validates: branch creation with correct naming, commit formatting,
|
||||
# merge strategies, input validation, and safety guards.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
# --- Usage ---
|
||||
|
||||
@test "git: exits 1 with usage when called with fewer than 2 args" {
|
||||
run "$LIB_DIR/archeflow-git.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "git: exits 1 for unknown command" {
|
||||
run "$LIB_DIR/archeflow-git.sh" nonexistent test-run
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Unknown command"* ]]
|
||||
}
|
||||
|
||||
# --- init ---
|
||||
|
||||
@test "git init: creates branch with archeflow/ prefix" {
|
||||
run "$LIB_DIR/archeflow-git.sh" init test-run
|
||||
[ "$status" -eq 0 ]
|
||||
local current
|
||||
current=$(git branch --show-current)
|
||||
[ "$current" = "archeflow/test-run" ]
|
||||
}
|
||||
|
||||
@test "git init: stores base branch in .archeflow/runs/<run_id>/base-branch" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
[ -f ".archeflow/runs/test-run/base-branch" ]
|
||||
local base
|
||||
base=$(cat ".archeflow/runs/test-run/base-branch")
|
||||
[ "$base" = "main" ]
|
||||
}
|
||||
|
||||
@test "git init: fails if branch already exists" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
git checkout main --quiet
|
||||
run "$LIB_DIR/archeflow-git.sh" init test-run
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"already exists"* ]]
|
||||
}
|
||||
|
||||
# --- commit ---
|
||||
|
||||
@test "git commit: uses conventional commit format by default" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
# Create a file to commit
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "initial plan" 2>/dev/null
|
||||
local msg
|
||||
msg=$(git log -1 --format=%s)
|
||||
[[ "$msg" == "archeflow(plan): initial plan" ]]
|
||||
}
|
||||
|
||||
@test "git commit: stages event file automatically" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "test commit" 2>/dev/null
|
||||
|
||||
# Verify the event file was committed
|
||||
local committed_files
|
||||
committed_files=$(git diff-tree --no-commit-id --name-only -r HEAD)
|
||||
[[ "$committed_files" == *"test-run.jsonl"* ]]
|
||||
}
|
||||
|
||||
@test "git commit: stages extra files passed as arguments" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
echo "extra content" > extra.txt
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run do "with extras" extra.txt 2>/dev/null
|
||||
local committed_files
|
||||
committed_files=$(git diff-tree --no-commit-id --name-only -r HEAD)
|
||||
[[ "$committed_files" == *"extra.txt"* ]]
|
||||
}
|
||||
|
||||
@test "git commit: reports nothing to commit when no changes" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
# Commit the init artifacts first so there's a clean state
|
||||
git add -A && git commit -m "init artifacts" --quiet 2>/dev/null || true
|
||||
run bash -c "cd '$BATS_TEST_TMPDIR' && '$LIB_DIR/archeflow-git.sh' commit test-run plan 'empty' 2>&1"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Nothing to commit"* ]]
|
||||
}
|
||||
|
||||
@test "git commit: fails if not on the run branch" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
git checkout main --quiet
|
||||
run "$LIB_DIR/archeflow-git.sh" commit test-run plan "wrong branch"
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Expected to be on branch"* ]]
|
||||
}
|
||||
|
||||
# --- phase-commit ---
|
||||
|
||||
@test "git phase-commit: creates commit with phase transition message" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" phase-commit test-run plan 2>/dev/null
|
||||
local msg
|
||||
msg=$(git log -1 --format=%s)
|
||||
# Should contain the phase transition arrow
|
||||
[[ "$msg" == *"plan"* ]]
|
||||
[[ "$msg" == *"do"* ]]
|
||||
}
|
||||
|
||||
# --- merge ---
|
||||
|
||||
@test "git merge: squash merge is the default strategy" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "test" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-git.sh" merge test-run 2>/dev/null
|
||||
|
||||
local current
|
||||
current=$(git branch --show-current)
|
||||
[ "$current" = "main" ]
|
||||
|
||||
local msg
|
||||
msg=$(git log -1 --format=%s)
|
||||
[[ "$msg" == *"archeflow run test-run"* ]]
|
||||
}
|
||||
|
||||
@test "git merge: --no-ff creates a merge commit" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "test" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-git.sh" merge test-run --no-ff 2>/dev/null
|
||||
|
||||
local current
|
||||
current=$(git branch --show-current)
|
||||
[ "$current" = "main" ]
|
||||
|
||||
# no-ff merge commit should have 2 parents
|
||||
local parent_count
|
||||
parent_count=$(git cat-file -p HEAD | grep -c '^parent')
|
||||
[ "$parent_count" -eq 2 ]
|
||||
}
|
||||
|
||||
@test "git merge: rejects unknown merge strategy" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "test" 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-git.sh" merge test-run --fast-forward
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Unknown merge strategy"* ]]
|
||||
}
|
||||
|
||||
@test "git merge: fails with uncommitted changes" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
echo "dirty" > dirty.txt
|
||||
git add dirty.txt
|
||||
run "$LIB_DIR/archeflow-git.sh" merge test-run
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Uncommitted changes"* ]]
|
||||
}
|
||||
|
||||
# --- format_message ---
|
||||
|
||||
@test "git commit: simple style uses 'phase: msg' format" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
# Create config with simple style
|
||||
mkdir -p .archeflow
|
||||
echo "commit_style: simple" > .archeflow/config.yaml
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "simple test" 2>/dev/null
|
||||
local msg
|
||||
msg=$(git log -1 --format=%s)
|
||||
[ "$msg" = "plan: simple test" ]
|
||||
}
|
||||
|
||||
# --- status ---
|
||||
|
||||
@test "git status: shows branch info for existing run" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-git.sh" status test-run
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Branch: archeflow/test-run"* ]]
|
||||
[[ "$output" == *"Base: main"* ]]
|
||||
}
|
||||
|
||||
@test "git status: fails for nonexistent branch" {
|
||||
run "$LIB_DIR/archeflow-git.sh" status nonexistent
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"does not exist"* ]]
|
||||
}
|
||||
|
||||
# --- cleanup ---
|
||||
|
||||
@test "git cleanup: fails if currently on the run branch" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-git.sh" cleanup test-run
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Cannot delete"* ]]
|
||||
}
|
||||
81
tests/archeflow-init.bats
Normal file
81
tests/archeflow-init.bats
Normal file
@@ -0,0 +1,81 @@
|
||||
# Tests for archeflow-init.sh — project initialization from templates.
|
||||
#
|
||||
# Validates: usage output, --list, --from (clone), and argument parsing.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
@test "init: shows usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-init.sh"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
[[ "$output" == *"bundle-name"* ]]
|
||||
}
|
||||
|
||||
@test "init: --list shows template listing without errors" {
|
||||
run "$LIB_DIR/archeflow-init.sh" --list
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Templates"* ]]
|
||||
[[ "$output" == *"Bundles"* ]]
|
||||
}
|
||||
|
||||
@test "init: --from fails when source has no .archeflow dir" {
|
||||
local source_dir
|
||||
source_dir=$(mktemp -d)
|
||||
run "$LIB_DIR/archeflow-init.sh" --from "$source_dir"
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"No .archeflow/"* ]]
|
||||
rm -rf "$source_dir"
|
||||
}
|
||||
|
||||
@test "init: --from clones setup from another project" {
|
||||
# Create a source project with .archeflow structure
|
||||
local source_dir
|
||||
source_dir=$(mktemp -d)
|
||||
mkdir -p "$source_dir/.archeflow/teams" "$source_dir/.archeflow/workflows"
|
||||
echo "name: test-team" > "$source_dir/.archeflow/teams/test.yaml"
|
||||
echo "name: test-workflow" > "$source_dir/.archeflow/workflows/test.yaml"
|
||||
echo "bundle: test" > "$source_dir/.archeflow/config.yaml"
|
||||
|
||||
run "$LIB_DIR/archeflow-init.sh" --from "$source_dir"
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f ".archeflow/teams/test.yaml" ]
|
||||
[ -f ".archeflow/workflows/test.yaml" ]
|
||||
[ -f ".archeflow/config.yaml" ]
|
||||
rm -rf "$source_dir"
|
||||
}
|
||||
|
||||
@test "init: --from skips events and artifacts directories" {
|
||||
local source_dir
|
||||
source_dir=$(mktemp -d)
|
||||
mkdir -p "$source_dir/.archeflow/events" "$source_dir/.archeflow/artifacts"
|
||||
mkdir -p "$source_dir/.archeflow/teams"
|
||||
echo "name: test" > "$source_dir/.archeflow/teams/t.yaml"
|
||||
echo '{"test":true}' > "$source_dir/.archeflow/events/run.jsonl"
|
||||
echo "artifact" > "$source_dir/.archeflow/artifacts/test.txt"
|
||||
|
||||
run "$LIB_DIR/archeflow-init.sh" --from "$source_dir"
|
||||
[ "$status" -eq 0 ]
|
||||
[ ! -f ".archeflow/events/run.jsonl" ]
|
||||
[ ! -f ".archeflow/artifacts/test.txt" ]
|
||||
[[ "$output" == *"skipped events"* ]]
|
||||
rm -rf "$source_dir"
|
||||
}
|
||||
|
||||
@test "init: rejects unknown options" {
|
||||
run "$LIB_DIR/archeflow-init.sh" --nonexistent
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Unknown option"* ]]
|
||||
}
|
||||
|
||||
@test "init: --save fails with no .archeflow directory" {
|
||||
run "$LIB_DIR/archeflow-init.sh" --save test-save
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"No .archeflow/"* ]]
|
||||
}
|
||||
227
tests/archeflow-memory.bats
Normal file
227
tests/archeflow-memory.bats
Normal file
@@ -0,0 +1,227 @@
|
||||
# Tests for archeflow-memory.sh — cross-run lesson memory management.
|
||||
#
|
||||
# Validates: add, list, decay, forget, inject filtering, and JSONL format.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
# --- Usage / error handling ---
|
||||
|
||||
@test "memory: exits 1 with usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-memory.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "memory: exits 1 for unknown command" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" nonexistent
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Unknown command"* ]]
|
||||
}
|
||||
|
||||
# --- add ---
|
||||
|
||||
@test "memory add: creates lessons.jsonl and appends a valid JSONL line" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" add preference "Always validate inputs"
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f ".archeflow/memory/lessons.jsonl" ]
|
||||
jq empty ".archeflow/memory/lessons.jsonl"
|
||||
}
|
||||
|
||||
@test "memory add: lesson has correct fields" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Guardian misses SQL injection" 2>/dev/null
|
||||
[ "$(jq -r '.type' .archeflow/memory/lessons.jsonl)" = "pattern" ]
|
||||
[ "$(jq -r '.description' .archeflow/memory/lessons.jsonl)" = "Guardian misses SQL injection" ]
|
||||
[ "$(jq -r '.source' .archeflow/memory/lessons.jsonl)" = "user_feedback" ]
|
||||
[ "$(jq -r '.frequency' .archeflow/memory/lessons.jsonl)" = "1" ]
|
||||
[ "$(jq -r '.run_id' .archeflow/memory/lessons.jsonl)" = "manual" ]
|
||||
[ "$(jq -r '.domain' .archeflow/memory/lessons.jsonl)" = "general" ]
|
||||
}
|
||||
|
||||
@test "memory add: generates sequential IDs" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "first lesson" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "second lesson" 2>/dev/null
|
||||
local id1 id2
|
||||
id1=$(head -1 ".archeflow/memory/lessons.jsonl" | jq -r '.id')
|
||||
id2=$(tail -1 ".archeflow/memory/lessons.jsonl" | jq -r '.id')
|
||||
[ "$id1" = "m-001" ]
|
||||
[ "$id2" = "m-002" ]
|
||||
}
|
||||
|
||||
@test "memory add: generates tags from description" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Guardian misses SQL injection attacks" 2>/dev/null
|
||||
local tags_count
|
||||
tags_count=$(head -1 ".archeflow/memory/lessons.jsonl" | jq '.tags | length')
|
||||
[ "$tags_count" -gt 0 ]
|
||||
}
|
||||
|
||||
@test "memory add: exits 1 when description is missing" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" add pattern
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
# --- list ---
|
||||
|
||||
@test "memory list: shows message when no lessons exist" {
|
||||
run bash -c "'$LIB_DIR/archeflow-memory.sh' list 2>&1"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"No lessons"* ]]
|
||||
}
|
||||
|
||||
@test "memory list: shows table header and lesson data" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Test lesson for listing" 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-memory.sh" list
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"ID"* ]]
|
||||
[[ "$output" == *"Freq"* ]]
|
||||
[[ "$output" == *"m-001"* ]]
|
||||
[[ "$output" == *"Test lesson for listing"* ]]
|
||||
}
|
||||
|
||||
# --- decay ---
|
||||
|
||||
@test "memory decay: increments runs_since_last_seen" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Decay test lesson" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-memory.sh" decay 2>/dev/null
|
||||
local runs_since
|
||||
runs_since=$(head -1 ".archeflow/memory/lessons.jsonl" | jq '.runs_since_last_seen')
|
||||
[ "$runs_since" -eq 1 ]
|
||||
}
|
||||
|
||||
@test "memory decay: decrements frequency after 10 runs" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Decay frequency test" 2>/dev/null
|
||||
# Set frequency=3 and runs_since=9 to trigger decay on next call
|
||||
local tmp=".archeflow/memory/lessons.jsonl.tmp"
|
||||
head -1 ".archeflow/memory/lessons.jsonl" | jq -c '.frequency = 3 | .runs_since_last_seen = 9' > "$tmp"
|
||||
mv "$tmp" ".archeflow/memory/lessons.jsonl"
|
||||
|
||||
"$LIB_DIR/archeflow-memory.sh" decay 2>/dev/null
|
||||
local freq
|
||||
freq=$(head -1 ".archeflow/memory/lessons.jsonl" | jq '.frequency')
|
||||
[ "$freq" -eq 2 ]
|
||||
}
|
||||
|
||||
@test "memory decay: archives lesson when frequency reaches 0" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Will be archived" 2>/dev/null
|
||||
# Set frequency=1 and runs_since=9 to trigger archival
|
||||
local tmp=".archeflow/memory/lessons.jsonl.tmp"
|
||||
head -1 ".archeflow/memory/lessons.jsonl" | jq -c '.frequency = 1 | .runs_since_last_seen = 9' > "$tmp"
|
||||
mv "$tmp" ".archeflow/memory/lessons.jsonl"
|
||||
|
||||
"$LIB_DIR/archeflow-memory.sh" decay 2>/dev/null
|
||||
|
||||
# Lesson should be gone from lessons file (file should be empty)
|
||||
local remaining
|
||||
remaining=$(wc -l < ".archeflow/memory/lessons.jsonl" | tr -d ' ')
|
||||
[ "$remaining" -eq 0 ]
|
||||
|
||||
# And present in archive
|
||||
[ -f ".archeflow/memory/archive.jsonl" ]
|
||||
local archived_count
|
||||
archived_count=$(wc -l < ".archeflow/memory/archive.jsonl" | tr -d ' ')
|
||||
[ "$archived_count" -eq 1 ]
|
||||
}
|
||||
|
||||
@test "memory decay: does nothing when no lessons exist" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" decay
|
||||
[ "$status" -eq 0 ]
|
||||
}
|
||||
|
||||
# --- forget ---
|
||||
|
||||
@test "memory forget: moves lesson to archive" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Will forget this" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-memory.sh" forget m-001 2>/dev/null
|
||||
|
||||
# Lessons file should be empty
|
||||
local remaining
|
||||
remaining=$(wc -l < ".archeflow/memory/lessons.jsonl" | tr -d ' ')
|
||||
[ "$remaining" -eq 0 ]
|
||||
|
||||
# Archive should have it
|
||||
[ -f ".archeflow/memory/archive.jsonl" ]
|
||||
local archived_id
|
||||
archived_id=$(head -1 ".archeflow/memory/archive.jsonl" | jq -r '.id')
|
||||
[ "$archived_id" = "m-001" ]
|
||||
}
|
||||
|
||||
@test "memory forget: exits 1 for nonexistent ID" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "test" 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-memory.sh" forget m-999
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "memory forget: exits 1 when no lessons file exists" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" forget m-001
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"No lessons file"* ]]
|
||||
}
|
||||
|
||||
# --- inject ---
|
||||
|
||||
@test "memory inject: outputs nothing when no lessons file exists" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" inject code guardian
|
||||
[ "$status" -eq 0 ]
|
||||
[ -z "$output" ]
|
||||
}
|
||||
|
||||
@test "memory inject: outputs relevant lessons with frequency >= 2" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Test injection lesson" 2>/dev/null
|
||||
# Bump frequency to 2
|
||||
local tmp=".archeflow/memory/lessons.jsonl.tmp"
|
||||
jq -c '.frequency = 2' ".archeflow/memory/lessons.jsonl" > "$tmp"
|
||||
mv "$tmp" ".archeflow/memory/lessons.jsonl"
|
||||
|
||||
run "$LIB_DIR/archeflow-memory.sh" inject "" ""
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Known Issues"* ]]
|
||||
[[ "$output" == *"Test injection lesson"* ]]
|
||||
}
|
||||
|
||||
@test "memory inject: skips lessons with frequency < 2 (except preferences)" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Low frequency lesson" 2>/dev/null
|
||||
# frequency is 1 by default, type is pattern -> should NOT be injected
|
||||
run "$LIB_DIR/archeflow-memory.sh" inject "" ""
|
||||
[ "$status" -eq 0 ]
|
||||
[ -z "$output" ]
|
||||
}
|
||||
|
||||
@test "memory inject: always injects preferences regardless of frequency" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add preference "User prefers explicit error messages" 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-memory.sh" inject "" ""
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"User prefers explicit error messages"* ]]
|
||||
}
|
||||
|
||||
# --- extract ---
|
||||
|
||||
@test "memory extract: exits 1 when events file not found" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" extract nonexistent.jsonl
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "memory extract: extracts findings from review.verdict events" {
|
||||
# Create a mock events file with a review.verdict
|
||||
mkdir -p .archeflow/events
|
||||
cat > /tmp/test-events.jsonl <<'EOF'
|
||||
{"run_id":"test-run","seq":1,"type":"run.start","phase":"plan","data":{"task":"test"}}
|
||||
{"run_id":"test-run","seq":2,"type":"review.verdict","phase":"check","data":{"archetype":"guardian","verdict":"needs_changes","findings":[{"severity":"warning","description":"Missing input validation on user endpoint","category":"code"}]}}
|
||||
EOF
|
||||
|
||||
run "$LIB_DIR/archeflow-memory.sh" extract /tmp/test-events.jsonl
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f ".archeflow/memory/lessons.jsonl" ]
|
||||
local desc
|
||||
desc=$(jq -r '.description' ".archeflow/memory/lessons.jsonl")
|
||||
[[ "$desc" == *"Missing input validation"* ]]
|
||||
rm -f /tmp/test-events.jsonl
|
||||
}
|
||||
78
tests/archeflow-progress.bats
Normal file
78
tests/archeflow-progress.bats
Normal file
@@ -0,0 +1,78 @@
|
||||
# Tests for archeflow-progress.sh — live progress file generation.
|
||||
#
|
||||
# Validates: markdown output structure, JSON mode, missing events handling, exit codes.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
|
||||
# Create standard events for progress tests
|
||||
mkdir -p .archeflow/events
|
||||
cat > ".archeflow/events/test-run.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"test-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"Build feature","workflow":"standard","team":"default"}}
|
||||
{"ts":"2026-04-03T10:01:00Z","run_id":"test-run","seq":2,"parent":[1],"type":"agent.complete","phase":"plan","agent":"creator","data":{"archetype":"creator","duration_ms":60000,"tokens":1500,"estimated_cost_usd":0.02,"summary":"Planned"}}
|
||||
EVENTS
|
||||
}
|
||||
|
||||
@test "progress: exits 1 with usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-progress.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "progress: exits 1 when events file not found" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" nonexistent-run
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "progress: default mode generates progress.md" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" test-run
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f ".archeflow/progress.md" ]
|
||||
[[ "$output" == *"# ArcheFlow Run: test-run"* ]]
|
||||
[[ "$output" == *"Status:"* ]]
|
||||
[[ "$output" == *"Progress"* ]]
|
||||
}
|
||||
|
||||
@test "progress: json mode outputs valid JSON" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" test-run --json
|
||||
[ "$status" -eq 0 ]
|
||||
echo "$output" | jq empty
|
||||
local run_id
|
||||
run_id=$(echo "$output" | jq -r '.run_id')
|
||||
[ "$run_id" = "test-run" ]
|
||||
}
|
||||
|
||||
@test "progress: json mode includes completed agents" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" test-run --json
|
||||
[ "$status" -eq 0 ]
|
||||
local completed_count
|
||||
completed_count=$(echo "$output" | jq '.completed | length')
|
||||
[ "$completed_count" -eq 1 ]
|
||||
local agent
|
||||
agent=$(echo "$output" | jq -r '.completed[0].agent')
|
||||
[ "$agent" = "creator" ]
|
||||
}
|
||||
|
||||
@test "progress: json mode shows correct phase" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" test-run --json
|
||||
[ "$status" -eq 0 ]
|
||||
local phase
|
||||
phase=$(echo "$output" | jq -r '.phase')
|
||||
[ "$phase" = "plan" ]
|
||||
}
|
||||
|
||||
@test "progress: reports error in json when events file missing" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" missing-run --json
|
||||
# JSON mode returns the JSON even on error
|
||||
local error
|
||||
error=$(echo "$output" | jq -r '.error // empty')
|
||||
[[ "$error" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "progress: rejects unknown flags" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" test-run --invalid
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Unknown flag"* ]]
|
||||
}
|
||||
62
tests/archeflow-replay.bats
Normal file
62
tests/archeflow-replay.bats
Normal file
@@ -0,0 +1,62 @@
|
||||
# Tests for archeflow-replay.sh — timeline, what-if, and compare modes.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
|
||||
mkdir -p .archeflow/events
|
||||
cat > ".archeflow/events/replay-run.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"replay-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"replay test"}}
|
||||
{"ts":"2026-04-03T10:05:00Z","run_id":"replay-run","seq":2,"parent":[1],"type":"decision.point","phase":"check","agent":"guardian","data":{"archetype":"guardian","input":"diff","decision":"needs_changes","confidence":0.88}}
|
||||
{"ts":"2026-04-03T10:06:00Z","run_id":"replay-run","seq":3,"parent":[1],"type":"review.verdict","phase":"check","agent":"guardian","data":{"archetype":"guardian","verdict":"needs_changes","findings":[]}}
|
||||
{"ts":"2026-04-03T10:07:00Z","run_id":"replay-run","seq":4,"parent":[1],"type":"review.verdict","phase":"check","agent":"sage","data":{"archetype":"sage","verdict":"approved","findings":[]}}
|
||||
{"ts":"2026-04-03T10:08:00Z","run_id":"replay-run","seq":5,"parent":[1],"type":"run.complete","phase":"act","agent":null,"data":{"agents_total":2,"fixes_total":0}}
|
||||
EVENTS
|
||||
}
|
||||
|
||||
@test "replay: usage without args" {
|
||||
run "$LIB_DIR/archeflow-replay.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "replay: timeline shows decision.point" {
|
||||
run "$LIB_DIR/archeflow-replay.sh" timeline replay-run
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"decision.point"* ]]
|
||||
[[ "$output" == *"guardian"* ]]
|
||||
[[ "$output" == *"needs_changes"* ]]
|
||||
}
|
||||
|
||||
@test "replay: whatif strict blocks when any reviewer blocks" {
|
||||
run "$LIB_DIR/archeflow-replay.sh" whatif replay-run
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"BLOCK"* ]]
|
||||
}
|
||||
|
||||
@test "replay: whatif weighted can ship when blocker is down-weighted" {
|
||||
run "$LIB_DIR/archeflow-replay.sh" whatif replay-run --weights guardian=0.2,sage=3
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Weighted replay"* ]] || [[ "$output" == *"SHIP"* ]]
|
||||
[[ "$output" == *"SHIP"* ]]
|
||||
}
|
||||
|
||||
@test "replay: whatif --json is valid JSON" {
|
||||
run "$LIB_DIR/archeflow-replay.sh" whatif replay-run --json
|
||||
[ "$status" -eq 0 ]
|
||||
echo "$output" | jq -e '.run_id == "replay-run"' >/dev/null
|
||||
}
|
||||
|
||||
@test "replay: compare includes timeline and whatif" {
|
||||
run "$LIB_DIR/archeflow-replay.sh" compare replay-run
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Decision timeline"* ]]
|
||||
[[ "$output" == *"What-if replay"* ]]
|
||||
}
|
||||
|
||||
@test "decision: logs decision.point via wrapper" {
|
||||
run "$LIB_DIR/archeflow-decision.sh" replay-run check trickster 'diff only' 'edge_case' 0.61 1
|
||||
[ "$status" -eq 0 ]
|
||||
last=$(jq -r 'select(.type=="decision.point") | .data.decision' ".archeflow/events/replay-run.jsonl" | tail -1)
|
||||
[ "$last" = "edge_case" ]
|
||||
}
|
||||
80
tests/archeflow-report.bats
Normal file
80
tests/archeflow-report.bats
Normal file
@@ -0,0 +1,80 @@
|
||||
# Tests for archeflow-report.sh — Markdown process report generation from JSONL events.
|
||||
#
|
||||
# Validates: report output format, summary mode, missing file handling, jq dependency check.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
|
||||
# Create a standard events file used by multiple tests
|
||||
mkdir -p .archeflow/events
|
||||
cat > "$BATS_TEST_TMPDIR/events.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"test-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"Write unit tests","workflow":"standard","team":"default"}}
|
||||
{"ts":"2026-04-03T10:01:00Z","run_id":"test-run","seq":2,"parent":[1],"type":"agent.complete","phase":"plan","agent":"creator","data":{"archetype":"creator","duration_ms":60000,"tokens":1500,"summary":"Designed test structure"}}
|
||||
{"ts":"2026-04-03T10:02:00Z","run_id":"test-run","seq":3,"parent":[2],"type":"phase.transition","phase":"do","agent":null,"data":{"from":"plan","to":"do"}}
|
||||
{"ts":"2026-04-03T10:05:00Z","run_id":"test-run","seq":4,"parent":[3],"type":"agent.complete","phase":"do","agent":"maker","data":{"archetype":"maker","duration_ms":180000,"tokens":3000,"summary":"Implemented tests"}}
|
||||
{"ts":"2026-04-03T10:06:00Z","run_id":"test-run","seq":5,"parent":[4],"type":"phase.transition","phase":"check","agent":null,"data":{"from":"do","to":"check"}}
|
||||
{"ts":"2026-04-03T10:07:00Z","run_id":"test-run","seq":6,"parent":[5],"type":"review.verdict","phase":"check","agent":"guardian","data":{"archetype":"guardian","verdict":"approved","findings":[]}}
|
||||
{"ts":"2026-04-03T10:08:00Z","run_id":"test-run","seq":7,"parent":[6],"type":"run.complete","phase":"act","agent":null,"data":{"status":"completed","cycles":1,"agents_total":3,"fixes_total":0,"duration_ms":480000}}
|
||||
EVENTS
|
||||
}
|
||||
|
||||
@test "report: exits 1 with usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-report.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "report: exits 1 when events file not found" {
|
||||
run "$LIB_DIR/archeflow-report.sh" nonexistent.jsonl
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "report: full mode produces markdown with header and overview" {
|
||||
run "$LIB_DIR/archeflow-report.sh" "$BATS_TEST_TMPDIR/events.jsonl"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"# Process Report: Write unit tests"* ]]
|
||||
[[ "$output" == *"test-run"* ]]
|
||||
[[ "$output" == *"Overview"* ]]
|
||||
[[ "$output" == *"Status"* ]]
|
||||
[[ "$output" == *"completed"* ]]
|
||||
}
|
||||
|
||||
@test "report: full mode includes phase sections" {
|
||||
run "$LIB_DIR/archeflow-report.sh" "$BATS_TEST_TMPDIR/events.jsonl"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"PLAN"* ]]
|
||||
[[ "$output" == *"DO"* ]]
|
||||
[[ "$output" == *"CHECK"* ]]
|
||||
}
|
||||
|
||||
@test "report: summary mode outputs one-line summary" {
|
||||
run "$LIB_DIR/archeflow-report.sh" "$BATS_TEST_TMPDIR/events.jsonl" --summary
|
||||
[ "$status" -eq 0 ]
|
||||
# Should be a single logical line with key stats
|
||||
[[ "$output" == *"[completed]"* ]]
|
||||
[[ "$output" == *"Write unit tests"* ]]
|
||||
[[ "$output" == *"1 cycles"* ]]
|
||||
[[ "$output" == *"test-run"* ]]
|
||||
}
|
||||
|
||||
@test "report: --output writes to file instead of stdout" {
|
||||
run "$LIB_DIR/archeflow-report.sh" "$BATS_TEST_TMPDIR/events.jsonl" --output "$BATS_TEST_TMPDIR/report.md"
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f "$BATS_TEST_TMPDIR/report.md" ]
|
||||
local content
|
||||
content=$(cat "$BATS_TEST_TMPDIR/report.md")
|
||||
[[ "$content" == *"# Process Report"* ]]
|
||||
}
|
||||
|
||||
@test "report: summary for in-progress run shows [in-progress]" {
|
||||
# Events file without run.complete
|
||||
cat > "$BATS_TEST_TMPDIR/in-progress.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"wip-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"WIP task","workflow":"fast","team":"default"}}
|
||||
EVENTS
|
||||
run "$LIB_DIR/archeflow-report.sh" "$BATS_TEST_TMPDIR/in-progress.jsonl" --summary
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"[in-progress]"* ]]
|
||||
[[ "$output" == *"WIP task"* ]]
|
||||
}
|
||||
82
tests/archeflow-review.bats
Normal file
82
tests/archeflow-review.bats
Normal file
@@ -0,0 +1,82 @@
|
||||
# Tests for archeflow-review.sh — git diff extraction for code review.
|
||||
#
|
||||
# Validates: argument parsing, diff modes, stats output, empty diff handling.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
@test "review: --help shows usage" {
|
||||
run "$LIB_DIR/archeflow-review.sh" --help
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
[[ "$output" == *"--branch"* ]]
|
||||
[[ "$output" == *"--commit"* ]]
|
||||
}
|
||||
|
||||
@test "review: exits 1 when no changes to review" {
|
||||
run "$LIB_DIR/archeflow-review.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"No changes"* ]]
|
||||
}
|
||||
|
||||
@test "review: shows diff for uncommitted changes" {
|
||||
echo "new content" > testfile.txt
|
||||
git add testfile.txt
|
||||
run "$LIB_DIR/archeflow-review.sh"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"testfile.txt"* ]]
|
||||
}
|
||||
|
||||
@test "review: --stat-only prints stats without diff content" {
|
||||
echo "stat content" > statfile.txt
|
||||
git add statfile.txt
|
||||
run "$LIB_DIR/archeflow-review.sh" --stat-only
|
||||
[ "$status" -eq 0 ]
|
||||
# stderr has stats, stdout should be empty (no diff)
|
||||
# But run captures both, so just check it ran ok
|
||||
[[ "$output" == *"Review Stats"* ]]
|
||||
}
|
||||
|
||||
@test "review: --branch fails for nonexistent branch" {
|
||||
run "$LIB_DIR/archeflow-review.sh" --branch nonexistent-branch-xyz
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "review: rejects unknown arguments" {
|
||||
run "$LIB_DIR/archeflow-review.sh" --unknown
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Unknown argument"* ]]
|
||||
}
|
||||
|
||||
@test "review: --branch shows diff against base" {
|
||||
# Create a feature branch with changes
|
||||
git checkout -b feat/test-review --quiet
|
||||
echo "feature" > feature.txt
|
||||
git add feature.txt
|
||||
git commit -m "feat: add feature" --quiet
|
||||
git checkout main --quiet
|
||||
|
||||
run "$LIB_DIR/archeflow-review.sh" --branch feat/test-review
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"feature.txt"* ]]
|
||||
}
|
||||
|
||||
@test "review: --commit shows diff for commit range" {
|
||||
echo "first" > first.txt
|
||||
git add first.txt
|
||||
git commit -m "first" --quiet
|
||||
echo "second" > second.txt
|
||||
git add second.txt
|
||||
git commit -m "second" --quiet
|
||||
|
||||
run "$LIB_DIR/archeflow-review.sh" --commit HEAD~1..HEAD
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"second.txt"* ]]
|
||||
}
|
||||
58
tests/archeflow-rollback.bats
Normal file
58
tests/archeflow-rollback.bats
Normal file
@@ -0,0 +1,58 @@
|
||||
# Tests for archeflow-rollback.sh — post-merge test and phase rollback.
|
||||
#
|
||||
# Validates: argument parsing, mutual exclusivity, phase validation, test-cmd config reading.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
@test "rollback: exits with error when called with no args" {
|
||||
run "$LIB_DIR/archeflow-rollback.sh"
|
||||
[ "$status" -ne 0 ]
|
||||
}
|
||||
|
||||
@test "rollback: rejects mutually exclusive --to and --test-cmd" {
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run --to plan --test-cmd "true"
|
||||
[ "$status" -eq 2 ]
|
||||
[[ "$output" == *"mutually exclusive"* ]]
|
||||
}
|
||||
|
||||
@test "rollback: rejects invalid phase names" {
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run --to invalid-phase
|
||||
[ "$status" -eq 2 ]
|
||||
[[ "$output" == *"Invalid phase"* ]]
|
||||
}
|
||||
|
||||
@test "rollback: accepts valid phase names (plan, do, check)" {
|
||||
# This will fail because no git branch exists, but should NOT fail on phase validation
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run --to plan
|
||||
# Should fail later (archeflow-git.sh rollback) not on phase validation
|
||||
[[ "$output" != *"Invalid phase"* ]]
|
||||
}
|
||||
|
||||
@test "rollback: exits 2 when no test command available" {
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run
|
||||
[ "$status" -eq 2 ]
|
||||
[[ "$output" == *"No test command"* ]]
|
||||
}
|
||||
|
||||
@test "rollback: reads test_command from config.yaml" {
|
||||
mkdir -p .archeflow
|
||||
echo 'test_command: "echo ok"' > .archeflow/config.yaml
|
||||
# HEAD won't have archeflow in its message, but the script just warns and proceeds
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run
|
||||
# It should pick up the command and try to run it (test should pass -> exit 0)
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Tests passed"* ]]
|
||||
}
|
||||
|
||||
@test "rollback: rejects unknown options" {
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run --unknown-flag
|
||||
[ "$status" -eq 2 ]
|
||||
[[ "$output" == *"Unknown option"* ]]
|
||||
}
|
||||
105
tests/archeflow-score.bats
Normal file
105
tests/archeflow-score.bats
Normal file
@@ -0,0 +1,105 @@
|
||||
# Tests for archeflow-score.sh — archetype effectiveness scoring.
|
||||
#
|
||||
# Validates: score extraction from events, report generation, input validation.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
|
||||
# Create a complete run events file with review data
|
||||
mkdir -p .archeflow/events .archeflow/memory
|
||||
cat > "$BATS_TEST_TMPDIR/scored-events.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"score-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"Score test"}}
|
||||
{"ts":"2026-04-03T10:01:00Z","run_id":"score-run","seq":2,"parent":[1],"type":"agent.complete","phase":"plan","agent":"creator","data":{"archetype":"creator","duration_ms":60000,"tokens":1500,"estimated_cost_usd":0.02}}
|
||||
{"ts":"2026-04-03T10:02:00Z","run_id":"score-run","seq":3,"parent":[2],"type":"agent.complete","phase":"do","agent":"maker","data":{"archetype":"maker","duration_ms":120000,"tokens":3000,"estimated_cost_usd":0.05}}
|
||||
{"ts":"2026-04-03T10:03:00Z","run_id":"score-run","seq":4,"parent":[3],"type":"review.verdict","phase":"check","agent":"guardian","data":{"archetype":"guardian","verdict":"needs_changes","findings":[{"severity":"warning","description":"Missing validation","fix_required":true},{"severity":"info","description":"Consider logging","fix_required":false}]}}
|
||||
{"ts":"2026-04-03T10:03:30Z","run_id":"score-run","seq":5,"parent":[3],"type":"review.verdict","phase":"check","agent":"sage","data":{"archetype":"sage","verdict":"approved","findings":[]}}
|
||||
{"ts":"2026-04-03T10:04:00Z","run_id":"score-run","seq":6,"parent":[4],"type":"fix.applied","phase":"act","agent":null,"data":{"source":"guardian","finding":"Missing validation"}}
|
||||
{"ts":"2026-04-03T10:05:00Z","run_id":"score-run","seq":7,"parent":[6],"type":"cycle.boundary","phase":"act","agent":null,"data":{"cycle":1,"max_cycles":3,"met":true,"next_action":"merge"}}
|
||||
{"ts":"2026-04-03T10:06:00Z","run_id":"score-run","seq":8,"parent":[7],"type":"run.complete","phase":"act","agent":null,"data":{"status":"completed","cycles":1,"agents_total":4,"fixes_total":1}}
|
||||
EVENTS
|
||||
}
|
||||
|
||||
@test "score: exits 1 with usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-score.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "score: exits 1 for unknown command" {
|
||||
run "$LIB_DIR/archeflow-score.sh" nonexistent
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Unknown command"* ]]
|
||||
}
|
||||
|
||||
@test "score extract: exits 1 when events file not found" {
|
||||
run "$LIB_DIR/archeflow-score.sh" extract nonexistent.jsonl
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "score extract: exits 1 for incomplete run (no run.complete)" {
|
||||
cat > "$BATS_TEST_TMPDIR/incomplete.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"incomplete","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"Incomplete"}}
|
||||
EVENTS
|
||||
run "$LIB_DIR/archeflow-score.sh" extract "$BATS_TEST_TMPDIR/incomplete.jsonl"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"run.complete"* ]]
|
||||
}
|
||||
|
||||
@test "score extract: creates effectiveness.jsonl with archetype scores" {
|
||||
run "$LIB_DIR/archeflow-score.sh" extract "$BATS_TEST_TMPDIR/scored-events.jsonl"
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f ".archeflow/memory/effectiveness.jsonl" ]
|
||||
|
||||
# Should have scores for guardian and sage (the reviewers)
|
||||
local guardian_score
|
||||
guardian_score=$(grep '"guardian"' ".archeflow/memory/effectiveness.jsonl" | head -1)
|
||||
[ -n "$guardian_score" ]
|
||||
|
||||
# Verify JSONL is valid
|
||||
while IFS= read -r line; do
|
||||
echo "$line" | jq empty
|
||||
done < ".archeflow/memory/effectiveness.jsonl"
|
||||
}
|
||||
|
||||
@test "score extract: guardian has correct finding counts" {
|
||||
"$LIB_DIR/archeflow-score.sh" extract "$BATS_TEST_TMPDIR/scored-events.jsonl" 2>/dev/null
|
||||
local guardian
|
||||
guardian=$(grep '"guardian"' ".archeflow/memory/effectiveness.jsonl" | head -1)
|
||||
local total_findings
|
||||
total_findings=$(echo "$guardian" | jq '.findings_total')
|
||||
[ "$total_findings" -eq 2 ]
|
||||
local useful_findings
|
||||
useful_findings=$(echo "$guardian" | jq '.findings_useful')
|
||||
[ "$useful_findings" -eq 1 ]
|
||||
local fixes
|
||||
fixes=$(echo "$guardian" | jq '.fixes_applied')
|
||||
[ "$fixes" -eq 1 ]
|
||||
}
|
||||
|
||||
@test "score extract: composite score is between 0 and 1" {
|
||||
"$LIB_DIR/archeflow-score.sh" extract "$BATS_TEST_TMPDIR/scored-events.jsonl" 2>/dev/null
|
||||
while IFS= read -r line; do
|
||||
local score
|
||||
score=$(echo "$line" | jq '.composite_score')
|
||||
# score >= 0 and score <= 1
|
||||
[ "$(echo "$score >= 0" | bc)" -eq 1 ]
|
||||
[ "$(echo "$score <= 1" | bc)" -eq 1 ]
|
||||
done < ".archeflow/memory/effectiveness.jsonl"
|
||||
}
|
||||
|
||||
@test "score report: exits 1 when no effectiveness data" {
|
||||
run "$LIB_DIR/archeflow-score.sh" report
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"No effectiveness data"* ]]
|
||||
}
|
||||
|
||||
@test "score report: outputs markdown table with archetype data" {
|
||||
"$LIB_DIR/archeflow-score.sh" extract "$BATS_TEST_TMPDIR/scored-events.jsonl" 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-score.sh" report
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Archetype Effectiveness Report"* ]]
|
||||
[[ "$output" == *"Archetype"* ]]
|
||||
[[ "$output" == *"guardian"* ]]
|
||||
}
|
||||
40
tests/test_helper.bash
Normal file
40
tests/test_helper.bash
Normal file
@@ -0,0 +1,40 @@
|
||||
# test_helper.bash — Shared setup/teardown for ArcheFlow bats tests.
|
||||
#
|
||||
# Usage in .bats files:
|
||||
# setup() { load test_helper; _common_setup; }
|
||||
# teardown() { _common_teardown; }
|
||||
#
|
||||
# Provides:
|
||||
# - BATS_TEST_TMPDIR: unique temp directory per test
|
||||
# - Mock .archeflow/ structure via a git repo
|
||||
# - LIB_DIR: path to the lib/ scripts under test
|
||||
|
||||
LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../lib" && pwd)"
|
||||
|
||||
_common_setup() {
|
||||
# Create a unique temp directory for this test
|
||||
BATS_TEST_TMPDIR="$(mktemp -d)"
|
||||
export BATS_TEST_TMPDIR
|
||||
|
||||
# Work inside the temp dir so scripts create .archeflow/ there
|
||||
cd "$BATS_TEST_TMPDIR"
|
||||
|
||||
# Initialize a minimal git repo (many scripts need it)
|
||||
git init --quiet
|
||||
git config user.email "test@test.com"
|
||||
git config user.name "Test User"
|
||||
# Disable commit signing in tests (global config may have it enabled)
|
||||
git config commit.gpgsign false
|
||||
git config tag.gpgsign false
|
||||
|
||||
# Create an initial commit so HEAD exists
|
||||
echo "init" > README.md
|
||||
git add README.md
|
||||
git commit -m "init" --quiet
|
||||
}
|
||||
|
||||
_common_teardown() {
|
||||
# Return to a safe directory before cleanup
|
||||
cd /tmp
|
||||
rm -rf "$BATS_TEST_TMPDIR"
|
||||
}
|
||||
Reference in New Issue
Block a user