Compare commits
19 Commits
refactor/s
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 3ef956485f | |||
| 1e96d87f49 | |||
| d99f449083 | |||
| 58315ac982 | |||
| 24ea632207 | |||
| 55dde5f07a | |||
| 4f8e2a9962 | |||
| 506143d613 | |||
| 607a53f1bf | |||
| 6a49c21bbe | |||
| 6bae80b874 | |||
| 43a147676e | |||
| 14d70689ce | |||
| 130c04fa58 | |||
| 752177528f | |||
| a1667633ad | |||
| d94688ca1b | |||
| 8837a359ac | |||
| af1f4e7da7 |
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "archeflow",
|
||||
"description": "Multi-agent orchestration with Jungian archetypes. PDCA quality cycles, shadow detection, git worktree isolation. Zero dependencies — works with any Claude Code session.",
|
||||
"version": "0.7.0",
|
||||
"version": "0.9.0",
|
||||
"author": {
|
||||
"name": "Chris Nennemann"
|
||||
},
|
||||
@@ -14,12 +14,12 @@
|
||||
"shadow-detection", "workflows"
|
||||
],
|
||||
"skills": [
|
||||
"run", "orchestration", "plan-phase", "do-phase", "check-phase", "act-phase",
|
||||
"shadow-detection", "attention-filters", "convergence", "artifact-routing",
|
||||
"process-log", "memory", "effectiveness", "progress",
|
||||
"colette-bridge", "git-integration", "multi-project",
|
||||
"custom-archetypes", "workflow-design", "domains", "cost-tracking",
|
||||
"templates", "autonomous-mode", "using-archeflow", "presence"
|
||||
"run", "sprint", "review", "check-phase", "act-phase",
|
||||
"shadow-detection", "memory", "progress", "presence",
|
||||
"colette-bridge", "git-integration", "multi-project", "cost-tracking",
|
||||
"custom-archetypes", "workflow-design", "domains",
|
||||
"templates", "autonomous-mode", "using-archeflow",
|
||||
"af-status", "af-score", "af-dag", "af-report", "af-replay"
|
||||
],
|
||||
"hooks": "hooks/hooks.json"
|
||||
}
|
||||
|
||||
8
.gitignore
vendored
8
.gitignore
vendored
@@ -8,3 +8,11 @@ Thumbs.db
|
||||
# Editor
|
||||
*.swp
|
||||
*~
|
||||
# Paper build artifacts
|
||||
paper/*.aux
|
||||
paper/*.bbl
|
||||
paper/*.blg
|
||||
paper/*.log
|
||||
paper/*.out
|
||||
paper/*.pdf
|
||||
paper/*.toc
|
||||
|
||||
@@ -2,6 +2,11 @@
|
||||
|
||||
All notable changes to ArcheFlow are documented in this file.
|
||||
|
||||
## [0.9.0] -- 2026-04-06
|
||||
|
||||
### Added
|
||||
- Run replay: `decision.point` events via `archeflow-decision.sh`; `archeflow-replay.sh` with `timeline`, `whatif` (weighted archetype weights + threshold), and `compare`; skill `af-replay`; DAG labels for `decision.point`.
|
||||
|
||||
## [0.7.0] -- 2026-04-04
|
||||
|
||||
### Added
|
||||
|
||||
160
CLAUDE.md
160
CLAUDE.md
@@ -1,71 +1,119 @@
|
||||
# archeflow — Multi-Agent Orchestration Plugin for Claude Code
|
||||
|
||||
Workspace-level orchestration: parallel agent teams across project portfolios, PDCA cycles with Jungian archetype roles, sprint runner, and post-implementation review. Installed as a Claude Code plugin.
|
||||
|
||||
## Tech Stack
|
||||
|
||||
- **Runtime:** Bash (lib scripts) + Claude Code skill system (Markdown skills)
|
||||
- **No build step, no dependencies** — pure bash + markdown
|
||||
- **Plugin format:** Claude Code plugin (skills/, hooks/, agents/, templates/)
|
||||
|
||||
## Key Commands
|
||||
|
||||
```bash
|
||||
# Use via Claude Code slash commands:
|
||||
/af-sprint # Main mode: work the queue across projects
|
||||
/af-run <task> # Deep orchestration with PDCA cycles
|
||||
/af-review # Post-implementation security/quality review
|
||||
/af-status # Current run status
|
||||
/af-init # Initialize ArcheFlow in a project
|
||||
/af-score # Archetype effectiveness scores
|
||||
/af-memory # Cross-run lesson memory
|
||||
/af-report # Full process report
|
||||
/af-fanout # Colette book fanout via agents
|
||||
```
|
||||
PDCA quality cycles with Jungian archetype roles, corrective action framework, sprint runner, and post-implementation review. Zero dependencies — pure Bash + Markdown.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
skills/ Slash command implementations (one dir per skill)
|
||||
sprint/ /af-sprint — queue-driven parallel agent runner
|
||||
run/ /af-run — PDCA orchestration
|
||||
skills/ Slash commands and internal protocols (one SKILL.md per dir)
|
||||
run/ /af-run — self-contained PDCA orchestration (core skill)
|
||||
sprint/ /af-sprint — queue-driven parallel agent dispatch
|
||||
review/ /af-review — Guardian-led code review
|
||||
plan-phase/ PDCA Plan phase
|
||||
do-phase/ PDCA Do phase
|
||||
check-phase/ PDCA Check phase
|
||||
act-phase/ PDCA Act phase
|
||||
check-phase/ Shared reviewer protocol (used by run + review)
|
||||
act-phase/ Finding collection, fix routing, exit decisions
|
||||
shadow-detection/ Corrective action framework (archetype + system + policy)
|
||||
memory/ Cross-run lessons learned
|
||||
cost-tracking/ Token/cost awareness
|
||||
cost-tracking/ Token/cost awareness and budget enforcement
|
||||
domains/ Domain detection (code, writing, research)
|
||||
... ~25 skill directories
|
||||
hooks/
|
||||
hooks.json Hook definitions
|
||||
session-start/ Auto-activation on session start
|
||||
agents/ Archetype agent definitions
|
||||
explorer.md Divergent thinking, research
|
||||
creator.md Design, architecture
|
||||
maker.md Implementation
|
||||
guardian.md Security, risk, quality gates
|
||||
sage.md Wisdom, patterns, trade-offs
|
||||
skeptic.md Devil's advocate
|
||||
trickster.md Edge cases, unconventional approaches
|
||||
lib/ Bash helper scripts (git, DAG, events, progress, etc.)
|
||||
colette-bridge/ Writing context loader from colette.yaml
|
||||
multi-project/ Cross-repo orchestration with dependency DAG
|
||||
git-integration/ Per-phase commits, branch strategy, rollback
|
||||
templates/ Workflow/team bundle gallery
|
||||
autonomous-mode/ Unattended session protocol
|
||||
using-archeflow/ Session-start activation (auto-loaded via hook)
|
||||
agents/ Archetype personality definitions (one .md per archetype)
|
||||
lib/ Bash helper scripts (events, git, memory, progress, etc.)
|
||||
hooks/ Session-start hook (injects using-archeflow)
|
||||
templates/bundles/ Pre-configured workflow bundles
|
||||
docs/ Roadmap, dogfood notes, test reports
|
||||
```
|
||||
|
||||
## Domain Rules
|
||||
## Commands
|
||||
|
||||
- Skills are Markdown files with frontmatter — follow existing skill format exactly
|
||||
- Agents are archetype personas — maintain their distinct voice and perspective
|
||||
- Dogfood observations go to `archeflow/.archeflow/memory/lessons.jsonl`
|
||||
- Cost tracking: prefer cheap models for bulk ops, expensive for creative/review
|
||||
- PDCA cycle order is mandatory: Plan -> Do -> Check -> Act
|
||||
| Command | Purpose |
|
||||
|---------|---------|
|
||||
| `/af-run <task>` | PDCA orchestration with full agent cycle |
|
||||
| `/af-sprint` | Work the queue across projects |
|
||||
| `/af-review` | Review existing code changes |
|
||||
| `/af-status` | Current/last run status |
|
||||
| `/af-init` | Initialize ArcheFlow in a project |
|
||||
| `/af-score` | Archetype effectiveness scores |
|
||||
| `/af-memory` | Cross-run lesson memory |
|
||||
| `/af-report` | Full process report |
|
||||
| `/af-fanout` | Colette book fanout via agents |
|
||||
|
||||
## Do NOT
|
||||
## Core Concepts
|
||||
|
||||
- Add runtime dependencies — this must stay zero-dependency
|
||||
- Change archetype personalities without updating all referencing skills
|
||||
- Skip the Check phase in PDCA cycles (quality gate)
|
||||
- Modify hooks.json format without testing plugin reload
|
||||
- Use ArcheFlow to orchestrate simple single-file tasks (overhead not justified)
|
||||
### PDCA Cycle
|
||||
```
|
||||
Plan (Explorer + Creator) -> Do (Maker in worktree) -> Check (Guardian first, then others) -> Act (fix, merge, or cycle)
|
||||
```
|
||||
|
||||
### Archetypes
|
||||
Explorer (research), Creator (design), Maker (implement), Guardian (security), Skeptic (assumptions), Trickster (edge cases), Sage (quality). Each has a virtue and a shadow — see `shadow-detection` skill.
|
||||
|
||||
### Corrective Action Framework
|
||||
Three layers, one escalation protocol:
|
||||
- **Archetype shadows** — individual agent dysfunction
|
||||
- **System shadows** — orchestration-level issues (echo chamber, tunnel vision, scope creep)
|
||||
- **Policy boundaries** — operational limits (checkpoints, budgets, Wiggum Breaks)
|
||||
|
||||
### Workflows
|
||||
| Risk Level | Workflow | Agents |
|
||||
|------------|----------|--------|
|
||||
| Low | `fast` | Creator -> Maker -> Guardian |
|
||||
| Medium | `standard` | Explorer + Creator -> Maker -> Guardian + Skeptic + Sage |
|
||||
| High | `thorough` | Explorer + Creator -> Maker -> All 4 reviewers |
|
||||
|
||||
## Guardrails
|
||||
|
||||
### DO
|
||||
|
||||
- Keep skills self-contained. The `run` skill needs zero prerequisites — it was consolidated for a reason.
|
||||
- Write skills as operational instructions Claude can follow, not software specifications.
|
||||
- Use tables for reference data, numbered steps for protocols.
|
||||
- Emit events via `./lib/archeflow-event.sh` — but never let logging block orchestration.
|
||||
- Maintain the corrective action framework when adding new agent types.
|
||||
- Test skill changes by running `/af-run --dry-run` and verifying the flow.
|
||||
- Keep archetype personalities distinct — each agent definition in `agents/` has a specific voice.
|
||||
|
||||
### DO NOT
|
||||
|
||||
- **Add runtime dependencies.** This must stay zero-dependency (Bash + Markdown only).
|
||||
- **Bloat skills back up.** The consolidation from 27 to ~15 skills was intentional. Do not create new skills for internal implementation details — inline them.
|
||||
- **Write bash pseudo-code in skills.** Skills are Claude instructions, not shell scripts. Use one-liner commands or lib script references, not multi-line bash blocks.
|
||||
- **Duplicate protocol definitions.** Finding format lives in `check-phase`. Routing table lives in `act-phase`. Shadow detection lives in `shadow-detection`. One source of truth per concept.
|
||||
- **Skip the Check phase** in PDCA cycles. It's the quality gate.
|
||||
- **Change archetype personalities** without updating all referencing skills and agent definitions.
|
||||
- **Use ArcheFlow for trivial tasks.** Single-file fixes, config changes, questions — just do them directly.
|
||||
- **Let skills exceed ~200 lines.** If a skill is growing past this, it probably needs splitting or the content belongs in a lib script.
|
||||
|
||||
### Skill Writing Rules
|
||||
|
||||
1. **Frontmatter**: `name` (kebab-case), `description` (one-liner + `<example>` tags for user-invocable skills)
|
||||
2. **Structure**: Imperative voice. Lead with what to do, not why. Tables > prose. Steps > paragraphs.
|
||||
3. **Agent templates**: Keep Agent() spawn templates concise. Include only the prompt, subagent_type, and isolation mode.
|
||||
4. **Cross-references**: Use `archeflow:<skill-name>` backtick syntax to reference other skills. Avoid circular dependencies.
|
||||
5. **Bash commands**: One-liners only in skills. Multi-step logic belongs in `lib/` scripts.
|
||||
|
||||
### Cost Awareness
|
||||
|
||||
- Prefer cheap models (haiku) for analytical tasks (validation, diff scoring)
|
||||
- Use capable models (sonnet/opus) for creative tasks (writing, complex design)
|
||||
- Budget enforcement via `cost-tracking` skill and `.archeflow/config.yaml`
|
||||
- Track token spend per agent in events for post-run analysis
|
||||
|
||||
### Git Rules
|
||||
|
||||
- Signing: `git config gpg.format ssh`, key at `~/.ssh/id_ed25519_dev.pub`
|
||||
- Push: `GIT_SSH_COMMAND="ssh -i /home/c/.ssh/id_ed25519_dev -o IdentitiesOnly=yes" git push origin main`
|
||||
- Conventional commits: `feat:`, `fix:`, `chore:`, `docs:`, `refactor:`
|
||||
- No Co-Authored-By trailers
|
||||
- All work on worktree branches until explicitly merged
|
||||
- Merges use `--no-ff` (individually revertable)
|
||||
|
||||
## Dogfooding
|
||||
|
||||
When using ArcheFlow to develop ArcheFlow itself:
|
||||
- Log observations to `.archeflow/memory/lessons.jsonl`
|
||||
- Note friction points, shadow false positives, skill gaps
|
||||
- Test skill changes with `/af-run --dry-run` before committing
|
||||
|
||||
93
README.md
93
README.md
@@ -146,69 +146,61 @@ Shadow detection is quantitative, not vibes. Explorer output exceeding 2000 word
|
||||
|
||||
## Skills Reference
|
||||
|
||||
ArcheFlow ships with 24 skills organized by function.
|
||||
ArcheFlow ships with 19 skills organized by function. The `run` skill is self-contained -- no prerequisites needed.
|
||||
|
||||
### Core Orchestration
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| `archeflow:run` | Automated PDCA execution loop -- single-command orchestration with `--start-from`, `--dry-run`, and cycle-back |
|
||||
| `archeflow:orchestration` | Step-by-step PDCA execution guide for manual orchestration |
|
||||
| `archeflow:plan-phase` | Explorer and Creator output formats and protocols |
|
||||
| `archeflow:do-phase` | Maker implementation rules and worktree commit strategy |
|
||||
| `archeflow:check-phase` | Shared reviewer protocols and output format |
|
||||
| `archeflow:act-phase` | Post-Check decision logic: collect findings, route fixes, exit or cycle |
|
||||
| `archeflow:run` | Self-contained PDCA orchestration -- Plan/Do/Check/Act with adaptation rules, pipeline strategy, and cycle-back |
|
||||
| `archeflow:sprint` | Queue-driven parallel agent dispatch across projects (primary mode) |
|
||||
| `archeflow:review` | Guardian-led code review on diff/branch/commit range |
|
||||
| `archeflow:check-phase` | Shared reviewer protocol -- finding format, evidence requirements, attention filters |
|
||||
| `archeflow:act-phase` | Finding collection, fix routing, exit decisions |
|
||||
|
||||
### Quality and Safety
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| `archeflow:shadow-detection` | Quantitative dysfunction detection and automatic correction |
|
||||
| `archeflow:attention-filters` | Context optimization per archetype -- each agent gets only what it needs |
|
||||
| `archeflow:convergence` | Detects convergence, stalling, and oscillation in multi-cycle runs |
|
||||
| `archeflow:artifact-routing` | Inter-phase artifact protocol -- naming, storage, routing, archiving |
|
||||
|
||||
### Process Intelligence
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| `archeflow:process-log` | Event-sourced JSONL logging with DAG parent relationships |
|
||||
| `archeflow:shadow-detection` | Corrective action framework -- archetype shadows, system shadows, policy boundaries |
|
||||
| `archeflow:memory` | Cross-run memory that learns recurring findings and injects lessons |
|
||||
| `archeflow:effectiveness` | Archetype scoring on signal-to-noise, fix rate, cost efficiency |
|
||||
| `archeflow:progress` | Live progress file watchable from a second terminal |
|
||||
|
||||
### Integration
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| `archeflow:colette-bridge` | Bridges ArcheFlow with the Colette writing platform |
|
||||
| `archeflow:git-integration` | Git-per-phase commits, branch-per-run, rollback to any phase boundary |
|
||||
| `archeflow:git-integration` | Per-phase commits, branch-per-run, rollback |
|
||||
| `archeflow:multi-project` | Cross-repo orchestration with dependency DAG and shared budget |
|
||||
| `archeflow:cost-tracking` | Budget enforcement, per-agent cost aggregation, model tier recommendations |
|
||||
|
||||
### Configuration
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| `archeflow:domains` | Domain adapters for writing, research, and non-code workflows |
|
||||
| `archeflow:custom-archetypes` | Create domain-specific roles (database reviewer, compliance auditor, etc.) |
|
||||
| `archeflow:workflow-design` | Design custom workflows with per-phase archetype assignment and exit conditions |
|
||||
| `archeflow:domains` | Domain adapters for writing, research, and other non-code workflows |
|
||||
| `archeflow:cost-tracking` | Budget enforcement, per-agent cost aggregation, model tier recommendations |
|
||||
| `archeflow:workflow-design` | Design custom workflows with per-phase archetype assignment |
|
||||
| `archeflow:templates` | Template gallery for sharing workflows, teams, and setup bundles |
|
||||
| `archeflow:autonomous-mode` | Unattended overnight sessions with progress logging and safe stopping |
|
||||
| `archeflow:autonomous-mode` | Unattended sessions with corrective action checkpoints |
|
||||
| `archeflow:progress` | Live progress file watchable from a second terminal |
|
||||
| `archeflow:presence` | User-facing output format -- show outcomes, not mechanics |
|
||||
|
||||
### Meta
|
||||
|
||||
| Skill | Description |
|
||||
|-------|-------------|
|
||||
| `archeflow:using-archeflow` | Session-start skill -- activation criteria, workflow selection, quick reference |
|
||||
| `archeflow:using-archeflow` | Session-start activation -- decision tree, workflow selection, commands |
|
||||
|
||||
## Library Scripts
|
||||
|
||||
Eight shell scripts in `lib/` power the process infrastructure.
|
||||
Ten shell scripts in `lib/` power the process infrastructure.
|
||||
|
||||
| Script | Purpose | Usage |
|
||||
|--------|---------|-------|
|
||||
| `archeflow-event.sh` | Append structured JSONL events to a run log | `archeflow-event.sh <run_id> <type> <phase> <agent> '<json>'` |
|
||||
| `archeflow-decision.sh` | Log a `decision.point` (phase, archetype, input, decision, confidence) | `archeflow-decision.sh <run_id> check guardian 'diff' 'needs_changes' 0.85` |
|
||||
| `archeflow-replay.sh` | Timeline + weighted what-if over recorded verdicts | `archeflow-replay.sh compare <run_id> --weights sage=2,guardian=1` |
|
||||
| `archeflow-dag.sh` | Render ASCII DAG from JSONL events | `archeflow-dag.sh events.jsonl --color` |
|
||||
| `archeflow-report.sh` | Generate Markdown process report | `archeflow-report.sh events.jsonl --output report.md --dag` |
|
||||
| `archeflow-progress.sh` | Regenerate live progress file from events | `archeflow-progress.sh <run_id>` |
|
||||
@@ -341,47 +333,28 @@ archetypes: [explorer, creator, maker, guardian, db-specialist]
|
||||
|
||||
```
|
||||
archeflow/
|
||||
├── .claude-plugin/plugin.json # Plugin manifest (v0.5.0)
|
||||
├── .claude-plugin/plugin.json # Plugin manifest
|
||||
├── agents/ # 7 archetype personas (behavioral protocols)
|
||||
│ ├── explorer.md # Plan: research and context mapping
|
||||
│ ├── creator.md # Plan: solution design and proposals
|
||||
│ ├── maker.md # Do: implementation in isolated worktree
|
||||
│ ├── guardian.md # Check: security and reliability review
|
||||
│ ├── skeptic.md # Check: assumption challenging
|
||||
│ ├── trickster.md # Check: adversarial testing
|
||||
│ └── sage.md # Check: holistic quality review
|
||||
├── skills/ # 24 behavioral skills
|
||||
│ ├── run/ # Automated PDCA loop
|
||||
│ ├── orchestration/ # Manual PDCA execution guide
|
||||
│ ├── plan-phase/ # Plan protocols
|
||||
│ ├── do-phase/ # Do protocols
|
||||
│ ├── check-phase/ # Check protocols
|
||||
│ ├── act-phase/ # Act phase decision logic
|
||||
│ ├── shadow-detection/ # Dysfunction detection
|
||||
│ ├── attention-filters/ # Context optimization
|
||||
│ ├── convergence/ # Cycle convergence detection
|
||||
│ ├── artifact-routing/ # Inter-phase artifact protocol
|
||||
│ ├── process-log/ # Event-sourced JSONL logging
|
||||
│ ├── explorer.md, creator.md # Plan phase agents
|
||||
│ ├── maker.md # Do phase agent
|
||||
│ └── guardian.md, skeptic.md, # Check phase agents
|
||||
│ trickster.md, sage.md
|
||||
├── skills/ # 19 skills (consolidated from 27)
|
||||
│ ├── run/ # Self-contained PDCA orchestration (core)
|
||||
│ ├── sprint/ # Queue-driven parallel agent dispatch
|
||||
│ ├── review/ # Guardian-led code review
|
||||
│ ├── check-phase/ # Shared reviewer protocol + attention filters
|
||||
│ ├── act-phase/ # Finding collection + fix routing
|
||||
│ ├── shadow-detection/ # Corrective action framework (3 layers)
|
||||
│ ├── memory/ # Cross-run learning
|
||||
│ ├── effectiveness/ # Archetype scoring
|
||||
│ ├── progress/ # Live progress file
|
||||
│ ├── colette-bridge/ # Colette writing platform bridge
|
||||
│ ├── git-integration/ # Per-phase git commits
|
||||
│ ├── multi-project/ # Cross-repo orchestration
|
||||
│ ├── custom-archetypes/ # Domain-specific roles
|
||||
│ ├── workflow-design/ # Custom workflow design
|
||||
│ ├── domains/ # Domain adapters
|
||||
│ ├── cost-tracking/ # Budget and cost management
|
||||
│ ├── templates/ # Template gallery
|
||||
│ ├── autonomous-mode/ # Unattended sessions
|
||||
│ └── using-archeflow/ # Session-start activation
|
||||
├── lib/ # 8 shell scripts (process infrastructure)
|
||||
│ └── ... # + 12 config/integration skills
|
||||
├── lib/ # 10 shell scripts (events, git, memory, etc.)
|
||||
├── hooks/ # Auto-activation (SessionStart)
|
||||
├── examples/ # Walkthroughs, templates, custom archetypes
|
||||
└── docs/ # Roadmap, changelog
|
||||
```
|
||||
|
||||
The flow: skills define behavioral rules (what agents should do), agents define personas (how they think), lib scripts handle tooling (event logging, git, reporting), and hooks wire it all together at session start. Events are emitted at every phase transition, forming a DAG that can be rendered, reported, or scored after the run.
|
||||
Skills define behavioral rules, agents define personas, lib scripts handle tooling, hooks wire it together at session start. The `run` skill is self-contained -- it absorbed 8 previously separate skills (orchestration, plan-phase, do-phase, artifact-routing, process-log, convergence, effectiveness, attention-filters) into one 459-line operational guide.
|
||||
|
||||
## Philosophy
|
||||
|
||||
|
||||
235
docs/plans/archeflow-roadmap-v1.md
Normal file
235
docs/plans/archeflow-roadmap-v1.md
Normal file
@@ -0,0 +1,235 @@
|
||||
# ArcheFlow Roadmap — From Framework to Tool
|
||||
|
||||
Status: Planning (2026-04-06)
|
||||
Context: v0.8.0 shipped — consolidated skills, corrective action framework, 110 tests. The scaffolding is solid. Now make it genuinely useful.
|
||||
|
||||
## Guiding Principle
|
||||
|
||||
Every feature must close a feedback loop or remove friction. No features that add complexity without measurable improvement in either speed, cost, or quality.
|
||||
|
||||
---
|
||||
|
||||
## Tier 1: Make the Sprint Runner Smart (highest impact)
|
||||
|
||||
### 1.1 Queue from Git Issues
|
||||
|
||||
**Problem:** Manual `queue.json` is the biggest friction point. Nobody wants to maintain a JSON file by hand.
|
||||
|
||||
**Solution:** `./scripts/ws sync-issues` that:
|
||||
- Reads Gitea/GitHub issues via API (`gh issue list` or Gitea REST)
|
||||
- Maps labels to priority: `P0`=critical/blocker, `P1`=high, `P2`=medium, `P3`=low/enhancement
|
||||
- Maps labels to estimate: `size/S`, `size/M`, `size/L`, `size/XL` (default: M)
|
||||
- Extracts `depends_on` from "blocks #N" / "depends on #N" in issue body
|
||||
- Upserts into `queue.json` (doesn't overwrite manual edits, merges by issue ID)
|
||||
- Skips issues with `wontfix`, `duplicate`, `question` labels
|
||||
|
||||
**Scope:** One script in `scripts/`, ~100 lines. Gitea API + GitHub API (detect from remote URL). Needs API token in env var `GITEA_TOKEN` or `GITHUB_TOKEN`.
|
||||
|
||||
**Test:** bats tests with mock API responses (curl fixture files).
|
||||
|
||||
### 1.2 Cost Estimation
|
||||
|
||||
**Problem:** Users don't know what a sprint will cost before running it.
|
||||
|
||||
**Solution:** `/af-sprint --dry-run` shows estimated cost:
|
||||
```
|
||||
Sprint estimate: 7 tasks, ~18 agents, est. $1.20-$2.40, ~12 minutes
|
||||
P1: writing.colette fanout (L) — est. $0.50, 4 agents
|
||||
P1: tool.archeflow review (M) — est. $0.15, 2 agents
|
||||
...
|
||||
Proceed? [y/n]
|
||||
```
|
||||
|
||||
**How:** Track actual token counts per task size (S/M/L/XL) in `.archeflow/memory/cost-history.jsonl`. After 5+ tasks per size bucket, use median. Before that, use defaults: S=$0.05, M=$0.15, L=$0.50, XL=$1.50.
|
||||
|
||||
**Scope:** Update `sprint` skill with estimation section. Add cost logging to `archeflow-event.sh` (include `tokens_used` in `agent.complete` data). New script `lib/archeflow-cost.sh` for estimation.
|
||||
|
||||
### 1.3 Smart Workflow Selection
|
||||
|
||||
**Problem:** Current auto-selection uses keyword matching ("fix" -> pipeline). This is crude.
|
||||
|
||||
**Solution:** Analyze the actual task + codebase signals:
|
||||
|
||||
| Signal | Source | Workflow |
|
||||
|--------|--------|----------|
|
||||
| Files matching `auth|crypto|secret|token|session` | task description + file paths | -> thorough |
|
||||
| Public API changes (OpenAPI spec modified, exported functions changed) | git diff | -> thorough |
|
||||
| <3 files changed, all in same dir | git diff | -> fast/pipeline |
|
||||
| Test files only | git diff | -> pipeline |
|
||||
| Historical: this project's last 3 runs needed 0 cycles | memory | -> fast |
|
||||
| Historical: this project's last run had 2+ CRITICALs | memory | -> thorough |
|
||||
|
||||
**Scope:** Add to the `run` skill's Strategy Selection section. Read git diff stats + memory lessons before choosing. ~20 lines of logic replacing the current keyword table.
|
||||
|
||||
---
|
||||
|
||||
## Tier 2: Close the Learning Loop
|
||||
|
||||
### 2.1 Confidence Calibration
|
||||
|
||||
**Problem:** Creator's confidence scores (0.0-1.0) are self-reported and uncalibrated. A Creator that always says 0.8 but gets rejected 40% of the time is not useful.
|
||||
|
||||
**Solution:** After each `run.complete`, log calibration data:
|
||||
```jsonl
|
||||
{"run_id":"...","creator_confidence":{"task":0.8,"solution":0.7,"risk":0.6},"actual_outcome":"rejected","cycles":2,"criticals":1}
|
||||
```
|
||||
|
||||
At run start, inject calibration context into Creator prompt:
|
||||
```
|
||||
Your historical calibration: You rate task understanding at 0.8 avg,
|
||||
but 35% of runs with that score needed cycle-back. Consider scoring
|
||||
more conservatively.
|
||||
```
|
||||
|
||||
**Scope:** New field in `archeflow-memory.sh` calibration store. ~30 lines in `run` skill to log + inject. Needs 5+ runs before meaningful.
|
||||
|
||||
### 2.2 Archetype Auto-Tuning
|
||||
|
||||
**Problem:** The effectiveness scoring system exists (`archeflow-score.sh`) but nothing acts on it.
|
||||
|
||||
**Solution:** After 10+ runs, auto-generate recommendations:
|
||||
```
|
||||
Archetype Recommendations (based on 15 runs):
|
||||
Guardian: essential (caught real issues in 80% of runs)
|
||||
Sage: keep (useful findings in 60% of runs)
|
||||
Skeptic: demote to thorough-only (useful in 20%, mostly INFO)
|
||||
Trickster: keep for thorough (caught 2 bugs Guardian missed)
|
||||
```
|
||||
|
||||
Add to `/af-score` output. Store recommendation in config as `reviewers.recommended`:
|
||||
```yaml
|
||||
reviewers:
|
||||
recommended:
|
||||
always: [guardian]
|
||||
default: [sage]
|
||||
thorough_only: [skeptic, trickster]
|
||||
# Auto-generated 2026-04-06 from 15 runs. Override with explicit config.
|
||||
```
|
||||
|
||||
**Scope:** Update `archeflow-score.sh` with recommendation logic. Update `run` skill to read recommended config. Add to `af-score` skill display.
|
||||
|
||||
### 2.3 Campaign Memory
|
||||
|
||||
**Problem:** Related runs (e.g., "harden all API endpoints") don't share context.
|
||||
|
||||
**Solution:** Optional `--campaign <id>` flag on `/af-run`:
|
||||
- Links runs under a campaign ID
|
||||
- Cross-run context: "In Run 1, we found the auth pattern uses middleware X. In Run 2, the same pattern applies."
|
||||
- Campaign-level progress: "3/8 endpoints hardened, 2 CRITICALs remaining"
|
||||
- Campaign memory injected into Explorer/Creator prompts
|
||||
|
||||
**Scope:** New field in event schema. Campaign index in `.archeflow/campaigns/`. Update memory injection to filter by campaign. ~50 lines in `run` skill.
|
||||
|
||||
---
|
||||
|
||||
## Tier 3: Integrate with Real Workflow
|
||||
|
||||
### 3.1 Findings as PR Comments
|
||||
|
||||
**Problem:** Review findings live in `.archeflow/artifacts/`. Nobody reads artifact files — they read PR comments.
|
||||
|
||||
**Solution:** After Check phase, if a PR exists for the branch:
|
||||
```bash
|
||||
# Post each CRITICAL/WARNING as a PR review comment
|
||||
gh api repos/{owner}/{repo}/pulls/{pr}/comments \
|
||||
--field body="🛡️ **Guardian** [CRITICAL/security]\n\n${description}\n\nSuggested fix: ${fix}" \
|
||||
--field path="${file}" --field line="${line}"
|
||||
```
|
||||
|
||||
**Scope:** New `--pr <number>` flag on `/af-run` and `/af-review`. Script `lib/archeflow-pr.sh` for posting comments. Falls back gracefully if no PR or no API token.
|
||||
|
||||
### 3.2 CI Hook Mode
|
||||
|
||||
**Problem:** ArcheFlow runs manually. It should run automatically on PRs.
|
||||
|
||||
**Solution:** Lightweight CI integration:
|
||||
```yaml
|
||||
# .github/workflows/archeflow-review.yml (or Gitea equivalent)
|
||||
on: pull_request
|
||||
jobs:
|
||||
review:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- run: claude --plugin-dir ./archeflow -p "/af-review --branch ${{ github.head_ref }} --pr ${{ github.event.number }}"
|
||||
```
|
||||
|
||||
Only runs Guardian (fast, cheap). Posts findings as PR comments. No PDCA overhead.
|
||||
|
||||
**Scope:** Template workflow file in `examples/ci/`. Update `review` skill to support `--pr` flag. Documentation.
|
||||
|
||||
### 3.3 Watch Mode
|
||||
|
||||
**Problem:** You have to remember to run `/af-review` after pushing.
|
||||
|
||||
**Solution:** `/af-watch` — background process that monitors a branch:
|
||||
- Uses `git log --since` polling (every 60s)
|
||||
- On new commits: auto-run `/af-review` on the diff
|
||||
- Posts findings as PR comments if PR exists
|
||||
- Respects budget gate from corrective action framework
|
||||
|
||||
**Scope:** New skill `af-watch/SKILL.md` (~30 lines). Uses the `loop` skill infrastructure. Low priority — CI hook mode covers most use cases.
|
||||
|
||||
---
|
||||
|
||||
## Tier 4: Replay and Analysis
|
||||
|
||||
### 4.1 Decision Journal
|
||||
|
||||
**Problem:** No visibility into why ArcheFlow made specific choices during a run.
|
||||
|
||||
**Solution:** Already started with `archeflow-decision.sh` and `archeflow-replay.sh`. Extend:
|
||||
- Log every decision point: workflow selection, A1/A2/A3 triggers, fix routing, shadow detections
|
||||
- `/af-replay <run_id> --timeline` shows the decision chain
|
||||
- `/af-replay <run_id> --whatif --workflow thorough` simulates: "What would thorough have found?"
|
||||
|
||||
**Scope:** Mostly built. Needs integration into the `run` skill (emit `decision.point` events at each choice). The replay script needs the what-if simulation logic.
|
||||
|
||||
### 4.2 Run Comparison
|
||||
|
||||
**Problem:** No way to evaluate whether workflow X is better than workflow Y for a project.
|
||||
|
||||
**Solution:** `/af-replay compare <run_a> <run_b>`:
|
||||
```
|
||||
Run A (standard, 4m30s, $0.80): 5 findings, 4 resolved, 1 INFO remaining
|
||||
Run B (thorough, 12m, $2.10): 7 findings, 6 resolved, 1 INFO remaining
|
||||
Delta: +2 findings (both INFO), +165% cost, +167% time
|
||||
Verdict: Standard was sufficient for this task.
|
||||
```
|
||||
|
||||
**Scope:** Update `archeflow-replay.sh` with comparison mode. Needs at least 2 runs on similar tasks.
|
||||
|
||||
---
|
||||
|
||||
## Implementation Order
|
||||
|
||||
```
|
||||
v0.9.0 — Sprint Intelligence
|
||||
1.1 Queue from issues
|
||||
1.2 Cost estimation
|
||||
1.3 Smart workflow selection
|
||||
|
||||
v0.10.0 — Learning Loop
|
||||
2.1 Confidence calibration
|
||||
2.2 Archetype auto-tuning
|
||||
2.3 Campaign memory
|
||||
|
||||
v0.11.0 — Integration
|
||||
3.1 Findings as PR comments
|
||||
3.2 CI hook mode
|
||||
3.3 Watch mode (stretch)
|
||||
|
||||
v0.12.0 — Analysis
|
||||
4.1 Decision journal (mostly done)
|
||||
4.2 Run comparison
|
||||
```
|
||||
|
||||
Each version is independently shippable. No version depends on a later one.
|
||||
|
||||
## What NOT to Build
|
||||
|
||||
- **Web dashboard** — Terminal is the interface. Don't add a server.
|
||||
- **Embedding-based memory** — Keyword matching works. Don't add vector DBs.
|
||||
- **Agent marketplace** — Focus on the 7 built-in archetypes being excellent.
|
||||
- **Multi-user collaboration** — ArcheFlow is a single-user tool. Git is the collaboration layer.
|
||||
- **Plugin system for plugins** — ArcheFlow IS a plugin. Don't go meta.
|
||||
@@ -1,5 +1,11 @@
|
||||
# ArcheFlow — Status Log
|
||||
|
||||
## 2026-04-06: Run replay (v0.9.0)
|
||||
|
||||
- `lib/archeflow-decision.sh` — append `decision.point` (phase, archetype, input, decision, confidence).
|
||||
- `lib/archeflow-replay.sh` — `timeline` / `whatif` (weighted archetypes, threshold) / `compare`; optional `--json`.
|
||||
- Skill `af-replay`, plugin bump, DAG renders `decision.point`, `tests/archeflow-replay.bats`.
|
||||
|
||||
## 2026-04-04: Triple Release Sprint (v0.4 → v0.6)
|
||||
|
||||
### What happened
|
||||
|
||||
@@ -7,7 +7,7 @@ const path = require("path");
|
||||
|
||||
try {
|
||||
const pluginRoot = path.resolve(__dirname, "..");
|
||||
const skillFile = path.join(pluginRoot, "skills", "using-archeflow", "SKILL.md");
|
||||
const skillFile = path.join(pluginRoot, "skills", "using-archeflow", "ACTIVATION.md");
|
||||
|
||||
if (!fs.existsSync(skillFile)) {
|
||||
console.log("{}");
|
||||
|
||||
@@ -87,6 +87,9 @@ EVENTS_PARSED=$(jq -r '
|
||||
elif .type == "agent.complete" then
|
||||
(.data.archetype // .agent // "unknown") + " (" + .phase + ")" +
|
||||
(if (.data.tokens // 0) > 0 then " [" + (.data.tokens | tostring) + " tok]" else "" end)
|
||||
elif .type == "decision.point" then
|
||||
(.data.archetype // .agent // "?") + " → " + (.data.decision // "?") +
|
||||
" (conf " + ((.data.confidence // 0) | tostring) + ")"
|
||||
elif .type == "decision" then
|
||||
"decision: " + (.data.what // "unknown") + " → " + (.data.chosen // "unknown")
|
||||
elif .type == "phase.transition" then
|
||||
@@ -209,7 +212,7 @@ render_node() {
|
||||
local colored_label
|
||||
case "$type" in
|
||||
phase.transition) colored_label="${C_TRANS}${label}${C_RESET}" ;;
|
||||
decision) colored_label="${C_DECISION}${label}${C_RESET}" ;;
|
||||
decision|decision.point) colored_label="${C_DECISION}${label}${C_RESET}" ;;
|
||||
review.verdict) colored_label="${C_VERDICT}${label}${C_RESET}" ;;
|
||||
*) colored_label="${pc}${label}${C_RESET}" ;;
|
||||
esac
|
||||
|
||||
48
lib/archeflow-decision.sh
Executable file
48
lib/archeflow-decision.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
# archeflow-decision.sh — Log a PDCA decision point for run replay / effectiveness analysis.
|
||||
#
|
||||
# Appends a decision.point event to .archeflow/events/<run_id>.jsonl with:
|
||||
# phase, archetype (agent + data.archetype), input, decision, confidence, ts (via event layer)
|
||||
#
|
||||
# Usage:
|
||||
# ./lib/archeflow-decision.sh <run_id> <phase> <archetype> '<input>' '<decision>' <confidence> [parent_seq]
|
||||
#
|
||||
# Examples:
|
||||
# ./lib/archeflow-decision.sh 2026-04-06-auth check guardian \
|
||||
# 'diff + proposal risks' 'needs_changes' 0.82 7
|
||||
# ./lib/archeflow-decision.sh 2026-04-06-auth act "" 'route findings' 'send_to_maker' 0.9
|
||||
#
|
||||
# confidence: 0.0–1.0 (orchestrator-estimated certainty in the recorded choice)
|
||||
#
|
||||
# Requires: jq (via archeflow-event.sh)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
if [[ $# -lt 6 ]]; then
|
||||
echo "Usage: $0 <run_id> <phase> <archetype> '<input>' '<decision>' <confidence> [parent_seq]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
RUN_ID="$1"
|
||||
PHASE="$2"
|
||||
ARCH="$3"
|
||||
INPUT="$4"
|
||||
DECISION="$5"
|
||||
CONF_RAW="$6"
|
||||
PARENT="${7:-}"
|
||||
|
||||
if ! [[ "$CONF_RAW" =~ ^[0-9]*\.?[0-9]+$ ]]; then
|
||||
echo "Error: confidence must be a number (e.g. 0.85)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DATA=$(jq -cn \
|
||||
--arg a "$ARCH" \
|
||||
--arg i "$INPUT" \
|
||||
--arg d "$DECISION" \
|
||||
--argjson c "$CONF_RAW" \
|
||||
'{archetype:$a, input:$i, decision:$d, confidence:$c}')
|
||||
|
||||
exec "$LIB_DIR/archeflow-event.sh" "$RUN_ID" decision.point "$PHASE" "$ARCH" "$DATA" "$PARENT"
|
||||
@@ -8,6 +8,9 @@
|
||||
# ./lib/archeflow-event.sh 2026-04-03-der-huster agent.complete plan creator '{"duration_ms":167522}' 2
|
||||
# ./lib/archeflow-event.sh 2026-04-03-der-huster phase.transition do "" '{"from":"plan","to":"do"}' 3,4
|
||||
# ./lib/archeflow-event.sh 2026-04-03-der-huster fix.applied act "" '{"source":"guardian"}' 8
|
||||
# ./lib/archeflow-event.sh 2026-04-03-der-huster decision.point check guardian \
|
||||
# '{"archetype":"guardian","input":"diff","decision":"needs_changes","confidence":0.85}' 7
|
||||
# # Or use: ./lib/archeflow-decision.sh <run_id> <phase> <arch> '<input>' '<decision>' <confidence> [parent]
|
||||
#
|
||||
# Parent seqs: comma-separated seq numbers of causal parent events (DAG).
|
||||
# "2" → single parent [2]
|
||||
|
||||
228
lib/archeflow-replay.sh
Executable file
228
lib/archeflow-replay.sh
Executable file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env bash
|
||||
# archeflow-replay.sh — Inspect recorded runs: decision timeline and weighted what-if replay.
|
||||
#
|
||||
# Usage:
|
||||
# archeflow-replay.sh timeline <run_id>
|
||||
# archeflow-replay.sh whatif <run_id> [--weights arch=w,arch2=w2] [--threshold 0.5] [--json]
|
||||
# archeflow-replay.sh compare <run_id> [--weights ...] [--threshold ...] [--json]
|
||||
#
|
||||
# Events file: .archeflow/events/<run_id>.jsonl (relative to current working directory)
|
||||
#
|
||||
# whatif / compare:
|
||||
# - Loads check-phase review.verdict events (last verdict per archetype).
|
||||
# - Original gate (strict): BLOCK if any reviewer is not approved.
|
||||
# - Replay gate (weighted): BLOCK if sum(weight * strict) / sum(weight) >= threshold,
|
||||
# where strict=1 for non-approved verdicts, else 0. Default weight per archetype is 1.0.
|
||||
#
|
||||
# Requires: jq
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "Usage: $0 {timeline|whatif|compare} <run_id> [options]" >&2
|
||||
echo "" >&2
|
||||
echo " timeline <run_id> Decision timeline (decision.point + review.verdict)" >&2
|
||||
echo " whatif <run_id> [--weights k=v,...] [--threshold 0.5] [--json]" >&2
|
||||
echo " compare <run_id> (timeline + whatif summary)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
COMMAND="$1"
|
||||
RUN_ID="$2"
|
||||
shift 2
|
||||
|
||||
if ! command -v jq &>/dev/null; then
|
||||
echo "Error: jq is required." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
EVENT_FILE=".archeflow/events/${RUN_ID}.jsonl"
|
||||
|
||||
resolve_event_file() {
|
||||
if [[ ! -f "$EVENT_FILE" ]]; then
|
||||
echo "Error: event file not found: $EVENT_FILE" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
cmd_timeline() {
|
||||
resolve_event_file
|
||||
echo "## Decision timeline — run_id=${RUN_ID}"
|
||||
echo ""
|
||||
local cnt
|
||||
cnt=$(jq -s '[.[] | select(.type == "decision.point")] | length' "$EVENT_FILE")
|
||||
if [[ "$cnt" -gt 0 ]]; then
|
||||
echo "### decision.point (${cnt})"
|
||||
jq -r 'select(.type == "decision.point")
|
||||
| "- \(.ts) [\(.phase)] \(.data.archetype // .agent // "?") \(.data.decision) conf=\(.data.confidence // "n/a") input=\(.data.input // "")"' \
|
||||
"$EVENT_FILE"
|
||||
echo ""
|
||||
else
|
||||
echo "### decision.point"
|
||||
echo "(none — emit with ./lib/archeflow-decision.sh during the run)"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
echo "### review.verdict (check phase)"
|
||||
if jq -e -s '[.[] | select(.type == "review.verdict" and .phase == "check")] | length > 0' "$EVENT_FILE" >/dev/null 2>&1; then
|
||||
jq -r 'select(.type == "review.verdict" and .phase == "check")
|
||||
| "- \(.ts) \(.data.archetype // .agent // "?") verdict=\(.data.verdict) findings=\((.data.findings // []) | length)"' \
|
||||
"$EVENT_FILE"
|
||||
else
|
||||
echo "(none)"
|
||||
fi
|
||||
echo ""
|
||||
}
|
||||
|
||||
parse_weights_to_json() {
|
||||
local raw="${1:-}"
|
||||
local obj='{}'
|
||||
if [[ -z "$raw" ]]; then
|
||||
echo '{}'
|
||||
return
|
||||
fi
|
||||
IFS=',' read -ra pairs <<< "$raw"
|
||||
for pair in "${pairs[@]}"; do
|
||||
[[ -z "$pair" ]] && continue
|
||||
local k="${pair%%=*}"
|
||||
local v="${pair#*=}"
|
||||
k=$(echo "$k" | tr '[:upper:]' '[:lower:]' | xargs)
|
||||
v=$(echo "$v" | xargs)
|
||||
if [[ -z "$k" || "$k" == "$pair" ]]; then
|
||||
echo "Error: invalid weight entry (use arch=1.5): $pair" >&2
|
||||
exit 1
|
||||
fi
|
||||
obj=$(echo "$obj" | jq --arg k "$k" --argjson v "$v" '. + {($k): $v}')
|
||||
done
|
||||
echo "$obj"
|
||||
}
|
||||
|
||||
cmd_whatif() {
|
||||
local weights_str=""
|
||||
local threshold="0.5"
|
||||
local json_out="false"
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--weights)
|
||||
weights_str="$2"
|
||||
shift 2
|
||||
;;
|
||||
--threshold)
|
||||
threshold="$2"
|
||||
shift 2
|
||||
;;
|
||||
--json)
|
||||
json_out="true"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
resolve_event_file
|
||||
local weights_json
|
||||
weights_json="$(parse_weights_to_json "$weights_str")"
|
||||
|
||||
local result
|
||||
result=$(jq -s --argjson weights "$weights_json" --argjson thr "$threshold" --arg run_id "$RUN_ID" '
|
||||
def strict($v):
|
||||
if $v == null then 1
|
||||
else ($v | ascii_downcase) as $lv
|
||||
| if ($lv == "approved" or $lv == "approve") then 0 else 1 end
|
||||
end;
|
||||
|
||||
def norm_key: ascii_downcase;
|
||||
|
||||
([.[] | select(.type == "review.verdict" and .phase == "check")]
|
||||
| sort_by(.seq)
|
||||
| reduce .[] as $e ({}; . + { (($e.data.archetype // $e.agent // "unknown") | norm_key): $e })
|
||||
) as $last |
|
||||
|
||||
($last | keys) as $keys |
|
||||
if ($keys | length) == 0 then
|
||||
{
|
||||
run_id: $run_id,
|
||||
error: "no check-phase review.verdict events; nothing to simulate"
|
||||
}
|
||||
else
|
||||
[ $keys[] as $k | $last[$k] as $ev |
|
||||
($weights[($k | norm_key)] // 1.0) as $w
|
||||
| strict($ev.data.verdict) as $s
|
||||
| {
|
||||
archetype: ($ev.data.archetype // $ev.agent // $k),
|
||||
verdict: ($ev.data.verdict // "unknown"),
|
||||
weight: $w,
|
||||
strict: $s,
|
||||
weighted_contrib: ($w * $s)
|
||||
}
|
||||
] as $rows |
|
||||
($rows | map(.weighted_contrib) | add) as $num |
|
||||
($rows | map(.weight) | add) as $den |
|
||||
(if $den > 0 then ($num / $den) else 0 end) as $ratio |
|
||||
(if ($rows | map(.strict) | max) == 1 then "BLOCK" else "SHIP" end) as $strict_out |
|
||||
(if $ratio >= $thr then "BLOCK" else "SHIP" end) as $replay_out |
|
||||
{
|
||||
run_id: $run_id,
|
||||
threshold: $thr,
|
||||
weights_used: $weights,
|
||||
strict_any_veto: {
|
||||
outcome: $strict_out,
|
||||
description: "BLOCK if any reviewer verdict is not approved"
|
||||
},
|
||||
weighted_replay: {
|
||||
weighted_strictness: ($ratio * 1000 | round / 1000),
|
||||
outcome: $replay_out,
|
||||
description: ("BLOCK if weighted strictness >= " + ($thr | tostring))
|
||||
},
|
||||
reviewers: $rows
|
||||
}
|
||||
end
|
||||
' "$EVENT_FILE")
|
||||
|
||||
if [[ "$json_out" == "true" ]]; then
|
||||
echo "$result"
|
||||
else
|
||||
echo "$result" | jq -r '
|
||||
if .error then "Error: \(.error)" else
|
||||
"# What-if replay — run_id=\(.run_id)\n",
|
||||
"",
|
||||
"## Outcomes",
|
||||
"| Model | Result |",
|
||||
"|-------|--------|",
|
||||
"| Original (any non-approve → BLOCK) | \(.strict_any_veto.outcome) |",
|
||||
"| Weighted replay (threshold=\(.threshold)) | \(.weighted_replay.outcome) |",
|
||||
"",
|
||||
"## Weighted strictness",
|
||||
"\(.weighted_replay.weighted_strictness) (0 = all approved, 1 = all blocking)",
|
||||
"",
|
||||
"## Per reviewer",
|
||||
"| Archetype | Verdict | Weight | Strict | w×strict |",
|
||||
"|-----------|---------|--------|--------|----------|",
|
||||
(.reviewers[] | "| \(.archetype) | \(.verdict) | \(.weight) | \(.strict) | \(.weighted_contrib) |"),
|
||||
"",
|
||||
(if (.weights_used | length) > 0 then
|
||||
"## Custom weights applied\n" + (.weights_used | to_entries | map("- \(.key): \(.value)") | join("\n")) + "\n"
|
||||
else empty end)
|
||||
end
|
||||
'
|
||||
fi
|
||||
}
|
||||
|
||||
cmd_compare() {
|
||||
cmd_timeline
|
||||
echo ""
|
||||
cmd_whatif "$@"
|
||||
}
|
||||
|
||||
case "$COMMAND" in
|
||||
timeline) cmd_timeline ;;
|
||||
whatif) cmd_whatif "$@" ;;
|
||||
compare) cmd_compare "$@" ;;
|
||||
*)
|
||||
echo "Unknown command: $COMMAND" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
18
paper/Makefile
Normal file
18
paper/Makefile
Normal file
@@ -0,0 +1,18 @@
|
||||
# Build the ArcheFlow paper
|
||||
# Usage: make (build PDF)
|
||||
# make clean (remove build artifacts)
|
||||
|
||||
MAIN = archeflow
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
all: $(MAIN).pdf
|
||||
|
||||
$(MAIN).pdf: $(MAIN).tex references.bib
|
||||
pdflatex $(MAIN)
|
||||
bibtex $(MAIN)
|
||||
pdflatex $(MAIN)
|
||||
pdflatex $(MAIN)
|
||||
|
||||
clean:
|
||||
rm -f $(MAIN).{aux,bbl,blg,log,out,pdf,toc,lof,lot,nav,snm,vrb}
|
||||
880
paper/archeflow.tex
Normal file
880
paper/archeflow.tex
Normal file
@@ -0,0 +1,880 @@
|
||||
\documentclass[11pt,a4paper]{article}
|
||||
|
||||
% ---- Packages ----
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{amsmath,amssymb}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{booktabs}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{listings}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{tikz}
|
||||
\usetikzlibrary{shapes,arrows.meta,positioning,fit,calc}
|
||||
\usepackage[numbers]{natbib}
|
||||
\usepackage{geometry}
|
||||
\geometry{margin=1in}
|
||||
|
||||
% ---- Listings style ----
|
||||
\lstset{
|
||||
basicstyle=\ttfamily\small,
|
||||
breaklines=true,
|
||||
frame=single,
|
||||
framesep=3pt,
|
||||
columns=flexible,
|
||||
keepspaces=true,
|
||||
showstringspaces=false,
|
||||
commentstyle=\color{gray},
|
||||
keywordstyle=\color{blue!70!black},
|
||||
}
|
||||
|
||||
% ---- Title ----
|
||||
\title{%
|
||||
ArcheFlow: Multi-Agent Orchestration with\\
|
||||
Archetypal Roles and PDCA Quality Cycles%
|
||||
}
|
||||
|
||||
\author{
|
||||
Christian Nennemann\\
|
||||
Independent Researcher\\
|
||||
\texttt{chris@nennemann.de}\\
|
||||
\texttt{https://github.com/XORwell/archeflow}
|
||||
}
|
||||
|
||||
\date{April 2026}
|
||||
|
||||
\begin{document}
|
||||
\maketitle
|
||||
|
||||
% ============================================================
|
||||
\begin{abstract}
|
||||
We present \textsc{ArcheFlow}, an open-source orchestration framework for
|
||||
multi-agent software engineering that assigns \emph{archetypal roles}---derived
|
||||
from Jungian analytical psychology---to LLM agents and coordinates them through
|
||||
\emph{Plan--Do--Check--Act} (PDCA) quality cycles. Each of seven archetypes
|
||||
(Explorer, Creator, Maker, Guardian, Skeptic, Trickster, Sage) carries a defined
|
||||
cognitive virtue and a quantitatively detected \emph{shadow}---a failure mode
|
||||
triggered when the virtue becomes excessive. The framework implements a
|
||||
three-layer corrective action system (archetype shadows, system shadows, policy
|
||||
boundaries) that detects and mitigates agent dysfunction during autonomous
|
||||
operation. We describe ArcheFlow's architecture as a zero-dependency plugin for
|
||||
Claude Code, detail its attention filtering, feedback routing, convergence
|
||||
detection, and effectiveness scoring mechanisms, and discuss connections to
|
||||
recent work on persona stability in language models
|
||||
\citep{lu2026assistant}. ArcheFlow demonstrates that structured persona
|
||||
assignment with shadow detection can maintain productive agent behavior across
|
||||
extended autonomous sessions spanning multiple projects and quality domains
|
||||
(code, prose, research). The system is publicly available under the MIT license.
|
||||
\end{abstract}
|
||||
|
||||
% ============================================================
|
||||
\section{Introduction}
|
||||
\label{sec:introduction}
|
||||
|
||||
The rise of agentic coding assistants---tools that autonomously write, test,
|
||||
review, and commit code---has created a new class of software engineering
|
||||
challenges. While individual LLM agents can produce competent code, the quality
|
||||
of autonomous output degrades under conditions that are well-known from human
|
||||
software teams: reviewers who rubber-stamp, architects who over-engineer,
|
||||
implementers who ignore specifications, and testers who optimize for coverage
|
||||
metrics rather than real defects.
|
||||
|
||||
These failure modes are not merely analogies. \citet{lu2026assistant}
|
||||
demonstrate that language models occupy a measurable \emph{persona space} and
|
||||
can drift from their trained Assistant identity during extended conversations,
|
||||
particularly under emotional or philosophical pressure. Their ``Assistant
|
||||
Axis''---a dominant directional component in activation space---predicts when
|
||||
models will exhibit uncharacteristic behavior. If a single model drifts, a
|
||||
multi-agent system where each agent maintains a distinct persona faces
|
||||
compounded persona management challenges.
|
||||
|
||||
ArcheFlow addresses this problem by drawing on two established frameworks:
|
||||
\begin{enumerate}
|
||||
\item \textbf{Jungian archetypal psychology} \citep{jung1968archetypes}, which
|
||||
provides a taxonomy of cognitive orientations---each with a productive
|
||||
\emph{virtue} and a destructive \emph{shadow}---that map naturally onto
|
||||
software engineering roles.
|
||||
\item \textbf{PDCA quality cycles} \citep{deming1986out}, which provide a
|
||||
convergence mechanism for iterative refinement with measurable exit criteria.
|
||||
\end{enumerate}
|
||||
|
||||
The contribution of this paper is threefold:
|
||||
\begin{itemize}
|
||||
\item We present a \emph{shadow detection framework} that quantitatively
|
||||
identifies agent dysfunction---not through sentiment analysis or output
|
||||
classification, but through structural metrics (output length, finding ratios,
|
||||
scope violations) specific to each archetype's failure mode (Section~\ref{sec:shadows}).
|
||||
\item We describe \emph{attention filters} and \emph{feedback routing} mechanisms
|
||||
that constrain what each agent sees and where its output flows, preventing the
|
||||
information overload and echo chamber effects that plague na\"ive multi-agent
|
||||
systems (Section~\ref{sec:attention}).
|
||||
\item We demonstrate that PDCA convergence detection---including oscillation
|
||||
analysis and divergence scoring---provides principled stopping criteria for
|
||||
iterative review cycles (Section~\ref{sec:convergence}).
|
||||
\end{itemize}
|
||||
|
||||
ArcheFlow is implemented as a zero-dependency plugin (Bash + Markdown) for
|
||||
Claude Code\footnote{\url{https://claude.ai/claude-code}}, Anthropic's CLI
|
||||
coding assistant. It has been used in production across a portfolio of 10--30
|
||||
repositories spanning code, creative writing, and academic research.
|
||||
|
||||
% ============================================================
|
||||
\section{Related Work}
|
||||
\label{sec:related}
|
||||
|
||||
\subsection{Multi-Agent Software Engineering}
|
||||
|
||||
Multi-agent systems for software engineering have proliferated since 2024.
|
||||
\citet{hong2024metagpt} propose MetaGPT, which assigns human-like roles
|
||||
(product manager, architect, engineer) to LLM agents and enforces structured
|
||||
communication through Standardized Operating Procedures (SOPs). ChatDev
|
||||
\citep{qian2024chatdev} simulates a virtual software company with role-playing
|
||||
agents communicating through natural language chat. SWE-Agent
|
||||
\citep{yang2024sweagent} focuses on single-agent benchmark performance on
|
||||
GitHub issues, demonstrating that tool-augmented agents can resolve real-world
|
||||
bugs.
|
||||
|
||||
These systems share a common limitation: roles are defined by \emph{job
|
||||
descriptions} rather than \emph{cognitive orientations}. A ``product manager''
|
||||
agent may behave identically to a ``tech lead'' agent when both receive the same
|
||||
context, because the role boundary is semantic rather than structural. ArcheFlow
|
||||
addresses this through attention filters (Section~\ref{sec:attention}) that
|
||||
physically restrict what each agent perceives, ensuring that role differences
|
||||
manifest in behavior rather than merely in prompts.
|
||||
|
||||
\subsection{Persona Stability in Language Models}
|
||||
|
||||
\citet{lu2026assistant} identify the ``Assistant Axis'' in LLM activation
|
||||
space---a linear direction capturing the degree to which a model operates in its
|
||||
default helpful mode versus an alternative persona. Their key findings are
|
||||
directly relevant to multi-agent orchestration:
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{Persona space is low-dimensional}: only 4--19 principal
|
||||
components explain 70\% of persona variance across 275 character archetypes.
|
||||
\item \textbf{Drift is predictable}: user message embeddings predict response
|
||||
position along the Assistant Axis ($R^2 = 0.53$--$0.77$).
|
||||
\item \textbf{Drift correlates with harm}: models are more liable to produce
|
||||
harmful outputs when drifted from the Assistant identity ($r = 0.39$--$0.52$).
|
||||
\end{enumerate}
|
||||
|
||||
ArcheFlow's shadow detection (Section~\ref{sec:shadows}) can be understood as an
|
||||
\emph{application-level} analog to activation capping: where \citet{lu2026assistant}
|
||||
constrain neural activations to maintain persona stability, ArcheFlow constrains
|
||||
\emph{behavioral outputs} through quantitative triggers and corrective prompts.
|
||||
Both approaches recognize that productive personas require active stabilization,
|
||||
not merely initial assignment.
|
||||
|
||||
\subsection{Quality Cycles in Software Engineering}
|
||||
|
||||
The Plan--Do--Check--Act (PDCA) cycle, formalized by \citet{deming1986out} and
|
||||
rooted in Shewhart's statistical process control \citep{shewhart1939statistical},
|
||||
is the dominant quality improvement framework in manufacturing and has been
|
||||
applied to software engineering through agile retrospectives and continuous
|
||||
improvement. To our knowledge, ArcheFlow is the first system to apply PDCA
|
||||
cycles to multi-agent LLM orchestration with formal convergence detection and
|
||||
oscillation analysis.
|
||||
|
||||
\subsection{Jungian Archetypes in Computing}
|
||||
|
||||
While Jungian archetypes have been applied in user experience design
|
||||
\citep{hartson2012ux}, brand strategy, and game design, their application to
|
||||
AI agent systems is novel. The closest related work is in computational
|
||||
creativity, where archetypal narratives have been used to structure story
|
||||
generation \citep{winston2011strong}. ArcheFlow extends this to software
|
||||
engineering by mapping archetypal virtues and shadows to measurable engineering
|
||||
outcomes.
|
||||
|
||||
% ============================================================
|
||||
\section{Architecture}
|
||||
\label{sec:architecture}
|
||||
|
||||
ArcheFlow is a plugin for Claude Code that operates entirely through prompt
|
||||
engineering, shell scripts, and file-based communication. It has zero runtime
|
||||
dependencies beyond Bash and a compatible LLM backend.
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\begin{tikzpicture}[
|
||||
node distance=1.2cm and 2cm,
|
||||
phase/.style={draw, rounded corners, minimum width=2.5cm, minimum height=0.8cm, font=\small\bfseries},
|
||||
agent/.style={draw, rounded corners, minimum width=2cm, minimum height=0.6cm, font=\small, fill=blue!5},
|
||||
arrow/.style={-{Stealth[length=3mm]}, thick},
|
||||
label/.style={font=\scriptsize, text=gray},
|
||||
]
|
||||
|
||||
% PDCA Cycle
|
||||
\node[phase, fill=yellow!20] (plan) {Plan};
|
||||
\node[phase, fill=green!20, right=of plan] (do) {Do};
|
||||
\node[phase, fill=orange!20, right=of do] (check) {Check};
|
||||
\node[phase, fill=red!15, right=of check] (act) {Act};
|
||||
|
||||
% Plan agents
|
||||
\node[agent, below left=0.8cm and 0.3cm of plan] (explorer) {Explorer};
|
||||
\node[agent, below right=0.8cm and 0.3cm of plan] (creator) {Creator};
|
||||
|
||||
% Do agent
|
||||
\node[agent, below=0.8cm of do] (maker) {Maker};
|
||||
|
||||
% Check agents
|
||||
\node[agent, below left=0.8cm and -0.2cm of check] (guardian) {Guardian};
|
||||
\node[agent, below=0.8cm of check] (skeptic) {Skeptic};
|
||||
\node[agent, below right=0.8cm and -0.2cm of check] (sage) {Sage};
|
||||
|
||||
% Arrows
|
||||
\draw[arrow] (plan) -- (do);
|
||||
\draw[arrow] (do) -- (check);
|
||||
\draw[arrow] (check) -- (act);
|
||||
\draw[arrow, dashed] (act.south) -- ++(0,-0.5) -| node[label, below, pos=0.25] {cycle back} (plan.south);
|
||||
|
||||
% Agent connections
|
||||
\draw[-] (plan.south) -- (explorer.north);
|
||||
\draw[-] (plan.south) -- (creator.north);
|
||||
\draw[-] (do.south) -- (maker.north);
|
||||
\draw[-] (check.south) -- (guardian.north);
|
||||
\draw[-] (check.south) -- (skeptic.north);
|
||||
\draw[-] (check.south) -- (sage.north);
|
||||
|
||||
\end{tikzpicture}
|
||||
\caption{ArcheFlow PDCA cycle with archetypal agent assignments. The dashed arrow represents cycle-back when reviewers find issues. A Trickster agent (not shown) joins the Check phase in \texttt{thorough} workflows.}
|
||||
\label{fig:pdca}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Components}
|
||||
|
||||
The system comprises four component types:
|
||||
|
||||
\begin{description}
|
||||
\item[Agent personas] (\texttt{agents/*.md}): Behavioral protocols for each
|
||||
archetype, defining the agent's cognitive lens, output format, and quality
|
||||
criteria. Each persona is a Markdown file loaded as a system prompt.
|
||||
|
||||
\item[Skills] (\texttt{skills/*/SKILL.md}): Operational instructions that
|
||||
Claude Code follows to orchestrate the PDCA cycle. The core \texttt{run} skill
|
||||
(466 lines) is self-contained---it encodes the complete orchestration protocol
|
||||
including workflow selection, agent spawning, attention filtering, convergence
|
||||
checking, and exit decisions.
|
||||
|
||||
\item[Library scripts] (\texttt{lib/*.sh}): Ten Bash scripts handling
|
||||
infrastructure concerns: JSONL event logging, git operations (per-phase
|
||||
commits, branch management, rollback), cross-run memory, progress tracking,
|
||||
effectiveness scoring, and run replay.
|
||||
|
||||
\item[Hooks] (\texttt{hooks/}): Session-start hook that auto-activates
|
||||
ArcheFlow and injects the domain detection logic.
|
||||
\end{description}
|
||||
|
||||
\subsection{Execution Modes}
|
||||
|
||||
ArcheFlow provides three execution modes optimized for different use cases:
|
||||
|
||||
\begin{description}
|
||||
\item[Sprint] (\texttt{/af-sprint}): Queue-driven parallel dispatch. Reads a
|
||||
priority-ordered task queue, spawns 3--5 agents across different projects
|
||||
simultaneously, collects results, commits, and starts the next batch. Designed
|
||||
for throughput over ceremony.
|
||||
|
||||
\item[Review] (\texttt{/af-review}): Guardian-led post-implementation review
|
||||
on existing diffs, branches, or commit ranges. No planning or implementation
|
||||
orchestration---pure quality analysis.
|
||||
|
||||
\item[Run] (\texttt{/af-run}): Full PDCA orchestration for complex tasks
|
||||
requiring structured exploration, design, implementation, and multi-perspective
|
||||
review.
|
||||
\end{description}
|
||||
|
||||
\subsection{Domain Adaptation}
|
||||
|
||||
ArcheFlow adapts its terminology and quality criteria based on domain detection:
|
||||
\texttt{code} (diffs, tests, security), \texttt{writing} (voice consistency,
|
||||
dialect authenticity, narrative structure), and \texttt{research} (source quality,
|
||||
argument coherence, citation accuracy). Domain is auto-detected from project
|
||||
contents or specified in configuration.
|
||||
|
||||
% ============================================================
|
||||
\section{The Seven Archetypes}
|
||||
\label{sec:archetypes}
|
||||
|
||||
Each archetype embodies a cognitive orientation with a defined virtue (productive
|
||||
mode) and shadow (destructive mode). \Cref{tab:archetypes} summarizes the
|
||||
complete taxonomy.
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\caption{The seven ArcheFlow archetypes with their PDCA phase assignments,
|
||||
cognitive virtues, and shadow failure modes.}
|
||||
\label{tab:archetypes}
|
||||
\begin{tabular}{@{}llllll@{}}
|
||||
\toprule
|
||||
\textbf{Archetype} & \textbf{Phase} & \textbf{Virtue} & \textbf{Shadow} & \textbf{Model Tier} \\
|
||||
\midrule
|
||||
Explorer & Plan & Contextual Clarity & Rabbit Hole & Haiku \\
|
||||
Creator & Plan & Decisive Framing & Over-Architect & Sonnet \\
|
||||
Maker & Do & Execution Discipline & Rogue & Sonnet \\
|
||||
Guardian & Check & Threat Intuition & Paranoid & Sonnet \\
|
||||
Skeptic & Check & Assumption Surfacing & Paralytic & Haiku \\
|
||||
Trickster & Check & Adversarial Creativity & False Alarm & Haiku \\
|
||||
Sage & Check & Maintainability Judgment & Bureaucrat & Haiku \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
The archetype--shadow pairing is not metaphorical; it is the core mechanism
|
||||
for maintaining agent quality. The virtue describes \emph{what} the archetype
|
||||
contributes; the shadow describes what happens when that contribution becomes
|
||||
excessive. An Explorer who never stops researching (Rabbit Hole) delays the
|
||||
entire pipeline. A Guardian who rejects everything (Paranoid) prevents any
|
||||
code from shipping.
|
||||
|
||||
\subsection{Cost-Aware Model Assignment}
|
||||
|
||||
Not all archetypes require the same model capability. Analytical tasks
|
||||
(exploration, assumption checking, code quality review) can be performed by
|
||||
cheaper models (Haiku), while creative tasks (architecture design,
|
||||
implementation, security analysis) benefit from more capable models (Sonnet).
|
||||
This tiered assignment reduces per-run costs by 40--60\% compared to using the
|
||||
most capable model for all agents, with no observed quality degradation in
|
||||
analytical roles.
|
||||
|
||||
% ============================================================
|
||||
\section{Shadow Detection and Corrective Action}
|
||||
\label{sec:shadows}
|
||||
|
||||
\subsection{Archetype Shadows}
|
||||
|
||||
Shadow detection is \emph{quantitative, not sentiment-based}. Each archetype has
|
||||
a specific trigger condition derived from structural properties of its output:
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Shadow detection triggers. Each trigger is evaluated automatically
|
||||
after the agent completes.}
|
||||
\label{tab:shadows}
|
||||
\begin{tabular}{@{}lll@{}}
|
||||
\toprule
|
||||
\textbf{Archetype} & \textbf{Shadow} & \textbf{Trigger} \\
|
||||
\midrule
|
||||
Explorer & Rabbit Hole & Output $> 2000$ words without Recommendation section \\
|
||||
Creator & Over-Architect & $> 2$ new abstractions for a single feature \\
|
||||
Maker & Rogue & No tests in changeset, or files outside proposal scope \\
|
||||
Guardian & Paranoid & CRITICAL:WARNING ratio $> 2{:}1$, or zero approvals \\
|
||||
Skeptic & Paralytic & $> 7$ challenges with $< 50\%$ having alternatives \\
|
||||
Trickster & False Alarm & Findings in untouched code, or $> 10$ total findings \\
|
||||
Sage & Bureaucrat & Review length $> 2\times$ code change length \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
The escalation protocol follows a three-strike pattern:
|
||||
\begin{enumerate}
|
||||
\item \textbf{First detection}: Inject a correction prompt that names the
|
||||
shadow and redirects the agent toward its virtue.
|
||||
\item \textbf{Second detection} (same shadow, same run): Replace the agent
|
||||
with a fresh instance.
|
||||
\item \textbf{Third detection}: Escalate to the user for manual intervention.
|
||||
\end{enumerate}
|
||||
|
||||
\subsection{System Shadows}
|
||||
|
||||
Beyond individual archetype dysfunction, ArcheFlow monitors for
|
||||
\emph{system-level} failure modes:
|
||||
|
||||
\begin{description}
|
||||
\item[Echo Chamber]: Multiple reviewers produce identical findings, suggesting
|
||||
they are confirming each other rather than applying independent judgment.
|
||||
Detected when $> 60\%$ of findings across reviewers share the same
|
||||
file-and-category tuple.
|
||||
|
||||
\item[Tunnel Vision]: All findings cluster in a single file or module while
|
||||
the changeset spans multiple. Detected when $> 80\%$ of findings target
|
||||
$< 20\%$ of changed files.
|
||||
|
||||
\item[Scope Creep]: Maker modifies files not mentioned in the Creator's
|
||||
proposal. Detected by comparing \texttt{do-maker-files.txt} against the
|
||||
proposal's file list.
|
||||
\end{description}
|
||||
|
||||
\subsection{Policy Boundaries and the Wiggum Break}
|
||||
|
||||
The third layer enforces operational limits through budget gates, cycle
|
||||
limits, and checkpoint policies. When limits are exceeded, the system
|
||||
triggers a \emph{Wiggum Break}\footnote{Named after Chief Wiggum from
|
||||
\emph{The Simpsons}---a nod to both ``policy enforcement'' and the
|
||||
Ralph Loop plugin for Claude Code.}---a circuit breaker that halts
|
||||
execution, saves state, and reports to the user.
|
||||
|
||||
Wiggum Breaks are classified as \emph{hard} (halt immediately) or
|
||||
\emph{soft} (finish current task, then halt):
|
||||
|
||||
\begin{description}
|
||||
\item[Hard breaks]: 3 consecutive agent failures, 3 consecutive shadow
|
||||
detections in one run, test suite broken after merge, 2+ oscillating
|
||||
findings.
|
||||
\item[Soft breaks]: convergence score $< 0.5$ for 2 consecutive cycles,
|
||||
findings unchanged between cycles, budget $> 95\%$ spent.
|
||||
\end{description}
|
||||
|
||||
Each Wiggum Break emits a \texttt{wiggum.break} event capturing the
|
||||
trigger, run state, and unresolved findings for post-run analysis.
|
||||
|
||||
\subsection{Connection to the Assistant Axis}
|
||||
|
||||
The shadow detection framework addresses the same fundamental problem identified
|
||||
by \citet{lu2026assistant}: models drift from productive personas during
|
||||
extended operation. Where their work identifies drift in activation space and
|
||||
proposes activation capping as a mitigation, ArcheFlow operates at the
|
||||
\emph{behavioral} level---detecting drift through output structure rather than
|
||||
internal representations, and correcting through prompt injection rather than
|
||||
activation manipulation.
|
||||
|
||||
This application-level approach has a practical advantage: it requires no access
|
||||
to model internals and works with any LLM backend, including API-only models
|
||||
where activation-level interventions are impossible. The tradeoff is that
|
||||
behavioral detection is necessarily coarser than activation-level measurement
|
||||
and can only detect drift after it manifests in output, not before.
|
||||
|
||||
% ============================================================
|
||||
\section{Attention Filters and Information Flow}
|
||||
\label{sec:attention}
|
||||
|
||||
A key design principle is that each agent receives \emph{only the information
|
||||
relevant to its role}. This is implemented through \emph{attention filters}---rules
|
||||
governing which artifacts from prior phases are injected into each agent's
|
||||
context.
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Attention filter matrix. Each agent receives only the artifacts marked
|
||||
with \checkmark.}
|
||||
\label{tab:attention}
|
||||
\begin{tabular}{@{}lccccc@{}}
|
||||
\toprule
|
||||
\textbf{Agent} & \textbf{Task} & \textbf{Explorer} & \textbf{Creator} & \textbf{Diff} & \textbf{Reviews} \\
|
||||
\midrule
|
||||
Explorer & \checkmark & & & & \\
|
||||
Creator & \checkmark & \checkmark & & & \\
|
||||
Maker & \checkmark & & \checkmark & & \\
|
||||
Guardian & & & (risks) & \checkmark & \\
|
||||
Skeptic & & & \checkmark & & \\
|
||||
Sage & & & \checkmark & \checkmark & \\
|
||||
Trickster & & & & \checkmark & \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
The rationale for attention filtering is twofold:
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{Independence}: Reviewers who see each other's findings tend to
|
||||
converge on a shared narrative rather than applying independent judgment. By
|
||||
isolating reviewer inputs, ArcheFlow ensures that each reviewer contributes a
|
||||
genuinely distinct perspective.
|
||||
|
||||
\item \textbf{Focus}: An agent given everything tends to address everything,
|
||||
producing diluted analysis. The Trickster, for example, receives \emph{only}
|
||||
the diff---no design rationale, no risk analysis---forcing it to evaluate the
|
||||
code purely on its own terms.
|
||||
\end{enumerate}
|
||||
|
||||
In PDCA cycle 2+, the feedback from the Act phase is routed selectively:
|
||||
Creator-routed issues go to the Creator, Maker-routed issues go to the Maker.
|
||||
Neither sees the other's feedback, preventing defensive responses to criticism
|
||||
that was directed elsewhere.
|
||||
|
||||
% ============================================================
|
||||
\section{Feedback Routing}
|
||||
\label{sec:routing}
|
||||
|
||||
When the Check phase identifies issues, the Act phase must decide where to route
|
||||
each finding for the next cycle. ArcheFlow uses a deterministic routing table
|
||||
based on the source archetype and finding category:
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Feedback routing table. Findings are routed to the agent best equipped
|
||||
to address them, preventing cross-contamination.}
|
||||
\label{tab:routing}
|
||||
\begin{tabular}{@{}llll@{}}
|
||||
\toprule
|
||||
\textbf{Source} & \textbf{Category} & \textbf{Routes To} & \textbf{Rationale} \\
|
||||
\midrule
|
||||
Guardian & security, breaking-change & Creator & Design must change \\
|
||||
Guardian & reliability, dependency & Creator & Architectural decision \\
|
||||
Skeptic & design, scalability & Creator & Assumptions need revision \\
|
||||
Sage & quality, consistency & Maker & Implementation refinement \\
|
||||
Sage & testing & Maker & Test gap, not design flaw \\
|
||||
Trickster & reliability (design flaw) & Creator & Needs redesign \\
|
||||
Trickster & reliability (test gap) & Maker & Needs more tests \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
The disambiguation principle: if fixing the issue requires changing the
|
||||
\emph{approach}, route to Creator. If it requires changing the \emph{code within
|
||||
the existing approach}, route to Maker. Findings that persist across two
|
||||
consecutive cycles are escalated to the user rather than cycled indefinitely.
|
||||
|
||||
% ============================================================
|
||||
\section{Convergence Detection}
|
||||
\label{sec:convergence}
|
||||
|
||||
\subsection{Convergence Score}
|
||||
|
||||
In PDCA cycle 2+, ArcheFlow compares current findings against the previous cycle
|
||||
and classifies each as \textsc{New}, \textsc{Resolved}, \textsc{Persistent}, or
|
||||
\textsc{Regressed}. The convergence score is:
|
||||
|
||||
\begin{equation}
|
||||
C = \frac{|\textsc{Resolved}|}{|\textsc{Resolved}| + |\textsc{New}| + |\textsc{Regressed}|}
|
||||
\label{eq:convergence}
|
||||
\end{equation}
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Convergence score interpretation and corresponding actions.}
|
||||
\label{tab:convergence}
|
||||
\begin{tabular}{@{}lll@{}}
|
||||
\toprule
|
||||
\textbf{Score Range} & \textbf{Status} & \textbf{Action} \\
|
||||
\midrule
|
||||
$C > 0.8$ & Converging & Continue if cycles remain \\
|
||||
$0.5 \leq C \leq 0.8$ & Stalling & Continue with caution \\
|
||||
$C < 0.5$ & Diverging & Stop if 2 consecutive diverging cycles \\
|
||||
$C = 0$ & Stuck & Stop immediately \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\subsection{Oscillation Detection}
|
||||
|
||||
A finding is \emph{oscillating} if it was present in cycle $n-2$, absent in
|
||||
cycle $n-1$, and present again in cycle $n$. Two or more oscillating findings
|
||||
trigger an immediate stop with escalation to the user, as oscillation indicates
|
||||
a fundamental tension in the review criteria that automated cycles cannot
|
||||
resolve.
|
||||
|
||||
\subsection{Adaptive Workflow Escalation}
|
||||
|
||||
Convergence detection interacts with workflow selection through Rule A1: if a
|
||||
\texttt{fast} workflow and Guardian finds $\geq 2$ CRITICAL findings, the next
|
||||
cycle escalates to \texttt{standard} (adding Skeptic and Sage reviewers). Once
|
||||
escalated, the workflow remains escalated for the duration of the run.
|
||||
|
||||
Conversely, Rule A2 provides a \emph{fast-path}: if Guardian finds zero CRITICAL
|
||||
and zero WARNING findings, remaining reviewers are skipped entirely, and the
|
||||
system proceeds directly to Act. This optimization reduces the cost of runs
|
||||
where the Maker's implementation is clean.
|
||||
|
||||
% ============================================================
|
||||
\section{Evidence Validation}
|
||||
\label{sec:evidence}
|
||||
|
||||
Reviewer findings are subject to evidence validation before they influence
|
||||
routing decisions. A CRITICAL or WARNING finding is downgraded to INFO if:
|
||||
|
||||
\begin{itemize}
|
||||
\item It uses \emph{banned hedging phrases} without supporting evidence:
|
||||
``might be'', ``could potentially'', ``appears to'', ``seems like'', ``may not''.
|
||||
\item It contains \emph{no evidence}: no command output, code citation, line
|
||||
reference, or reproduction steps.
|
||||
\end{itemize}
|
||||
|
||||
This mechanism addresses a well-known failure mode of LLM reviewers: generating
|
||||
plausible-sounding but unsupported concerns. By requiring evidence for
|
||||
high-severity findings, ArcheFlow forces reviewers to ground their analysis in
|
||||
the actual changeset rather than speculation.
|
||||
|
||||
Downgrades are tracked in the event log but do \emph{not} modify the original
|
||||
artifact files, preserving the complete reviewer output for post-run analysis.
|
||||
|
||||
% ============================================================
|
||||
\section{Effectiveness Scoring}
|
||||
\label{sec:effectiveness}
|
||||
|
||||
After each completed run, ArcheFlow scores review archetypes across five
|
||||
dimensions:
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Effectiveness scoring dimensions and their weights.}
|
||||
\label{tab:effectiveness}
|
||||
\begin{tabular}{@{}lp{7cm}r@{}}
|
||||
\toprule
|
||||
\textbf{Dimension} & \textbf{Description} & \textbf{Weight} \\
|
||||
\midrule
|
||||
Signal-to-noise & Ratio of useful findings to total findings & 0.30 \\
|
||||
Fix rate & Fraction of findings that led to applied fixes & 0.25 \\
|
||||
Cost efficiency & Useful findings per dollar of model inference cost & 0.20 \\
|
||||
Accuracy & Fraction not contradicted by other reviewers & 0.15 \\
|
||||
Cycle impact & Whether findings contributed to cycle exit decision & 0.10 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
Scores accumulate in a cross-run memory file
|
||||
(\texttt{.archeflow/memory/effectiveness.jsonl}). After 10+ completed runs,
|
||||
the system recommends model tier changes (e.g., promoting a Haiku-tier reviewer
|
||||
to Sonnet if its signal-to-noise is consistently high) and, in extreme cases,
|
||||
archetype removal for persistently low-scoring reviewers.
|
||||
|
||||
% ============================================================
|
||||
\section{Cross-Run Memory}
|
||||
\label{sec:memory}
|
||||
|
||||
ArcheFlow maintains a lesson-learning system that persists across runs. When
|
||||
recurring findings are detected---the same category of issue appearing in
|
||||
multiple runs---the system stores a lesson and injects it into future agents
|
||||
as additional context.
|
||||
|
||||
Lessons decay over time: each lesson has a relevance counter that increments on
|
||||
reuse and decrements on irrelevance. Lessons that fall below a threshold are
|
||||
archived rather than injected, preventing the accumulation of stale guidance.
|
||||
|
||||
The memory system also performs regression detection: if a previously resolved
|
||||
issue reappears, it is flagged as a regression with higher priority than a
|
||||
fresh finding.
|
||||
|
||||
% ============================================================
|
||||
\section{Implementation}
|
||||
\label{sec:implementation}
|
||||
|
||||
ArcheFlow is implemented in approximately 6,700 lines across three layers:
|
||||
|
||||
\begin{itemize}
|
||||
\item \textbf{Skills} (19 Markdown files, $\sim$2,500 lines): Operational
|
||||
instructions for Claude Code, written as imperative protocols. The core
|
||||
\texttt{run} skill encodes the complete PDCA orchestration in 466 lines.
|
||||
|
||||
\item \textbf{Agent personas} (7 Markdown files, $\sim$700 lines): Behavioral
|
||||
protocols defining each archetype's cognitive lens, output format, and
|
||||
self-review checklist.
|
||||
|
||||
\item \textbf{Library scripts} (10 Bash scripts, $\sim$3,500 lines): Event
|
||||
logging, git operations, memory management, progress tracking, effectiveness
|
||||
scoring, and run replay.
|
||||
\end{itemize}
|
||||
|
||||
The system uses no database, no API server, and no runtime dependencies beyond
|
||||
Bash 4+ and a Claude Code installation. All state is stored in JSONL event logs
|
||||
and Markdown artifact files. This zero-dependency architecture was a deliberate
|
||||
design choice: orchestration infrastructure that itself requires complex setup
|
||||
and maintenance undermines the autonomy it is supposed to enable.
|
||||
|
||||
\subsection{Git Integration}
|
||||
|
||||
ArcheFlow creates per-phase commits, enabling fine-grained rollback. The Maker
|
||||
operates in a git worktree---an isolated working copy---so its changes do not
|
||||
affect the main branch until explicitly merged. If post-merge tests fail, the
|
||||
system auto-reverts the merge and cycles back with ``integration test failure''
|
||||
feedback.
|
||||
|
||||
\subsection{Run Replay}
|
||||
|
||||
All orchestration decisions are logged as \texttt{decision.point} events,
|
||||
enabling post-hoc analysis. The replay system provides:
|
||||
\begin{itemize}
|
||||
\item \textbf{Timeline view}: chronological sequence of all decisions with
|
||||
confidence scores.
|
||||
\item \textbf{Weighted what-if}: re-evaluation of the ship/block outcome
|
||||
using different reviewer weights, answering questions like ``would the outcome
|
||||
have changed if we weighted Guardian 2x and Sage 0.5x?''
|
||||
\item \textbf{Cross-run comparison}: side-by-side analysis of decision
|
||||
patterns across runs.
|
||||
\end{itemize}
|
||||
|
||||
% ============================================================
|
||||
\section{Multi-Domain Application}
|
||||
\label{sec:domains}
|
||||
|
||||
ArcheFlow's archetype system extends beyond code. The framework has been
|
||||
deployed across three domains:
|
||||
|
||||
\subsection{Software Engineering}
|
||||
|
||||
The primary domain. Archetypes map to standard engineering roles: Explorer
|
||||
performs codebase research, Creator designs architecture, Maker writes code,
|
||||
and the Check-phase archetypes review for security (Guardian), design flaws
|
||||
(Skeptic), edge cases (Trickster), and overall quality (Sage).
|
||||
|
||||
\subsection{Creative Writing}
|
||||
|
||||
In writing mode, the same archetype structure applies with adapted quality
|
||||
criteria. Custom archetypes (story-explorer, story-sage) replace or augment
|
||||
the defaults. The framework integrates with Colette, a voice profiling system
|
||||
that maintains consistent authorial voice across chapters. Quality gates check
|
||||
for voice consistency, dialect authenticity, and narrative structure rather
|
||||
than test coverage and security.
|
||||
|
||||
\subsection{Academic Research}
|
||||
|
||||
In research mode, quality criteria shift to source quality, argument coherence,
|
||||
citation accuracy, and methodological rigor. The Guardian reviews for logical
|
||||
fallacies and unsupported claims rather than security vulnerabilities.
|
||||
|
||||
% ============================================================
|
||||
\section{Discussion}
|
||||
\label{sec:discussion}
|
||||
|
||||
\subsection{Archetypes vs. Role Descriptions}
|
||||
|
||||
The key distinction between ArcheFlow's approach and prior multi-agent systems
|
||||
is the \emph{shadow} mechanism. A role description tells an agent what to do;
|
||||
an archetype tells an agent what to do \emph{and what doing too much of it
|
||||
looks like}. This bidirectional specification creates a bounded operating
|
||||
range for each agent, preventing the unbounded optimization that leads to
|
||||
dysfunction.
|
||||
|
||||
The connection to \citet{lu2026assistant}'s persona axis is instructive.
|
||||
They show that model personas exist on a continuum, with the Assistant identity
|
||||
at one extreme and theatrical/mystical identities at the other. ArcheFlow's
|
||||
archetypes deliberately position agents \emph{away} from the default Assistant
|
||||
toward specific cognitive orientations---but the shadow mechanism prevents them
|
||||
from drifting too far, maintaining a productive operating range analogous to
|
||||
what \citeauthor{lu2026assistant} achieve through activation capping.
|
||||
|
||||
\subsection{Wiggum Breaks as Human-in-the-Loop Boundaries}
|
||||
|
||||
A central question in autonomous agent systems is: \emph{when should the
|
||||
system stop acting and ask a human?} Most frameworks treat this as an
|
||||
implementation detail---a timeout, a retry limit, an exception handler.
|
||||
ArcheFlow treats it as a first-class architectural concept through the
|
||||
\emph{Wiggum Break}.
|
||||
|
||||
The Wiggum Break defines the \textbf{formal boundary between autonomous and
|
||||
human-supervised operation}. It is not a failure mode; it is the system's
|
||||
\emph{designed} response to situations where autonomous resolution is
|
||||
provably unproductive:
|
||||
|
||||
\begin{itemize}
|
||||
\item \textbf{Oscillation} (finding present $\to$ absent $\to$ present)
|
||||
indicates a genuine tension in the review criteria that no amount of
|
||||
cycling will resolve---only human judgment about which criterion takes
|
||||
priority.
|
||||
|
||||
\item \textbf{Divergence} (convergence score $< 0.5$ for two consecutive
|
||||
cycles) indicates that the implementation is getting worse with each
|
||||
iteration---the agents lack the context or capability to solve the
|
||||
problem, and continuing wastes resources.
|
||||
|
||||
\item \textbf{Repeated shadow detection} (same dysfunction three times)
|
||||
indicates that the corrective action framework has exhausted its
|
||||
options---the task structure is incompatible with the assigned archetype,
|
||||
and a human must re-scope.
|
||||
\end{itemize}
|
||||
|
||||
This framing inverts the typical HITL paradigm. Rather than asking
|
||||
``how much autonomy should the system have?'' and pre-defining approval
|
||||
gates, ArcheFlow asks ``under what conditions is autonomy
|
||||
\emph{provably unproductive}?'' and derives the HITL boundary from
|
||||
convergence theory. The system runs autonomously by default and escalates
|
||||
only when it can demonstrate---through quantitative metrics, not
|
||||
heuristics---that continued autonomous operation will not improve the
|
||||
outcome.
|
||||
|
||||
This approach has three advantages over pre-defined approval gates:
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{Adaptive autonomy}: Simple tasks never trigger a Wiggum
|
||||
Break; complex tasks trigger one quickly. The HITL boundary adapts to
|
||||
task difficulty without manual configuration.
|
||||
|
||||
\item \textbf{Auditable escalation}: Every Wiggum Break emits a
|
||||
\texttt{wiggum.break} event with the trigger condition, run state, and
|
||||
unresolved findings. The human receives not just a request for help,
|
||||
but a structured summary of \emph{why} autonomous resolution failed
|
||||
and what specifically needs their judgment.
|
||||
|
||||
\item \textbf{Minimal interruption}: Pre-defined gates (``approve every
|
||||
PR'', ``review every design'') interrupt the human on tasks the system
|
||||
could have handled autonomously. Convergence-derived breaks interrupt
|
||||
only when the system has evidence that it cannot proceed productively.
|
||||
\end{enumerate}
|
||||
|
||||
The Wiggum Break thus operationalizes a principle from resilience
|
||||
engineering: the system should be \emph{autonomy-seeking} (preferring to
|
||||
resolve issues itself) but \emph{escalation-ready} (able to produce a
|
||||
useful handoff when self-resolution fails). The quality of the handoff---not
|
||||
just the fact of escalation---is what makes HITL effective.
|
||||
|
||||
\subsection{Limitations}
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{No activation-level control}: ArcheFlow operates purely at the
|
||||
prompt level. It cannot detect persona drift before it manifests in output,
|
||||
unlike activation-level approaches \citep{lu2026assistant}.
|
||||
|
||||
\item \textbf{Single LLM backend}: The current implementation targets Claude
|
||||
Code. While the architectural principles are model-agnostic, the skill and
|
||||
hook system is specific to Claude Code's plugin API.
|
||||
|
||||
\item \textbf{Evaluation methodology}: We have not conducted controlled
|
||||
experiments comparing ArcheFlow's output quality against baselines (single-agent,
|
||||
role-based multi-agent without shadows, PDCA without archetypes). The system
|
||||
has been evaluated through production use across real projects, which
|
||||
demonstrates practical utility but not causal attribution.
|
||||
|
||||
\item \textbf{Shadow trigger thresholds}: The quantitative thresholds
|
||||
(e.g., 2000 words for Rabbit Hole, ratio $> 2{:}1$ for Paranoid) were
|
||||
determined empirically through iterative use and may not generalize across
|
||||
all codebases and domains.
|
||||
\end{enumerate}
|
||||
|
||||
\subsection{Future Work}
|
||||
|
||||
\begin{enumerate}
|
||||
\item \textbf{Activation-level integration}: Combining behavioral shadow
|
||||
detection with the Assistant Axis measurement from \citet{lu2026assistant}
|
||||
could provide earlier and more reliable drift detection, particularly for
|
||||
open-weight models where activations are accessible.
|
||||
|
||||
\item \textbf{Controlled evaluation}: A systematic comparison across standard
|
||||
benchmarks (SWE-bench, HumanEval) would establish whether the archetype +
|
||||
PDCA approach provides measurable quality improvements over simpler
|
||||
orchestration strategies.
|
||||
|
||||
\item \textbf{Archetype discovery}: Rather than hand-designing archetypes,
|
||||
the persona space analysis from \citet{lu2026assistant} could be used to
|
||||
identify \emph{natural} cognitive orientations that models adopt, potentially
|
||||
revealing useful archetypes that human intuition would not suggest.
|
||||
|
||||
\item \textbf{Cross-model persona stability}: Investigating whether shadow
|
||||
triggers calibrated for one model family transfer to others, or whether
|
||||
per-model calibration is necessary.
|
||||
\end{enumerate}
|
||||
|
||||
% ============================================================
|
||||
\section{Conclusion}
|
||||
\label{sec:conclusion}
|
||||
|
||||
ArcheFlow demonstrates that multi-agent LLM orchestration benefits from
|
||||
structured persona management---not just telling agents \emph{what to do},
|
||||
but actively monitoring and correcting \emph{how they do it}. The combination
|
||||
of Jungian archetypes (providing a principled taxonomy of cognitive virtues and
|
||||
their failure modes) with PDCA quality cycles (providing convergence guarantees
|
||||
and principled stopping criteria) produces an orchestration framework that
|
||||
maintains productive agent behavior across extended autonomous sessions.
|
||||
|
||||
The shadow detection mechanism---quantitative triggers for archetype-specific
|
||||
dysfunction---addresses the same persona stability challenge identified by
|
||||
\citet{lu2026assistant} at the application level, requiring no access to model
|
||||
internals and working with any LLM backend. While coarser than activation-level
|
||||
approaches, behavioral shadow detection is practical, interpretable, and
|
||||
immediately deployable.
|
||||
|
||||
ArcheFlow is open-source under the MIT license and available at
|
||||
\url{https://github.com/XORwell/archeflow}.
|
||||
|
||||
% ============================================================
|
||||
\section*{Acknowledgments}
|
||||
|
||||
The author thanks the Claude Code team at Anthropic for building the plugin
|
||||
infrastructure that made ArcheFlow possible, and the authors of
|
||||
\citet{lu2026assistant} for the Assistant Axis framework that informed the
|
||||
theoretical grounding of shadow detection.
|
||||
|
||||
% ============================================================
|
||||
\bibliographystyle{plainnat}
|
||||
\bibliography{references}
|
||||
|
||||
\end{document}
|
||||
89
paper/references.bib
Normal file
89
paper/references.bib
Normal file
@@ -0,0 +1,89 @@
|
||||
@article{lu2026assistant,
|
||||
title={The Assistant Axis: Situating and Stabilizing the Default Persona of Language Models},
|
||||
author={Lu, Christina and Gallagher, Jack and Michala, Jonathan and Fish, Kyle and Lindsey, Jack},
|
||||
journal={arXiv preprint arXiv:2601.10387},
|
||||
year={2026},
|
||||
url={https://arxiv.org/abs/2601.10387}
|
||||
}
|
||||
|
||||
@book{jung1968archetypes,
|
||||
title={The Archetypes and the Collective Unconscious},
|
||||
author={Jung, Carl Gustav},
|
||||
year={1968},
|
||||
publisher={Princeton University Press},
|
||||
edition={2nd},
|
||||
series={Collected Works of C.G. Jung},
|
||||
volume={9}
|
||||
}
|
||||
|
||||
@book{deming1986out,
|
||||
title={Out of the Crisis},
|
||||
author={Deming, W. Edwards},
|
||||
year={1986},
|
||||
publisher={MIT Press},
|
||||
address={Cambridge, MA}
|
||||
}
|
||||
|
||||
@book{shewhart1939statistical,
|
||||
title={Statistical Method from the Viewpoint of Quality Control},
|
||||
author={Shewhart, Walter Andrew},
|
||||
year={1939},
|
||||
publisher={Graduate School of the Department of Agriculture},
|
||||
address={Washington, DC}
|
||||
}
|
||||
|
||||
@article{hong2024metagpt,
|
||||
title={MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework},
|
||||
author={Hong, Sirui and Zhuge, Mingchen and Chen, Jonathan and Zheng, Xiawu and Cheng, Yuheng and Zhang, Ceyao and Wang, Jinlin and Wang, Zili and Yau, Steven Ka Shing and Lin, Zijuan and Zhou, Liyang and Ran, Chenyu and Xiao, Lingfeng and Wu, Chenglin and Schmidhuber, J{\"u}rgen},
|
||||
journal={arXiv preprint arXiv:2308.00352},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2308.00352}
|
||||
}
|
||||
|
||||
@article{qian2024chatdev,
|
||||
title={ChatDev: Communicative Agents for Software Development},
|
||||
author={Qian, Chen and Liu, Wei and Liu, Hongzhang and Chen, Nuo and Dang, Yufan and Li, Jiahao and Yang, Cheng and Chen, Weize and Su, Yusheng and Cong, Xin and Xu, Juyuan and Li, Dahai and Liu, Zhiyuan and Sun, Maosong},
|
||||
journal={arXiv preprint arXiv:2307.07924},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2307.07924}
|
||||
}
|
||||
|
||||
@article{yang2024sweagent,
|
||||
title={SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering},
|
||||
author={Yang, John and Jimenez, Carlos E and Wettig, Alexander and Liber, Kilian and Narasimhan, Karthik and Press, Ofir},
|
||||
journal={arXiv preprint arXiv:2405.15793},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2405.15793}
|
||||
}
|
||||
|
||||
@article{chen2025persona,
|
||||
title={Persona Vectors: Monitoring and Controlling Character Traits via Activation Directions},
|
||||
author={Chen, Yiwei and others},
|
||||
journal={arXiv preprint arXiv:2507.21509},
|
||||
year={2025},
|
||||
url={https://arxiv.org/abs/2507.21509}
|
||||
}
|
||||
|
||||
@article{bai2022constitutional,
|
||||
title={Constitutional AI: Harmlessness from AI Feedback},
|
||||
author={Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and others},
|
||||
journal={arXiv preprint arXiv:2212.08073},
|
||||
year={2022},
|
||||
url={https://arxiv.org/abs/2212.08073}
|
||||
}
|
||||
|
||||
@book{hartson2012ux,
|
||||
title={The UX Book: Process and Guidelines for Ensuring a Quality User Experience},
|
||||
author={Hartson, Rex and Pyla, Pardha S.},
|
||||
year={2012},
|
||||
publisher={Morgan Kaufmann},
|
||||
address={Burlington, MA}
|
||||
}
|
||||
|
||||
@inproceedings{winston2011strong,
|
||||
title={The Strong Story Hypothesis and the Directed Perception Hypothesis},
|
||||
author={Winston, Patrick Henry},
|
||||
booktitle={AAAI Fall Symposium: Advances in Cognitive Systems},
|
||||
year={2011},
|
||||
pages={345--352}
|
||||
}
|
||||
194
paper/taxonomy-refs.bib
Normal file
194
paper/taxonomy-refs.bib
Normal file
@@ -0,0 +1,194 @@
|
||||
% ---- Agent Frameworks ----
|
||||
|
||||
@article{hong2024metagpt,
|
||||
title={MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework},
|
||||
author={Hong, Sirui and Zhuge, Mingchen and Chen, Jonathan and Zheng, Xiawu and Cheng, Yuheng and Zhang, Ceyao and Wang, Jinlin and Wang, Zili and Yau, Steven Ka Shing and Lin, Zijuan and Zhou, Liyang and Ran, Chenyu and Xiao, Lingfeng and Wu, Chenglin and Schmidhuber, J{\"u}rgen},
|
||||
journal={arXiv preprint arXiv:2308.00352},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2308.00352}
|
||||
}
|
||||
|
||||
@article{qian2024chatdev,
|
||||
title={ChatDev: Communicative Agents for Software Development},
|
||||
author={Qian, Chen and Liu, Wei and Liu, Hongzhang and Chen, Nuo and Dang, Yufan and Li, Jiahao and Yang, Cheng and Chen, Weize and Su, Yusheng and Cong, Xin and Xu, Juyuan and Li, Dahai and Liu, Zhiyuan and Sun, Maosong},
|
||||
journal={arXiv preprint arXiv:2307.07924},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2307.07924}
|
||||
}
|
||||
|
||||
@article{wu2023autogen,
|
||||
title={AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation},
|
||||
author={Wu, Qingyun and Bansal, Gagan and Zhang, Jieyu and Wu, Yiran and Li, Beibin and Zhu, Erkang and Jiang, Li and Zhang, Xiaoyun and Zhang, Shaokun and Liu, Jiale and Awadallah, Ahmed Hassan and White, Ryen W. and Burger, Doug and Wang, Chi},
|
||||
journal={arXiv preprint arXiv:2308.08155},
|
||||
year={2023},
|
||||
url={https://arxiv.org/abs/2308.08155}
|
||||
}
|
||||
|
||||
@article{yang2024sweagent,
|
||||
title={SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering},
|
||||
author={Yang, John and Jimenez, Carlos E and Wettig, Alexander and Liber, Kilian and Narasimhan, Karthik and Press, Ofir},
|
||||
journal={arXiv preprint arXiv:2405.15793},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2405.15793}
|
||||
}
|
||||
|
||||
@article{nennemann2026archeflow,
|
||||
title={ArcheFlow: Multi-Agent Orchestration with Archetypal Roles and PDCA Quality Cycles},
|
||||
author={Nennemann, Christian},
|
||||
journal={arXiv preprint},
|
||||
year={2026},
|
||||
url={https://github.com/XORwell/archeflow}
|
||||
}
|
||||
|
||||
@article{nguyen2024agilecoder,
|
||||
title={AgileCoder: Dynamic Collaborative Agents for Software Development based on Agile Methodology},
|
||||
author={Nguyen, Minh Huynh and Chau, Thang Phan and Phung, Phong X. and Nguyen, Nghi D. Q.},
|
||||
journal={arXiv preprint arXiv:2406.11912},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2406.11912}
|
||||
}
|
||||
|
||||
@article{patel2026sixsigma,
|
||||
title={The Six Sigma Agent: Achieving Enterprise-Grade Reliability in LLM Systems Through Consensus-Driven Decomposed Execution},
|
||||
author={Patel, Rushi and Surendira, Bala and George, Allen and Kapale, Kiran},
|
||||
journal={arXiv preprint arXiv:2601.22290},
|
||||
year={2026},
|
||||
url={https://arxiv.org/abs/2601.22290}
|
||||
}
|
||||
|
||||
@article{shinn2023reflexion,
|
||||
title={Reflexion: Language Agents with Verbal Reinforcement Learning},
|
||||
author={Shinn, Noah and Cassano, Federico and Gopinath, Ashwin and Narasimhan, Karthik and Yao, Shunyu},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={36},
|
||||
year={2023},
|
||||
url={https://arxiv.org/abs/2303.11366}
|
||||
}
|
||||
|
||||
@article{xia2024eddops,
|
||||
title={Evaluation-Driven Development and Operations of LLM Agents: A Process Model and Reference Architecture},
|
||||
author={Xia, Boming and Lu, Qinghua and Zhu, Liming and Xing, Zhenchang and Zhao, Dehai and Zhang, Hao},
|
||||
journal={arXiv preprint arXiv:2411.13768},
|
||||
year={2024},
|
||||
url={https://arxiv.org/abs/2411.13768}
|
||||
}
|
||||
|
||||
@article{rasheed2024survey,
|
||||
title={LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision and the Road Ahead},
|
||||
author={Rasheed, Zeeshan and others},
|
||||
journal={ACM Transactions on Software Engineering and Methodology},
|
||||
year={2025},
|
||||
url={https://arxiv.org/abs/2404.04834}
|
||||
}
|
||||
|
||||
@article{li2023camel,
|
||||
title={CAMEL: Communicative Agents for ``Mind'' Exploration of Large Language Model Society},
|
||||
author={Li, Guohao and Hammoud, Hasan Abed Al Kader and Itani, Hani and Khizbullin, Dmitrii and Ghanem, Bernard},
|
||||
journal={Advances in Neural Information Processing Systems},
|
||||
volume={36},
|
||||
year={2023},
|
||||
url={https://arxiv.org/abs/2303.17760}
|
||||
}
|
||||
|
||||
% ---- Persona Stability ----
|
||||
|
||||
@article{lu2026assistant,
|
||||
title={The Assistant Axis: Situating and Stabilizing the Default Persona of Language Models},
|
||||
author={Lu, Christina and Gallagher, Jack and Michala, Jonathan and Fish, Kyle and Lindsey, Jack},
|
||||
journal={arXiv preprint arXiv:2601.10387},
|
||||
year={2026},
|
||||
url={https://arxiv.org/abs/2601.10387}
|
||||
}
|
||||
|
||||
% ---- PM/OM Foundations ----
|
||||
|
||||
@book{deming1986out,
|
||||
title={Out of the Crisis},
|
||||
author={Deming, W. Edwards},
|
||||
year={1986},
|
||||
publisher={MIT Press},
|
||||
address={Cambridge, MA}
|
||||
}
|
||||
|
||||
@book{shewhart1939statistical,
|
||||
title={Statistical Method from the Viewpoint of Quality Control},
|
||||
author={Shewhart, Walter Andrew},
|
||||
year={1939},
|
||||
publisher={Graduate School of the Department of Agriculture},
|
||||
address={Washington, DC}
|
||||
}
|
||||
|
||||
@book{goldratt1984goal,
|
||||
title={The Goal: A Process of Ongoing Improvement},
|
||||
author={Goldratt, Eliyahu M. and Cox, Jeff},
|
||||
year={1984},
|
||||
publisher={North River Press},
|
||||
address={Great Barrington, MA}
|
||||
}
|
||||
|
||||
@book{ohno1988toyota,
|
||||
title={Toyota Production System: Beyond Large-Scale Production},
|
||||
author={Ohno, Taiichi},
|
||||
year={1988},
|
||||
publisher={Productivity Press},
|
||||
address={Portland, OR}
|
||||
}
|
||||
|
||||
@book{womack1996lean,
|
||||
title={Lean Thinking: Banish Waste and Create Wealth in Your Corporation},
|
||||
author={Womack, James P. and Jones, Daniel T.},
|
||||
year={1996},
|
||||
publisher={Simon \& Schuster},
|
||||
address={New York}
|
||||
}
|
||||
|
||||
@article{cooper1990stagegate,
|
||||
title={Stage-Gate Systems: A New Tool for Managing New Products},
|
||||
author={Cooper, Robert G.},
|
||||
journal={Business Horizons},
|
||||
volume={33},
|
||||
number={3},
|
||||
pages={44--54},
|
||||
year={1990},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
|
||||
@article{snowden2007cynefin,
|
||||
title={A Leader's Framework for Decision Making},
|
||||
author={Snowden, David J. and Boone, Mary E.},
|
||||
journal={Harvard Business Review},
|
||||
volume={85},
|
||||
number={11},
|
||||
pages={68--76},
|
||||
year={2007}
|
||||
}
|
||||
|
||||
@book{altshuller1999innovation,
|
||||
title={The Innovation Algorithm: TRIZ, Systematic Innovation and Technical Creativity},
|
||||
author={Altshuller, Genrich},
|
||||
year={1999},
|
||||
publisher={Technical Innovation Center},
|
||||
address={Worcester, MA}
|
||||
}
|
||||
|
||||
@article{boyd1976destruction,
|
||||
title={Destruction and Creation},
|
||||
author={Boyd, John R.},
|
||||
year={1976},
|
||||
note={Unpublished manuscript, widely circulated}
|
||||
}
|
||||
|
||||
@book{schwaber2020scrum,
|
||||
title={The Scrum Guide},
|
||||
author={Schwaber, Ken and Sutherland, Jeff},
|
||||
year={2020},
|
||||
publisher={Scrum.org},
|
||||
note={Available at \url{https://scrumguides.org}}
|
||||
}
|
||||
|
||||
@techreport{mil1949fmea,
|
||||
title={MIL-P-1629: Procedures for Performing a Failure Mode, Effects and Criticality Analysis},
|
||||
institution={United States Department of Defense},
|
||||
year={1949},
|
||||
note={Revised as MIL-STD-1629A, 1980}
|
||||
}
|
||||
805
paper/taxonomy.tex
Normal file
805
paper/taxonomy.tex
Normal file
@@ -0,0 +1,805 @@
|
||||
\documentclass[11pt,a4paper]{article}
|
||||
|
||||
% ---- Packages ----
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{amsmath,amssymb}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{booktabs}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{listings}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{tikz}
|
||||
\usetikzlibrary{shapes,arrows.meta,positioning,fit,calc,matrix}
|
||||
\usepackage[numbers]{natbib}
|
||||
\usepackage{geometry}
|
||||
\usepackage{enumitem}
|
||||
\geometry{margin=1in}
|
||||
|
||||
% ---- Colors ----
|
||||
\definecolor{highfit}{HTML}{2E7D32}
|
||||
\definecolor{medfit}{HTML}{F57F17}
|
||||
\definecolor{lowfit}{HTML}{C62828}
|
||||
\definecolor{neutral}{HTML}{546E7A}
|
||||
|
||||
% ---- Title ----
|
||||
\title{%
|
||||
From Factory Floor to Token Stream:\\
|
||||
A Taxonomy of Operations Management Methods\\
|
||||
for LLM Agent Orchestration%
|
||||
}
|
||||
|
||||
\author{
|
||||
Christian Nennemann\\
|
||||
Independent Researcher\\
|
||||
\texttt{chris@nennemann.de}
|
||||
}
|
||||
|
||||
\date{April 2026}
|
||||
|
||||
\begin{document}
|
||||
\maketitle
|
||||
|
||||
% ============================================================
|
||||
\begin{abstract}
|
||||
Multi-agent systems built on large language models (LLMs) increasingly adopt
|
||||
metaphors from human project management---sprints, standups, code review---yet
|
||||
draw from a remarkably narrow slice of the operations management literature.
|
||||
This paper presents a systematic taxonomy of twelve established PM/OM methods,
|
||||
evaluates their structural compatibility with LLM agent constraints (stateless
|
||||
invocation, cheap cloning, deterministic dysfunction, absence of human
|
||||
psychology), and identifies which methods are underexploited, which are
|
||||
inapplicable, and which require fundamental adaptation. We find that methods
|
||||
designed for \emph{flow optimization} (Kanban, Theory of Constraints) and
|
||||
\emph{rapid decision-making} (OODA Loop) are structurally well-suited to
|
||||
agent orchestration but remain largely unexplored, while methods centered on
|
||||
\emph{human psychology} (Scrum ceremonies, Design Thinking empathy phases)
|
||||
transfer poorly without significant reformulation. We propose a decision
|
||||
framework for selecting orchestration methods based on task complexity, agent
|
||||
count, and quality requirements, and identify five open research directions
|
||||
at the intersection of operations management and agentic AI.
|
||||
\end{abstract}
|
||||
|
||||
% ============================================================
|
||||
\section{Introduction}
|
||||
\label{sec:intro}
|
||||
|
||||
The dominant paradigm for multi-agent LLM systems borrows from agile software
|
||||
development: agents are organized into ``teams'' with role-based
|
||||
specialization, tasks are decomposed into work items, and results are reviewed
|
||||
before merging \citep{hong2024metagpt, qian2024chatdev}. This borrowing is
|
||||
natural---the humans building these systems are software engineers familiar
|
||||
with agile methods---but it is also narrow. The operations management
|
||||
literature contains dozens of methods developed over a century of industrial
|
||||
practice, each encoding different assumptions about workflow structure, quality
|
||||
assurance, failure modes, and coordination costs.
|
||||
|
||||
Not all of these methods are equally applicable to LLM agents. Agents differ
|
||||
from human workers in five structurally important ways:
|
||||
|
||||
\begin{enumerate}[label=\textbf{C\arabic*}]
|
||||
\item \label{c:stateless} \textbf{Stateless invocation}: Agents do not
|
||||
retain memory between invocations unless explicitly persisted. Human team
|
||||
members accumulate institutional knowledge automatically.
|
||||
|
||||
\item \label{c:cloning} \textbf{Cheap to clone, expensive to coordinate}:
|
||||
Spawning a new agent costs milliseconds and cents; coordinating two agents
|
||||
costs tokens and latency. For human teams, the inverse holds---hiring is
|
||||
expensive, coordination is (comparatively) cheap.
|
||||
|
||||
\item \label{c:dysfunction} \textbf{Deterministic dysfunction}: LLM agents
|
||||
fail in predictable, repeatable patterns---verbosity, scope creep, false
|
||||
positives---rather than the varied, context-dependent failures of human
|
||||
cognition \citep{nennemann2026archeflow}.
|
||||
|
||||
\item \label{c:psychology} \textbf{No psychology}: Agents have no morale,
|
||||
fatigue, ego, or office politics. Methods designed to manage human
|
||||
psychology (retrospectives, team-building, conflict resolution) have no
|
||||
direct function.
|
||||
|
||||
\item \label{c:speed} \textbf{Cycle speed}: Agents complete tasks in
|
||||
seconds to minutes, enabling iteration frequencies that would be
|
||||
impractical for human teams. Methods that assume week-long or month-long
|
||||
cycles can be compressed.
|
||||
\end{enumerate}
|
||||
|
||||
These constraints define a \emph{fitness landscape}: some PM/OM methods gain
|
||||
effectiveness when applied to agents (because agents remove friction those
|
||||
methods were designed to manage), while others lose their raison d'\^etre
|
||||
(because they solve human problems agents don't have).
|
||||
|
||||
This paper contributes:
|
||||
\begin{itemize}
|
||||
\item A systematic taxonomy of twelve PM/OM methods evaluated against the
|
||||
five agent constraints (\ref{c:stateless}--\ref{c:speed}).
|
||||
\item A compatibility matrix scoring each method's structural fit for
|
||||
agent orchestration (\S\ref{sec:matrix}).
|
||||
\item A decision framework for practitioners selecting orchestration
|
||||
strategies (\S\ref{sec:decision}).
|
||||
\item Five open research directions at the intersection of operations
|
||||
management theory and agentic AI (\S\ref{sec:future}).
|
||||
\end{itemize}
|
||||
|
||||
% ============================================================
|
||||
\section{Background: Current Agent Orchestration Landscape}
|
||||
\label{sec:background}
|
||||
|
||||
\subsection{Frameworks and Their Implicit PM Models}
|
||||
|
||||
The current generation of multi-agent LLM frameworks implicitly adopts
|
||||
project management concepts, though rarely with explicit attribution to
|
||||
PM/OM theory.
|
||||
|
||||
\textbf{MetaGPT} \citep{hong2024metagpt} assigns human job titles (product
|
||||
manager, architect, engineer) and enforces communication through Standardized
|
||||
Operating Procedures (SOPs)---an implicit adoption of \emph{waterfall}
|
||||
phase gates with role-based access control.
|
||||
|
||||
\textbf{ChatDev} \citep{qian2024chatdev} simulates a software company with
|
||||
sequential phases (design, coding, testing, documentation). Despite the
|
||||
``company'' framing, the execution model is a \emph{linear pipeline} with
|
||||
pair-programming-style chat between adjacent roles.
|
||||
|
||||
\textbf{AgileCoder} \citep{nguyen2024agilecoder} is the first framework to
|
||||
explicitly adopt sprint-based iteration, assigning Scrum Master and Product
|
||||
Manager roles to LLM agents with a Dynamic Code Graph Generator tracking
|
||||
inter-file dependencies between sprints.
|
||||
|
||||
\textbf{CrewAI} organizes agents into ``crews'' with a ``manager'' agent
|
||||
orchestrating task delegation---an implicit \emph{hierarchical management}
|
||||
model with single-point-of-failure coordination.
|
||||
|
||||
\textbf{AutoGen} \citep{wu2023autogen} provides a conversation-based
|
||||
framework where agents negotiate through multi-turn dialogue. The implicit
|
||||
model is \emph{committee decision-making}---all agents see all messages,
|
||||
consensus emerges through discussion.
|
||||
|
||||
\textbf{The Six Sigma Agent} \citep{patel2026sixsigma} decomposes tasks
|
||||
into atomic dependency trees, executes each node $n$ times with independent
|
||||
LLM samples, and uses consensus voting to achieve defect rates scaling as
|
||||
$O(p^{\lceil n/2 \rceil})$---reaching 3.4 DPMO (the Six Sigma threshold)
|
||||
at $n=13$.
|
||||
|
||||
\textbf{Reflexion} \citep{shinn2023reflexion} implements a de facto PDCA
|
||||
loop through verbal reinforcement: Plan $\to$ Act $\to$ Evaluate (Check)
|
||||
$\to$ Reflect (Act), though it does not name this structure explicitly.
|
||||
|
||||
\textbf{ArcheFlow} \citep{nennemann2026archeflow} explicitly applies PDCA
|
||||
quality cycles with Jungian archetypal roles, representing the first
|
||||
framework to deliberately adopt a named PM/OM methodology with formal
|
||||
convergence criteria.
|
||||
|
||||
\subsection{The Gap}
|
||||
|
||||
Despite the variety of frameworks, the PM/OM methods actually employed
|
||||
cluster tightly around four approaches: (1) waterfall-style sequential
|
||||
phases (MetaGPT, ChatDev), (2) role-based team simulation (CAMEL
|
||||
\citep{li2023camel}, CrewAI), (3) informal ``manager'' delegation
|
||||
(AutoGen), and (4) agile sprints (AgileCoder). The Six Sigma Agent
|
||||
\citep{patel2026sixsigma} is a notable exception---the only framework to
|
||||
explicitly name a PM/OM method as its primary architectural contribution.
|
||||
|
||||
Methods from lean manufacturing, constraint theory, military
|
||||
decision-making, innovation management, and failure analysis remain
|
||||
unexplored in the peer-reviewed agent orchestration literature, despite
|
||||
strong structural compatibility with agent constraints.
|
||||
|
||||
% ============================================================
|
||||
\section{Taxonomy of PM/OM Methods}
|
||||
\label{sec:taxonomy}
|
||||
|
||||
We evaluate twelve methods spanning five categories: iterative improvement,
|
||||
flow optimization, decision-making, innovation management, and quality
|
||||
engineering. For each method, we describe the core mechanism, evaluate
|
||||
structural compatibility with agent constraints \ref{c:stateless}--\ref{c:speed},
|
||||
identify the primary adaptation required, and assess overall fitness.
|
||||
|
||||
% ---- 3.1 Iterative Improvement ----
|
||||
\subsection{Iterative Improvement Methods}
|
||||
|
||||
\subsubsection{PDCA (Plan--Do--Check--Act)}
|
||||
\label{sec:pdca}
|
||||
|
||||
\textbf{Origin}: Shewhart \citep{shewhart1939statistical}, popularized by
|
||||
Deming \citep{deming1986out}.
|
||||
|
||||
\textbf{Mechanism}: Four-phase cycle repeated until quality targets are met.
|
||||
Each cycle narrows the gap between current and desired state through
|
||||
structured feedback.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{High}. PDCA's phase structure maps directly
|
||||
to agent orchestration: Plan (research + design agents), Do (implementation
|
||||
agent), Check (review agents), Act (routing + merge decisions). The cycle
|
||||
abstraction handles the core challenge of ``when to stop iterating'' through
|
||||
convergence metrics. Demonstrated in ArcheFlow \citep{nennemann2026archeflow}.
|
||||
|
||||
\textbf{Key adaptation}: Convergence detection must be automated (human PDCA
|
||||
relies on subjective judgment). ArcheFlow addresses this with a convergence
|
||||
score based on finding classification (new, resolved, persistent, regressed)
|
||||
and oscillation detection.
|
||||
|
||||
\textbf{Constraint fit}: Stateless (\ref{c:stateless})---artifacts persist
|
||||
state between cycles. Cloning (\ref{c:cloning})---fresh agents per cycle
|
||||
avoid accumulated bias. Speed (\ref{c:speed})---cycles complete in minutes,
|
||||
enabling 2--3 cycles where humans would manage one.
|
||||
|
||||
\subsubsection{Scrum}
|
||||
\label{sec:scrum}
|
||||
|
||||
\textbf{Origin}: Schwaber \& Sutherland, 1995.
|
||||
|
||||
\textbf{Mechanism}: Time-boxed sprints with defined roles (Product Owner,
|
||||
Scrum Master, Development Team), ceremonies (planning, daily standup,
|
||||
review, retrospective), and artifacts (backlog, sprint board, burndown).
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Low--Medium}. Scrum's ceremony-heavy
|
||||
structure exists primarily to manage human coordination challenges: standups
|
||||
maintain shared awareness (agents can share a filesystem), retrospectives
|
||||
address interpersonal friction (agents have none), sprint planning negotiates
|
||||
capacity (agents have deterministic throughput). The useful kernel---time-boxed
|
||||
work with a prioritized backlog---is trivially implementable without Scrum's
|
||||
overhead.
|
||||
|
||||
\textbf{Key adaptation}: Strip ceremonies, keep the backlog + sprint
|
||||
structure. ``Daily standups'' become status file reads. ``Retrospectives''
|
||||
become cross-run memory extraction. The Scrum Master role is pure overhead
|
||||
for agents.
|
||||
|
||||
\textbf{Constraint fit}: Psychology (\ref{c:psychology})---most Scrum
|
||||
ceremonies solve human problems. Speed (\ref{c:speed})---sprint length
|
||||
compresses from weeks to minutes. Cloning (\ref{c:cloning})---team
|
||||
stability (a Scrum value) is irrelevant when agents are stateless.
|
||||
|
||||
\subsubsection{DMAIC (Six Sigma)}
|
||||
\label{sec:dmaic}
|
||||
|
||||
\textbf{Origin}: Motorola, 1986; systematized by General Electric.
|
||||
|
||||
\textbf{Mechanism}: Define--Measure--Analyze--Improve--Control. Unlike PDCA,
|
||||
DMAIC emphasizes \emph{statistical measurement} of process capability and
|
||||
explicitly separates analysis (understanding the problem) from improvement
|
||||
(fixing it).
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium--High}. The Define--Measure--Analyze
|
||||
front-loading is valuable for agents: it forces explicit quality metrics
|
||||
\emph{before} implementation, preventing the common failure mode of agents
|
||||
optimizing for the wrong objective. The Control phase---establishing
|
||||
monitoring to prevent regression---maps to cross-run memory systems.
|
||||
|
||||
\textbf{Key adaptation}: Agents can compute statistical process control
|
||||
metrics (defect rates, cycle times, sigma levels) automatically from event
|
||||
logs. The ``Measure'' phase, which is expensive and tedious for humans,
|
||||
becomes a strength: agents can instrument everything.
|
||||
|
||||
\textbf{Constraint fit}: Speed (\ref{c:speed})---full DMAIC in minutes.
|
||||
Dysfunction (\ref{c:dysfunction})---agent failure modes have measurable
|
||||
baselines, making sigma calculations meaningful. Stateless
|
||||
(\ref{c:stateless})---Control phase requires persistent monitoring, which
|
||||
must be explicitly built.
|
||||
|
||||
% ---- 3.2 Flow Optimization ----
|
||||
\subsection{Flow Optimization Methods}
|
||||
|
||||
\subsubsection{Kanban}
|
||||
\label{sec:kanban}
|
||||
|
||||
\textbf{Origin}: Toyota Production System, Taiichi Ohno, 1950s.
|
||||
|
||||
\textbf{Mechanism}: Pull-based workflow with explicit work-in-progress (WIP)
|
||||
limits. Work items flow through columns (stages); new work is pulled only
|
||||
when capacity is available. No iterations---continuous flow.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{High}. Kanban's WIP limits directly address
|
||||
a critical agent challenge: \emph{coordination cost scaling}. Without WIP
|
||||
limits, spawning more agents increases throughput initially but eventually
|
||||
degrades quality due to coordination overhead (conflicting changes, merge
|
||||
conflicts, context fragmentation). Kanban provides a principled mechanism for
|
||||
determining optimal concurrency.
|
||||
|
||||
\textbf{Key adaptation}: WIP limits should be \emph{dynamic}, adjusting
|
||||
based on observed coordination costs (merge conflicts, finding duplications)
|
||||
rather than fixed. The pull mechanism maps naturally: agents poll a task
|
||||
queue and pull the highest-priority item they can handle.
|
||||
|
||||
\textbf{Constraint fit}: Cloning (\ref{c:cloning})---WIP limits are
|
||||
\emph{exactly} the missing constraint for cheap-to-clone agents. Speed
|
||||
(\ref{c:speed})---flow metrics (lead time, cycle time, throughput) update
|
||||
in real-time. Psychology (\ref{c:psychology})---no ``swarming'' or
|
||||
``blocked item'' social dynamics to manage.
|
||||
|
||||
\subsubsection{Theory of Constraints (TOC)}
|
||||
\label{sec:toc}
|
||||
|
||||
\textbf{Origin}: Goldratt, \emph{The Goal}, 1984.
|
||||
|
||||
\textbf{Mechanism}: Identify the system's constraint (bottleneck), exploit
|
||||
it (maximize its throughput), subordinate everything else to it, elevate it
|
||||
(invest to remove it), repeat. The Five Focusing Steps.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{High}. In multi-agent pipelines, the
|
||||
bottleneck is typically the most capable (and expensive) agent: the
|
||||
implementation agent that must run on a powerful model, or the security
|
||||
reviewer that requires deep context. TOC provides a framework for
|
||||
organizing the entire pipeline around this constraint.
|
||||
|
||||
\textbf{Key adaptation}: ``Exploit the constraint'' means ensuring the
|
||||
bottleneck agent never waits for input. Pre-compute its context, batch
|
||||
its inputs, and schedule cheaper agents (research, formatting, validation)
|
||||
to run during its processing time. ``Subordinate'' means cheaper agents
|
||||
should produce output in the format the bottleneck needs, not in whatever
|
||||
format is easiest for them.
|
||||
|
||||
\textbf{Constraint fit}: Cloning (\ref{c:cloning})---non-bottleneck agents
|
||||
are cheap to overprovision. Speed (\ref{c:speed})---constraint shifts can
|
||||
be detected and responded to within a single run. Dysfunction
|
||||
(\ref{c:dysfunction})---bottleneck agent's failure mode has outsized impact,
|
||||
justifying targeted shadow detection.
|
||||
|
||||
\subsubsection{Lean / Toyota Production System}
|
||||
\label{sec:lean}
|
||||
|
||||
\textbf{Origin}: Ohno, 1988; Womack \& Jones, 1996.
|
||||
|
||||
\textbf{Mechanism}: Eliminate waste (\emph{muda}), reduce variability
|
||||
(\emph{mura}), avoid overburden (\emph{muri}). Seven wastes: overproduction,
|
||||
waiting, transport, overprocessing, inventory, motion, defects.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium--High}. The seven wastes map
|
||||
surprisingly well to agent systems:
|
||||
|
||||
\begin{itemize}[nosep]
|
||||
\item \textbf{Overproduction}: Agents generating output nobody reads
|
||||
(verbose research reports, unused alternative proposals).
|
||||
\item \textbf{Waiting}: Agents idle while waiting for predecessor output
|
||||
(sequential pipeline where parallel would work).
|
||||
\item \textbf{Transport}: Redundant context passing (sending full codebase
|
||||
to agents that need only a diff).
|
||||
\item \textbf{Overprocessing}: Running thorough review on trivial changes.
|
||||
\item \textbf{Inventory}: Accumulated artifacts from prior cycles that
|
||||
are never referenced.
|
||||
\item \textbf{Motion}: Agents reading files they don't need, exploring
|
||||
irrelevant code paths.
|
||||
\item \textbf{Defects}: Findings that are false positives, requiring
|
||||
rework to dismiss.
|
||||
\end{itemize}
|
||||
|
||||
\textbf{Key adaptation}: Lean's ``respect for people'' pillar has no direct
|
||||
analog. The technical pillar (continuous improvement, waste elimination)
|
||||
transfers fully.
|
||||
|
||||
% ---- 3.3 Decision-Making ----
|
||||
\subsection{Decision-Making Methods}
|
||||
|
||||
\subsubsection{OODA Loop (Observe--Orient--Decide--Act)}
|
||||
\label{sec:ooda}
|
||||
|
||||
\textbf{Origin}: John Boyd, 1976. Military strategy for air combat; later
|
||||
generalized to competitive decision-making.
|
||||
|
||||
\textbf{Mechanism}: Continuous loop of Observe (gather data), Orient (analyze
|
||||
context, update mental models), Decide (select course of action), Act
|
||||
(execute). The key insight is that the \emph{speed} of the loop---not any
|
||||
individual decision's quality---determines competitive advantage. ``Getting
|
||||
inside the opponent's OODA loop'' means acting faster than the adversary can
|
||||
react.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{High}. OODA is structurally similar to PDCA
|
||||
but optimized for speed over thoroughness. For agent systems, this maps to
|
||||
scenarios requiring rapid adaptation: adversarial testing, incident response,
|
||||
market-reactive coding, or any context where the problem space changes
|
||||
during execution.
|
||||
|
||||
\textbf{Key adaptation}: Boyd's ``Orient'' phase---updating mental models
|
||||
based on new information---is the hardest to implement for stateless agents.
|
||||
It requires either persistent state (a world model that updates across
|
||||
iterations) or a ``fast reorientation'' agent that rapidly synthesizes new
|
||||
information into an updated context.
|
||||
|
||||
\textbf{Constraint fit}: Speed (\ref{c:speed})---agents can OODA at
|
||||
superhuman frequency. Stateless (\ref{c:stateless})---the Orient phase
|
||||
needs explicit state management. Psychology (\ref{c:psychology})---Boyd's
|
||||
concept of ``mental agility'' translates to model selection: smaller, faster
|
||||
models for rapid OODA; larger models for deep Orient phases.
|
||||
|
||||
\subsubsection{Cynefin Framework}
|
||||
\label{sec:cynefin}
|
||||
|
||||
\textbf{Origin}: Snowden \& Boone, 2007.
|
||||
|
||||
\textbf{Mechanism}: Classify problems into five domains---\textsc{Clear}
|
||||
(obvious cause-effect), \textsc{Complicated} (expert analysis needed),
|
||||
\textsc{Complex} (emergent, probe-sense-respond), \textsc{Chaotic}
|
||||
(act first, then sense), \textsc{Confused} (unknown domain)---and apply
|
||||
domain-appropriate strategies.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium--High}. Cynefin provides a
|
||||
\emph{meta-framework}: instead of choosing one orchestration method for all
|
||||
tasks, classify the task first, then select the appropriate method:
|
||||
|
||||
\begin{itemize}[nosep]
|
||||
\item \textsc{Clear}: Single agent, no review (``fix this typo'').
|
||||
\item \textsc{Complicated}: Expert agent with review (PDCA fast workflow).
|
||||
\item \textsc{Complex}: Multiple competing proposals, let results emerge
|
||||
(PDCA standard/thorough with parallel alternatives).
|
||||
\item \textsc{Chaotic}: Act immediately, stabilize, then analyze (OODA
|
||||
with hotfix agent, then PDCA for proper fix).
|
||||
\end{itemize}
|
||||
|
||||
\textbf{Key adaptation}: Task classification must be automated. Proxies:
|
||||
number of files affected, cross-module dependencies, security sensitivity,
|
||||
test coverage of affected area.
|
||||
|
||||
% ---- 3.4 Innovation Management ----
|
||||
\subsection{Innovation Management Methods}
|
||||
|
||||
\subsubsection{Stage-Gate}
|
||||
\label{sec:stagegate}
|
||||
|
||||
\textbf{Origin}: Cooper, 1990.
|
||||
|
||||
\textbf{Mechanism}: Innovation projects pass through stages (scoping,
|
||||
business case, development, testing, launch), separated by gates where a
|
||||
cross-functional team decides: Go, Kill, Hold, or Recycle. The gate
|
||||
decision is binary---no ``continue with reservations.''
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium}. The gate mechanism maps well to
|
||||
agent confidence checks: a Creator agent's proposal either meets the
|
||||
confidence threshold (Go) or doesn't (Kill/Recycle). However, Stage-Gate
|
||||
assumes expensive stages (weeks/months of human work), making Kill decisions
|
||||
high-stakes. For agents, stages are cheap (minutes), reducing the value of
|
||||
formal gate decisions.
|
||||
|
||||
\textbf{Key adaptation}: Gates become lightweight confidence checks rather
|
||||
than committee reviews. The ``Kill'' decision---rare and painful in human
|
||||
innovation---should be common and cheap for agents. Explore multiple
|
||||
proposals in parallel, gate aggressively, continue only the best.
|
||||
|
||||
\subsubsection{Design Thinking}
|
||||
\label{sec:designthinking}
|
||||
|
||||
\textbf{Origin}: IDEO / Stanford d.school, 2000s.
|
||||
|
||||
\textbf{Mechanism}: Five phases: Empathize (understand the user),
|
||||
Define (frame the problem), Ideate (generate solutions), Prototype (build
|
||||
quickly), Test (get feedback). Emphasis on user empathy and divergent
|
||||
thinking.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Low}. Design Thinking's core value
|
||||
proposition---\emph{empathy with users}---is precisely what LLM agents
|
||||
cannot genuinely do. Agents can simulate empathy (generate persona-based
|
||||
scenarios), but the insight that comes from observing real users in context
|
||||
has no agent equivalent. The Ideate phase (divergent brainstorming) is
|
||||
feasible but produces quantity over quality without the ``empathy filter''
|
||||
that makes Design Thinking effective.
|
||||
|
||||
\textbf{Key adaptation}: If used, the Empathize phase must be replaced
|
||||
with explicit user research artifacts (personas, journey maps, interview
|
||||
transcripts) provided as input. This transforms Design Thinking from a
|
||||
discovery method into a synthesis method---fundamentally changing its nature.
|
||||
|
||||
\subsubsection{TRIZ}
|
||||
\label{sec:triz}
|
||||
|
||||
\textbf{Origin}: Altshuller, 1946--1985. Theory of Inventive Problem
|
||||
Solving.
|
||||
|
||||
\textbf{Mechanism}: Problems contain contradictions (improving one parameter
|
||||
worsens another). TRIZ provides a contradiction matrix mapping 39 engineering
|
||||
parameters to 40 inventive principles. Instead of compromise, TRIZ seeks
|
||||
solutions that resolve the contradiction.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium}. TRIZ's structured problem-solving
|
||||
is well-suited to agents: the contradiction matrix is a lookup table, and
|
||||
agents can systematically apply inventive principles. However, TRIZ requires
|
||||
\emph{reformulating the problem as a contradiction}---a creative step that
|
||||
is itself challenging for agents.
|
||||
|
||||
\textbf{Key adaptation}: Provide the contradiction matrix as context. Train
|
||||
agents to identify the ``improving parameter'' and ``worsening parameter''
|
||||
in engineering tasks (e.g., ``improving security worsens performance'').
|
||||
Use TRIZ principles as a structured brainstorming prompt for the Creator
|
||||
archetype.
|
||||
|
||||
% ---- 3.5 Quality Engineering ----
|
||||
\subsection{Quality Engineering Methods}
|
||||
|
||||
\subsubsection{FMEA (Failure Mode and Effects Analysis)}
|
||||
\label{sec:fmea}
|
||||
|
||||
\textbf{Origin}: US Military, 1949; adopted by automotive (AIAG) and
|
||||
aerospace.
|
||||
|
||||
\textbf{Mechanism}: For each component/process step, systematically
|
||||
enumerate: (1) potential failure modes, (2) effects of each failure,
|
||||
(3) causes, (4) current controls, (5) risk priority number
|
||||
(severity $\times$ occurrence $\times$ detection). Address highest-RPN
|
||||
items first.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{High}. FMEA's systematic enumeration is
|
||||
exactly what LLM agents excel at: given a design, enumerate everything that
|
||||
could go wrong, assess severity, and propose mitigations. The Risk Priority
|
||||
Number provides a quantitative framework for prioritizing review effort---more
|
||||
principled than the common ``CRITICAL/WARNING/INFO'' severity classification.
|
||||
|
||||
\textbf{Key adaptation}: Use FMEA \emph{before} implementation (as part of
|
||||
the Plan phase) rather than only during review. An FMEA agent analyzes the
|
||||
Creator's proposal and generates a failure mode table; the Maker then
|
||||
implements with awareness of high-RPN failure modes; the Guardian validates
|
||||
that mitigations are in place.
|
||||
|
||||
\textbf{Constraint fit}: Dysfunction (\ref{c:dysfunction})---agents' own
|
||||
failure modes can be pre-enumerated via FMEA, creating a meta-level
|
||||
quality system. Cloning (\ref{c:cloning})---FMEA agents are cheap
|
||||
(analytical, not creative), enabling systematic coverage.
|
||||
|
||||
\subsubsection{Statistical Process Control (SPC)}
|
||||
\label{sec:spc}
|
||||
|
||||
\textbf{Origin}: Shewhart, 1920s.
|
||||
|
||||
\textbf{Mechanism}: Monitor process outputs over time using control charts.
|
||||
Distinguish \emph{common cause} variation (inherent to the process) from
|
||||
\emph{special cause} variation (attributable to specific events). React only
|
||||
to special causes; reduce common cause variation through process improvement.
|
||||
|
||||
\textbf{Agent fitness}: \textsc{Medium--High}. SPC requires historical data,
|
||||
which agent orchestration systems naturally generate (event logs, finding
|
||||
counts, cycle times, token usage). Control charts over agent effectiveness
|
||||
scores can distinguish between normal variation (``Guardian found 2 issues
|
||||
this run vs. 1 last run'') and genuine degradation (``Guardian's false
|
||||
positive rate spiked after a model update'').
|
||||
|
||||
\textbf{Key adaptation}: Sufficient run history is needed to establish
|
||||
control limits. Early runs operate without SPC; after 10--20 runs,
|
||||
control limits become meaningful. Model updates reset control limits
|
||||
(new process = new baseline).
|
||||
|
||||
% ============================================================
|
||||
\section{Compatibility Matrix}
|
||||
\label{sec:matrix}
|
||||
|
||||
Table~\ref{tab:matrix} scores each method against the five agent constraints,
|
||||
producing an overall fitness assessment.
|
||||
|
||||
\begin{table}[t]
|
||||
\centering
|
||||
\small
|
||||
\caption{Compatibility matrix: PM/OM methods scored against agent constraints.
|
||||
\textcolor{highfit}{\textbf{+}} = method benefits from this constraint;
|
||||
\textcolor{lowfit}{\textbf{--}} = method is undermined;
|
||||
\textcolor{neutral}{\textbf{0}} = neutral.
|
||||
Overall fitness: H = High, M = Medium, L = Low.}
|
||||
\label{tab:matrix}
|
||||
\begin{tabular}{@{}l*{5}{c}c@{}}
|
||||
\toprule
|
||||
\textbf{Method} &
|
||||
\textbf{C1} &
|
||||
\textbf{C2} &
|
||||
\textbf{C3} &
|
||||
\textbf{C4} &
|
||||
\textbf{C5} &
|
||||
\textbf{Fit} \\
|
||||
\midrule
|
||||
PDCA & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textbf{H} \\
|
||||
Scrum & \textcolor{lowfit}{--} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{lowfit}{--} & \textcolor{highfit}{+} & \textbf{L--M} \\
|
||||
DMAIC & \textcolor{lowfit}{--} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textbf{M--H} \\
|
||||
Kanban & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textbf{H} \\
|
||||
TOC & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textbf{H} \\
|
||||
Lean & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{lowfit}{--} & \textcolor{highfit}{+} & \textbf{M--H} \\
|
||||
OODA & \textcolor{lowfit}{--} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textbf{H} \\
|
||||
Cynefin & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textbf{M--H} \\
|
||||
Stage-Gate & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{lowfit}{--} & \textbf{M} \\
|
||||
Design Think. & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{lowfit}{--} & \textcolor{neutral}{0} & \textbf{L} \\
|
||||
TRIZ & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{neutral}{0} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textbf{M} \\
|
||||
FMEA & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textbf{H} \\
|
||||
SPC & \textcolor{lowfit}{--} & \textcolor{neutral}{0} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textcolor{highfit}{+} & \textbf{M--H} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\subsection{Analysis}
|
||||
|
||||
Several patterns emerge from the compatibility matrix:
|
||||
|
||||
\textbf{High-fitness methods share three properties}: they are
|
||||
\emph{mechanistic} (decisions follow rules, not judgment), \emph{flow-oriented}
|
||||
(optimize throughput, not team dynamics), and \emph{metric-driven} (quality
|
||||
is quantified, not discussed). PDCA, Kanban, TOC, OODA, and FMEA all share
|
||||
this profile.
|
||||
|
||||
\textbf{Low-fitness methods are psychology-dependent}: Scrum and Design
|
||||
Thinking derive their primary value from managing human cognitive and social
|
||||
limitations. Without those limitations, the methods become overhead.
|
||||
|
||||
\textbf{The ``Cheap Clone'' constraint is universally beneficial}: every
|
||||
method either benefits from or is neutral to the ability to spawn agents
|
||||
cheaply. This suggests that agent orchestration should generally favor
|
||||
\emph{parallelism}---run multiple approaches simultaneously, then
|
||||
select the best result.
|
||||
|
||||
\textbf{``Stateless'' is the most disruptive constraint}: methods that
|
||||
assume accumulated knowledge (Scrum's team velocity, SPC's control charts,
|
||||
DMAIC's baseline measurements) require explicit persistence mechanisms that
|
||||
agents don't provide natively.
|
||||
|
||||
% ============================================================
|
||||
\section{Hybrid Approaches and Method Composition}
|
||||
\label{sec:hybrid}
|
||||
|
||||
The methods in our taxonomy are not mutually exclusive. Effective agent
|
||||
orchestration likely requires combining methods at different levels:
|
||||
|
||||
\subsection{Proposed Three-Layer Architecture}
|
||||
|
||||
\begin{description}
|
||||
\item[Strategic layer (Cynefin)]: Classify the task and select the
|
||||
appropriate orchestration method. Simple tasks get a single agent;
|
||||
complicated tasks get PDCA; complex tasks get parallel competing
|
||||
approaches; chaotic tasks get OODA.
|
||||
|
||||
\item[Operational layer (PDCA/OODA + Kanban)]: Execute the selected
|
||||
method with flow control. Kanban WIP limits prevent coordination
|
||||
overload. PDCA provides quality convergence for standard tasks; OODA
|
||||
provides rapid adaptation for time-sensitive tasks.
|
||||
|
||||
\item[Quality layer (FMEA + SPC + TOC)]: Monitor execution quality.
|
||||
FMEA front-loads failure analysis in the Plan phase. SPC monitors
|
||||
long-term agent effectiveness trends. TOC identifies and optimizes
|
||||
around bottleneck agents.
|
||||
\end{description}
|
||||
|
||||
\subsection{ArcheFlow as a Case Study}
|
||||
|
||||
ArcheFlow \citep{nennemann2026archeflow} already implements elements of
|
||||
this three-layer architecture, though without explicitly naming all methods:
|
||||
|
||||
\begin{itemize}[nosep]
|
||||
\item \textbf{Strategic}: Workflow selection (fast/standard/thorough)
|
||||
functions as a simplified Cynefin classification.
|
||||
\item \textbf{Operational}: PDCA cycles with convergence detection;
|
||||
sprint mode with WIP-limited parallel dispatch (implicit Kanban).
|
||||
\item \textbf{Quality}: Shadow detection (behavioral FMEA for agent
|
||||
failure modes); effectiveness scoring (rudimentary SPC); Guardian
|
||||
fast-path (TOC---don't waste the bottleneck on clean code); ``Wiggum
|
||||
Break'' circuit breakers (hard/soft halt conditions with event logging).
|
||||
\end{itemize}
|
||||
|
||||
The gap is in explicit TOC application (identifying and optimizing around
|
||||
the most expensive agent) and in OODA integration for time-sensitive tasks.
|
||||
|
||||
% ============================================================
|
||||
\section{Decision Framework}
|
||||
\label{sec:decision}
|
||||
|
||||
We propose a practitioner-oriented decision framework for selecting
|
||||
orchestration methods based on three dimensions:
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\begin{tikzpicture}[
|
||||
box/.style={draw, rounded corners, minimum width=3.5cm, minimum height=0.7cm, font=\small, fill=#1},
|
||||
arrow/.style={-{Stealth[length=3mm]}, thick},
|
||||
]
|
||||
|
||||
% Decision tree
|
||||
\node[box=yellow!20] (start) {Task arrives};
|
||||
\node[box=orange!15, below=0.8cm of start] (cynefin) {Classify (Cynefin)};
|
||||
|
||||
\node[box=green!15, below left=1cm and 2cm of cynefin] (clear) {Clear};
|
||||
\node[box=green!15, below left=1cm and 0cm of cynefin] (complicated) {Complicated};
|
||||
\node[box=blue!10, below right=1cm and 0cm of cynefin] (complex) {Complex};
|
||||
\node[box=red!10, below right=1cm and 2cm of cynefin] (chaotic) {Chaotic};
|
||||
|
||||
\node[box=white, below=0.7cm of clear, text width=2.5cm, align=center, font=\scriptsize] (m1) {Single agent\\No review};
|
||||
\node[box=white, below=0.7cm of complicated, text width=2.5cm, align=center, font=\scriptsize] (m2) {PDCA fast\\+ FMEA};
|
||||
\node[box=white, below=0.7cm of complex, text width=2.5cm, align=center, font=\scriptsize] (m3) {PDCA thorough\\+ parallel proposals};
|
||||
\node[box=white, below=0.7cm of chaotic, text width=2.5cm, align=center, font=\scriptsize] (m4) {OODA\\then PDCA};
|
||||
|
||||
\draw[arrow] (start) -- (cynefin);
|
||||
\draw[arrow] (cynefin) -- (clear);
|
||||
\draw[arrow] (cynefin) -- (complicated);
|
||||
\draw[arrow] (cynefin) -- (complex);
|
||||
\draw[arrow] (cynefin) -- (chaotic);
|
||||
\draw[arrow] (clear) -- (m1);
|
||||
\draw[arrow] (complicated) -- (m2);
|
||||
\draw[arrow] (complex) -- (m3);
|
||||
\draw[arrow] (chaotic) -- (m4);
|
||||
|
||||
\end{tikzpicture}
|
||||
\caption{Decision framework for selecting agent orchestration method
|
||||
based on Cynefin task classification.}
|
||||
\label{fig:decision}
|
||||
\end{figure}
|
||||
|
||||
\textbf{Cross-cutting concerns} apply regardless of classification:
|
||||
\begin{itemize}[nosep]
|
||||
\item \textbf{Kanban WIP limits}: Always. Prevents coordination overload.
|
||||
\item \textbf{TOC awareness}: Identify the costliest agent; schedule
|
||||
others around it.
|
||||
\item \textbf{SPC monitoring}: After 10+ runs, establish control limits
|
||||
for agent effectiveness.
|
||||
\item \textbf{Lean waste audit}: Periodically review token usage patterns
|
||||
for waste (unused artifacts, redundant context, overprocessing).
|
||||
\end{itemize}
|
||||
|
||||
% ============================================================
|
||||
\section{Open Research Directions}
|
||||
\label{sec:future}
|
||||
|
||||
\subsection{Adaptive Method Selection}
|
||||
|
||||
Current frameworks use a fixed orchestration method. An adaptive system
|
||||
would classify each incoming task (Cynefin), select the appropriate method,
|
||||
and switch methods mid-execution if the task's nature changes (e.g.,
|
||||
a ``complicated'' task reveals unexpected complexity during exploration).
|
||||
This requires a \emph{method-aware orchestrator} that understands the
|
||||
assumptions and exit criteria of each method.
|
||||
|
||||
\subsection{Kanban for Agent Swarms}
|
||||
|
||||
As agent counts increase beyond 5--10, coordination costs dominate.
|
||||
Kanban's WIP limits and flow metrics provide a theoretical basis for
|
||||
determining optimal agent concurrency, but empirical studies are needed
|
||||
to establish how coordination cost scales with agent count across
|
||||
different task types and model capabilities.
|
||||
|
||||
\subsection{OODA for Adversarial Agent Scenarios}
|
||||
|
||||
Boyd's OODA loop was designed for competitive environments where speed of
|
||||
decision-making determines the winner. Applications include adversarial
|
||||
testing (red team agents vs. blue team agents), competitive code generation
|
||||
(multiple agents racing to solve a problem), and incident response
|
||||
(rapid diagnosis and mitigation under time pressure).
|
||||
|
||||
\subsection{Cross-Method Quality Metrics}
|
||||
|
||||
Each PM/OM method defines quality differently: PDCA uses convergence scores,
|
||||
Six Sigma uses sigma levels, Lean uses waste ratios, SPC uses control
|
||||
limits. A unified quality metric for agent orchestration---one that allows
|
||||
meaningful comparison across methods---does not yet exist.
|
||||
|
||||
\subsection{FMEA for Agent Failure Modes}
|
||||
|
||||
Agent failure modes (hallucination, scope creep, false positive reviews,
|
||||
persona drift \citep{lu2026assistant}) can be systematically enumerated
|
||||
using FMEA methodology. A comprehensive FMEA catalog for LLM agents---with
|
||||
severity, occurrence, and detection ratings calibrated from empirical
|
||||
data---would provide a foundation for designing more robust orchestration
|
||||
systems.
|
||||
|
||||
% ============================================================
|
||||
\section{Conclusion}
|
||||
\label{sec:conclusion}
|
||||
|
||||
The operations management literature offers a rich toolkit for agent
|
||||
orchestration that extends far beyond the agile methods currently dominant
|
||||
in the field. Our taxonomy reveals that the highest-fitness methods---PDCA,
|
||||
Kanban, TOC, OODA, and FMEA---share a common profile: mechanistic,
|
||||
flow-oriented, and metric-driven. Methods centered on human psychology
|
||||
(Scrum, Design Thinking) transfer poorly without fundamental reformulation.
|
||||
|
||||
The key insight is that LLM agents are not ``fast humans.'' They have
|
||||
fundamentally different constraint profiles---cheap to clone, expensive to
|
||||
coordinate, stateless, psychologically inert---and these differences make
|
||||
some PM/OM methods \emph{more} effective (OODA loops at superhuman speed,
|
||||
FMEA with exhaustive enumeration) while rendering others irrelevant
|
||||
(standups without psychology, retrospectives without learning).
|
||||
|
||||
We encourage the agent orchestration community to look beyond agile sprints
|
||||
and role-playing frameworks toward the broader operations management
|
||||
tradition. A century of industrial practice has much to teach us about
|
||||
orchestrating intelligent agents---if we take the time to translate.
|
||||
|
||||
% ============================================================
|
||||
\section*{Acknowledgments}
|
||||
|
||||
The author thanks the operations management and quality engineering
|
||||
communities whose work, developed over decades for human organizations,
|
||||
provides the theoretical foundation for this analysis.
|
||||
|
||||
% ============================================================
|
||||
\bibliographystyle{plainnat}
|
||||
\bibliography{taxonomy-refs}
|
||||
|
||||
\end{document}
|
||||
34
scripts/run-tests.sh
Executable file
34
scripts/run-tests.sh
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env bash
|
||||
# run-tests.sh — Run all ArcheFlow bats tests.
|
||||
#
|
||||
# Usage: ./scripts/run-tests.sh [bats-args...]
|
||||
# Examples:
|
||||
# ./scripts/run-tests.sh # Run all tests
|
||||
# ./scripts/run-tests.sh --filter "event" # Run only event tests
|
||||
# ./scripts/run-tests.sh -t # TAP output
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
TESTS_DIR="$PROJECT_DIR/tests"
|
||||
|
||||
# Find bats binary
|
||||
BATS="${BATS:-}"
|
||||
if [[ -z "$BATS" ]]; then
|
||||
if command -v bats &>/dev/null; then
|
||||
BATS="bats"
|
||||
elif [[ -x "$HOME/.local/bin/bats" ]]; then
|
||||
BATS="$HOME/.local/bin/bats"
|
||||
else
|
||||
echo "ERROR: bats not found. Install bats-core or set BATS env var." >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Running ArcheFlow tests..."
|
||||
echo " bats: $($BATS --version)"
|
||||
echo " tests: $TESTS_DIR"
|
||||
echo ""
|
||||
|
||||
exec "$BATS" "$@" "$TESTS_DIR"/*.bats
|
||||
@@ -1,292 +1,46 @@
|
||||
---
|
||||
name: act-phase
|
||||
description: |
|
||||
Use after the Check phase completes. Collects reviewer findings, prioritizes them, routes fixes to the right agent or tool, applies fixes systematically, and decides whether to exit or cycle.
|
||||
Use after the Check phase completes. Collects reviewer findings, routes fixes, applies them, decides whether to exit or cycle.
|
||||
<example>Automatically loaded during orchestration after Check phase</example>
|
||||
<example>User: "Run just the act phase on existing findings"</example>
|
||||
---
|
||||
|
||||
# Act Phase
|
||||
|
||||
After all reviewers complete, the Act phase turns findings into fixes and decides whether the cycle is done. This is the bridge between "what's wrong" and "what we do about it."
|
||||
|
||||
## Overview
|
||||
Turn Check phase findings into fixes, then decide: exit or cycle.
|
||||
|
||||
```
|
||||
Check phase output → Collect → Prioritize → Route → Fix → Verify → Exit or Cycle
|
||||
Check output → Collect → Deduplicate → Route → Fix → Exit or Cycle
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 1: Finding Collection
|
||||
## Step 1: Collect and Consolidate Findings
|
||||
|
||||
Parse all reviewer outputs into one consolidated findings table. Use the standardized format from the `check-phase` skill.
|
||||
Parse all reviewer outputs into one table grouped by severity (CRITICAL / WARNING / INFO):
|
||||
|
||||
```markdown
|
||||
## Findings Summary — Cycle N
|
||||
|
||||
### CRITICAL (must fix before next cycle)
|
||||
| # | Source | Location | Category | Description | Suggested Fix |
|
||||
|---|--------|----------|----------|-------------|---------------|
|
||||
| 1 | guardian | src/auth/handler.ts:48 | security | Empty string bypasses validation | Add length check |
|
||||
| 2 | trickster | src/api/parse.ts:92 | reliability | Null input causes crash | Guard with null check |
|
||||
|
||||
### WARNING (should fix)
|
||||
| # | Source | Location | Category | Description | Suggested Fix |
|
||||
|---|--------|----------|----------|-------------|---------------|
|
||||
| 3 | sage | tests/auth.test.ts:15 | testing | Test names don't describe behavior | Rename to "should reject expired tokens" |
|
||||
| 4 | guardian | src/auth/handler.ts:52 | security | Missing rate limit | Add rate limiter middleware |
|
||||
|
||||
### INFO (nice to have)
|
||||
| # | Source | Location | Category | Description | Suggested Fix |
|
||||
|---|--------|----------|----------|-------------|---------------|
|
||||
| 5 | skeptic | src/auth/handler.ts:30 | design | Consider caching validated tokens | Add TTL cache |
|
||||
```
|
||||
|
||||
### Deduplication
|
||||
|
||||
Before listing findings, deduplicate across reviewers (same rule as `check-phase`):
|
||||
- Same file + same category + similar description = one finding
|
||||
- Use the higher severity
|
||||
- Credit all sources: `guardian + skeptic`
|
||||
- Don't double-count in severity tallies
|
||||
Same file + same category + similar description = one finding. Use the higher severity, credit all sources (e.g. `guardian + skeptic`).
|
||||
|
||||
### Cross-Cycle Tracking
|
||||
### Cross-Cycle Tracking (cycle > 1)
|
||||
|
||||
Compare against prior cycle findings (if cycle > 1):
|
||||
- **Resolved:** Finding from cycle N-1 no longer present → mark resolved, do not re-raise
|
||||
- **Persisting:** Same location + category still present → increment `cycle_count`
|
||||
- **New:** Finding not seen before → add with `cycle_count: 1`
|
||||
Compare against prior cycle findings:
|
||||
- **Resolved** — no longer present, mark resolved, do not re-raise
|
||||
- **Persisting** — same location + category, increment `cycle_count`
|
||||
- **New** — first appearance, `cycle_count: 1`
|
||||
|
||||
If a finding persists for 2+ consecutive cycles, flag for user escalation (see Step 5).
|
||||
Finding persisting 2+ cycles = flag for escalation (see Step 4).
|
||||
|
||||
---
|
||||
|
||||
## Step 2: Fix Routing
|
||||
|
||||
Not all findings are fixed the same way. Route each finding based on its nature:
|
||||
|
||||
| Category | Fix Route | Rationale |
|
||||
|----------|-----------|-----------|
|
||||
| `security` | Spawn Maker with targeted instructions | Security fixes need tested code changes |
|
||||
| `reliability` | Spawn Maker with targeted instructions | Same — code-level fix with test |
|
||||
| `breaking-change` | Route to Creator in next cycle | Design decision needed |
|
||||
| `design` | Route to Creator in next cycle | Architecture change, not a patch |
|
||||
| `dependency` | Spawn Maker with targeted instructions | Package update or removal |
|
||||
| `quality` | Spawn Maker or apply directly | Depends on scope (see below) |
|
||||
| `testing` | Spawn Maker with targeted instructions | Tests need to be written and run |
|
||||
| `consistency` | Apply directly or spawn Maker | Naming/style → direct. Pattern change → Maker |
|
||||
|
||||
### Direct Fix (no agent)
|
||||
|
||||
Apply directly with Edit tool when **all** of these are true:
|
||||
- The fix is mechanical (typo, naming, formatting, import order)
|
||||
- No behavioral change
|
||||
- No test update needed
|
||||
- Exactly one file affected
|
||||
|
||||
Examples: rename a variable, fix a typo in a string, reorder imports, fix indentation.
|
||||
|
||||
### Maker Fix (spawn agent)
|
||||
|
||||
Spawn a targeted Maker when the fix involves:
|
||||
- Code logic changes
|
||||
- New or modified tests
|
||||
- Multiple files
|
||||
- Any behavioral change
|
||||
|
||||
Provide the Maker with:
|
||||
1. The specific finding(s) to address (not all findings — just the routed ones)
|
||||
2. The file and line location
|
||||
3. The suggested fix from the reviewer
|
||||
4. The Maker's original branch (to apply fixes on top)
|
||||
|
||||
```
|
||||
Agent(
|
||||
description: "Fix: <finding description>",
|
||||
prompt: "You are the MAKER archetype.
|
||||
Apply this fix on branch: <maker's branch>
|
||||
|
||||
Finding: <source> | <severity> | <category>
|
||||
Location: <file:line>
|
||||
Issue: <description>
|
||||
Suggested fix: <fix>
|
||||
|
||||
Rules:
|
||||
1. Fix ONLY this issue — no other changes
|
||||
2. Add/update tests if the fix changes behavior
|
||||
3. Run existing tests — nothing may break
|
||||
4. Commit with message: 'fix: <description>'
|
||||
Do NOT refactor surrounding code.",
|
||||
isolation: "worktree",
|
||||
mode: "bypassPermissions"
|
||||
)
|
||||
```
|
||||
|
||||
### Writing/Prose Fix (domain-specific)
|
||||
|
||||
For writing projects (books, stories), voice or prose findings need special context:
|
||||
|
||||
```
|
||||
Agent(
|
||||
description: "Fix: voice drift in <file>",
|
||||
prompt: "You are the MAKER archetype.
|
||||
Apply this prose fix on branch: <maker's branch>
|
||||
|
||||
Finding: <source> | <severity> | <category>
|
||||
Location: <file:line>
|
||||
Issue: <description>
|
||||
|
||||
Voice profile to match: <load from .archeflow/config.yaml or project voice profile>
|
||||
|
||||
Rules:
|
||||
1. Fix the flagged passage to match the voice profile
|
||||
2. Do not rewrite surrounding paragraphs
|
||||
3. Preserve the narrative intent — only change voice/style
|
||||
4. Commit with message: 'fix: <description>'",
|
||||
isolation: "worktree",
|
||||
mode: "bypassPermissions"
|
||||
)
|
||||
```
|
||||
|
||||
### Design Fix (route to next cycle)
|
||||
|
||||
Findings that require design changes are NOT fixed in the Act phase. They become structured feedback for the Creator in the next PDCA cycle. Collect them into `act-feedback.md` (see Step 5).
|
||||
|
||||
---
|
||||
|
||||
## Step 3: Fix Application Protocol
|
||||
|
||||
Apply fixes in severity order: CRITICAL first, then WARNING, then INFO. Within the same severity, fix in file order (reduces context switching).
|
||||
|
||||
### For each fix:
|
||||
|
||||
1. **Apply the change** (direct edit or via Maker agent)
|
||||
2. **Emit `fix.applied` event:**
|
||||
```json
|
||||
{
|
||||
"type": "fix.applied",
|
||||
"phase": "act",
|
||||
"agent": "maker",
|
||||
"data": {
|
||||
"source": "guardian",
|
||||
"finding": "Empty string bypasses validation",
|
||||
"file": "src/auth/handler.ts",
|
||||
"line": 48,
|
||||
"severity": "CRITICAL",
|
||||
"before": "<old code>",
|
||||
"after": "<new code>"
|
||||
},
|
||||
"parent": [<seq of the review.verdict that found it>]
|
||||
}
|
||||
```
|
||||
3. **Targeted re-check** (if the fix is non-trivial):
|
||||
- Re-run only the reviewer that raised the finding
|
||||
- Scope the re-check to just the changed file(s)
|
||||
- If the re-check raises new findings → add them to the findings list with source `re-check:<reviewer>`
|
||||
|
||||
### Batching Maker Fixes
|
||||
|
||||
If multiple findings route to the same Maker and affect the same file or tightly coupled files, batch them into a single Maker spawn:
|
||||
|
||||
```
|
||||
Agent(
|
||||
description: "Fix: 3 findings in src/auth/",
|
||||
prompt: "You are the MAKER archetype.
|
||||
Apply these fixes on branch: <maker's branch>
|
||||
|
||||
1. [CRITICAL] src/auth/handler.ts:48 — Empty string bypass → Add length check
|
||||
2. [WARNING] src/auth/handler.ts:52 — Missing rate limit → Add middleware
|
||||
3. [WARNING] tests/auth.test.ts:15 — Bad test names → Rename to behavior descriptions
|
||||
|
||||
Fix all three. Commit each as a separate commit.
|
||||
Run tests after all fixes."
|
||||
)
|
||||
```
|
||||
|
||||
Batch only within the same functional area. Don't batch unrelated fixes — the Maker loses focus.
|
||||
|
||||
---
|
||||
|
||||
## Step 4: Exit Decision
|
||||
|
||||
After all fixes are applied, evaluate exit conditions:
|
||||
|
||||
### Decision Tree
|
||||
|
||||
```
|
||||
┌─ Count remaining CRITICAL findings (including from re-checks)
|
||||
│
|
||||
├─ CRITICAL = 0 AND completion criteria met (if defined)
|
||||
│ └─ EXIT: Proceed to merge
|
||||
│
|
||||
├─ CRITICAL = 0 AND completion criteria NOT met
|
||||
│ └─ CYCLE: Feed back "completion criteria failing" to Creator
|
||||
│
|
||||
├─ CRITICAL > 0 AND cycles_remaining > 0
|
||||
│ └─ CYCLE: Build feedback, go to Plan phase
|
||||
│
|
||||
├─ CRITICAL > 0 AND cycles_remaining = 0
|
||||
│ └─ STOP: Report to user with unresolved findings
|
||||
│
|
||||
└─ Same CRITICAL finding persisted 2+ cycles
|
||||
└─ ESCALATE: Stop and ask user for guidance
|
||||
```
|
||||
|
||||
### Emit `cycle.boundary` event:
|
||||
```json
|
||||
{
|
||||
"type": "cycle.boundary",
|
||||
"phase": "act",
|
||||
"data": {
|
||||
"cycle": 1,
|
||||
"max_cycles": 2,
|
||||
"exit_condition": "all_approved",
|
||||
"met": false,
|
||||
"critical_remaining": 1,
|
||||
"warning_remaining": 2,
|
||||
"info_remaining": 1,
|
||||
"fixes_applied": 3,
|
||||
"design_issues_forwarded": 1,
|
||||
"next_action": "cycle"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 5: Cycle Feedback Protocol
|
||||
|
||||
When cycling back, produce `act-feedback.md` as a structured handoff. This replaces dumping raw findings.
|
||||
|
||||
```markdown
|
||||
## Cycle N Feedback → Cycle N+1
|
||||
|
||||
### For Creator (design changes needed)
|
||||
| # | Source | Severity | Category | Issue | Cycles Open |
|
||||
|---|--------|----------|----------|-------|-------------|
|
||||
| 1 | guardian | CRITICAL | security | SQL injection in user input | 1 |
|
||||
| 2 | skeptic | WARNING | design | Assumes single-tenant only | 1 |
|
||||
|
||||
### For Maker (implementation fixes needed)
|
||||
| # | Source | Severity | Category | Issue | Cycles Open |
|
||||
|---|--------|----------|----------|-------|-------------|
|
||||
| 3 | sage | WARNING | testing | Test assertions too weak | 1 |
|
||||
| 4 | trickster | WARNING | reliability | Error path not tested | 1 |
|
||||
|
||||
### Resolved in This Cycle
|
||||
| # | Source | Issue | How Resolved |
|
||||
|---|--------|-------|--------------|
|
||||
| 5 | guardian | Missing rate limit | Added rate limiter middleware (commit abc123) |
|
||||
| 6 | sage | Test names unclear | Renamed to behavior descriptions (commit def456) |
|
||||
|
||||
### Persisting Issues (escalation candidates)
|
||||
| # | Source | Issue | Cycles Open | Action |
|
||||
|---|--------|-------|-------------|--------|
|
||||
| — | — | — | — | — |
|
||||
```
|
||||
|
||||
**Routing rules** (canonical table — matches orchestration and artifact-routing skills):
|
||||
This is the **canonical routing table** (single source of truth for the whole system):
|
||||
|
||||
| Source | Category | Routes to | Reason |
|
||||
|--------|----------|-----------|--------|
|
||||
@@ -296,76 +50,91 @@ When cycling back, produce `act-feedback.md` as a structured handoff. This repla
|
||||
| Sage | quality, consistency | Maker | Implementation refinement |
|
||||
| Sage | testing | Maker | Test gap, not design flaw |
|
||||
| Trickster | reliability (design flaw) | Creator | Needs redesign |
|
||||
| Trickster | reliability (test gap) | Maker | Needs more tests |
|
||||
| Trickster | testing | Maker | Edge case not covered |
|
||||
| Trickster | reliability (test gap), testing | Maker | Needs more tests |
|
||||
|
||||
**Disambiguation rule:** When in doubt: if the fix requires changing the approach, route to Creator. If it requires changing the code within the existing approach, route to Maker.
|
||||
**Disambiguation:** If the fix requires changing the approach → Creator. If it requires changing code within the existing approach → Maker.
|
||||
|
||||
### Direct Fix (no agent)
|
||||
|
||||
Apply with Edit tool when **all** are true:
|
||||
- Mechanical (typo, naming, formatting, import order)
|
||||
- No behavioral change
|
||||
- No test update needed
|
||||
- Single file
|
||||
|
||||
### Maker Fix (spawn agent)
|
||||
|
||||
Spawn a targeted Maker when the fix involves code logic, tests, multiple files, or behavioral changes. Batch findings in the same file area into one Maker spawn.
|
||||
|
||||
```
|
||||
Agent(
|
||||
description: "Fix: <description>",
|
||||
prompt: "You are the MAKER archetype.
|
||||
Branch: <maker's branch>
|
||||
Findings:
|
||||
1. [CRITICAL] file:line — issue → suggested fix
|
||||
2. [WARNING] file:line — issue → suggested fix
|
||||
Rules: fix ONLY these issues, add/update tests if behavior changes,
|
||||
run tests, commit each fix separately as 'fix: <description>'.
|
||||
Do NOT refactor surrounding code.",
|
||||
isolation: "worktree",
|
||||
mode: "bypassPermissions"
|
||||
)
|
||||
```
|
||||
|
||||
### Design Fix (route to Creator)
|
||||
|
||||
Design findings are NOT fixed in Act. Collect them into `act-feedback.md` for the Creator in the next cycle (see Step 5).
|
||||
|
||||
---
|
||||
|
||||
## Step 6: Incremental Runs
|
||||
## Step 3: Fix Application
|
||||
|
||||
Support starting the orchestration from any phase by reusing existing artifacts.
|
||||
Apply in severity order: CRITICAL → WARNING → INFO. Within same severity, group by file.
|
||||
|
||||
### `--start-from check`
|
||||
|
||||
Re-run Check + Act on existing Do artifacts:
|
||||
1. Read `.archeflow/artifacts/<run_id>/` for Maker branch and implementation summary
|
||||
2. Verify the Maker branch still exists (`git branch --list`)
|
||||
3. Spawn reviewers against the existing branch
|
||||
4. Proceed through Act phase normally
|
||||
|
||||
### `--start-from act`
|
||||
|
||||
Re-run Act with existing Check findings:
|
||||
1. Read `.archeflow/artifacts/<run_id>/` for Check phase consolidated output
|
||||
2. Parse findings from the stored reviewer outputs
|
||||
3. Skip finding collection (already done) — proceed from Step 2 (Fix Routing)
|
||||
|
||||
### `--start-from do`
|
||||
|
||||
Re-run Do + Check + Act with existing Plan:
|
||||
1. Read `.archeflow/artifacts/<run_id>/` for Creator's proposal
|
||||
2. Verify proposal exists and is parseable
|
||||
3. Spawn Maker with the existing proposal
|
||||
4. Proceed through Check and Act normally
|
||||
|
||||
### Artifact Verification
|
||||
|
||||
Before starting from a mid-point, verify required artifacts exist:
|
||||
|
||||
```
|
||||
--start-from do → needs: proposal (Creator output)
|
||||
--start-from check → needs: proposal + implementation (Maker branch + summary)
|
||||
--start-from act → needs: proposal + implementation + review outputs
|
||||
```
|
||||
|
||||
If artifacts are missing, report which ones and abort. Don't guess or generate placeholders.
|
||||
|
||||
### Event Continuity
|
||||
|
||||
For incremental runs, emit events with `parent` pointing to the existing artifacts' events:
|
||||
1. Read the existing `<run_id>.jsonl` to find the last `seq` number
|
||||
2. Continue sequence numbering from there
|
||||
3. Set `parent` on the first new event to point to the last event of the prior phase
|
||||
For each fix:
|
||||
1. Apply the change (direct edit or via Maker agent)
|
||||
2. Emit `fix.applied` event with source, finding, file, severity, before/after
|
||||
3. For non-trivial fixes: re-run only the originating reviewer scoped to changed files. New findings from re-check get added with source `re-check:<reviewer>`
|
||||
|
||||
---
|
||||
|
||||
## Act Phase Checklist (Quick Reference)
|
||||
## Step 4: Exit Decision
|
||||
|
||||
```
|
||||
□ Parse all reviewer outputs into consolidated findings table
|
||||
□ Deduplicate across reviewers
|
||||
□ Compare against prior cycle findings (if cycle > 1)
|
||||
□ Route each finding: direct fix / Maker / Creator feedback
|
||||
□ Apply direct fixes first (fastest)
|
||||
□ Spawn Maker(s) for code fixes (batch by file area)
|
||||
□ Emit fix.applied event for each fix
|
||||
□ Re-check non-trivial fixes with the originating reviewer
|
||||
□ Count remaining CRITICALs after all fixes
|
||||
□ Check completion criteria (if defined)
|
||||
□ Decide: exit / cycle / escalate
|
||||
□ If cycling: produce act-feedback.md with routed findings
|
||||
□ If exiting: proceed to merge (see orchestration skill Step 4)
|
||||
□ Emit cycle.boundary event
|
||||
CRITICAL = 0 AND criteria met → EXIT: proceed to merge
|
||||
CRITICAL = 0 AND criteria NOT met → CYCLE: feedback to Creator
|
||||
CRITICAL > 0 AND cycles remaining → CYCLE: build feedback, go to Plan
|
||||
CRITICAL > 0 AND no cycles left → STOP: report unresolved to user
|
||||
Same CRITICAL persists 2+ cycles → ESCALATE: ask user for guidance
|
||||
```
|
||||
|
||||
Emit `cycle.boundary` event with: cycle number, max_cycles, critical/warning/info remaining, fixes applied, next action.
|
||||
|
||||
---
|
||||
|
||||
## Step 5: Cycle Feedback
|
||||
|
||||
When cycling back, produce `act-feedback.md`:
|
||||
|
||||
```markdown
|
||||
## Cycle N → Cycle N+1
|
||||
|
||||
### For Creator (design changes needed)
|
||||
| # | Source | Severity | Category | Issue | Cycles Open |
|
||||
|---|--------|----------|----------|-------|-------------|
|
||||
|
||||
### For Maker (implementation fixes needed)
|
||||
| # | Source | Severity | Category | Issue | Cycles Open |
|
||||
|---|--------|----------|----------|-------|-------------|
|
||||
|
||||
### Resolved This Cycle
|
||||
| # | Source | Issue | How Resolved |
|
||||
|---|--------|-------|--------------|
|
||||
|
||||
### Persisting Issues (escalation candidates)
|
||||
| # | Source | Issue | Cycles Open | Action |
|
||||
|---|--------|-------|-------------|--------|
|
||||
```
|
||||
|
||||
Route findings into Creator vs Maker sections using the routing table in Step 2.
|
||||
|
||||
34
skills/af-dag/SKILL.md
Normal file
34
skills/af-dag/SKILL.md
Normal file
@@ -0,0 +1,34 @@
|
||||
---
|
||||
name: af-dag
|
||||
description: |
|
||||
Show the DAG of the current or last ArcheFlow run.
|
||||
<example>User: "/af-dag"</example>
|
||||
<example>User: "/af-dag 2026-04-06-jwt-auth"</example>
|
||||
---
|
||||
|
||||
# ArcheFlow Run DAG
|
||||
|
||||
1. Parse `run_id` from args. If none provided, read the latest run_id from `.archeflow/events/index.jsonl`.
|
||||
2. Run `./lib/archeflow-dag.sh .archeflow/events/<run_id>.jsonl` if the script exists. Display its output.
|
||||
3. If the script does not exist, read `.archeflow/events/<run_id>.jsonl` and render a text DAG:
|
||||
- Each node is an event (phase transitions, agent starts/completes, findings).
|
||||
- Show parent relationships via indentation.
|
||||
- Mark completed events with `[done]`, active with `[running]`, failed with `[FAIL]`.
|
||||
|
||||
Example output:
|
||||
```
|
||||
run.start 2026-04-06-jwt-auth
|
||||
plan.start
|
||||
agent.complete explorer (42s)
|
||||
agent.complete creator (68s)
|
||||
do.start
|
||||
agent.complete maker (180s)
|
||||
check.start
|
||||
agent.complete guardian (55s) -- 3 findings
|
||||
agent.complete skeptic (40s) -- 1 finding
|
||||
act.start
|
||||
fixes.applied 3/4
|
||||
run.complete (6m12s)
|
||||
```
|
||||
|
||||
4. If no events found for the run_id, say: "No events found for run `<run_id>`."
|
||||
42
skills/af-replay/SKILL.md
Normal file
42
skills/af-replay/SKILL.md
Normal file
@@ -0,0 +1,42 @@
|
||||
---
|
||||
name: af-replay
|
||||
description: "Replay and analyze a recorded ArcheFlow run: decision timeline and weighted what-if. Usage: /af-replay <run-id> [--timeline|--whatif|--compare] [--weights arch=w,...]"
|
||||
user-invocable: true
|
||||
---
|
||||
|
||||
# ArcheFlow Run Replay
|
||||
|
||||
Inspect a completed or in-progress run logged in `.archeflow/events/<run_id>.jsonl`. Use this to study which archetypes drove outcomes and to simulate **weighted** consensus (what-if).
|
||||
|
||||
## Recording (during PDCA)
|
||||
|
||||
After each meaningful orchestration choice, log a **decision point** (in addition to `review.verdict` where applicable):
|
||||
|
||||
```bash
|
||||
./lib/archeflow-decision.sh <run_id> <phase> <archetype> '<input_summary>' '<decision>' <confidence> [parent_seq]
|
||||
```
|
||||
|
||||
Fields stored: `phase`, `archetype`, `input`, `decision`, `confidence`, `ts` (event timestamp). The event type is `decision.point`.
|
||||
|
||||
Lower-level alternative:
|
||||
|
||||
```bash
|
||||
./lib/archeflow-event.sh "$RUN_ID" decision.point check guardian \
|
||||
'{"archetype":"guardian","input":"diff","decision":"needs_changes","confidence":0.85}' 7
|
||||
```
|
||||
|
||||
## Commands (from project root)
|
||||
|
||||
| Action | Shell |
|
||||
|--------|--------|
|
||||
| Timeline | `./lib/archeflow-replay.sh timeline <run_id>` |
|
||||
| What-if | `./lib/archeflow-replay.sh whatif <run_id> [--weights guardian=2,sage=0.5] [--threshold 0.5] [--json]` |
|
||||
| Both | `./lib/archeflow-replay.sh compare <run_id> [--weights ...]` |
|
||||
|
||||
- **Timeline** lists `decision.point` rows and `review.verdict` (check phase).
|
||||
- **What-if** reads the **last** `review.verdict` per archetype in check. **Original** outcome uses strict any-veto (any non-approve → BLOCK). **Replay** uses weighted mean strictness: each reviewer contributes weight × (1 if not approved, else 0); BLOCK if mean ≥ threshold (default 0.5).
|
||||
- **`--json`** emits machine-readable output for dashboards or scripts.
|
||||
|
||||
## Learning effectiveness
|
||||
|
||||
Correlate `decision.point` confidence and verdicts with cycle outcomes (`cycle.boundary`, `run.complete`) and `./lib/archeflow-score.sh extract` to see which archetypes add signal for which task shapes.
|
||||
40
skills/af-report/SKILL.md
Normal file
40
skills/af-report/SKILL.md
Normal file
@@ -0,0 +1,40 @@
|
||||
---
|
||||
name: af-report
|
||||
description: |
|
||||
Generate a full process report for an ArcheFlow run.
|
||||
<example>User: "/af-report"</example>
|
||||
<example>User: "/af-report 2026-04-06-jwt-auth"</example>
|
||||
---
|
||||
|
||||
# ArcheFlow Run Report
|
||||
|
||||
1. Parse `run_id` from args. If none provided, read the latest run_id from `.archeflow/events/index.jsonl`.
|
||||
2. Run `./lib/archeflow-report.sh .archeflow/events/<run_id>.jsonl` if the script exists. Display its output.
|
||||
3. If the script does not exist, read `.archeflow/events/<run_id>.jsonl` and produce a markdown report:
|
||||
|
||||
```markdown
|
||||
# ArcheFlow Report: <run_id>
|
||||
|
||||
## Overview
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| Task | ... |
|
||||
| Workflow | fast/standard/thorough |
|
||||
| Cycles | N |
|
||||
| Duration | Xm Ys |
|
||||
| Total Cost | $X.XX |
|
||||
|
||||
## Phase Summary
|
||||
For each phase (Plan, Do, Check, Act): agents involved, duration, token cost, key outputs.
|
||||
|
||||
## Findings
|
||||
Table of all findings: severity, category, description, archetype source, resolution (fixed/dismissed/deferred).
|
||||
|
||||
## Fixes Applied
|
||||
List of fixes with before/after summary and which finding they addressed.
|
||||
|
||||
## Lessons Learned
|
||||
Any new lessons extracted to memory during this run.
|
||||
```
|
||||
|
||||
4. If no events found for the run_id, say: "No events found for run `<run_id>`."
|
||||
23
skills/af-score/SKILL.md
Normal file
23
skills/af-score/SKILL.md
Normal file
@@ -0,0 +1,23 @@
|
||||
---
|
||||
name: af-score
|
||||
description: |
|
||||
Show archetype effectiveness scores across runs.
|
||||
<example>User: "/af-score"</example>
|
||||
---
|
||||
|
||||
# ArcheFlow Effectiveness Scores
|
||||
|
||||
1. Run `./lib/archeflow-score.sh list` if the script exists. Display its output.
|
||||
2. If the script does not exist, read `.archeflow/memory/effectiveness.jsonl` directly.
|
||||
3. Summarize per archetype as a table:
|
||||
|
||||
| Archetype | Runs | Signal/Noise | Fix Rate | Avg Cost |
|
||||
|-----------|------|--------------|----------|----------|
|
||||
| Guardian | ... | ... | ... | ... |
|
||||
| Skeptic | ... | ... | ... | ... |
|
||||
|
||||
- **Signal/Noise**: findings that led to actual fixes vs total findings raised.
|
||||
- **Fix Rate**: percentage of findings that were applied (not dismissed).
|
||||
- **Avg Cost**: mean token cost per review across runs.
|
||||
|
||||
4. If no effectiveness data exists, say: "No effectiveness data yet. Run `/af-run` at least once."
|
||||
25
skills/af-status/SKILL.md
Normal file
25
skills/af-status/SKILL.md
Normal file
@@ -0,0 +1,25 @@
|
||||
---
|
||||
name: af-status
|
||||
description: |
|
||||
Show ArcheFlow status — current/last run, active agents, findings.
|
||||
<example>User: "/af-status"</example>
|
||||
---
|
||||
|
||||
# ArcheFlow Status
|
||||
|
||||
1. Read `.archeflow/state.json` if it exists. Extract: task, phase, cycle, workflow, active agents, findings count, start time.
|
||||
2. If `state.json` does not exist, read the latest entry from `.archeflow/events/index.jsonl`. Extract run_id, task, last event type, timestamp.
|
||||
3. Calculate duration from start time to now (or to completion time if run finished).
|
||||
4. Report as a compact table:
|
||||
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| Run | `<run_id>` |
|
||||
| Task | `<task description>` |
|
||||
| Phase | `<current phase>` |
|
||||
| Cycle | `<cycle number>` |
|
||||
| Workflow | `<fast/standard/thorough>` |
|
||||
| Findings | `<count>` |
|
||||
| Duration | `<elapsed>` |
|
||||
|
||||
5. If no `state.json` and no `index.jsonl`, say: "No active or recent ArcheFlow runs."
|
||||
@@ -1,121 +0,0 @@
|
||||
---
|
||||
name: attention-filters
|
||||
description: Use when spawning archetype agents to decide what context each agent receives. Reduces token waste and sharpens focus by passing only relevant artifacts.
|
||||
---
|
||||
|
||||
# Attention Filters
|
||||
|
||||
Each archetype needs different context. Pass only what's relevant — not everything.
|
||||
|
||||
| Archetype | Receives | Does NOT Receive |
|
||||
|-----------|----------|-----------------|
|
||||
| Explorer | Task description, codebase access | Prior proposals or reviews |
|
||||
| Creator | Explorer's research + task description | Implementation details |
|
||||
| Maker | Creator's proposal | Explorer's research, reviews |
|
||||
| Guardian | Maker's git diff + proposal risk section | Explorer's research |
|
||||
| Skeptic | Creator's proposal (focus: assumptions) | Git diff details |
|
||||
| Trickster | Maker's git diff only | Everything else |
|
||||
| Sage | Proposal + implementation + diff | Explorer's raw research |
|
||||
|
||||
## Why This Matters
|
||||
|
||||
- **Token cost:** A Guardian reading the Explorer's 2000-word research wastes ~2600 tokens on irrelevant context
|
||||
- **Focus:** An agent with too much context drifts from its archetype's concern
|
||||
- **Shadow prevention:** Over-loading context encourages rabbit-holing (Explorer) and scope creep (Maker)
|
||||
|
||||
## In Practice
|
||||
|
||||
When spawning a Check-phase agent, include only the filtered context in the prompt:
|
||||
|
||||
```
|
||||
# Guardian receives:
|
||||
"Review these changes: <git diff output>
|
||||
The proposal identified these risks: <risks section only>
|
||||
Verdict: APPROVED or REJECTED with findings."
|
||||
|
||||
# NOT:
|
||||
"Here is the full research, the full proposal, the full implementation,
|
||||
the full git log, and everything else we have..."
|
||||
```
|
||||
|
||||
## Prompt Construction Templates
|
||||
|
||||
### Explorer
|
||||
- **Receives:** Task description, file tree (max 200 lines), prior-cycle feedback (if cycle 2+)
|
||||
- **Excludes:** Creator proposals, Maker diffs, reviewer outputs
|
||||
- **Token target:** ~2000 tokens input
|
||||
|
||||
### Creator
|
||||
- **Receives:** Task description, Explorer research (if available), prior-cycle feedback (if cycle 2+)
|
||||
- **Excludes:** Maker diffs, reviewer outputs
|
||||
- **Token target:** ~3000 tokens input
|
||||
|
||||
### Maker
|
||||
- **Receives:** Creator's proposal (full), test strategy section, file list
|
||||
- **Excludes:** Explorer research, reviewer outputs, prior-cycle feedback
|
||||
- **Token target:** ~2500 tokens input
|
||||
|
||||
### Guardian
|
||||
- **Receives:** Maker's git diff, proposal risk section, test results
|
||||
- **Excludes:** Explorer research, Creator rationale, Skeptic/Sage outputs
|
||||
- **Token target:** ~2000 tokens input
|
||||
|
||||
### Skeptic
|
||||
- **Receives:** Creator's proposal (assumptions + architecture decision), confidence scores
|
||||
- **Excludes:** Git diff details, Explorer raw research, other reviewer outputs
|
||||
- **Token target:** ~1500 tokens input
|
||||
|
||||
### Trickster
|
||||
- **Receives:** Maker's git diff only, attack surface summary (file types + entry points)
|
||||
- **Excludes:** Proposal, research, other reviewer outputs
|
||||
- **Token target:** ~1500 tokens input
|
||||
|
||||
### Sage
|
||||
- **Receives:** Creator's proposal, Maker's implementation summary + diff, test results
|
||||
- **Excludes:** Explorer raw research, other reviewer verdicts
|
||||
- **Token target:** ~2500 tokens input
|
||||
|
||||
## Token Budget Targets
|
||||
|
||||
| Archetype | Fast | Standard | Thorough |
|
||||
|-----------|------|----------|----------|
|
||||
| Explorer | skip | 2000 | 3000 |
|
||||
| Creator | 2000 | 3000 | 4000 |
|
||||
| Maker | 2000 | 2500 | 3000 |
|
||||
| Guardian | 1500 | 2000 | 2500 |
|
||||
| Skeptic | skip | 1500 | 2000 |
|
||||
| Trickster | skip | skip | 1500 |
|
||||
| Sage | skip | 2500 | 3000 |
|
||||
|
||||
"skip" means the archetype is not spawned in that workflow tier.
|
||||
|
||||
## Cycle-Back Filtering
|
||||
|
||||
When injecting prior-cycle feedback into cycle 2+:
|
||||
|
||||
1. **Summary only** — pass the structured feedback table (issue, source, severity), not full reviewer artifacts
|
||||
2. **Strip resolved items** — if a finding was marked Fixed in the Act phase, exclude it
|
||||
3. **Compress context** — prior proposal diffs reduce to "What Changed" section only (not full re-proposal)
|
||||
4. **Cap at 500 tokens** — if feedback exceeds this, summarize by severity (CRITICAL first, then WARNING, drop INFO)
|
||||
|
||||
## Filter Verification Checklist
|
||||
|
||||
Before spawning each agent, verify:
|
||||
|
||||
- [ ] Prompt contains ONLY the artifacts listed in that archetype's "Receives" above
|
||||
- [ ] No cross-contamination from other reviewers' outputs
|
||||
- [ ] Token count is within 20% of the target for the current workflow tier
|
||||
- [ ] Prior-cycle feedback (if any) is summarized, not raw
|
||||
- [ ] Excluded artifacts are genuinely absent (search for keywords like file paths from excluded sources)
|
||||
|
||||
## Context Isolation
|
||||
|
||||
Attention filters control *what* each agent receives. Context isolation controls *how* that context is constructed — ensuring agents operate on provided facts, not ambient knowledge.
|
||||
|
||||
### Rules
|
||||
|
||||
1. **No session bleed.** Agents receive fresh context only — constructed from task description, artifact files, or extracted sections. They must not inherit session state, chat history, or prior agent prompts.
|
||||
2. **No cross-agent contamination.** An agent receives another agent's output only if the attention filter table above explicitly allows it. Guardian does not see Skeptic's output. Skeptic does not see the Maker's diff. Violations produce unreliable reviews.
|
||||
3. **Controller-constructed only.** All agent context is assembled by the orchestrator from: (a) the task description, (b) artifact files on disk, or (c) extracted sections of those artifacts. Agents never pull their own context.
|
||||
4. **No ambient knowledge.** Agents cannot "remember" findings from prior phases or cycles unless that information is explicitly injected via the cycle-back filtering protocol above. An agent that references information not in its prompt is hallucinating.
|
||||
5. **Verification.** Before spawning each agent, confirm the constructed prompt has zero references to other agents' raw outputs that are not in the "Receives" column. Search for file paths, archetype names, and finding descriptions from excluded sources.
|
||||
@@ -1,221 +1,70 @@
|
||||
---
|
||||
name: autonomous-mode
|
||||
description: Use when the user wants to run ArcheFlow orchestrations unattended — overnight sessions, batch processing multiple tasks, or fully autonomous coding. Handles self-organization, progress logging, and safe stopping.
|
||||
description: Use when the user wants to run ArcheFlow orchestrations unattended -- overnight sessions, batch processing multiple tasks, or fully autonomous coding. Handles self-organization, progress logging, and safe stopping.
|
||||
---
|
||||
|
||||
# Autonomous Mode
|
||||
|
||||
ArcheFlow orchestrations can run fully autonomously because the archetypes self-organize through the PDCA cycle. The user sets the task queue, walks away, and reviews results later.
|
||||
|
||||
## How Autonomous Mode Works
|
||||
|
||||
The PDCA cycle provides natural quality gates at every turn of the spiral:
|
||||
- **Plan** phase produces a proposal — reviewable artifact
|
||||
- **Do** phase produces committed code in a worktree — isolated, reversible
|
||||
- **Check** phase produces approval/rejection — automatic quality control
|
||||
- **Act** phase either merges (safe) or cycles back (self-correcting)
|
||||
|
||||
No unreviewed code reaches the main branch. Ever. That's what makes overnight runs safe.
|
||||
|
||||
## Starting an Autonomous Session
|
||||
|
||||
```
|
||||
You are entering AUTONOMOUS MODE.
|
||||
|
||||
Task queue:
|
||||
1. "Add input validation to all API endpoints" (thorough)
|
||||
2. "Refactor auth middleware to use JWT" (standard)
|
||||
3. "Fix pagination bug in search results" (fast)
|
||||
4. "Add rate limiting to public endpoints" (standard)
|
||||
|
||||
Rules:
|
||||
- Process tasks sequentially (one orchestration at a time)
|
||||
- Log progress to .archeflow/session-log.md after each task
|
||||
- If a task fails after max cycles: log findings, skip to next task
|
||||
- If 3 consecutive tasks fail: STOP and wait for user
|
||||
- Commit and push after each successful merge
|
||||
- Never force-push. Never modify main history.
|
||||
```
|
||||
|
||||
## Session Log — Full Visibility
|
||||
|
||||
Every autonomous session writes to `.archeflow/session-log.md`:
|
||||
|
||||
```markdown
|
||||
# ArcheFlow Autonomous Session
|
||||
**Started:** 2026-04-02 22:00 UTC
|
||||
**Mode:** autonomous
|
||||
**Tasks:** 4 queued
|
||||
|
||||
---
|
||||
|
||||
## Task 1: Add input validation to all API endpoints
|
||||
**Workflow:** thorough | **Status:** COMPLETED
|
||||
**Cycles:** 2 of 3
|
||||
**Cycle 1:** Guardian REJECTED (missing sanitization on 2 endpoints)
|
||||
**Cycle 2:** All APPROVED
|
||||
**Files changed:** 8 | **Tests added:** 24
|
||||
**Branch:** merged to main (commit abc1234)
|
||||
**Duration:** 12 min | **Completed:** 22:12 UTC
|
||||
|
||||
---
|
||||
|
||||
## Task 2: Refactor auth middleware to use JWT
|
||||
**Workflow:** standard | **Status:** COMPLETED
|
||||
**Cycles:** 1 of 2
|
||||
**Cycle 1:** All APPROVED (clean implementation)
|
||||
**Files changed:** 5 | **Tests added:** 15
|
||||
**Branch:** merged to main (commit def5678)
|
||||
**Duration:** 8 min | **Completed:** 22:20 UTC
|
||||
|
||||
---
|
||||
|
||||
## Task 3: Fix pagination bug in search results
|
||||
**Workflow:** fast | **Status:** COMPLETED
|
||||
**Cycles:** 1 of 1
|
||||
**Cycle 1:** Guardian APPROVED
|
||||
**Files changed:** 2 | **Tests added:** 3
|
||||
**Branch:** merged to main (commit ghi9012)
|
||||
**Duration:** 4 min | **Completed:** 22:24 UTC
|
||||
|
||||
---
|
||||
|
||||
## Task 4: Add rate limiting to public endpoints
|
||||
**Workflow:** standard | **Status:** FAILED (max cycles)
|
||||
**Cycles:** 2 of 2
|
||||
**Cycle 1:** Skeptic REJECTED (Redis dependency not in Docker setup)
|
||||
**Cycle 2:** Guardian REJECTED (race condition in token bucket)
|
||||
**Unresolved:** Race condition in concurrent token bucket decrement
|
||||
**Branch:** archeflow/maker-xyz (NOT merged — available for manual review)
|
||||
**Duration:** 15 min | **Completed:** 22:39 UTC
|
||||
|
||||
---
|
||||
|
||||
## Session Summary
|
||||
**Completed:** 3 of 4 tasks
|
||||
**Failed:** 1 (rate limiting — needs human input on concurrency design)
|
||||
**Total duration:** 39 min
|
||||
**Files changed:** 15 | **Tests added:** 42
|
||||
**Ended:** 22:39 UTC
|
||||
```
|
||||
|
||||
## Safety Mechanisms
|
||||
|
||||
### Automatic Stop Conditions
|
||||
The session halts and waits for the user when:
|
||||
- **3 consecutive failures:** Something systemic is wrong
|
||||
- **Destructive action detected:** Force push, branch deletion, schema drop
|
||||
- **Shadow escalation:** Same shadow detected 3+ times across tasks
|
||||
- **Budget exceeded:** If cost tracking is enabled, stop at budget limit
|
||||
- **Test suite broken:** If existing tests fail after merge, halt immediately and revert
|
||||
|
||||
### Everything is Reversible
|
||||
- Code changes live on worktree branches until explicitly merged
|
||||
- Merges use `--no-ff` — every merge commit is individually revertable
|
||||
- The session log captures every decision for post-hoc review
|
||||
- Failed tasks leave their branches intact for manual inspection
|
||||
|
||||
### User Controls
|
||||
The user can at any time:
|
||||
- **Cancel:** Kill the session. All incomplete work stays on branches.
|
||||
- **Pause:** Stop after current task completes. Resume later.
|
||||
- **Skip:** Skip the current task, move to the next one.
|
||||
- **Review:** Read `.archeflow/session-log.md` for real-time progress.
|
||||
- **Intervene:** Jump into a worktree branch and fix something manually.
|
||||
ArcheFlow orchestrations run fully autonomously through the PDCA cycle's natural quality gates. No unreviewed code reaches main.
|
||||
|
||||
## Task Queue Formats
|
||||
|
||||
### Simple (inline)
|
||||
**Inline:**
|
||||
```
|
||||
Tasks:
|
||||
1. "Fix the login bug" (fast)
|
||||
2. "Add user profile page" (standard)
|
||||
```
|
||||
|
||||
### From File
|
||||
Create `.archeflow/queue.md`:
|
||||
**From file (`.archeflow/queue.md`):**
|
||||
```markdown
|
||||
- [ ] Fix the login bug | fast
|
||||
- [ ] Add user profile page | standard
|
||||
- [ ] Security audit of payment flow | thorough
|
||||
- [x] Refactor database queries | standard (completed)
|
||||
- [ ] Add user profile page | standard | depends: fix login
|
||||
- [ ] Security audit | thorough | done: Guardian approves AND load_test.sh passes
|
||||
```
|
||||
|
||||
### With Dependencies
|
||||
```markdown
|
||||
- [ ] Add user model (standard)
|
||||
- [ ] Add user API endpoints (standard) | depends: user model
|
||||
- [ ] Add user UI (standard) | depends: user API endpoints
|
||||
```
|
||||
Dependencies are processed in order: a task with `depends: X` waits until X completes successfully. Tasks without dependencies or with resolved dependencies can run in parallel (see Parallel Team Orchestration in the orchestration skill).
|
||||
Tasks with `depends:` wait for the named task to complete. Tasks with `done:` have completion criteria checked in the Act phase.
|
||||
|
||||
### With Completion Criteria
|
||||
```markdown
|
||||
- [ ] Fix login bug | fast | done: login_test.py passes
|
||||
- [ ] Add rate limiting | standard | done: Guardian approves AND load_test.sh passes
|
||||
```
|
||||
Completion criteria are checked in the Act phase. If the test command fails even when reviewers approve, the task cycles back.
|
||||
## Safety Mechanisms
|
||||
|
||||
### Automatic Stop Conditions
|
||||
|
||||
- **3 consecutive failures:** Something systemic is wrong
|
||||
- **Test suite broken:** Halt immediately, revert last merge
|
||||
- **Budget exceeded:** Stop at limit
|
||||
- **Shadow escalation:** Same shadow detected 3+ times across tasks
|
||||
- **Destructive action detected:** Force push, branch deletion, schema drop
|
||||
|
||||
### Everything is Reversible
|
||||
|
||||
- Code lives on worktree branches until explicitly merged
|
||||
- Merges use `--no-ff` (individually revertable)
|
||||
- Failed tasks leave branches intact for inspection
|
||||
|
||||
### User Controls
|
||||
|
||||
- **Cancel:** Kill session, incomplete work stays on branches
|
||||
- **Pause:** Stop after current task, resume later
|
||||
- **Skip:** Move to next task
|
||||
- **Review:** Read `.archeflow/session-log.md` for progress
|
||||
|
||||
## Session Log
|
||||
|
||||
Every session writes to `.archeflow/session-log.md` with per-task entries:
|
||||
- Workflow, status, cycles, reviewer verdicts
|
||||
- Files changed, tests added
|
||||
- Branch and commit info
|
||||
- Duration and timestamps
|
||||
- Session summary at the end
|
||||
|
||||
## Budget-Aware Scheduling
|
||||
|
||||
Set a token or cost budget for the session. The orchestrator tracks estimated cost per task and adapts:
|
||||
|
||||
```
|
||||
Budget: $5.00 (or ~2M tokens)
|
||||
```
|
||||
|
||||
| Budget Remaining | Action |
|
||||
|-----------------|--------|
|
||||
| > 50% | Run tasks at their selected workflow level |
|
||||
| 25-50% | Downgrade `thorough` → `standard`, `standard` → `fast` |
|
||||
| < 25% | Run remaining tasks as `fast` only |
|
||||
| Exhausted | Stop. Log remaining tasks as "skipped — budget exhausted" |
|
||||
| > 50% | Run at selected workflow level |
|
||||
| 25-50% | Downgrade thorough to standard, standard to fast |
|
||||
| < 25% | All tasks as fast only |
|
||||
| Exhausted | Stop, log remaining as skipped |
|
||||
|
||||
Budget is tracked per-task in the session log. Estimated cost per agent by model tier:
|
||||
## Auto-Resume
|
||||
|
||||
| Tier | Model | Est. Cost/Agent |
|
||||
|------|-------|----------------|
|
||||
| cheap | Haiku | ~$0.01 |
|
||||
| standard | Sonnet | ~$0.05 |
|
||||
| premium | Opus | ~$0.25 |
|
||||
|
||||
A standard workflow (6 agents, mostly Sonnet) costs ~$0.30. A thorough workflow (8 agents) costs ~$0.50. These are rough estimates — actual cost depends on context size and output length.
|
||||
|
||||
## Auto-Resume on Interruption
|
||||
|
||||
If a session is interrupted (crash, timeout, user cancel), save state for resumption:
|
||||
|
||||
### On Interruption
|
||||
Write `.archeflow/state.json`:
|
||||
```json
|
||||
{
|
||||
"session_id": "...",
|
||||
"current_task": 2,
|
||||
"current_phase": "check",
|
||||
"current_cycle": 1,
|
||||
"completed_tasks": [1],
|
||||
"queue": ["task3", "task4"],
|
||||
"worktree_branch": "archeflow/maker-abc",
|
||||
"timestamp": "2026-04-03T22:15:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
### On Next Session Start
|
||||
If `.archeflow/state.json` exists:
|
||||
1. Report: "Found interrupted ArcheFlow session from [timestamp]. Task [N] was in [phase] phase."
|
||||
2. Offer: "Resume from where we left off? Or start fresh?"
|
||||
3. If resume: pick up from the saved phase. The worktree branch is still intact.
|
||||
4. If fresh: clean up state file and worktrees, start over.
|
||||
|
||||
## Overnight Session Checklist
|
||||
|
||||
Before starting an autonomous overnight session:
|
||||
|
||||
1. **Clean working tree:** `git status` — no uncommitted changes
|
||||
2. **Tests passing:** Run the full test suite. Don't start on a broken baseline.
|
||||
3. **Task queue defined:** Either inline or in `.archeflow/queue.md`
|
||||
4. **Workflow selected per task:** Match risk level to workflow type
|
||||
5. **Budget set (optional):** If cost matters, set a token/dollar limit
|
||||
6. **Push access:** Verify git push works (SSH key, auth token)
|
||||
|
||||
Then: set it, forget it, read the session log in the morning.
|
||||
On interruption, save state to `.archeflow/state.json` (current task, phase, cycle, completed tasks, worktree branch). On next session start, offer to resume or start fresh.
|
||||
|
||||
@@ -1,233 +1,110 @@
|
||||
---
|
||||
name: check-phase
|
||||
description: Use when you are acting as Guardian, Skeptic, Sage, or Trickster archetype in the Check phase. Defines shared review rules and output format.
|
||||
description: Use when acting as Guardian, Skeptic, Sage, or Trickster in the Check phase. Defines review rules, finding format, attention filters, and spawning protocol.
|
||||
---
|
||||
|
||||
# Check Phase
|
||||
|
||||
Multiple reviewers examine the Maker's implementation in parallel. Each agent definition has its specific protocol — this skill defines the shared rules.
|
||||
Reviewers examine the Maker's implementation. This skill defines shared rules, finding format, and spawning protocol.
|
||||
|
||||
## Shared Rules
|
||||
|
||||
1. **Read the proposal first.** Review against the intended design, not invented requirements.
|
||||
2. **Read the actual code.** Use `git diff` on the Maker's branch. Don't review descriptions alone.
|
||||
3. **Structured findings.** Use the standardized finding format below for every issue.
|
||||
4. **Clear verdict:** `APPROVED` or `REJECTED` with rationale.
|
||||
5. **Status tokens are separate from verdicts.** The `STATUS: DONE` line signals the agent finished successfully. The `APPROVED`/`REJECTED` verdict is domain output. A reviewer can be `STATUS: DONE` with verdict `REJECTED` — that is normal. Parse both independently.
|
||||
1. Review against the proposal's intended design, not invented requirements.
|
||||
2. Read actual code via `git diff` on the Maker's branch.
|
||||
3. Use the finding format below for every issue.
|
||||
4. Give a clear verdict: `APPROVED` or `REJECTED` with rationale.
|
||||
5. `STATUS: DONE` signals agent completion. `APPROVED`/`REJECTED` is domain output. Both are parsed independently.
|
||||
|
||||
## Finding Format
|
||||
|
||||
Every finding must use this format for cross-cycle tracking:
|
||||
|
||||
```
|
||||
| Location | Severity | Category | Description | Fix |
|
||||
|----------|----------|----------|-------------|-----|
|
||||
| src/auth/handler.ts:48 | CRITICAL | security | Empty string bypasses validation | Add length check before processing |
|
||||
```
|
||||
| src/auth/handler.ts:48 | CRITICAL | security | Empty string bypasses validation | Add length check |
|
||||
|
||||
**Severity:**
|
||||
- **CRITICAL** — Must fix. Blocks approval.
|
||||
- **WARNING** — Should fix. Doesn't block alone.
|
||||
- **INFO** — Nice to have. Never blocks.
|
||||
**Severity:** CRITICAL = must fix, blocks approval. WARNING = should fix, doesn't block alone. INFO = nice to have, never blocks.
|
||||
|
||||
**Categories** (use consistently for cross-cycle tracking):
|
||||
- `security` — Injection, auth bypass, data exposure, secrets
|
||||
- `reliability` — Error handling, edge cases, race conditions, crashes
|
||||
- `design` — Architecture, assumptions, scalability, coupling
|
||||
- `breaking-change` — API compatibility, schema migrations, removals
|
||||
- `dependency` — New deps, version conflicts, license issues
|
||||
- `quality` — Readability, maintainability, naming, duplication
|
||||
- `testing` — Missing tests, weak assertions, untested paths
|
||||
- `consistency` — Deviates from codebase patterns
|
||||
**Categories:** `security` `reliability` `design` `breaking-change` `dependency` `quality` `testing` `consistency`
|
||||
|
||||
## Consolidated Output
|
||||
## Evidence Requirements
|
||||
|
||||
After all reviewers finish, compile:
|
||||
Every CRITICAL or WARNING must include concrete evidence. Without evidence, downgrade to INFO.
|
||||
|
||||
```markdown
|
||||
## Check Phase Results — Cycle N
|
||||
**Valid evidence:** command output, exit codes, code citations with line numbers, git diff excerpts, reproduction steps.
|
||||
|
||||
### Guardian: APPROVED
|
||||
| Location | Severity | Category | Description | Fix |
|
||||
|----------|----------|----------|-------------|-----|
|
||||
| src/auth/handler.ts:52 | WARNING | security | Missing rate limit | Add rate limiter middleware |
|
||||
**Banned in CRITICAL/WARNING:** "might be", "could potentially", "appears to", "seems like", "may not". Rewrite with evidence or downgrade.
|
||||
|
||||
### Skeptic: APPROVED
|
||||
| Location | Severity | Category | Description | Fix |
|
||||
|----------|----------|----------|-------------|-----|
|
||||
| src/auth/handler.ts:30 | INFO | design | Consider caching validated tokens | Add TTL cache for token validation |
|
||||
For each CRITICAL/WARNING, state: (1) what was tested, (2) what was observed, (3) what correct behavior should be.
|
||||
|
||||
### Sage: APPROVED
|
||||
| Location | Severity | Category | Description | Fix |
|
||||
|----------|----------|----------|-------------|-----|
|
||||
| tests/auth.test.ts:15 | WARNING | testing | Test names don't describe behavior | Rename to "should reject expired tokens" |
|
||||
## Attention Filters
|
||||
|
||||
### Trickster: REJECTED
|
||||
| Location | Severity | Category | Description | Fix |
|
||||
|----------|----------|----------|-------------|-----|
|
||||
| src/auth/handler.ts:48 | CRITICAL | reliability | Empty string bypasses validation | Add `if (!token || token.trim() === '')` guard |
|
||||
Each archetype receives only relevant context. Do not pass everything.
|
||||
|
||||
### Deduplication
|
||||
If two reviewers raise the same issue (same file + same category), merge:
|
||||
| Guardian + Skeptic | CRITICAL | security | Input not sanitized (src/api.ts:30) | Add validation |
|
||||
| Archetype | Receives | Excludes |
|
||||
|-----------|----------|----------|
|
||||
| Guardian | Maker's git diff + proposal risk section + test results | Explorer research, Creator rationale, other reviewers |
|
||||
| Skeptic | Creator's proposal (assumptions + architecture) + confidence scores | Git diff, Explorer research, other reviewers |
|
||||
| Sage | Creator's proposal + Maker's diff + implementation summary + test results | Explorer raw research, other reviewer verdicts |
|
||||
| Trickster | Maker's git diff + attack surface summary (file types + entry points) | Proposal, research, other reviewers |
|
||||
|
||||
Use the higher severity. Don't double-count in the verdict.
|
||||
**Token budget targets:**
|
||||
|
||||
### Verdict: REJECTED — 1 critical finding
|
||||
→ Build cycle feedback (see orchestration skill) and feed to Plan phase
|
||||
```
|
||||
| Archetype | Fast | Standard | Thorough |
|
||||
|-----------|------|----------|----------|
|
||||
| Guardian | 1500 | 2000 | 2500 |
|
||||
| Skeptic | skip | 1500 | 2000 |
|
||||
| Trickster | skip | skip | 1500 |
|
||||
| Sage | skip | 2500 | 3000 |
|
||||
|
||||
**Context isolation:** Agents receive fresh, controller-constructed context only. No session bleed, no cross-agent contamination, no ambient knowledge. Verify zero references to excluded artifacts before spawning.
|
||||
|
||||
**Cycle-back filtering (cycle 2+):** Pass structured feedback table only (not full reviewer artifacts). Strip resolved items. Cap at 500 tokens — summarize by severity if exceeded.
|
||||
|
||||
## Reviewer Spawning Protocol
|
||||
|
||||
This section defines the exact sequence for spawning reviewers in the Check phase.
|
||||
|
||||
### Step 1: Guardian First (mandatory)
|
||||
|
||||
Guardian always runs first, before any other reviewer. It receives the Maker's git diff and the proposal's risk section only.
|
||||
|
||||
**Context for Guardian:**
|
||||
- `git diff main...<maker-branch>` (the actual code changes)
|
||||
- Risk section from `plan-creator.md` (if present)
|
||||
- Do NOT include: Explorer research, full proposal, other reviewer outputs
|
||||
|
||||
```
|
||||
Agent(
|
||||
description: "Guardian: security and risk review for <task>",
|
||||
prompt: "You are the GUARDIAN archetype.
|
||||
Review the diff: <maker's diff>
|
||||
Proposal risks: <risk section from plan-creator.md>
|
||||
Assess: security vulnerabilities, reliability risks, breaking changes, dependency risks.
|
||||
Output: APPROVED or REJECTED with findings in the standardized format.
|
||||
Each finding: | Location | Severity | Category | Description | Fix |",
|
||||
model: <resolve_model guardian $WORKFLOW>
|
||||
)
|
||||
```
|
||||
Guardian always runs first. It receives the Maker's git diff and the proposal's risk section only.
|
||||
|
||||
Save output to `.archeflow/artifacts/${RUN_ID}/check-guardian.md`.
|
||||
|
||||
### Step 2: A2 Fast-Path Evaluation
|
||||
|
||||
After Guardian completes, parse its output before spawning other reviewers:
|
||||
After Guardian completes, count CRITICAL and WARNING findings in its output. If both are zero, and not escalated, and not first cycle of a thorough workflow — skip remaining reviewers and proceed to Act phase.
|
||||
|
||||
```bash
|
||||
CRITICAL_COUNT=$(grep -c "| CRITICAL |" ".archeflow/artifacts/${RUN_ID}/check-guardian.md" || echo 0)
|
||||
WARNING_COUNT=$(grep -c "| WARNING |" ".archeflow/artifacts/${RUN_ID}/check-guardian.md" || echo 0)
|
||||
### Step 3: Parallel Remaining Reviewers
|
||||
|
||||
# A2 fast-path: skip remaining reviewers if Guardian is clean
|
||||
# Exception: first cycle of thorough workflows always spawns all reviewers
|
||||
if [[ "$CRITICAL_COUNT" -eq 0 && "$WARNING_COUNT" -eq 0 \
|
||||
&& "$ESCALATED" != "true" \
|
||||
&& ! ("$WORKFLOW" == "thorough" && "$CYCLE" -eq 1) ]]; then
|
||||
echo "Guardian fast-path: 0 CRITICAL, 0 WARNING — skipping remaining reviewers."
|
||||
# Proceed directly to Act phase
|
||||
fi
|
||||
```
|
||||
|
||||
### Step 3: Parallel Reviewer Spawning
|
||||
|
||||
If A2 does not trigger, spawn remaining reviewers in parallel based on workflow:
|
||||
If A2 does not trigger, spawn remaining reviewers in parallel:
|
||||
|
||||
| Workflow | Reviewers (after Guardian) |
|
||||
|----------|--------------------------|
|
||||
| `fast` | None (Guardian only) |
|
||||
| `fast` (escalated via A1) | Skeptic + Sage |
|
||||
| `fast` (escalated) | Skeptic + Sage |
|
||||
| `standard` | Skeptic + Sage |
|
||||
| `thorough` | Skeptic + Sage + Trickster |
|
||||
|
||||
Spawn all applicable reviewers in a single message with multiple Agent calls:
|
||||
Each reviewer gets context per the attention filters above.
|
||||
|
||||
```
|
||||
# Standard workflow example — spawn Skeptic and Sage in parallel:
|
||||
Agent(
|
||||
description: "Skeptic: challenge assumptions for <task>",
|
||||
prompt: "<Skeptic prompt with Creator's proposal>",
|
||||
model: <resolve_model skeptic $WORKFLOW>
|
||||
)
|
||||
### Step 4: Collect and Consolidate
|
||||
|
||||
Agent(
|
||||
description: "Sage: holistic quality review for <task>",
|
||||
prompt: "<Sage prompt with proposal + diff + implementation summary>",
|
||||
model: <resolve_model sage $WORKFLOW>
|
||||
)
|
||||
For each reviewer: save to `.archeflow/artifacts/${RUN_ID}/check-<archetype>.md`, emit `review.verdict` event, record sequence number.
|
||||
|
||||
**Deduplication:** If two reviewers raise the same issue (same file + same category), merge into one finding using the higher severity. Don't double-count.
|
||||
|
||||
**Verdict:** Count CRITICAL findings across all reviewers (after dedup). Any CRITICAL = `REJECTED`. Otherwise `APPROVED`.
|
||||
|
||||
Example consolidated output:
|
||||
|
||||
```markdown
|
||||
## Check Phase Results — Cycle 1
|
||||
### Guardian: APPROVED
|
||||
| Location | Severity | Category | Description | Fix |
|
||||
|----------|----------|----------|-------------|-----|
|
||||
| src/auth.ts:52 | WARNING | security | Missing rate limit | Add rate limiter |
|
||||
### Verdict: APPROVED — 0 critical, 1 warning
|
||||
```
|
||||
|
||||
Each reviewer gets context per the attention filters defined in `archeflow:orchestration`:
|
||||
- **Skeptic:** Creator's proposal (assumptions section focus)
|
||||
- **Sage:** Creator's proposal + Maker's diff + implementation summary
|
||||
- **Trickster:** Maker's diff only
|
||||
## Timeout Handling
|
||||
|
||||
### Step 4: Collect Results
|
||||
Each reviewer has a **5-minute timeout**. On timeout: emit `agent.complete` with `"error": true`, log WARNING, treat as no findings, proceed.
|
||||
|
||||
Wait for all spawned reviewers to return. For each:
|
||||
1. Save output to `.archeflow/artifacts/${RUN_ID}/check-<archetype>.md`
|
||||
2. Emit `review.verdict` event with findings
|
||||
3. Record sequence number for DAG parent tracking
|
||||
|
||||
### Timeout Handling
|
||||
|
||||
Each reviewer has a **5-minute timeout**. If a reviewer does not return within 5 minutes:
|
||||
1. Emit `agent.complete` with `"error": true, "reason": "timeout"`
|
||||
2. Log a WARNING — do not block the run
|
||||
3. Treat the timed-out reviewer as having delivered no findings (neither approved nor rejected)
|
||||
4. Proceed with available verdicts
|
||||
|
||||
If Guardian times out, this is a blocking failure — abort the Check phase and report to the user.
|
||||
|
||||
### Re-Check Protocol (Act Phase Fixes)
|
||||
|
||||
When the Act phase routes findings back to the Maker and the Maker applies fixes in a subsequent cycle, the Check phase re-runs with the updated diff. Reviewers who previously rejected should focus on whether their specific findings were addressed. The structured feedback from `act-feedback.md` provides the mapping of which findings were routed where.
|
||||
|
||||
---
|
||||
|
||||
## Evidence Requirements
|
||||
|
||||
Every CRITICAL or WARNING finding must include concrete evidence. Findings without evidence are downgraded to INFO.
|
||||
|
||||
### Evidence Types
|
||||
|
||||
| Type | Example | When Required |
|
||||
|------|---------|---------------|
|
||||
| Command output | `npm test` output showing failure | Test-related findings |
|
||||
| Exit code | `exit code 1 from eslint` | Tool-based validation |
|
||||
| Code citation | `src/auth.ts:48 — \`if (token) { ... }\`` | Logic or security findings |
|
||||
| Git diff | `+ db.query(userInput)` (unsanitized) | Implementation review |
|
||||
| Reproduction steps | "1. Send POST with empty body, 2. Observe 500" | Runtime behavior findings |
|
||||
|
||||
### Banned Phrases
|
||||
|
||||
The following phrases are not permitted in CRITICAL or WARNING findings. They indicate speculation, not evidence:
|
||||
|
||||
- "might be"
|
||||
- "could potentially"
|
||||
- "appears to"
|
||||
- "seems like"
|
||||
- "may not"
|
||||
|
||||
A finding using these phrases must either be rewritten with evidence or downgraded to INFO.
|
||||
|
||||
### Verification Protocol
|
||||
|
||||
For each CRITICAL or WARNING finding, state:
|
||||
|
||||
1. **What was tested** — the specific code path, input, or scenario examined
|
||||
2. **What was observed** — the actual behavior or code construct found
|
||||
3. **What correct behavior should be** — the expected alternative
|
||||
|
||||
### Downgrade Rule
|
||||
|
||||
If a reviewer produces a CRITICAL or WARNING finding without any of the evidence types above, the orchestrator downgrades it to INFO and emits a `decision` event:
|
||||
|
||||
```bash
|
||||
./lib/archeflow-event.sh "$RUN_ID" decision check "" \
|
||||
'{"what":"evidence_downgrade","from":"CRITICAL","to":"INFO","finding":"<description>","reviewer":"<archetype>","reason":"no evidence provided"}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Why Structured Findings Matter
|
||||
|
||||
The standardized format enables:
|
||||
- **Cross-cycle tracking:** Same category + location = same issue. Can detect resolution or regression.
|
||||
- **Feedback routing:** Security/design findings → Creator. Quality/testing findings → Maker.
|
||||
- **Shadow detection:** CRITICAL:WARNING ratios, finding counts, and category distributions are measurable.
|
||||
- **Metrics:** Severity counts feed into the orchestration summary.
|
||||
**Exception:** Guardian timeout is blocking — abort Check phase and report to user.
|
||||
|
||||
@@ -9,384 +9,91 @@ description: |
|
||||
<example>User: "archeflow:run" in a project with colette.yaml</example>
|
||||
---
|
||||
|
||||
# Colette Bridge — Writing Context Auto-Loader
|
||||
# Colette Bridge -- Writing Context Auto-Loader
|
||||
|
||||
When ArcheFlow detects `colette.yaml` in the project root, this skill automatically loads voice profiles, personas, character sheets, and project rules into a context bundle that every agent receives (filtered by archetype role).
|
||||
When `colette.yaml` exists in the project root, this skill loads voice profiles, personas, character sheets, and project rules into a context bundle filtered per archetype.
|
||||
|
||||
## Prerequisites
|
||||
## Activation
|
||||
|
||||
- `archeflow:domains` — Colette Bridge sets domain to `writing` automatically
|
||||
- `archeflow:artifact-routing` — bundle is injected via the artifact routing system
|
||||
- `archeflow:run` — bridge hooks into run initialization
|
||||
|
||||
## Trigger
|
||||
|
||||
At `run.start`, after domain detection but before the Plan phase:
|
||||
|
||||
1. Check if `colette.yaml` exists in the project root
|
||||
2. If found, activate Colette Bridge
|
||||
3. If not found, skip silently (no error, no warning)
|
||||
|
||||
When the bridge activates, it emits a decision event:
|
||||
|
||||
```bash
|
||||
./lib/archeflow-event.sh "$RUN_ID" decision init "" \
|
||||
'{"what":"colette_bridge","chosen":"activated","signal":"colette.yaml found","files_resolved":<count>}'
|
||||
```
|
||||
|
||||
---
|
||||
At `run.start`, after domain detection but before Plan phase:
|
||||
1. Check for `colette.yaml` in project root
|
||||
2. If found: activate bridge, set domain to `writing`
|
||||
3. If not found: skip silently
|
||||
|
||||
## File Resolution
|
||||
|
||||
Colette projects reference files by ID (e.g., `vp-giesing-gschichten-v1`) but the actual YAML files may live in different locations. The bridge resolves files using this search order:
|
||||
Colette projects reference files by ID (e.g., `vp-giesing-gschichten-v1`). The bridge resolves them:
|
||||
|
||||
### Search Priority (highest first)
|
||||
| Priority | Location |
|
||||
|----------|----------|
|
||||
| 1 | Explicit path in `colette.yaml` (has `/` or `.yaml`) |
|
||||
| 2 | Project root subdirectories (`./profiles/<id>.yaml`) |
|
||||
| 3 | Parent `writing.colette/` dir (`../writing.colette/profiles/<id>.yaml`) |
|
||||
|
||||
| Priority | Location | Example |
|
||||
|----------|----------|---------|
|
||||
| 1 | Explicit path in `colette.yaml` | `voice.profile: ../writing.colette/profiles/custom.yaml` |
|
||||
| 2 | Project root subdirectories | `./profiles/vp-giesing-gschichten-v1.yaml` |
|
||||
| 3 | Parent directory + `writing.colette/` | `../writing.colette/profiles/vp-giesing-gschichten-v1.yaml` |
|
||||
**What gets resolved:**
|
||||
|
||||
### What Gets Resolved
|
||||
|
||||
| Source | colette.yaml field | Search paths |
|
||||
|--------|-------------------|-------------|
|
||||
| Voice profile | `voice.profile` | `profiles/<id>.yaml`, `../writing.colette/profiles/<id>.yaml` |
|
||||
| Persona | `writing.persona` or inferred from profile | `personas/<id>.yaml`, `../writing.colette/personas/<id>.yaml` |
|
||||
| Source | colette.yaml field | Search subdirs |
|
||||
|--------|-------------------|----------------|
|
||||
| Voice profile | `voice.profile` | `profiles/` |
|
||||
| Persona | `writing.persona` or inferred from profile | `personas/` |
|
||||
| Characters | Auto-discovered | `characters/*.yaml` |
|
||||
| Series config | `series` section (if present) | `colette.yaml` itself, `../writing.colette/series/<name>.yaml` |
|
||||
| Series config | `series` section | `colette.yaml` itself |
|
||||
| Project rules | Always | `CLAUDE.md` in project root |
|
||||
|
||||
### Resolution Procedure
|
||||
|
||||
```
|
||||
for each reference in colette.yaml:
|
||||
1. If the field contains a path (has / or .yaml) → use as-is, verify exists
|
||||
2. If the field contains an ID (e.g., "vp-giesing-gschichten-v1"):
|
||||
a. Check ./profiles/<id>.yaml (or ./personas/<id>.yaml)
|
||||
b. Check ../writing.colette/profiles/<id>.yaml (or ../writing.colette/personas/<id>.yaml)
|
||||
c. If not found → warn in event log, skip this file
|
||||
3. For characters/ → glob characters/*.yaml in project root
|
||||
4. For CLAUDE.md → check project root
|
||||
```
|
||||
|
||||
If a referenced file cannot be found at any location, emit a warning event but do not abort:
|
||||
|
||||
```bash
|
||||
./lib/archeflow-event.sh "$RUN_ID" decision init "" \
|
||||
'{"what":"colette_bridge_warning","chosen":"skip","file":"vp-giesing-gschichten-v1","reason":"not found in any search path"}'
|
||||
```
|
||||
|
||||
---
|
||||
Missing files emit a warning event but do not abort the run.
|
||||
|
||||
## Context Bundle
|
||||
|
||||
The bridge generates `.archeflow/context/colette-bundle.md` — a summarized, token-efficient Markdown file that agents receive as part of their prompt context.
|
||||
Generated at `.archeflow/context/colette-bundle.md`. Summarized, not raw YAML. Target: under 1500 tokens.
|
||||
|
||||
### Bundle Structure
|
||||
|
||||
```markdown
|
||||
# Writing Context (auto-loaded from Colette)
|
||||
|
||||
## Voice Profile: <id>
|
||||
**Tone:** <tone_summary from meta>
|
||||
**Perspective:** <perspektive>
|
||||
**Density:** <dichte>
|
||||
**Attitude:** <haltung>
|
||||
**Sharpness:** <schaerfe>
|
||||
**Humor:** <humor>
|
||||
**Tempo:** <tempo>
|
||||
**Reader relationship:** <leser_beziehung>
|
||||
|
||||
### Forbidden
|
||||
- <each item from verboten>
|
||||
|
||||
### Allowed
|
||||
- <each item from erlaubt>
|
||||
|
||||
### Style models
|
||||
- <each item from vorbilder, name only + one-word tag>
|
||||
|
||||
## Persona: <id>
|
||||
**Name:** <name>
|
||||
**Bio:** <bio, max 2 sentences>
|
||||
**Genres:** <genres, comma-separated>
|
||||
|
||||
### Rules
|
||||
- <each item from rules>
|
||||
|
||||
## Characters
|
||||
### <name> (<role>)
|
||||
- **Age:** <age>
|
||||
- **Key traits:** <first 3 personality items>
|
||||
- **Speech:** <speech_pattern, first sentence only>
|
||||
- **Relationships:** <key relationships, one line each>
|
||||
|
||||
[Repeated for each character in characters/*.yaml]
|
||||
|
||||
## Series Context
|
||||
[Only if series config found in colette.yaml]
|
||||
- **Shared concepts:** <list>
|
||||
- **Glossary:** <key terms>
|
||||
- **Forbidden cross-story:** <items>
|
||||
|
||||
## Project Rules (from CLAUDE.md)
|
||||
[Key writing rules extracted from CLAUDE.md, summarized as bullet points]
|
||||
- <rule 1>
|
||||
- <rule 2>
|
||||
- ...
|
||||
```
|
||||
|
||||
### Summarization Rules
|
||||
|
||||
The bundle is **summarized**, not a raw YAML dump. This reduces token cost:
|
||||
|
||||
- Voice profile dimensions: key name + value (no YAML formatting, no `dimensionen:` wrapper)
|
||||
- Verboten/erlaubt: bullet list, strip explanation after the dash if over 15 words
|
||||
**Summarization rules:**
|
||||
- Voice dimensions: key + value (no YAML wrapper)
|
||||
- Verboten/erlaubt: bullet list, truncate items over 15 words
|
||||
- Characters: name, role, age, top 3 traits, first sentence of speech pattern, relationships
|
||||
- Persona bio: max 2 sentences
|
||||
- CLAUDE.md: extract only rules/style sections, skip meta/git/cost config
|
||||
- Target: bundle should be under 1500 tokens for a typical project
|
||||
|
||||
---
|
||||
- CLAUDE.md: only writing rules, skip meta/git/cost config
|
||||
|
||||
## Caching
|
||||
|
||||
The bundle is regenerated only when source files have changed. Cache validation uses file modification times.
|
||||
|
||||
### Cache Check Procedure
|
||||
|
||||
```
|
||||
bundle_path = .archeflow/context/colette-bundle.md
|
||||
|
||||
if bundle_path does not exist → generate
|
||||
if bundle_path exists:
|
||||
bundle_mtime = mtime of bundle_path
|
||||
for each resolved source file:
|
||||
if source_mtime > bundle_mtime → regenerate, break
|
||||
if no source file is newer → use cached bundle
|
||||
```
|
||||
|
||||
When the cache is valid, emit:
|
||||
|
||||
```bash
|
||||
./lib/archeflow-event.sh "$RUN_ID" decision init "" \
|
||||
'{"what":"colette_bundle_cache","chosen":"reuse","reason":"all sources older than bundle"}'
|
||||
```
|
||||
|
||||
When regenerating:
|
||||
|
||||
```bash
|
||||
./lib/archeflow-event.sh "$RUN_ID" decision init "" \
|
||||
'{"what":"colette_bundle_cache","chosen":"regenerate","reason":"<file> modified since last bundle"}'
|
||||
```
|
||||
|
||||
---
|
||||
Bundle regenerated only when source file mtimes are newer than the bundle. If all sources are older, reuse cached bundle.
|
||||
|
||||
## Per-Agent Attention Filters
|
||||
|
||||
Not every agent needs the full bundle. The bridge defines attention filters that control which sections each archetype receives. This extends the base attention filters from `archeflow:attention-filters`.
|
||||
Not every agent needs the full bundle:
|
||||
|
||||
| Archetype | Bundle sections injected | Rationale |
|
||||
|-----------|------------------------|-----------|
|
||||
| **Explorer** | Full bundle | Needs all context for research — setting, characters, voice, rules |
|
||||
| **Creator** | Voice dimensions + persona rules + characters | Designs outline — needs to know who speaks how, who exists, what's allowed |
|
||||
| **Maker** | Full bundle | Writes prose — needs voice for style, characters for dialogue, rules for guardrails |
|
||||
| **Guardian** | Characters + series shared_concepts | Checks consistency — needs character facts and cross-story constraints |
|
||||
| **Sage** | Voice profile (full, including verboten/erlaubt) + persona rules | Checks voice drift — needs the complete voice spec and persona constraints |
|
||||
| **Trickster** | Characters + series glossary | Tests continuity — needs character facts and terminology for contradiction checks |
|
||||
| Archetype | Receives |
|
||||
|-----------|----------|
|
||||
| Explorer | Full bundle |
|
||||
| Creator | Voice dimensions + persona rules + characters |
|
||||
| Maker | Full bundle |
|
||||
| Guardian | Characters + series shared_concepts |
|
||||
| Sage | Full voice profile (incl. verboten/erlaubt) + persona rules |
|
||||
| Trickster | Characters + series glossary |
|
||||
|
||||
### Filter Implementation
|
||||
|
||||
When injecting the bundle into an agent prompt, extract only the relevant sections:
|
||||
|
||||
```
|
||||
# For Guardian:
|
||||
Extract: "## Characters" section (all characters)
|
||||
Extract: "## Series Context" section (if present)
|
||||
Skip: everything else
|
||||
|
||||
# For Sage:
|
||||
Extract: "## Voice Profile" section (full, with forbidden/allowed)
|
||||
Extract: "## Persona" section (rules subsection)
|
||||
Skip: characters, series, project rules
|
||||
|
||||
# For Explorer and Maker:
|
||||
Inject: full bundle as-is
|
||||
```
|
||||
|
||||
The filtering happens at prompt assembly time, not at bundle generation time. One bundle, multiple filtered views.
|
||||
|
||||
### Custom Archetypes
|
||||
|
||||
Custom archetypes (e.g., `story-explorer`, `story-sage`) inherit the filter of their closest base archetype:
|
||||
|
||||
| Custom archetype | Inherits filter from | Override |
|
||||
|-----------------|---------------------|----------|
|
||||
| `story-explorer` | Explorer | Full bundle |
|
||||
| `story-sage` | Sage | Full voice profile + persona rules |
|
||||
| `story-guardian` | Guardian | Characters + series |
|
||||
|
||||
If a custom archetype needs a different filter, define it in the archetype's markdown frontmatter:
|
||||
Custom archetypes inherit the filter of their closest base archetype. Override with `colette_filter` in archetype frontmatter:
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: story-sage
|
||||
colette_filter: [voice_profile, persona, characters]
|
||||
---
|
||||
```
|
||||
|
||||
The `colette_filter` field accepts section keys: `voice_profile`, `persona`, `characters`, `series`, `project_rules`, `full`.
|
||||
Section keys: `voice_profile`, `persona`, `characters`, `series`, `project_rules`, `full`.
|
||||
|
||||
---
|
||||
|
||||
## Integration with Run Skill
|
||||
|
||||
The Colette Bridge hooks into `archeflow:run` initialization. The sequence is:
|
||||
## Run Integration
|
||||
|
||||
```
|
||||
run.start
|
||||
├── Domain detection (from archeflow:domains)
|
||||
│ └── colette.yaml found → domain = writing
|
||||
├── Colette Bridge activation
|
||||
│ ├── Resolve files (voice profile, persona, characters, CLAUDE.md)
|
||||
│ ├── Check bundle cache
|
||||
│ ├── Generate/refresh bundle → .archeflow/context/colette-bundle.md
|
||||
│ └── Register bundle path in artifact routing
|
||||
└── Continue to Plan phase
|
||||
+-- Domain detection -> colette.yaml found -> domain = writing
|
||||
+-- Colette Bridge activation
|
||||
| +-- Resolve files
|
||||
| +-- Check/refresh bundle cache
|
||||
| +-- Register bundle in artifact routing
|
||||
+-- Continue to Plan phase
|
||||
```
|
||||
|
||||
### Artifact Routing Registration
|
||||
|
||||
The bundle path is registered so that every phase's context injection includes the (filtered) bundle:
|
||||
|
||||
```
|
||||
artifact_routing.register_context(
|
||||
path = ".archeflow/context/colette-bundle.md",
|
||||
inject_at = "all_phases",
|
||||
filter_by = "archetype" # Apply per-agent attention filters
|
||||
)
|
||||
```
|
||||
|
||||
In practice, this means the run skill prepends the filtered bundle content to each agent's prompt, after the standard task description but before phase-specific artifacts.
|
||||
|
||||
### Prompt Injection Order
|
||||
|
||||
```
|
||||
1. Archetype definition (from SKILL.md or custom archetype .md)
|
||||
2. Domain-specific review focus (from archeflow:domains)
|
||||
**Prompt injection order:**
|
||||
1. Archetype definition
|
||||
2. Domain-specific review focus
|
||||
3. Colette bundle (filtered for this archetype)
|
||||
4. Task description
|
||||
5. Phase-specific artifacts (Explorer output, Creator proposal, etc.)
|
||||
5. Phase-specific artifacts
|
||||
6. Cycle feedback (if cycle 2+)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Example: Giesing Gschichten
|
||||
|
||||
Given this `colette.yaml`:
|
||||
|
||||
```yaml
|
||||
project:
|
||||
name: "Giesing Gschichten"
|
||||
author: "C. Nennemann"
|
||||
language: de
|
||||
type: fiction
|
||||
|
||||
voice:
|
||||
profile: vp-giesing-gschichten-v1
|
||||
|
||||
writing:
|
||||
target_words: 6000
|
||||
style: "Ich-Erzaehler, lakonisch, Eberhofer-meets-Grossstadt"
|
||||
```
|
||||
|
||||
The bridge:
|
||||
|
||||
1. Reads `voice.profile: vp-giesing-gschichten-v1`
|
||||
2. Searches for `./profiles/vp-giesing-gschichten-v1.yaml` — not found
|
||||
3. Searches for `../writing.colette/profiles/vp-giesing-gschichten-v1.yaml` — found
|
||||
4. Infers persona from voice profile ID pattern or searches `personas/` — finds `giesinger.yaml` at `../writing.colette/personas/giesinger.yaml`
|
||||
5. Globs `characters/*.yaml` — finds `alex.yaml` (and others if present)
|
||||
6. Reads `CLAUDE.md` for writing rules
|
||||
7. Generates bundle:
|
||||
|
||||
```markdown
|
||||
# Writing Context (auto-loaded from Colette)
|
||||
|
||||
## Voice Profile: vp-giesing-gschichten-v1
|
||||
**Tone:** Lakonisch, warmherzig-genervt, trockener Humor
|
||||
**Perspective:** Ich-Erzaehler (Alex), nah dran, subjektiv
|
||||
**Density:** Alltagsdetails die Atmosphaere schaffen
|
||||
**Attitude:** Lakonisch, leicht genervt, aber mit Herz
|
||||
**Sharpness:** Beobachtungsscharf, sprachlich reduziert
|
||||
**Humor:** Trocken, Understatement, absurde Situationen
|
||||
**Tempo:** Gemaechlich mit Spannungsspitzen, Slow Burn
|
||||
**Reader relationship:** Kumpel am Stammtisch
|
||||
|
||||
### Forbidden
|
||||
- Hochdeutsch-Sterilitaet
|
||||
- Krimi-Klischees (CSI, Profiler, Tatort)
|
||||
- Lederhosen-Kitsch und Oktoberfest-Folklore
|
||||
- Dialekt-Overkill
|
||||
- Moralisieren oder Erklaeren
|
||||
- Kuenstliche Spannungsaufbauten
|
||||
- Adverb-Orgien und Adjektiv-Ketten
|
||||
- Infodumps
|
||||
|
||||
### Allowed
|
||||
- Bairische Einsprengsel in Hochdeutsch-Prosa
|
||||
- Essen und Trinken als Leitmotiv
|
||||
- Kiffer-Humor und Slow-Motion-Beobachtungen
|
||||
- Gentrification-Satire
|
||||
- Echte Giesinger Orte und Strassen
|
||||
- Skurrile Nachbarn
|
||||
- Kriminalplot aus dem Alltag
|
||||
- Kurze, lakonische Dialoge
|
||||
|
||||
### Style models
|
||||
- Rita Falk (Erzaehlton), Wolf Haas (lakonisch), Helmut Dietl (Muenchner Milieu), Friedrich Ani (duester), Bukowski (Anti-Held)
|
||||
|
||||
## Persona: giesinger
|
||||
**Name:** Der Giesinger
|
||||
**Bio:** Erzaehlt Geschichten aus Muenchen-Giesing. Eberhofer meets Grossstadt.
|
||||
**Genres:** Krimi, Kurzgeschichte, Milieustudie
|
||||
|
||||
### Rules
|
||||
- Ich-Erzaehler, immer — Alex erzaehlt
|
||||
- Hauptsaechlich Hochdeutsch mit bairischen Einsprengsel
|
||||
- Jede Geschichte hat einen Kriminalplot
|
||||
- Essen/Trinken in jeder Geschichte
|
||||
- Echte Giesinger Orte und Strassen
|
||||
- Humor durch Understatement
|
||||
- Alex ist kein Ermittler
|
||||
- Figuren reden wie echte Menschen
|
||||
|
||||
## Characters
|
||||
### Alex (protagonist)
|
||||
- **Age:** Mitte 30
|
||||
- **Key traits:** Lakonisch, funktionaler Kiffer, unmotiviert aber nicht dumm
|
||||
- **Speech:** Kurze Saetze, Hochdeutsch mit bairischen Einsprengsel.
|
||||
- **Relationships:** Mo — Nachbar, Kumpel und Unruhestifter
|
||||
|
||||
## Project Rules (from CLAUDE.md)
|
||||
- Jede Geschichte beginnt mit einer Alltagsszene
|
||||
- Kriminalplot ergibt sich organisch aus dem Alltag
|
||||
- Essen/Trinken in jeder Geschichte
|
||||
- Echte Giesinger Orte verwenden
|
||||
- Kein Moralisieren, kein Erklaerbaer
|
||||
- Ende muss nicht alles aufloesen
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Design Principles
|
||||
|
||||
1. **Summarize, don't dump.** Raw YAML wastes tokens and confuses agents. The bundle is a curated briefing.
|
||||
2. **Cache aggressively.** Voice profiles and characters rarely change mid-run. Only regenerate when mtimes change.
|
||||
3. **Filter per agent.** A Guardian checking plot consistency does not need the full voice profile. A Sage checking voice drift does not need character sheets.
|
||||
4. **Graceful degradation.** Missing files are warned about, not fatal. A project with `colette.yaml` but no characters/ still works — the Characters section is simply empty.
|
||||
5. **One bundle, filtered views.** Generate the full bundle once. Filter at injection time per archetype. This keeps caching simple.
|
||||
6. **Additive to existing skills.** The bridge does not replace domain detection or artifact routing — it hooks into them. Remove the bridge, everything still works (just without auto-loaded writing context).
|
||||
|
||||
@@ -8,320 +8,87 @@ description: |
|
||||
<example>Automatically active when budget is configured</example>
|
||||
---
|
||||
|
||||
# Cost Tracking — Budget-Aware Orchestration
|
||||
# Cost Tracking -- Budget-Aware Orchestration
|
||||
|
||||
Every ArcheFlow orchestration consumes LLM tokens. This skill tracks costs per agent and per run, enforces budgets, and recommends cost-optimal model assignments.
|
||||
Tracks costs per agent and per run, enforces budgets, and selects cost-optimal models.
|
||||
|
||||
## Model Pricing Table
|
||||
## Model Pricing
|
||||
|
||||
Current pricing (update when models change):
|
||||
| Model | Input ($/M tok) | Output ($/M tok) |
|
||||
|-------|----------------:|-----------------:|
|
||||
| claude-opus-4-6 | 15.00 | 75.00 |
|
||||
| claude-sonnet-4-6 | 3.00 | 15.00 |
|
||||
| claude-haiku-4-5 | 0.80 | 4.00 |
|
||||
|
||||
| Model | Input ($/M tokens) | Output ($/M tokens) | Notes |
|
||||
|-------|--------------------:|---------------------:|-------|
|
||||
| `claude-opus-4-6` | 15.00 | 75.00 | Highest quality, use sparingly |
|
||||
| `claude-sonnet-4-6` | 3.00 | 15.00 | Good balance of quality and cost |
|
||||
| `claude-haiku-4-5` | 0.80 | 4.00 | Cheap, fast, good for structured tasks |
|
||||
**Prompt caching:** 90% discount on cached input tokens. Structure system prompts for cache hits.
|
||||
**Batches API:** 50% discount. Use for non-time-sensitive bulk ops.
|
||||
|
||||
**Prompt caching** (when applicable): 90% discount on cached input tokens. The orchestrator should structure system prompts to maximize cache hits (archetype instructions, voice profiles, and domain context are cache-friendly since they repeat across agents in a run).
|
||||
|
||||
**Batches API**: 50% discount on all tokens. Use for non-time-sensitive bulk operations (validation passes, consistency checks).
|
||||
|
||||
## Per-Agent Cost Tracking
|
||||
|
||||
Every `agent.complete` event includes cost data:
|
||||
|
||||
```jsonl
|
||||
{
|
||||
"type": "agent.complete",
|
||||
"data": {
|
||||
"archetype": "story-explorer",
|
||||
"duration_ms": 87605,
|
||||
"tokens_input": 15000,
|
||||
"tokens_output": 6000,
|
||||
"tokens_cache_read": 8000,
|
||||
"model": "haiku",
|
||||
"estimated_cost_usd": 0.02,
|
||||
"summary": "3 plot directions developed, recommended C"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Cost Calculation
|
||||
## Cost Calculation
|
||||
|
||||
```
|
||||
cost = (tokens_input - tokens_cache_read) * input_price / 1_000_000
|
||||
+ tokens_cache_read * input_price * 0.10 / 1_000_000
|
||||
+ tokens_output * output_price / 1_000_000
|
||||
cost = (input - cache_read) * input_price/1M
|
||||
+ cache_read * input_price * 0.10/1M
|
||||
+ output * output_price/1M
|
||||
```
|
||||
|
||||
If exact token counts are unavailable (Claude Code doesn't always expose them), estimate based on character count:
|
||||
If exact tokens unavailable, estimate: `tokens ~= chars / 4`. Mark with `cost_estimated: true`.
|
||||
|
||||
```
|
||||
estimated_tokens = character_count / 4 # rough heuristic
|
||||
```
|
||||
## Default Model Assignments
|
||||
|
||||
Mark estimated costs with `"cost_estimated": true` in the event data so reports can distinguish measured from estimated values.
|
||||
| Archetype | Code | Writing |
|
||||
|-----------|------|---------|
|
||||
| Explorer | haiku | haiku |
|
||||
| Creator | sonnet | sonnet |
|
||||
| Maker | sonnet | **sonnet** |
|
||||
| Guardian | haiku | haiku |
|
||||
| Skeptic | haiku | haiku |
|
||||
| Sage | sonnet | **sonnet** |
|
||||
| Trickster | haiku | haiku |
|
||||
|
||||
## Run-Level Aggregation
|
||||
Opus is user-opt-in only (team preset `model_overrides`).
|
||||
|
||||
The `run.complete` event includes cost totals:
|
||||
**Resolution order:** team preset override > domain override > archetype default.
|
||||
|
||||
```jsonl
|
||||
{
|
||||
"type": "run.complete",
|
||||
"data": {
|
||||
"status": "completed",
|
||||
"total_tokens_input": 95000,
|
||||
"total_tokens_output": 33000,
|
||||
"total_tokens_cache_read": 42000,
|
||||
"total_cost_usd": 1.45,
|
||||
"budget_usd": 10.00,
|
||||
"budget_remaining_usd": 8.55,
|
||||
"agents_total": 5,
|
||||
"cost_by_phase": {
|
||||
"plan": 0.35,
|
||||
"do": 0.72,
|
||||
"check": 0.38
|
||||
},
|
||||
"cost_by_model": {
|
||||
"haiku": 0.12,
|
||||
"sonnet": 1.33
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
## Pre-Agent Cost Estimates
|
||||
|
||||
### Cost Summary in Orchestration Report
|
||||
| Archetype | Typical Input | Typical Output |
|
||||
|-----------|-------------:|---------------:|
|
||||
| Explorer | 8k | 4k |
|
||||
| Creator | 12k | 6k |
|
||||
| Maker | 15k | 12k |
|
||||
| Guardian | 10k | 3k |
|
||||
| Skeptic | 8k | 3k |
|
||||
| Sage | 12k | 4k |
|
||||
| Trickster | 8k | 4k |
|
||||
|
||||
After each orchestration, the report includes a cost section:
|
||||
|
||||
```markdown
|
||||
## Cost Summary
|
||||
| Phase | Model(s) | Tokens (in/out) | Cost |
|
||||
|-------|----------|-----------------|------|
|
||||
| Plan | haiku, sonnet | 32k / 12k | $0.35 |
|
||||
| Do | sonnet | 40k / 15k | $0.72 |
|
||||
| Check | haiku, sonnet | 23k / 6k | $0.38 |
|
||||
| **Total** | | **95k / 33k** | **$1.45** |
|
||||
|
||||
Budget: $10.00 | Spent: $1.45 | Remaining: $8.55
|
||||
```
|
||||
After 10+ runs, use actual averages from `metrics.jsonl` instead.
|
||||
|
||||
## Budget Configuration
|
||||
|
||||
Budgets are defined in team presets or `.archeflow/config.yaml`:
|
||||
|
||||
```yaml
|
||||
# .archeflow/config.yaml
|
||||
budget:
|
||||
per_run_usd: 10.00 # Max cost per orchestration run
|
||||
per_agent_usd: 3.00 # Max cost per individual agent
|
||||
daily_usd: 50.00 # Max daily spend across all runs
|
||||
warn_at_percent: 75 # Warn when this % of budget is consumed
|
||||
per_run_usd: 10.00
|
||||
per_agent_usd: 3.00
|
||||
daily_usd: 50.00
|
||||
warn_at_percent: 75
|
||||
```
|
||||
|
||||
```yaml
|
||||
# Team preset override
|
||||
name: story-development
|
||||
domain: writing
|
||||
budget:
|
||||
per_run_usd: 5.00 # Writing runs are usually cheaper
|
||||
```
|
||||
|
||||
Team preset budget overrides the global config for that run.
|
||||
|
||||
### Budget Precedence
|
||||
|
||||
1. Team preset `budget` (if set)
|
||||
2. `.archeflow/config.yaml` `budget`
|
||||
3. No budget (unlimited) — costs are still tracked but not enforced
|
||||
Team preset budget overrides global config. No budget = unlimited (costs still tracked).
|
||||
|
||||
## Budget Enforcement
|
||||
|
||||
Budget checks happen at two points:
|
||||
**Pre-agent:** Estimate cost. If > remaining budget: stop (autonomous) or warn (attended).
|
||||
|
||||
### 1. Pre-Agent Check (before spawning)
|
||||
**Post-agent:** Update total. Warn at threshold. Stop if budget exceeded.
|
||||
|
||||
Before each agent is spawned, estimate its cost and check against remaining budget:
|
||||
## Cost Optimization
|
||||
|
||||
```
|
||||
estimated_agent_cost = estimate_tokens(archetype, task_complexity) * model_price
|
||||
remaining_budget = budget - sum(costs_so_far)
|
||||
|
||||
if estimated_agent_cost > remaining_budget:
|
||||
WARN: "Estimated cost for {archetype} (${estimated}) would exceed remaining budget (${remaining}). Continue? [y/N]"
|
||||
```
|
||||
|
||||
**In autonomous mode**: if budget would be exceeded, STOP the run and report. Do not prompt — there is no one to answer.
|
||||
|
||||
**In attended mode**: warn and ask the user. They can approve the overage or stop.
|
||||
|
||||
### 2. Post-Agent Check (after completion)
|
||||
|
||||
After each agent completes, update the running total and check:
|
||||
|
||||
```
|
||||
if total_cost > budget * warn_at_percent / 100:
|
||||
WARN: "Budget ${warn_at_percent}% consumed (${total_cost} of ${budget})"
|
||||
|
||||
if total_cost > budget:
|
||||
STOP: "Budget exceeded (${total_cost} of ${budget}). Run halted."
|
||||
```
|
||||
|
||||
### Pre-Agent Cost Estimation
|
||||
|
||||
Rough token estimates by archetype (calibrate over time with actual data from `metrics.jsonl`):
|
||||
|
||||
| Archetype | Typical Input | Typical Output | Notes |
|
||||
|-----------|-------------:|---------------:|-------|
|
||||
| Explorer | 8k | 4k | Research, reads many files |
|
||||
| Creator | 12k | 6k | Receives Explorer output, produces plan |
|
||||
| Maker | 15k | 12k | Largest output (implementation/prose) |
|
||||
| Guardian | 10k | 3k | Reads diff, structured output |
|
||||
| Skeptic | 8k | 3k | Reads proposal, structured challenges |
|
||||
| Sage | 12k | 4k | Reads diff + proposal |
|
||||
| Trickster | 8k | 4k | Reads diff, generates test cases |
|
||||
|
||||
These are starting estimates. After 10+ runs, use actual averages from `metrics.jsonl` instead.
|
||||
|
||||
## Cost-Aware Model Selection
|
||||
|
||||
Each archetype has a recommended model tier based on the quality requirements of its role:
|
||||
|
||||
### Default Model Assignments (Code Domain)
|
||||
|
||||
| Archetype | Model | Rationale |
|
||||
|-----------|-------|-----------|
|
||||
| Explorer | haiku | Research is structured extraction — cheap model handles it well |
|
||||
| Creator | sonnet | Design decisions need reasoning quality |
|
||||
| Maker | sonnet | Implementation needs quality to avoid rework cycles |
|
||||
| Guardian | haiku | Security/risk review is checklist-driven — structured and cheap |
|
||||
| Skeptic | haiku | Challenge generation follows patterns — cheap |
|
||||
| Sage | sonnet | Holistic quality judgment needs nuance |
|
||||
| Trickster | haiku | Adversarial testing is systematic — cheap |
|
||||
|
||||
### Writing Domain Overrides
|
||||
|
||||
Writing tasks need higher quality for prose-generating agents:
|
||||
|
||||
| Archetype | Model | Rationale |
|
||||
|-----------|-------|-----------|
|
||||
| Explorer / story-explorer | haiku | Research is still cheap |
|
||||
| Creator | sonnet | Outline design needs narrative judgment |
|
||||
| Maker | **sonnet** | Prose quality is the product — cannot be cheap |
|
||||
| Guardian | haiku | Plot/continuity checks are structured |
|
||||
| Skeptic | haiku | Premise challenges are structured |
|
||||
| Sage / story-sage | **sonnet** | Voice and craft judgment need taste |
|
||||
| Trickster | haiku | Reader-confusion analysis is systematic |
|
||||
|
||||
**When to escalate to opus**: Only for final-pass prose polishing on high-stakes content (book manuscripts, not short stories). Never for review or research agents. The user must explicitly opt in via:
|
||||
|
||||
```yaml
|
||||
# Team preset
|
||||
model_overrides:
|
||||
maker: opus # Only for final polish pass
|
||||
```
|
||||
|
||||
### Domain-Driven Model Selection
|
||||
|
||||
The effective model for each agent is resolved in this order:
|
||||
|
||||
1. **Team preset `model_overrides`** (highest priority — explicit choice)
|
||||
2. **Domain `model_overrides`** (from `.archeflow/domains/<name>.yaml`)
|
||||
3. **Archetype default** (from the table above)
|
||||
4. **Custom archetype `model` field** (from archetype YAML frontmatter)
|
||||
|
||||
Example resolution for `story-sage` in a writing run:
|
||||
- Team preset says nothing about story-sage → skip
|
||||
- Writing domain says `story-sage: sonnet` → **use sonnet**
|
||||
- Archetype YAML says `model: sonnet` → would have been used if domain didn't specify
|
||||
|
||||
## Cost Optimization Strategies
|
||||
|
||||
### 1. Prompt Caching
|
||||
|
||||
Structure prompts so that stable content comes first (maximizes cache prefix hits):
|
||||
|
||||
```
|
||||
[System prompt — archetype instructions] ← cached across agents in same run
|
||||
[Domain context — voice profile, persona] ← cached across agents in same run
|
||||
[Phase context — Explorer output, proposal] ← changes per agent
|
||||
[Task-specific instructions] ← changes per agent
|
||||
```
|
||||
|
||||
Estimated savings: 30-50% on input tokens for runs with 5+ agents.
|
||||
|
||||
### 2. Guardian Fast-Path (A2)
|
||||
|
||||
When Guardian approves with 0 issues, skip Skeptic/Sage/Trickster. This saves 2-3 agent calls per cycle. See `archeflow:orchestration` skill, rule A2.
|
||||
|
||||
Typical savings: $0.30-0.80 per skipped cycle (depending on models).
|
||||
|
||||
### 3. Explorer Cache
|
||||
|
||||
Reuse recent Explorer research instead of re-running. See `archeflow:orchestration` skill, Explorer Cache section.
|
||||
|
||||
Typical savings: $0.02-0.05 per cache hit (haiku Explorer).
|
||||
|
||||
### 4. Batches API for Bulk Operations
|
||||
|
||||
When running consistency checks, validation passes, or other non-time-sensitive work across multiple files, use the Batches API (50% discount):
|
||||
|
||||
```yaml
|
||||
# Mark agents as batch-eligible in team presets
|
||||
batch_eligible:
|
||||
- guardian # Structured review, can wait
|
||||
- skeptic # Challenge generation, can wait
|
||||
```
|
||||
|
||||
Only use batches when the user is not waiting for real-time results (overnight runs, autonomous mode).
|
||||
|
||||
### 5. Early Termination
|
||||
|
||||
If the first cycle produces a clean Guardian pass (A2 fast-path) AND the Maker's self-review checklist is clean, skip the remaining cycles even if `max_cycles > 1`. This avoids spending tokens on unnecessary verification.
|
||||
1. **Prompt caching:** Stable content first (archetype instructions, voice profiles). Saves 30-50% on input.
|
||||
2. **Guardian fast-path (A2):** 0 issues = skip remaining reviewers. Saves $0.30-0.80/cycle.
|
||||
3. **Explorer cache:** Reuse recent research. Saves $0.02-0.05/hit.
|
||||
4. **Batches API:** For autonomous/overnight review passes (50% discount).
|
||||
5. **Early termination:** Clean Guardian + clean Maker self-review = skip remaining cycles.
|
||||
|
||||
## Daily Cost Tracking
|
||||
|
||||
Across runs, maintain a daily cost ledger:
|
||||
|
||||
```
|
||||
.archeflow/costs/<YYYY-MM-DD>.jsonl
|
||||
```
|
||||
|
||||
Each line is one run's cost summary:
|
||||
|
||||
```jsonl
|
||||
{"run_id":"2026-04-03-der-huster","cost_usd":1.45,"tokens_input":95000,"tokens_output":33000,"models":{"haiku":2,"sonnet":3},"domain":"writing"}
|
||||
{"run_id":"2026-04-03-auth-refactor","cost_usd":2.10,"tokens_input":120000,"tokens_output":45000,"models":{"haiku":3,"sonnet":2},"domain":"code"}
|
||||
```
|
||||
|
||||
Daily budget enforcement reads this file to check `daily_usd` limits before starting new runs.
|
||||
|
||||
### Cost Report Command
|
||||
|
||||
```bash
|
||||
# Show today's costs
|
||||
./lib/archeflow-costs.sh today
|
||||
|
||||
# Show costs for a date range
|
||||
./lib/archeflow-costs.sh 2026-04-01 2026-04-03
|
||||
|
||||
# Show costs for a specific run
|
||||
./lib/archeflow-costs.sh run 2026-04-03-der-huster
|
||||
```
|
||||
|
||||
## Integration with Other Skills
|
||||
|
||||
- **`orchestration`**: Calls pre-agent and post-agent budget checks. Includes cost summary in orchestration report.
|
||||
- **`process-log`**: Cost data is embedded in `agent.complete` and `run.complete` events. No separate cost events needed.
|
||||
- **`domains`**: Reads `model_overrides` from the active domain to determine effective model per agent.
|
||||
- **`autonomous-mode`**: Enforces budget strictly (no prompts — just stop on budget exceeded). Uses daily budget to limit overnight spend.
|
||||
- **`workflow-design`**: Custom workflows can specify per-phase model assignments that override domain defaults.
|
||||
|
||||
## Design Principles
|
||||
|
||||
1. **Track always, enforce optionally.** Cost data is in every event regardless of whether a budget is set. Budget enforcement is opt-in.
|
||||
2. **Estimate before spend.** Always estimate before spawning an agent. Surprises are worse than slightly inaccurate estimates.
|
||||
3. **Cheapest model that works.** Default to haiku. Upgrade to sonnet only when the task demonstrably needs it. Opus is user-opt-in only.
|
||||
4. **Transparent.** Every cost shows up in the orchestration report. No hidden token spend.
|
||||
5. **Learn from history.** After enough runs, replace estimates with actual averages from `metrics.jsonl`.
|
||||
Ledger at `.archeflow/costs/<YYYY-MM-DD>.jsonl`. One line per run with cost, tokens, models, domain. Daily budget enforcement reads this before starting new runs.
|
||||
|
||||
@@ -1,181 +1,58 @@
|
||||
---
|
||||
name: custom-archetypes
|
||||
description: Use when the user wants to create domain-specific archetypes — specialized agent roles beyond the 7 built-in ones. For example a database reviewer, compliance auditor, or accessibility tester.
|
||||
description: Use when the user wants to create domain-specific archetypes -- specialized agent roles beyond the 7 built-in ones.
|
||||
---
|
||||
|
||||
# Custom Archetypes
|
||||
|
||||
ArcheFlow's 7 built-in archetypes cover general software engineering. Custom archetypes add **domain expertise** — a database specialist, a compliance auditor, an accessibility reviewer.
|
||||
Add domain expertise beyond the 7 built-ins: database specialist, compliance auditor, accessibility reviewer, etc.
|
||||
|
||||
## When to Create One
|
||||
## When to Create
|
||||
|
||||
- A recurring review concern isn't covered by built-in archetypes
|
||||
- A recurring review concern isn't covered by built-ins
|
||||
- You need domain knowledge (GDPR, PCI-DSS, WCAG, SQL optimization)
|
||||
- The same custom instructions are used in multiple orchestrations
|
||||
- Same custom instructions used across multiple orchestrations
|
||||
|
||||
## Archetype Definition
|
||||
## Definition Format
|
||||
|
||||
Create a markdown file in your project at `.archeflow/archetypes/<id>.md`:
|
||||
Create `.archeflow/archetypes/<id>.md`:
|
||||
|
||||
```markdown
|
||||
# <Name>
|
||||
|
||||
## Identity
|
||||
**ID:** <lowercase-with-hyphens>
|
||||
**Role:** <one sentence — what this archetype does>
|
||||
**Lens:** <the question this archetype always asks>
|
||||
**Role:** <one sentence>
|
||||
**Lens:** <the one question this archetype always asks>
|
||||
**Model tier:** cheap | standard | premium
|
||||
|
||||
## Behavior
|
||||
<System prompt injected into the agent. Define:
|
||||
- What to look for
|
||||
- How to evaluate
|
||||
- What output format to use
|
||||
- Decision criteria for approve/reject>
|
||||
<System prompt: what to look for, how to evaluate, output format, decision criteria>
|
||||
|
||||
## Outputs
|
||||
<What message types this archetype produces>
|
||||
- Research (if it gathers info)
|
||||
- Proposal (if it designs)
|
||||
- Challenge (if it critiques)
|
||||
- RiskAssessment (if it assesses risk)
|
||||
- QualityReport (if it reviews quality)
|
||||
- Implementation (if it writes code)
|
||||
<Message types: Research, Proposal, Challenge, RiskAssessment, QualityReport, Implementation>
|
||||
|
||||
## Shadow
|
||||
**Name:** <the dysfunction>
|
||||
**Strength inverted:** <how the core strength becomes destructive>
|
||||
**Symptoms:**
|
||||
- <observable behavior 1>
|
||||
- <observable behavior 2>
|
||||
- <observable behavior 3>
|
||||
**Name:** <dysfunction name>
|
||||
**Strength inverted:** <how core strength becomes destructive>
|
||||
**Symptoms:** <3 observable behaviors>
|
||||
**Correction:** <specific prompt to course-correct>
|
||||
```
|
||||
|
||||
## Examples
|
||||
## Composition
|
||||
|
||||
### Database Specialist
|
||||
```markdown
|
||||
# Database Specialist
|
||||
Combine two archetypes into a focused super-reviewer:
|
||||
|
||||
## Identity
|
||||
**ID:** db-specialist
|
||||
**Role:** Reviews database schemas, queries, and migration safety
|
||||
**Lens:** "Will this scale? Will this corrupt data?"
|
||||
**Model tier:** standard
|
||||
|
||||
## Behavior
|
||||
You review database changes for:
|
||||
1. Schema design — normalization, index coverage, constraint integrity
|
||||
2. Query performance — would an EXPLAIN ANALYZE show problems?
|
||||
3. Migration safety — backward compatible? Zero-downtime possible?
|
||||
4. Data integrity — foreign keys, unique constraints, NOT NULL where needed
|
||||
|
||||
Output APPROVED or REJECTED with findings including:
|
||||
- Table/column/query location
|
||||
- Severity (CRITICAL/WARNING/INFO)
|
||||
- Specific fix
|
||||
|
||||
## Outputs
|
||||
- Challenge
|
||||
- QualityReport
|
||||
|
||||
## Shadow
|
||||
**Name:** Schema Perfectionist
|
||||
**Strength inverted:** Database expertise becomes over-normalization and premature optimization
|
||||
**Symptoms:**
|
||||
- Demanding 3NF for a 10-row config table
|
||||
- Requiring indexes for queries that run once a day
|
||||
- Blocking on theoretical scale issues for an app with 50 users
|
||||
**Correction:** "Optimize for the current order of magnitude. If the app has 1000 users, design for 10,000. Not for 10 million."
|
||||
```
|
||||
|
||||
### Compliance Auditor
|
||||
```markdown
|
||||
# Compliance Auditor
|
||||
|
||||
## Identity
|
||||
**ID:** compliance-auditor
|
||||
**Role:** Verifies code changes against regulatory requirements
|
||||
**Lens:** "Could this get us fined?"
|
||||
**Model tier:** premium
|
||||
|
||||
## Behavior
|
||||
You audit changes against:
|
||||
1. GDPR — personal data handling, consent, right to deletion
|
||||
2. PCI-DSS — payment data storage, transmission, access controls
|
||||
3. Logging — are sensitive fields being logged? PII in error messages?
|
||||
4. Data retention — are we keeping data longer than allowed?
|
||||
|
||||
Reference specific regulation articles in findings.
|
||||
|
||||
## Outputs
|
||||
- RiskAssessment
|
||||
|
||||
## Shadow
|
||||
**Name:** Regulation Zealot
|
||||
**Strength inverted:** Compliance awareness becomes impossible-to-satisfy requirements
|
||||
**Symptoms:**
|
||||
- Citing regulations irrelevant to the change
|
||||
- Requiring legal review for non-PII code
|
||||
- Blocking internal tools with customer-facing compliance standards
|
||||
**Correction:** "Match the compliance level to the data classification. Internal admin tools don't need PCI-DSS Level 1 controls."
|
||||
```
|
||||
|
||||
## Using Custom Archetypes
|
||||
|
||||
Reference them by ID when orchestrating:
|
||||
|
||||
```
|
||||
# In the orchestration skill, add to Check phase:
|
||||
Agent(
|
||||
description: "db-specialist: review schema changes",
|
||||
prompt: "<contents of .archeflow/archetypes/db-specialist.md>
|
||||
Review the changes in branch: <maker's branch>
|
||||
..."
|
||||
)
|
||||
```
|
||||
|
||||
Or in a custom workflow, include them in the check phase archetypes list.
|
||||
|
||||
## Archetype Composition
|
||||
|
||||
Combine two archetypes into a focused super-reviewer when you need a specific perspective but don't want to spawn two agents:
|
||||
|
||||
```markdown
|
||||
# .archeflow/archetypes/security-breaker.md
|
||||
|
||||
## Identity
|
||||
**ID:** security-breaker
|
||||
**Composed of:** Guardian + Trickster
|
||||
**Role:** Security review with active exploitation attempts
|
||||
**Lens:** "Can I break the security model? How?"
|
||||
**Model tier:** standard
|
||||
|
||||
## Behavior
|
||||
Combine Guardian's checklist-driven security review with Trickster's
|
||||
adversarial testing. For each Guardian finding, attempt to exploit it.
|
||||
Only report findings you can actually reproduce.
|
||||
|
||||
## Shadow
|
||||
**Name:** Security Theater
|
||||
**Strength inverted:** Both shadows compound — paranoid blocking + noise
|
||||
**Correction:** "Only report findings with reproduction steps. Max 5."
|
||||
```
|
||||
|
||||
**Rules for composition:**
|
||||
- Max 2 archetypes combined (more defeats the purpose)
|
||||
- Max 2 archetypes combined
|
||||
- Combined shadow must address both source shadows
|
||||
- Use when spawning both separately would waste tokens on overlapping context
|
||||
|
||||
## Team Presets
|
||||
|
||||
Save common team configurations for your project in `.archeflow/teams/`:
|
||||
Save team configs in `.archeflow/teams/<name>.yaml`:
|
||||
|
||||
```yaml
|
||||
# .archeflow/teams/backend.yaml
|
||||
name: backend
|
||||
description: Standard backend development team
|
||||
plan: [explorer, creator]
|
||||
do: [maker]
|
||||
check: [guardian, sage]
|
||||
@@ -183,23 +60,12 @@ exit: all_approved
|
||||
max_cycles: 2
|
||||
```
|
||||
|
||||
```yaml
|
||||
# .archeflow/teams/security-audit.yaml
|
||||
name: security-audit
|
||||
description: Security-focused review team
|
||||
plan: [explorer, creator]
|
||||
do: [maker]
|
||||
check: [guardian, trickster, compliance-auditor]
|
||||
exit: all_approved
|
||||
max_cycles: 3
|
||||
```
|
||||
Reference custom archetypes by ID in the `check` (or any phase) list.
|
||||
|
||||
Use in orchestration: `"Use the backend team preset"` or `"Run security-audit workflow on this change"`
|
||||
## Rules
|
||||
|
||||
## Design Principles
|
||||
|
||||
1. **One concern per archetype.** Don't make a "full-stack reviewer."
|
||||
2. **Concrete shadow.** Vague shadows don't get detected. Use observable symptoms.
|
||||
3. **Right model tier.** Analytical → cheap. Creative → standard. Judgment-heavy → premium.
|
||||
4. **Specific lens.** The one question the archetype asks. This focuses behavior.
|
||||
5. **Composition over sprawl.** Combine before creating from scratch. 2 composed > 3 separate.
|
||||
1. One concern per archetype
|
||||
2. Concrete shadow with observable symptoms
|
||||
3. Right model tier: analytical = cheap, creative = standard, judgment = premium
|
||||
4. Specific lens question focuses behavior
|
||||
5. Compose before creating from scratch
|
||||
|
||||
@@ -10,363 +10,92 @@ description: |
|
||||
|
||||
# Domain Adapter System
|
||||
|
||||
ArcheFlow's PDCA pipeline and archetype system are domain-agnostic. This skill defines how to adapt them to specific domains (writing, code, research, etc.) so that events, metrics, reviews, and context use terminology that makes sense for the work being done.
|
||||
Adapts the PDCA pipeline and archetype system to specific domains (writing, code, research) so events, metrics, reviews, and context use domain-appropriate terminology.
|
||||
|
||||
## Domain Registry
|
||||
|
||||
Domain definitions live in `.archeflow/domains/<name>.yaml`. Each domain maps ArcheFlow's generic concepts to domain-specific equivalents and configures what metrics to track, what reviewers should focus on, and what context agents need.
|
||||
Domain definitions live in `.archeflow/domains/<name>.yaml`. Each maps generic concepts to domain-specific equivalents.
|
||||
|
||||
### Writing Domain
|
||||
### Concept Mapping
|
||||
|
||||
```yaml
|
||||
# .archeflow/domains/writing.yaml
|
||||
name: writing
|
||||
description: "Creative writing — stories, novels, non-fiction"
|
||||
| Generic Concept | Code | Writing | Research |
|
||||
|----------------|------|---------|----------|
|
||||
| implementation | code changes | draft/prose | draft/analysis |
|
||||
| tests | automated tests | consistency checks | citation verification |
|
||||
| files_changed | files changed | word count delta | section count |
|
||||
| test_coverage | test coverage % | voice drift score | source coverage |
|
||||
| code_review | code review | prose review | peer review |
|
||||
| build | build/compile | compile/export | compile (LaTeX/PDF) |
|
||||
| deploy | deploy | publish | submit/publish |
|
||||
| bug | bug | continuity error | unsupported claim |
|
||||
| feature | feature | scene/chapter | section |
|
||||
|
||||
# Concept mapping — how generic ArcheFlow terms translate
|
||||
concepts:
|
||||
implementation: "draft/prose"
|
||||
tests: "consistency checks"
|
||||
files_changed: "word count delta"
|
||||
test_coverage: "voice drift score"
|
||||
code_review: "prose review"
|
||||
build: "compile/export"
|
||||
deploy: "publish"
|
||||
refactor: "revision"
|
||||
bug: "continuity error"
|
||||
feature: "scene/chapter"
|
||||
PR: "manuscript submission"
|
||||
### Metrics by Domain
|
||||
|
||||
# Metrics — what to track instead of lines/files/tests
|
||||
metrics:
|
||||
- word_count
|
||||
- voice_drift_score
|
||||
- dialect_density
|
||||
- essen_count # Giesing Gschichten rule: food in every scene
|
||||
- scene_count
|
||||
- dialogue_ratio
|
||||
| Code | Writing | Research |
|
||||
|------|---------|----------|
|
||||
| files_changed | word_count | word_count |
|
||||
| lines_added/removed | voice_drift_score | citation_count |
|
||||
| tests_added | dialect_density | source_diversity |
|
||||
| tests_passing | scene_count | claim_count |
|
||||
| coverage_delta | dialogue_ratio | unsupported_claims |
|
||||
|
||||
# Review focus areas — override default Guardian/Sage lenses
|
||||
review_focus:
|
||||
guardian:
|
||||
- plot_coherence
|
||||
- character_consistency
|
||||
- timeline_accuracy
|
||||
- continuity
|
||||
sage:
|
||||
- voice_consistency
|
||||
- prose_quality
|
||||
- dialect_authenticity
|
||||
- forbidden_pattern_violations
|
||||
skeptic:
|
||||
- premise_strength
|
||||
- character_motivation
|
||||
- ending_satisfaction
|
||||
trickster:
|
||||
- reader_confusion_points
|
||||
- pacing_dead_spots
|
||||
- suspension_of_disbelief_breaks
|
||||
### Review Focus by Domain
|
||||
|
||||
# Context injection — what extra files agents should read per phase
|
||||
context:
|
||||
always:
|
||||
- "voice profile YAML (profiles/*.yaml)"
|
||||
- "persona YAML (personas/*.yaml)"
|
||||
- "character sheets (characters/*.yaml)"
|
||||
plan_phase:
|
||||
- "series config (colette.yaml)"
|
||||
- "previous stories (if series, for continuity)"
|
||||
- "story brief / premise"
|
||||
do_phase:
|
||||
- "scene outline from Creator"
|
||||
- "voice profile (for style reference)"
|
||||
check_phase:
|
||||
- "voice profile (for Sage drift scoring)"
|
||||
- "outline (for Guardian coherence check)"
|
||||
- "character sheets (for consistency)"
|
||||
| Reviewer | Code | Writing | Research |
|
||||
|----------|------|---------|----------|
|
||||
| Guardian | security, breaking changes, deps, error handling | plot coherence, character consistency, timeline, continuity | factual accuracy, citation validity, logic, methodology |
|
||||
| Sage | code quality, coverage, docs, patterns | voice consistency, prose quality, dialect authenticity | argument structure, clarity, tone, completeness |
|
||||
| Skeptic | design assumptions, scalability, edge cases | premise strength, motivation, ending satisfaction | (default) |
|
||||
| Trickster | malformed input, races, error paths, dep failures | reader confusion, pacing dead spots, disbelief breaks | (default) |
|
||||
|
||||
# Model preferences — domain-specific overrides
|
||||
model_overrides:
|
||||
maker: sonnet # Prose quality matters more than for code
|
||||
story-sage: sonnet # Needs taste for voice evaluation
|
||||
```
|
||||
### Model Overrides
|
||||
|
||||
### Code Domain (Default)
|
||||
Domains can override default model assignments:
|
||||
|
||||
```yaml
|
||||
# .archeflow/domains/code.yaml
|
||||
name: code
|
||||
description: "Software development — applications, libraries, infrastructure"
|
||||
| Domain | Override | Rationale |
|
||||
|--------|----------|-----------|
|
||||
| Writing | maker: sonnet | Prose quality is the product |
|
||||
| Writing | story-sage: sonnet | Voice evaluation needs taste |
|
||||
| Research | maker: sonnet | Analysis quality matters |
|
||||
| Code | (none) | Defaults are calibrated for code |
|
||||
|
||||
concepts:
|
||||
implementation: "code changes"
|
||||
tests: "automated tests"
|
||||
files_changed: "files changed"
|
||||
test_coverage: "test coverage %"
|
||||
code_review: "code review"
|
||||
build: "build/compile"
|
||||
deploy: "deploy"
|
||||
refactor: "refactor"
|
||||
bug: "bug"
|
||||
feature: "feature"
|
||||
PR: "pull request"
|
||||
### Context Injection by Domain
|
||||
|
||||
metrics:
|
||||
- files_changed
|
||||
- lines_added
|
||||
- lines_removed
|
||||
- tests_added
|
||||
- tests_passing
|
||||
- coverage_delta
|
||||
Domains declare which extra files agents should read per phase. Context injection is additive (on top of standard ArcheFlow context).
|
||||
|
||||
review_focus:
|
||||
guardian:
|
||||
- security_vulnerabilities
|
||||
- breaking_changes
|
||||
- dependency_risks
|
||||
- error_handling
|
||||
sage:
|
||||
- code_quality
|
||||
- test_coverage
|
||||
- documentation
|
||||
- pattern_consistency
|
||||
skeptic:
|
||||
- design_assumptions
|
||||
- scalability
|
||||
- alternative_approaches
|
||||
- edge_cases
|
||||
trickster:
|
||||
- malformed_input
|
||||
- concurrency_races
|
||||
- error_path_exploitation
|
||||
- dependency_failures
|
||||
|
||||
context:
|
||||
always:
|
||||
- "README.md"
|
||||
- ".archeflow/config.yaml"
|
||||
plan_phase:
|
||||
- "relevant source files (Explorer identifies)"
|
||||
- "existing tests for affected area"
|
||||
do_phase:
|
||||
- "Creator's proposal"
|
||||
- "test fixtures and helpers"
|
||||
check_phase:
|
||||
- "git diff from Maker"
|
||||
- "proposal risk section"
|
||||
|
||||
model_overrides: {}
|
||||
# Code domain uses default archetype model assignments
|
||||
```
|
||||
|
||||
### Research Domain (Example Extension)
|
||||
|
||||
```yaml
|
||||
# .archeflow/domains/research.yaml
|
||||
name: research
|
||||
description: "Academic or technical research — papers, analysis, literature review"
|
||||
|
||||
concepts:
|
||||
implementation: "draft/analysis"
|
||||
tests: "citation verification"
|
||||
files_changed: "section count"
|
||||
test_coverage: "source coverage"
|
||||
code_review: "peer review"
|
||||
build: "compile (LaTeX/PDF)"
|
||||
deploy: "submit/publish"
|
||||
|
||||
metrics:
|
||||
- word_count
|
||||
- citation_count
|
||||
- source_diversity
|
||||
- claim_count
|
||||
- unsupported_claims
|
||||
|
||||
review_focus:
|
||||
guardian:
|
||||
- factual_accuracy
|
||||
- citation_validity
|
||||
- logical_coherence
|
||||
- methodology_soundness
|
||||
sage:
|
||||
- argument_structure
|
||||
- prose_clarity
|
||||
- academic_tone
|
||||
- completeness
|
||||
|
||||
context:
|
||||
always:
|
||||
- "bibliography/references"
|
||||
- "research brief"
|
||||
plan_phase:
|
||||
- "prior literature notes"
|
||||
- "methodology constraints"
|
||||
check_phase:
|
||||
- "citation database"
|
||||
- "claims vs. evidence mapping"
|
||||
|
||||
model_overrides:
|
||||
maker: sonnet # Research writing needs quality
|
||||
```
|
||||
| Phase | Code | Writing |
|
||||
|-------|------|---------|
|
||||
| always | README.md, config.yaml | voice profile, persona, characters |
|
||||
| plan | relevant source files, existing tests | series config, previous stories, brief |
|
||||
| do | Creator's proposal, test fixtures | scene outline, voice profile |
|
||||
| check | git diff, risk section | voice profile (Sage), outline (Guardian), characters |
|
||||
|
||||
## Domain Detection
|
||||
|
||||
ArcheFlow auto-detects the domain based on project markers. Detection runs once at `run.start` and the result is stored in the run's event stream.
|
||||
Auto-detects at `run.start`. Result stored in event stream.
|
||||
|
||||
### Detection Priority (highest first)
|
||||
|
||||
| Priority | Signal | Domain | Rationale |
|
||||
|----------|--------|--------|-----------|
|
||||
| 1 | CLI flag `--domain <name>` | as specified | Explicit override always wins |
|
||||
| 2 | Team preset has `domain: <name>` | as specified | Preset knows its domain |
|
||||
| 3 | `colette.yaml` exists in project root | `writing` | Colette is the writing platform |
|
||||
| 4 | `*.bib` or `references/` exists | `research` | Bibliography signals research |
|
||||
| 5 | `package.json` exists | `code` | Node.js project |
|
||||
| 6 | `Cargo.toml` exists | `code` | Rust project |
|
||||
| 7 | `pyproject.toml` exists | `code` | Python project |
|
||||
| 8 | `go.mod` exists | `code` | Go project |
|
||||
| 9 | `Makefile` or `CMakeLists.txt` exists | `code` | C/C++ project |
|
||||
| 10 | No markers found | `code` | Default fallback |
|
||||
|
||||
### Detection in Team Presets
|
||||
|
||||
Team presets can declare their domain explicitly:
|
||||
|
||||
```yaml
|
||||
# .archeflow/teams/story-development.yaml
|
||||
name: story-development
|
||||
domain: writing # <-- explicit domain
|
||||
description: "Kurzgeschichten-Entwicklung"
|
||||
plan: [story-explorer, creator]
|
||||
do: [maker]
|
||||
check: [guardian, story-sage]
|
||||
```
|
||||
|
||||
When `domain` is set in the preset, detection is skipped entirely.
|
||||
|
||||
### Detection Event
|
||||
|
||||
Domain detection emits a decision event:
|
||||
|
||||
```jsonl
|
||||
{"ts":"...","run_id":"...","seq":1,"parent":[],"type":"decision","phase":"init","agent":null,"data":{"what":"domain_detection","chosen":"writing","signal":"colette.yaml exists","alternatives":[{"id":"code","reason_rejected":"No code project markers found"}]}}
|
||||
```
|
||||
|
||||
## How Domains Affect Orchestration
|
||||
|
||||
### 1. Concept Translation in Reports
|
||||
|
||||
The orchestration report and session log use domain-translated terms:
|
||||
|
||||
```markdown
|
||||
# Code domain report
|
||||
- **Files changed:** 4 files, +120 -30 lines
|
||||
- **Tests added:** 8 new tests
|
||||
|
||||
# Writing domain report (same data, different framing)
|
||||
- **Word count delta:** +6004 words across 7 scenes
|
||||
- **Consistency checks:** voice drift 0.12, 2 continuity fixes applied
|
||||
```
|
||||
|
||||
### 2. Domain-Specific Event Data
|
||||
|
||||
Events include domain-relevant metrics in their `data` payload:
|
||||
|
||||
```jsonl
|
||||
// Writing domain — agent.complete
|
||||
{"type":"agent.complete","data":{"archetype":"maker","duration_ms":180000,"word_count":6004,"voice_drift":0.12,"scenes":7,"dialogue_ratio":0.35,"essen_count":4}}
|
||||
|
||||
// Code domain — agent.complete
|
||||
{"type":"agent.complete","data":{"archetype":"maker","duration_ms":90000,"files_changed":5,"tests_added":12,"coverage_delta":"+3%","lines_added":245,"lines_removed":80}}
|
||||
|
||||
// Writing domain — run.complete
|
||||
{"type":"run.complete","data":{"status":"completed","word_count":6004,"voice_drift_final":0.08,"scenes":7,"dialect_density":0.15,"cycles":1}}
|
||||
|
||||
// Code domain — run.complete
|
||||
{"type":"run.complete","data":{"status":"completed","files_changed":4,"tests_total":20,"coverage":"87%","cycles":2}}
|
||||
```
|
||||
|
||||
### 3. Review Focus Override
|
||||
|
||||
When a domain defines `review_focus`, reviewers receive domain-specific instructions instead of the defaults:
|
||||
|
||||
```
|
||||
# Without domain adapter (code defaults):
|
||||
Guardian → "Check for security vulnerabilities, breaking changes..."
|
||||
|
||||
# With writing domain adapter:
|
||||
Guardian → "Check for plot coherence, character consistency, timeline accuracy, continuity..."
|
||||
```
|
||||
|
||||
The orchestration skill reads the domain's `review_focus` and injects it into the reviewer prompt. The archetype's base personality (virtue, shadow, lens) stays the same — only the checklist changes.
|
||||
|
||||
### 4. Context Injection
|
||||
|
||||
The domain's `context` config tells the orchestrator which additional files to pass to each agent:
|
||||
|
||||
```
|
||||
# Plan phase in writing domain:
|
||||
# Orchestrator automatically includes voice profile, persona, character sheets, series config
|
||||
# alongside the standard task description and Explorer output
|
||||
|
||||
# Check phase in writing domain:
|
||||
# Guardian gets the outline (for coherence)
|
||||
# Sage gets the voice profile (for drift scoring)
|
||||
```
|
||||
|
||||
Context injection is additive — domain context is added on top of ArcheFlow's standard context rules (task description, prior phase output, etc.).
|
||||
|
||||
### 5. Model Overrides
|
||||
|
||||
If the domain specifies `model_overrides`, those override the default model assignment for the listed archetypes:
|
||||
|
||||
```
|
||||
# Default: Maker uses whatever the workflow assigns (often haiku for cheap tasks)
|
||||
# Writing domain: Maker uses sonnet (prose quality matters)
|
||||
# Research domain: Maker uses sonnet (analysis quality matters)
|
||||
```
|
||||
|
||||
Model overrides interact with cost tracking — the cost-tracking skill reads the effective model assignment (after domain overrides) for its estimates.
|
||||
| Priority | Signal | Domain |
|
||||
|----------|--------|--------|
|
||||
| 1 | CLI `--domain <name>` | as specified |
|
||||
| 2 | Team preset `domain:` field | as specified |
|
||||
| 3 | `colette.yaml` exists | writing |
|
||||
| 4 | `*.bib` or `references/` exists | research |
|
||||
| 5 | `package.json`, `Cargo.toml`, `pyproject.toml`, `go.mod`, `Makefile` | code |
|
||||
| 6 | No markers | code (default) |
|
||||
|
||||
## Adding a New Domain
|
||||
|
||||
1. Create `.archeflow/domains/<name>.yaml` following the schema above
|
||||
2. Add detection signals to the priority table (or rely on `--domain` / team preset)
|
||||
3. Define custom archetypes if needed (e.g., `story-explorer` for writing)
|
||||
4. Test with `--domain <name> --dry-run` to verify detection and context injection
|
||||
1. Create `.archeflow/domains/<name>.yaml` with `name`, `concepts`, `metrics` (minimum required)
|
||||
2. Optionally add `review_focus`, `context`, `model_overrides`
|
||||
3. Missing sections fall back to `code` domain defaults
|
||||
4. Test with `--domain <name> --dry-run`
|
||||
|
||||
### Minimum Viable Domain
|
||||
## How Domains Affect Orchestration
|
||||
|
||||
Only `name`, `concepts`, and `metrics` are required. Everything else has sensible defaults:
|
||||
|
||||
```yaml
|
||||
name: legal
|
||||
description: "Legal document drafting and review"
|
||||
|
||||
concepts:
|
||||
implementation: "draft"
|
||||
tests: "compliance checks"
|
||||
code_review: "legal review"
|
||||
|
||||
metrics:
|
||||
- clause_count
|
||||
- citation_count
|
||||
- compliance_score
|
||||
```
|
||||
|
||||
Missing sections fall back to the `code` domain defaults.
|
||||
|
||||
## Integration with Other Skills
|
||||
|
||||
- **`orchestration`**: Reads domain config at `run.start`, applies concept translation, context injection, model overrides, and review focus throughout the run
|
||||
- **`process-log`**: Domain-specific event data fields are included in `agent.complete` and `run.complete` payloads
|
||||
- **`cost-tracking`**: Reads `model_overrides` from the active domain to calculate accurate cost estimates
|
||||
- **`custom-archetypes`**: Domain-specific archetypes (e.g., `story-explorer`, `story-sage`) are defined per-project and referenced in team presets
|
||||
- **`workflow-design`**: Custom workflows can reference a domain explicitly
|
||||
|
||||
## Design Principles
|
||||
|
||||
1. **Additive, not replacing.** Domains add context and translate terms. They do not change the PDCA cycle, archetype system, or event schema.
|
||||
2. **Graceful degradation.** If no domain config exists, everything works as before (code domain defaults).
|
||||
3. **One domain per run.** A run operates in exactly one domain. Multi-domain projects use separate runs.
|
||||
4. **Domain config is data, not code.** YAML files, no scripts. Portable across projects.
|
||||
- **Reports** use domain-translated terms (e.g., "word count delta" instead of "files changed")
|
||||
- **Events** include domain-relevant metrics in `agent.complete` and `run.complete` payloads
|
||||
- **Reviewers** receive domain-specific focus checklists (archetype personality stays the same)
|
||||
- **Context injection** adds domain-declared files to each agent's prompt
|
||||
- **Model overrides** change which model an archetype uses (interacts with cost-tracking)
|
||||
- **One domain per run.** Multi-domain projects use separate runs.
|
||||
|
||||
@@ -6,263 +6,86 @@ description: |
|
||||
Enables rollback to any phase boundary and full audit trail via git history.
|
||||
<example>Automatically loaded by archeflow:run when git.enabled is true</example>
|
||||
<example>User: "archeflow rollback --to plan"</example>
|
||||
<example>User: "Show me the git history for this run"</example>
|
||||
---
|
||||
|
||||
# Git Integration — Per-Phase Commit Strategy
|
||||
# Git Integration -- Per-Phase Commit Strategy
|
||||
|
||||
Every ArcheFlow run creates a dedicated branch. Each phase transition and agent completion produces a commit. At run completion, the branch is merged back to the base branch. On failure, the branch stays intact for inspection or rollback.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- `archeflow:orchestration` — workflow rules and safety constraints
|
||||
- `archeflow:process-log` — event schema (git events are emitted alongside process events)
|
||||
- `archeflow:artifact-routing` — artifact paths that get committed
|
||||
|
||||
## Helper Script
|
||||
|
||||
All git operations go through the helper script:
|
||||
|
||||
```bash
|
||||
./lib/archeflow-git.sh <command> <run_id> [args...]
|
||||
```
|
||||
|
||||
See `lib/archeflow-git.sh` for full usage. The skill describes *when* to call the script; the script handles *how*.
|
||||
|
||||
---
|
||||
Every run creates branch `archeflow/<run_id>`. Each phase transition and agent completion produces a commit. On success, merge back. On failure, branch stays for inspection.
|
||||
|
||||
## Branch Strategy
|
||||
|
||||
```
|
||||
main (or current base branch)
|
||||
└── archeflow/<run_id> # Created at run.start
|
||||
├── commit: "archeflow(plan): explorer research"
|
||||
├── commit: "archeflow(plan): creator outline"
|
||||
├── commit: "archeflow(plan→do): phase transition"
|
||||
├── commit: "archeflow(do): maker draft"
|
||||
├── commit: "archeflow(do→check): phase transition"
|
||||
├── commit: "archeflow(check): guardian review"
|
||||
├── commit: "archeflow(check): sage review"
|
||||
├── commit: "archeflow(check→act): phase transition"
|
||||
├── commit: "archeflow(act): apply 6 fixes"
|
||||
├── commit: "archeflow(act): cycle 1 complete"
|
||||
└── commit: "archeflow(run): complete — <summary>"
|
||||
main
|
||||
+-- archeflow/<run_id>
|
||||
+-- archeflow(plan): explorer research
|
||||
+-- archeflow(plan): creator outline
|
||||
+-- archeflow(plan->do): phase transition
|
||||
+-- archeflow(do): maker draft
|
||||
+-- archeflow(check): guardian review
|
||||
+-- archeflow(act): cycle 1 complete
|
||||
+-- archeflow(run): complete
|
||||
```
|
||||
|
||||
Branch naming: `archeflow/<run_id>` (e.g., `archeflow/2026-04-03-jwt-auth`).
|
||||
|
||||
---
|
||||
|
||||
## Commit Points
|
||||
|
||||
| Trigger | What to commit | Message format |
|
||||
|---------|---------------|----------------|
|
||||
| After `agent.complete` | Agent artifacts + any created/modified files | `archeflow(<phase>): <archetype> <summary>` |
|
||||
| After `phase.transition` | All artifacts from completed phase | `archeflow(<from>→<to>): phase transition` |
|
||||
| After each `fix.applied` | The fixed file | `archeflow(fix): <source> — <finding summary>` |
|
||||
| After `cycle.boundary` | Everything staged | `archeflow(act): cycle <N> <status>` |
|
||||
| After `run.complete` | Final state + process report | `archeflow(run): complete — <summary>` |
|
||||
|
||||
---
|
||||
| Trigger | Message format |
|
||||
|---------|----------------|
|
||||
| `agent.complete` | `archeflow(<phase>): <archetype> <summary>` |
|
||||
| `phase.transition` | `archeflow(<from>-><to>): phase transition` |
|
||||
| `fix.applied` | `archeflow(fix): <source> -- <finding>` |
|
||||
| `cycle.boundary` | `archeflow(act): cycle <N> <status>` |
|
||||
| `run.complete` | `archeflow(run): complete -- <summary>` |
|
||||
|
||||
## Commit Protocol
|
||||
|
||||
1. **Stage only relevant files.** Never `git add -A`. Stage:
|
||||
- `.archeflow/artifacts/<run_id>/` — artifacts produced by the current agent/phase
|
||||
- `.archeflow/events/<run_id>.jsonl` — updated event log
|
||||
- Any project files created or modified by the current agent (from `do-maker-files.txt` or explicit file list)
|
||||
2. **Exclude ephemeral files.** Never commit:
|
||||
- `.archeflow/progress.md` (live progress display, ephemeral)
|
||||
- `.archeflow/explorer-cache/` (local cache, not run-specific)
|
||||
- `.archeflow/session-log.md` (separate concern)
|
||||
3. **Use conventional commit format:** `archeflow(<scope>): <message>`
|
||||
4. **Signing:** If `git.signing_key` is configured, pass `-c user.signingkey=<key>` to `git commit`.
|
||||
- Stage only relevant files: `.archeflow/artifacts/<run_id>/`, event log, project files from maker
|
||||
- Never `git add -A`
|
||||
- Exclude: `progress.md`, `explorer-cache/`, `session-log.md`
|
||||
- Use conventional commit format
|
||||
- Signing opt-in via `git.signing_key` config
|
||||
|
||||
### Integration with the Run Skill
|
||||
## All operations go through `./lib/archeflow-git.sh`:
|
||||
|
||||
The `archeflow:run` skill calls git operations at these points:
|
||||
| Run event | Command |
|
||||
|-----------|---------|
|
||||
| `run.start` | `init <run_id>` (create+switch branch) |
|
||||
| `agent.complete` | `commit <run_id> <phase> "<msg>" [files]` |
|
||||
| `phase.transition` | `phase-commit <run_id> <phase>` |
|
||||
| `run.complete` (ok) | `merge <run_id> [--squash|--no-ff]` |
|
||||
| `run.complete` (fail) | branch preserved |
|
||||
|
||||
```
|
||||
run.start → ./lib/archeflow-git.sh init <run_id>
|
||||
agent.complete → ./lib/archeflow-git.sh commit <run_id> <phase> "<archetype> <summary>" [files...]
|
||||
phase.transition → ./lib/archeflow-git.sh phase-commit <run_id> <phase>
|
||||
fix.applied → ./lib/archeflow-git.sh commit <run_id> fix "<source> — <finding>"
|
||||
cycle.boundary → ./lib/archeflow-git.sh commit <run_id> act "cycle <N> <status>"
|
||||
run.complete (ok) → ./lib/archeflow-git.sh merge <run_id> [--squash|--no-ff]
|
||||
run.complete (fail) → branch preserved, not merged
|
||||
```
|
||||
## Merge
|
||||
|
||||
---
|
||||
|
||||
## Run Lifecycle
|
||||
|
||||
### 1. Initialization (`run.start`)
|
||||
|
||||
```bash
|
||||
./lib/archeflow-git.sh init <run_id>
|
||||
```
|
||||
|
||||
This will:
|
||||
1. Verify a clean working tree (or stash uncommitted changes)
|
||||
2. Create branch `archeflow/<run_id>` from current HEAD
|
||||
3. Switch to the new branch
|
||||
|
||||
### 2. During Execution (phase commits)
|
||||
|
||||
After each agent completes or phase transitions, the run skill calls:
|
||||
|
||||
```bash
|
||||
# After an agent completes:
|
||||
./lib/archeflow-git.sh commit <run_id> plan "explorer research" \
|
||||
.archeflow/artifacts/<run_id>/plan-explorer.md
|
||||
|
||||
# After a phase transition:
|
||||
./lib/archeflow-git.sh phase-commit <run_id> plan
|
||||
```
|
||||
|
||||
The `commit` command stages artifact directories and event logs automatically. Additional files can be passed as trailing arguments.
|
||||
|
||||
The `phase-commit` command stages all artifacts matching the phase prefix and commits with a transition message.
|
||||
|
||||
### 3. Completion (merge)
|
||||
|
||||
```bash
|
||||
# Success — squash merge (default):
|
||||
./lib/archeflow-git.sh merge <run_id> --squash
|
||||
|
||||
# Success — preserve history:
|
||||
./lib/archeflow-git.sh merge <run_id> --no-ff
|
||||
|
||||
# Failure or user abort:
|
||||
# Do nothing. Branch stays for inspection.
|
||||
echo "Branch archeflow/<run_id> preserved for inspection."
|
||||
```
|
||||
|
||||
The merge command:
|
||||
1. Verifies all changes on the branch are committed
|
||||
2. Switches to the base branch (main or wherever the run started)
|
||||
3. Merges with the chosen strategy
|
||||
4. If squash: creates a single commit with `feat: <task summary>`
|
||||
5. Does NOT delete the branch (user may want to inspect)
|
||||
|
||||
### 4. Cleanup (optional, after inspection)
|
||||
|
||||
```bash
|
||||
./lib/archeflow-git.sh cleanup <run_id>
|
||||
```
|
||||
|
||||
Deletes the branch after the user has confirmed the merge is correct.
|
||||
|
||||
---
|
||||
1. Verify all changes committed
|
||||
2. Switch to base branch
|
||||
3. Merge with configured strategy (squash default)
|
||||
4. Branch NOT auto-deleted (user may inspect)
|
||||
|
||||
## Rollback
|
||||
|
||||
Roll back to the end of any completed phase:
|
||||
`./lib/archeflow-git.sh rollback <run_id> --to <target>`
|
||||
|
||||
```bash
|
||||
./lib/archeflow-git.sh rollback <run_id> --to plan
|
||||
```
|
||||
Targets: `plan`, `do`, `check`, `act`, `cycle-N`. Only works on `archeflow/<run_id>` branch. Resets to last commit for target phase and trims event JSONL.
|
||||
|
||||
This will:
|
||||
1. Find the last commit for the target phase by searching commit messages
|
||||
2. Show the user what commits will be lost (everything after the target)
|
||||
3. Perform `git reset --hard <commit>` on the branch
|
||||
4. Trim the events JSONL to remove events that occurred after the rollback point
|
||||
## Post-Merge Validation
|
||||
|
||||
**Supported rollback targets:** `plan`, `do`, `check`, `act`, or any cycle number (`cycle-1`, `cycle-2`).
|
||||
|
||||
**Safety:** Rollback only works on the run's branch, never on main. The script verifies you are on `archeflow/<run_id>` before proceeding.
|
||||
|
||||
---
|
||||
|
||||
## Status
|
||||
|
||||
View the git state of a run:
|
||||
|
||||
```bash
|
||||
./lib/archeflow-git.sh status <run_id>
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
Branch: archeflow/2026-04-03-jwt-auth
|
||||
Base: main (3 commits ahead)
|
||||
|
||||
Commits:
|
||||
abc1234 archeflow(plan): explorer research
|
||||
def5678 archeflow(plan): creator outline
|
||||
ghi9012 archeflow(plan→do): phase transition
|
||||
jkl3456 archeflow(do): maker implementation
|
||||
|
||||
Current phase: do
|
||||
Files changed (total): 8
|
||||
Uncommitted changes: none
|
||||
```
|
||||
|
||||
---
|
||||
After merge, runs project test suite (from `test_command` in config) with 5-min timeout. If tests fail: `git revert --no-edit HEAD`.
|
||||
|
||||
## Configuration
|
||||
|
||||
In `.archeflow/config.yaml` or a team preset:
|
||||
|
||||
```yaml
|
||||
git:
|
||||
enabled: true # Default: true. Set false to disable all git operations.
|
||||
branch_prefix: "archeflow/" # Default. The run_id is appended.
|
||||
commit_style: conventional # conventional (archeflow(<scope>): msg) | simple (<phase>: msg)
|
||||
enabled: true
|
||||
branch_prefix: "archeflow/"
|
||||
merge_strategy: squash # squash | no-ff | rebase
|
||||
auto_push: false # Push branch to remote after each commit
|
||||
signing_key: null # SSH key path for signed commits (e.g., ~/.ssh/id_ed25519.pub)
|
||||
auto_push: false
|
||||
signing_key: null
|
||||
```
|
||||
|
||||
The helper script reads this config if it exists. All values have sensible defaults.
|
||||
|
||||
---
|
||||
|
||||
## Post-Merge Rollback
|
||||
|
||||
After merging, the run skill validates the merge by running the project's test suite. If tests fail, the merge is automatically reverted.
|
||||
|
||||
### Script
|
||||
|
||||
```bash
|
||||
./lib/archeflow-rollback.sh <run_id> [--test-cmd <cmd>]
|
||||
```
|
||||
|
||||
**Behavior:**
|
||||
1. Reads `test_command` from `.archeflow/config.yaml` (or uses `--test-cmd` override)
|
||||
2. Runs the test suite with a 5-minute timeout
|
||||
3. If tests pass: exits 0 (merge is good)
|
||||
4. If tests fail: runs `git revert --no-edit HEAD`, emits a `decision` event, exits 1
|
||||
5. Verifies HEAD is an ArcheFlow merge commit before reverting (warning if not, proceeds anyway)
|
||||
|
||||
**Integration with run skill:** Called in section 4c (All Approved) after `archeflow-git.sh merge`. If it returns non-zero, the orchestrator cycles back with "integration test failure" feedback or reports to the user if max cycles are reached.
|
||||
|
||||
**Configuration:** Set `test_command` in `.archeflow/config.yaml`:
|
||||
```yaml
|
||||
test_command: "npm test" # or "pytest", "cargo test", etc.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Safety Rules
|
||||
|
||||
These rules are inherited from `archeflow:orchestration` and reinforced here:
|
||||
|
||||
1. **Never force-push.** No `--force`, no `--force-with-lease`. If a push fails, diagnose and fix.
|
||||
2. **Never modify main history.** Merges are forward-only. No rebasing main.
|
||||
3. **Branch stays intact on failure.** If a run fails or is aborted, the branch is preserved for inspection. Never auto-delete failed branches.
|
||||
4. **All commits are individually revertable.** Each commit represents a discrete unit of work.
|
||||
5. **Worktree mode compatibility.** If the Maker runs in a worktree, git-integration commits go to the worktree's branch. The merge happens at the run level, not the worktree level. The Maker's worktree branch is a sub-branch of `archeflow/<run_id>`.
|
||||
6. **Clean merge or abort.** If a merge produces conflicts, do not force-resolve. Report the conflict, leave the branch intact, and let the user decide.
|
||||
7. **No signing by default.** Signing is opt-in via config. If configured, all commits on the branch are signed.
|
||||
|
||||
---
|
||||
|
||||
## Design Principles
|
||||
|
||||
1. **Git is the audit trail.** Every phase transition is a commit. `git log` tells the full story of a run.
|
||||
2. **Rollback is cheap.** Reset to any phase boundary, re-run from there. No need to start over.
|
||||
3. **Merge strategy is a project decision.** Squash for clean history, no-ff for detailed history. Both are valid.
|
||||
4. **Events + git = full observability.** Process events capture *what happened* (decisions, verdicts, timing). Git captures *what changed* (files, diffs). Together they provide complete run archaeology.
|
||||
5. **Fail-safe by default.** Every safety rule defaults to the conservative option. The user must explicitly opt in to destructive operations.
|
||||
- Never force-push
|
||||
- Never modify main history
|
||||
- Branch stays intact on failure
|
||||
- Clean merge or abort (no force-resolve on conflicts)
|
||||
- Worktree-compatible (Maker's worktree branch is sub-branch of run branch)
|
||||
|
||||
@@ -6,624 +6,138 @@ description: |
|
||||
and enforces a shared budget. Each sub-run uses the standard `run` skill internally.
|
||||
<example>User: "archeflow:multi-project" with a multi-run.yaml</example>
|
||||
<example>User: "Run this across archeflow, colette, and giesing"</example>
|
||||
<example>User: "archeflow:multi-project --dry-run"</example>
|
||||
---
|
||||
|
||||
# Multi-Project Orchestration
|
||||
|
||||
Coordinates ArcheFlow runs across multiple projects in a workspace. Each project gets its own
|
||||
PDCA run (via the standard `run` skill), but dependencies between projects are respected, artifacts
|
||||
are shared, and budget is tracked globally.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Load these skills (they are referenced throughout):
|
||||
- `archeflow:run` — single-project PDCA execution loop
|
||||
- `archeflow:process-log` — event schema and DAG parent rules
|
||||
- `archeflow:artifact-routing` — artifact naming, context injection, cycle archiving
|
||||
- `archeflow:cost-tracking` — cost aggregation and budget enforcement
|
||||
- `archeflow:domains` — domain detection per project
|
||||
|
||||
## Invocation
|
||||
|
||||
```
|
||||
archeflow:multi-project # Read from .archeflow/multi-run.yaml
|
||||
archeflow:multi-project --config path/to.yaml # Explicit config file
|
||||
archeflow:multi-project --dry-run # Plan phase only for all projects, show cost estimate
|
||||
archeflow:multi-project --resume <multi-run-id> # Resume a failed/paused multi-run
|
||||
```
|
||||
|
||||
---
|
||||
Coordinates ArcheFlow runs across multiple projects. Each project gets its own PDCA run (via `run` skill), but dependencies are respected, artifacts shared, and budget tracked globally.
|
||||
|
||||
## Multi-Run Definition
|
||||
|
||||
A multi-run is defined in YAML, either in `.archeflow/multi-run.yaml` or passed via `--config`.
|
||||
Defined in `.archeflow/multi-run.yaml` or passed via `--config`.
|
||||
|
||||
```yaml
|
||||
name: "giesing-gschichten-v2"
|
||||
description: "Write second story with improved ArcheFlow + Colette integration"
|
||||
|
||||
projects:
|
||||
- id: archeflow
|
||||
path: "../archeflow" # Relative to workspace root, or absolute
|
||||
path: "../archeflow"
|
||||
task: "Add memory injection to run skill"
|
||||
workflow: fast # fast | standard | thorough (optional, auto-select if omitted)
|
||||
domain: code # Optional, auto-detected if omitted
|
||||
depends_on: [] # No dependencies — can start immediately
|
||||
|
||||
workflow: fast
|
||||
depends_on: []
|
||||
- id: colette
|
||||
path: "../writing.colette"
|
||||
task: "Add story-specific voice validation command"
|
||||
workflow: standard
|
||||
domain: code
|
||||
depends_on: [] # Independent of archeflow — runs in parallel
|
||||
|
||||
task: "Add voice validation command"
|
||||
depends_on: []
|
||||
- id: giesing
|
||||
path: "."
|
||||
task: "Write story #2 using improved tools"
|
||||
task: "Write story #2"
|
||||
workflow: kurzgeschichte
|
||||
domain: writing
|
||||
depends_on: [archeflow, colette] # Waits for both to complete
|
||||
|
||||
depends_on: [archeflow, colette]
|
||||
budget:
|
||||
total_usd: 15.00 # Hard cap — stops all projects when exceeded
|
||||
per_project_usd: 10.00 # Soft cap — warns but does not stop
|
||||
|
||||
parallel: true # Run independent projects concurrently (default: true)
|
||||
total_usd: 15.00
|
||||
per_project_usd: 10.00
|
||||
```
|
||||
|
||||
### Definition Rules
|
||||
**Rules:** Unique `id` per project. `depends_on` references other `id` values. Cycles rejected at validation. At least one project must have empty `depends_on`. `workflow` and `domain` auto-select if omitted.
|
||||
|
||||
- `id` must be unique within the multi-run.
|
||||
- `path` is resolved relative to the directory containing the YAML file unless absolute.
|
||||
- `depends_on` references other project `id` values. Cycles are rejected at validation time.
|
||||
- `workflow` and `domain` are optional. If omitted, the `run` skill auto-selects per project.
|
||||
- At least one project must have an empty `depends_on` (otherwise the DAG has no entry point).
|
||||
## Dependency Resolution
|
||||
|
||||
---
|
||||
|
||||
## Workspace Registry Integration
|
||||
|
||||
If `docs/project-registry.md` exists at the workspace root, the multi-project skill can:
|
||||
|
||||
1. **Auto-discover paths:** When `path` is omitted from a project entry, look up the project `id` in the registry to find its directory.
|
||||
2. **Validate existence:** Before starting, verify that every project path exists on disk. Abort with a clear error if a path is missing.
|
||||
3. **Show registry status:** In the progress table, include the project's current sprint goal from the registry alongside the multi-run status.
|
||||
4. **Update registry:** After the multi-run completes, update each project's status in the registry if meaningful changes were made (new features, completed sprint goals).
|
||||
|
||||
---
|
||||
|
||||
## Execution Steps
|
||||
|
||||
### 0. Validate and Initialize
|
||||
|
||||
**0a. Parse and validate the multi-run definition:**
|
||||
Topological sort of the project DAG determines execution order.
|
||||
|
||||
```
|
||||
1. Read the YAML file.
|
||||
2. Validate all required fields (name, projects with id/path/task).
|
||||
3. Resolve all paths to absolute paths.
|
||||
4. Verify each path exists on disk.
|
||||
5. Build the dependency DAG.
|
||||
6. Check for cycles — abort if any detected.
|
||||
7. Identify the entry-point projects (depends_on is empty).
|
||||
8. Verify at least one entry-point exists.
|
||||
Layer 0 (immediate): [archeflow, colette] # No deps, start now
|
||||
Layer 1: [giesing] # Depends on Layer 0
|
||||
```
|
||||
|
||||
**0b. Generate multi-run ID and directory structure:**
|
||||
Independent projects in the same layer run in parallel. When a project completes, downstream projects with all deps met move to the ready queue.
|
||||
|
||||
```bash
|
||||
MULTI_RUN_ID="$(date -u +%Y-%m-%d)-${name}"
|
||||
Cycle detection via Kahn's algorithm. If sorted list is shorter than project list, report the cycle and abort.
|
||||
|
||||
# Master event file
|
||||
mkdir -p .archeflow/events
|
||||
touch .archeflow/events/${MULTI_RUN_ID}.jsonl
|
||||
## Parallel Execution
|
||||
|
||||
# Cross-project artifact directory
|
||||
mkdir -p .archeflow/artifacts/${MULTI_RUN_ID}
|
||||
for project in ${PROJECT_IDS}; do
|
||||
mkdir -p .archeflow/artifacts/${MULTI_RUN_ID}/${project}
|
||||
done
|
||||
For each ready project, start a sub-run as a parallel subagent with `isolation: "worktree"`. Each sub-run invokes `archeflow:run` with its own run_id, workflow, domain, and budget slice.
|
||||
|
||||
# Progress file
|
||||
touch .archeflow/multi-progress.md
|
||||
```
|
||||
When `parallel: false`, run sequentially in topological order.
|
||||
|
||||
**0c. Emit `multi.start`:**
|
||||
## Cross-Project Artifacts
|
||||
|
||||
```jsonl
|
||||
{"ts":"...","run_id":"<MULTI_RUN_ID>","seq":1,"parent":[],"type":"multi.start","phase":"init","agent":null,"data":{"name":"giesing-v2","description":"...","projects":["archeflow","colette","giesing"],"parallel":true,"budget_total_usd":15.00,"dag":{"archeflow":[],"colette":[],"giesing":["archeflow","colette"]}}}
|
||||
```
|
||||
When project B depends on A, B's Explorer receives upstream artifact summaries:
|
||||
- Only summaries injected (not full artifacts)
|
||||
- Large artifacts (>200 lines): extract summary section only
|
||||
- Cross-project injection happens only in Plan phase
|
||||
- Downstream Explorer has filesystem access to full artifacts if needed
|
||||
|
||||
**Track state throughout the multi-run:**
|
||||
- `MULTI_RUN_ID` — unique multi-run identifier
|
||||
- `MULTI_SEQ` — master event sequence counter
|
||||
- `PROJECT_STATUS` — map of project_id to status (`pending | running | completed | failed | blocked | skipped`)
|
||||
- `PROJECT_RUN_IDS` — map of project_id to its sub-run_id
|
||||
- `TOTAL_COST` — running cost total across all projects
|
||||
- `REMAINING_BUDGET` — budget minus total cost
|
||||
Artifact directory: `.archeflow/artifacts/<MULTI_RUN_ID>/<project_id>/`
|
||||
|
||||
---
|
||||
## Budget Coordination
|
||||
|
||||
### 1. Dependency Resolution
|
||||
|
||||
Build a topological sort of the project DAG. This determines execution order.
|
||||
|
||||
```
|
||||
Given:
|
||||
archeflow: depends_on=[]
|
||||
colette: depends_on=[]
|
||||
giesing: depends_on=[archeflow, colette]
|
||||
|
||||
Topological layers:
|
||||
Layer 0 (immediate): [archeflow, colette] # No deps, start now
|
||||
Layer 1: [giesing] # Depends on Layer 0
|
||||
```
|
||||
|
||||
**Algorithm:**
|
||||
1. Find all projects with zero unmet dependencies. These form the current layer.
|
||||
2. When a project completes, remove it from the dependency lists of all downstream projects.
|
||||
3. Any project whose dependency list becomes empty moves to the ready queue.
|
||||
4. Repeat until all projects are complete, failed, or blocked.
|
||||
|
||||
**Cycle detection:** Before starting, verify the DAG is acyclic. Use Kahn's algorithm — if after processing all nodes the sorted list is shorter than the project list, there is a cycle. Report which projects form the cycle and abort.
|
||||
|
||||
---
|
||||
|
||||
### 2. Parallel Execution
|
||||
|
||||
For each project in the ready queue, start a sub-run. Independent projects run concurrently.
|
||||
|
||||
**Starting a sub-run:**
|
||||
|
||||
```
|
||||
For each ready project:
|
||||
1. Set PROJECT_STATUS[project_id] = "running"
|
||||
2. Generate sub-run ID: MULTI_RUN_ID/project_id
|
||||
(e.g., "2026-04-03-giesing-v2/archeflow")
|
||||
3. Emit project.start to master event file
|
||||
4. cd into the project's path
|
||||
5. Invoke archeflow:run with:
|
||||
- run_id = MULTI_RUN_ID/project_id
|
||||
- workflow = project.workflow (or auto-select)
|
||||
- domain = project.domain (or auto-detect)
|
||||
- budget = min(per_project_budget, remaining_total_budget)
|
||||
- artifact_dir = .archeflow/artifacts/MULTI_RUN_ID/project_id/
|
||||
6. The sub-run emits its own events to its own JSONL file
|
||||
inside the project's directory (standard run behavior)
|
||||
```
|
||||
|
||||
**Concurrency model:**
|
||||
|
||||
When `parallel: true` (default), spawn independent projects as parallel subagents:
|
||||
|
||||
```
|
||||
Agent(
|
||||
description: "Multi-project sub-run: <project_id> — <task>",
|
||||
prompt: "Run archeflow:run in <path> with task: <task>.
|
||||
Run ID: <MULTI_RUN_ID>/<project_id>
|
||||
Workflow: <workflow>
|
||||
Domain: <domain>
|
||||
Budget: $<per_project_budget>
|
||||
Save artifacts to: .archeflow/artifacts/<MULTI_RUN_ID>/<project_id>/
|
||||
When complete, report: status, cost, artifact list, and any issues.",
|
||||
isolation: "worktree",
|
||||
mode: "bypassPermissions"
|
||||
)
|
||||
```
|
||||
|
||||
Launch all Layer 0 projects simultaneously. As each completes, check if any Layer 1+ projects become unblocked.
|
||||
|
||||
When `parallel: false`, run projects sequentially in topological order. Still respect dependencies — a project does not start until all its dependencies have completed.
|
||||
|
||||
---
|
||||
|
||||
### 3. Master Events
|
||||
|
||||
All multi-run-level events are written to `.archeflow/events/<MULTI_RUN_ID>.jsonl`. These track the overall orchestration, not individual PDCA phases (those go to each project's own event file).
|
||||
|
||||
#### Master Event Types
|
||||
|
||||
| Event | When | Key Data |
|
||||
| Level | Type | Behavior |
|
||||
|-------|------|----------|
|
||||
| `multi.start` | Multi-run begins | Project list, DAG, budget |
|
||||
| `project.start` | A sub-run launches | project_id, run_id, path |
|
||||
| `project.complete` | A sub-run finishes successfully | project_id, status, cost, artifacts |
|
||||
| `project.failed` | A sub-run fails | project_id, error, cost_so_far |
|
||||
| `project.blocked` | A dependency failed, blocking this project | project_id, blocked_by |
|
||||
| `project.unblocked` | All dependencies met, project can start | project_id, unblocked_by |
|
||||
| `project.skipped` | User chose to skip a blocked project | project_id, reason |
|
||||
| `budget.warning` | Budget threshold crossed | spent, budget, percent |
|
||||
| `budget.exceeded` | Hard budget cap hit | spent, budget, halted_projects |
|
||||
| `multi.complete` | All projects done (or halted) | status, projects_completed, total_cost |
|
||||
| `total_usd` | Hard cap | Stops ALL projects when exceeded |
|
||||
| `per_project_usd` | Soft cap | Warns but continues |
|
||||
|
||||
#### Example Master Event Stream
|
||||
**Enforcement points:**
|
||||
1. Before starting a sub-run: estimate cost, halt if > remaining budget
|
||||
2. After each sub-run: update total, emit `budget.warning` at threshold, emit `budget.exceeded` at cap
|
||||
|
||||
```jsonl
|
||||
{"seq":1,"type":"multi.start","phase":"init","data":{"name":"giesing-v2","projects":["archeflow","colette","giesing"],"parallel":true,"budget_total_usd":15.00}}
|
||||
{"seq":2,"type":"project.start","phase":"run","data":{"project":"archeflow","run_id":"2026-04-03-giesing-v2/archeflow","path":"/home/c/projects/archeflow"}}
|
||||
{"seq":3,"type":"project.start","phase":"run","data":{"project":"colette","run_id":"2026-04-03-giesing-v2/colette","path":"/home/c/projects/writing.colette"}}
|
||||
{"seq":4,"type":"project.complete","phase":"run","data":{"project":"archeflow","status":"completed","run_id":"2026-04-03-giesing-v2/archeflow","cost_usd":1.20,"artifacts":["plan-explorer.md","plan-creator.md","do-maker.md","check-guardian.md"]}}
|
||||
{"seq":5,"type":"project.complete","phase":"run","data":{"project":"colette","status":"completed","run_id":"2026-04-03-giesing-v2/colette","cost_usd":1.80,"artifacts":["plan-creator.md","do-maker.md","check-guardian.md","check-sage.md"]}}
|
||||
{"seq":6,"type":"project.unblocked","phase":"run","data":{"project":"giesing","unblocked_by":["archeflow","colette"]}}
|
||||
{"seq":7,"type":"project.start","phase":"run","data":{"project":"giesing","run_id":"2026-04-03-giesing-v2/giesing","path":"/home/c/projects/book.giesing-gschichten"}}
|
||||
{"seq":8,"type":"project.complete","phase":"run","data":{"project":"giesing","status":"completed","run_id":"2026-04-03-giesing-v2/giesing","cost_usd":3.50,"artifacts":["plan-explorer.md","plan-creator.md","do-maker.md","check-guardian.md","check-sage.md"]}}
|
||||
{"seq":9,"type":"multi.complete","phase":"done","data":{"status":"completed","projects_completed":3,"projects_failed":0,"total_cost_usd":6.50,"budget_remaining_usd":8.50}}
|
||||
```
|
||||
Each sub-run receives `min(per_project_usd, remaining_total_budget)` as its budget.
|
||||
|
||||
---
|
||||
|
||||
### 4. Cross-Project Artifacts
|
||||
|
||||
When project B depends on project A, B's agents can access A's artifacts. This is the primary mechanism for cross-project information flow.
|
||||
|
||||
#### Artifact Directory Layout
|
||||
|
||||
```
|
||||
.archeflow/artifacts/<MULTI_RUN_ID>/
|
||||
├── archeflow/ # Sub-run artifacts from archeflow
|
||||
│ ├── plan-explorer.md
|
||||
│ ├── plan-creator.md
|
||||
│ ├── do-maker.md
|
||||
│ ├── do-maker-files.txt
|
||||
│ └── check-guardian.md
|
||||
├── colette/ # Sub-run artifacts from colette
|
||||
│ ├── plan-creator.md
|
||||
│ ├── do-maker.md
|
||||
│ └── check-sage.md
|
||||
└── giesing/ # Sub-run artifacts from giesing (depends on both)
|
||||
├── plan-explorer.md # Explorer can reference upstream artifacts
|
||||
├── plan-creator.md
|
||||
├── do-maker.md
|
||||
└── check-guardian.md
|
||||
```
|
||||
|
||||
#### Cross-Project Context Injection
|
||||
|
||||
When a dependent project's sub-run starts, inject upstream artifact summaries into the Explorer's prompt:
|
||||
|
||||
```markdown
|
||||
## Upstream Project Results
|
||||
|
||||
### archeflow (completed)
|
||||
Summary: Added memory injection to run skill.
|
||||
Key artifacts:
|
||||
- plan-creator.md: <first 20 lines or summary section>
|
||||
- do-maker.md: <implementation summary>
|
||||
|
||||
### colette (completed)
|
||||
Summary: Added story-specific voice validation command.
|
||||
Key artifacts:
|
||||
- plan-creator.md: <first 20 lines or summary section>
|
||||
- do-maker.md: <implementation summary>
|
||||
|
||||
Use these results as context. The changes from these projects are available in their
|
||||
respective directories and have been committed to their branches.
|
||||
```
|
||||
|
||||
**Rules for cross-project injection:**
|
||||
- Only inject summaries, not full artifacts (keep context small).
|
||||
- If an upstream artifact is large (>200 lines), extract the summary/overview section only.
|
||||
- The dependent project's Explorer has filesystem access to read full upstream artifacts if needed.
|
||||
- Cross-project injection happens ONLY in the Plan phase (Explorer and Creator). The Maker works from the Creator's proposal, which already incorporates upstream context.
|
||||
|
||||
---
|
||||
|
||||
### 5. Budget Coordination
|
||||
|
||||
The multi-run has a shared budget across all projects.
|
||||
|
||||
#### Budget Hierarchy
|
||||
|
||||
```
|
||||
total_usd: 15.00 # Hard cap — stops ALL projects when exceeded
|
||||
per_project_usd: 10.00 # Soft cap — warns but does not stop individual project
|
||||
```
|
||||
|
||||
#### Budget Tracking
|
||||
|
||||
Maintain a running total across all sub-runs:
|
||||
|
||||
```
|
||||
TOTAL_COST = sum of all project costs reported in project.complete events
|
||||
REMAINING = total_usd - TOTAL_COST
|
||||
```
|
||||
|
||||
#### Budget Enforcement Points
|
||||
|
||||
1. **Before starting a sub-run:**
|
||||
- Estimate the sub-run cost (based on workflow and domain).
|
||||
- If estimated cost > REMAINING: warn and ask user (attended) or halt (autonomous).
|
||||
|
||||
2. **After each sub-run completes:**
|
||||
- Update TOTAL_COST with actual cost from the sub-run.
|
||||
- If TOTAL_COST > total_usd * warn_at_percent: emit `budget.warning`.
|
||||
- If TOTAL_COST > total_usd: emit `budget.exceeded`, halt remaining projects.
|
||||
|
||||
3. **Per-project soft cap:**
|
||||
- Each sub-run receives `min(per_project_usd, REMAINING)` as its budget.
|
||||
- The `run` skill's own budget enforcement handles the per-project cap.
|
||||
- If a project exceeds per_project_usd, it warns but continues (soft cap).
|
||||
|
||||
#### Budget Events
|
||||
|
||||
```jsonl
|
||||
{"seq":5,"type":"budget.warning","data":{"spent_usd":11.50,"budget_usd":15.00,"percent":77,"message":"Budget 77% consumed"}}
|
||||
{"seq":8,"type":"budget.exceeded","data":{"spent_usd":15.30,"budget_usd":15.00,"halted_projects":["giesing"],"message":"Hard budget cap exceeded. Halting remaining projects."}}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 6. Failure Handling
|
||||
|
||||
Failures in one project affect downstream projects but not independent ones.
|
||||
|
||||
#### Failure Scenarios
|
||||
## Failure Handling
|
||||
|
||||
| Scenario | Action |
|
||||
|----------|--------|
|
||||
| Project fails (run error, test failure, max cycles) | Mark as `failed` in master events. Independent projects continue. |
|
||||
| Dependency of project X failed | Mark X as `blocked`. Do not start X. |
|
||||
| Budget exceeded mid-run | Halt the current project. Mark remaining as `blocked`. |
|
||||
| All entry-point projects fail | Entire multi-run fails. No downstream projects can start. |
|
||||
| Project fails | Mark `failed`. Independent projects continue. |
|
||||
| Dependency failed | Mark downstream as `blocked`. Do not start. |
|
||||
| Budget exceeded | Halt current project. Skip downstream. |
|
||||
| All entry-points fail | Entire multi-run fails. |
|
||||
|
||||
#### Blocked Project Resolution
|
||||
**Blocked project resolution:**
|
||||
- Autonomous mode: skip blocked projects, continue independent ones
|
||||
- Attended mode: offer skip / retry / abort
|
||||
|
||||
When a project is blocked because a dependency failed, offer three options:
|
||||
## Progress Tracking
|
||||
|
||||
1. **Skip:** Mark the blocked project as `skipped`. Continue with other independent projects.
|
||||
2. **Retry:** Re-run the failed dependency. If it succeeds, unblock downstream projects.
|
||||
3. **Abort:** Stop the entire multi-run. Report what completed and what did not.
|
||||
|
||||
In **autonomous mode**, the default action is `skip` — blocked projects are skipped, independent projects continue, and the multi-run completes with partial results.
|
||||
|
||||
In **attended mode**, prompt the user with the options above.
|
||||
|
||||
#### Failure Events
|
||||
|
||||
```jsonl
|
||||
{"seq":4,"type":"project.failed","data":{"project":"archeflow","error":"Max cycles reached with unresolved CRITICAL findings","cost_usd":2.10}}
|
||||
{"seq":5,"type":"project.blocked","data":{"project":"giesing","blocked_by":["archeflow"],"reason":"Dependency 'archeflow' failed"}}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 7. Progress Tracking
|
||||
|
||||
Maintain a live progress file at `.archeflow/multi-progress.md`. Update it after every project state change.
|
||||
Live progress at `.archeflow/multi-progress.md`, updated after every project state change:
|
||||
|
||||
```markdown
|
||||
# Multi-Run: giesing-v2
|
||||
Started: 2026-04-03T14:00:00Z
|
||||
|
||||
| Project | Status | Domain | Phase | Detail |
|
||||
|---------|--------|--------|-------|--------|
|
||||
| archeflow | completed | code | -- | 1 cycle, $1.20 |
|
||||
| colette | running | code | DO | maker drafting |
|
||||
| giesing | blocked | writing | -- | waiting for colette |
|
||||
|
||||
## Budget
|
||||
| | Amount |
|
||||
|---|--------|
|
||||
| Spent | $3.00 |
|
||||
| Budget | $15.00 |
|
||||
| Remaining | $12.00 |
|
||||
| Utilization | 20% |
|
||||
|
||||
## Dependency Graph
|
||||
```
|
||||
archeflow ----\
|
||||
+---> giesing
|
||||
colette ------/
|
||||
Budget: $3.00 / $15.00 (20%)
|
||||
```
|
||||
|
||||
## Timeline
|
||||
- 14:00:00 — Started archeflow, colette (parallel)
|
||||
- 14:05:23 — archeflow completed ($1.20, 1 cycle)
|
||||
- 14:06:10 — colette DO phase, maker drafting
|
||||
```
|
||||
## Master Events
|
||||
|
||||
Update this file after:
|
||||
- A project starts
|
||||
- A project changes phase (via status polling or sub-agent reporting)
|
||||
- A project completes or fails
|
||||
- A project becomes unblocked
|
||||
- Budget threshold is crossed
|
||||
Written to `.archeflow/events/<MULTI_RUN_ID>.jsonl`:
|
||||
|
||||
---
|
||||
| Event | When |
|
||||
|-------|------|
|
||||
| `multi.start` | Multi-run begins |
|
||||
| `project.start` | Sub-run launches |
|
||||
| `project.complete` | Sub-run succeeds |
|
||||
| `project.failed` | Sub-run fails |
|
||||
| `project.blocked` | Dependency failed |
|
||||
| `project.unblocked` | All deps met |
|
||||
| `budget.warning` | Threshold crossed |
|
||||
| `budget.exceeded` | Hard cap hit |
|
||||
| `multi.complete` | All projects done |
|
||||
|
||||
### 8. Completion
|
||||
## Dry-Run and Resume
|
||||
|
||||
When all projects are complete (or blocked/skipped with no more actionable items):
|
||||
**`--dry-run`:** Validates DAG, runs `archeflow:run --dry-run` per project, shows cost estimate. Does not execute.
|
||||
|
||||
**8a. Emit `multi.complete`:**
|
||||
**`--resume <id>`:** Reconstructs state from master events. Retries failed projects, starts pending ones with deps met.
|
||||
|
||||
```jsonl
|
||||
{"seq":9,"type":"multi.complete","phase":"done","data":{"status":"completed","projects_completed":3,"projects_failed":0,"projects_skipped":0,"total_cost_usd":6.50,"budget_remaining_usd":8.50,"duration_ms":600000}}
|
||||
```
|
||||
## Workspace Registry
|
||||
|
||||
Status values:
|
||||
- `completed` — all projects finished successfully
|
||||
- `partial` — some projects completed, some failed/skipped
|
||||
- `failed` — no projects completed successfully
|
||||
- `halted` — stopped due to budget or user abort
|
||||
If `docs/project-registry.md` exists: auto-discover paths by project id, validate existence, update registry after meaningful changes.
|
||||
|
||||
**8b. Generate multi-run report:**
|
||||
## Completion
|
||||
|
||||
```markdown
|
||||
# Multi-Run Report: giesing-v2
|
||||
Status values: `completed` (all done), `partial` (some failed/skipped), `failed` (none completed), `halted` (budget/abort).
|
||||
|
||||
## Summary
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Projects | 3 |
|
||||
| Completed | 3 |
|
||||
| Failed | 0 |
|
||||
| Total cost | $6.50 / $15.00 |
|
||||
| Duration | 10m 00s |
|
||||
|
||||
## Per-Project Results
|
||||
### archeflow
|
||||
- **Status:** completed
|
||||
- **Task:** Add memory injection to run skill
|
||||
- **Workflow:** fast (1 cycle)
|
||||
- **Cost:** $1.20
|
||||
- **Key artifacts:** plan-creator.md, do-maker.md
|
||||
|
||||
### colette
|
||||
- **Status:** completed
|
||||
- **Task:** Add story-specific voice validation command
|
||||
- **Workflow:** standard (1 cycle)
|
||||
- **Cost:** $1.80
|
||||
- **Key artifacts:** plan-creator.md, do-maker.md, check-sage.md
|
||||
|
||||
### giesing
|
||||
- **Status:** completed
|
||||
- **Task:** Write story #2 using improved tools
|
||||
- **Workflow:** kurzgeschichte (2 cycles)
|
||||
- **Cost:** $3.50
|
||||
- **Key artifacts:** plan-explorer.md, do-maker.md, check-guardian.md
|
||||
|
||||
## Dependency Graph Execution
|
||||
archeflow (Layer 0) ----> completed
|
||||
colette (Layer 0) ----> completed
|
||||
giesing (Layer 1) ----> unblocked ----> completed
|
||||
|
||||
## Cost Breakdown
|
||||
| Project | Plan | Do | Check | Total |
|
||||
|---------|------|----|-------|-------|
|
||||
| archeflow | $0.20 | $0.60 | $0.40 | $1.20 |
|
||||
| colette | $0.30 | $0.80 | $0.70 | $1.80 |
|
||||
| giesing | $0.50 | $2.00 | $1.00 | $3.50 |
|
||||
| **Total** | **$1.00** | **$3.40** | **$2.10** | **$6.50** |
|
||||
```
|
||||
|
||||
**8c. Update master event index:**
|
||||
|
||||
Append to `.archeflow/events/index.jsonl`:
|
||||
|
||||
```jsonl
|
||||
{"run_id":"2026-04-03-giesing-v2","ts":"2026-04-03T14:10:00Z","type":"multi","task":"Write second story with improved ArcheFlow + Colette integration","status":"completed","projects":3,"total_cost_usd":6.50}
|
||||
```
|
||||
|
||||
**8d. Update workspace registry (if applicable):**
|
||||
|
||||
If `docs/project-registry.md` exists and project statuses changed meaningfully, update the registry entries for affected projects.
|
||||
|
||||
---
|
||||
|
||||
## Dry-Run Mode
|
||||
|
||||
When `--dry-run` is specified:
|
||||
|
||||
1. Validate the multi-run definition (DAG, paths, budget).
|
||||
2. For each project (in topological order), run `archeflow:run --dry-run` to get a cost estimate and plan preview.
|
||||
3. Display a summary:
|
||||
|
||||
```
|
||||
Multi-Run Dry Run: giesing-v2
|
||||
Projects: 3
|
||||
Dependency layers: 2
|
||||
Parallel execution: yes
|
||||
|
||||
Layer 0 (parallel):
|
||||
archeflow — fast workflow, code domain
|
||||
Estimated cost: $0.50-1.50
|
||||
colette — standard workflow, code domain
|
||||
Estimated cost: $1.00-3.00
|
||||
|
||||
Layer 1 (after Layer 0):
|
||||
giesing — kurzgeschichte workflow, writing domain
|
||||
Estimated cost: $2.00-5.00
|
||||
|
||||
Total estimated cost: $3.50-9.50
|
||||
Budget: $15.00 (sufficient)
|
||||
|
||||
Proceed? [y/n]
|
||||
```
|
||||
|
||||
4. Do NOT emit `multi.complete`. The multi-run is paused.
|
||||
5. If user says yes, start the full multi-run using the validated config.
|
||||
|
||||
---
|
||||
|
||||
## Resume Mode
|
||||
|
||||
When `--resume <multi-run-id>` is specified:
|
||||
|
||||
1. Read the master event file `.archeflow/events/<multi-run-id>.jsonl`.
|
||||
2. Reconstruct `PROJECT_STATUS` from events (which projects completed, failed, are pending).
|
||||
3. Identify resumable projects:
|
||||
- `failed` projects can be retried.
|
||||
- `blocked` projects whose blockers are now `completed` (e.g., after manual fix) can start.
|
||||
- `pending` projects that were never started can start if their deps are met.
|
||||
4. Display current state and ask for confirmation.
|
||||
5. Continue the multi-run from where it left off, appending to the existing master event file.
|
||||
|
||||
Resume emits a `multi.resume` event:
|
||||
|
||||
```jsonl
|
||||
{"seq":10,"type":"multi.resume","phase":"init","data":{"resumed_from":"2026-04-03-giesing-v2","projects_completed":["archeflow"],"projects_to_run":["colette","giesing"]}}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Integration with Existing Skills
|
||||
|
||||
| Skill | Integration Point |
|
||||
|-------|-------------------|
|
||||
| `run` | Each sub-run is a standard `archeflow:run` invocation. The multi-project skill wraps and coordinates multiple runs. |
|
||||
| `process-log` | Master events follow the same schema (ts, run_id, seq, parent, type, phase, agent, data). Sub-run events use the standard event types. |
|
||||
| `artifact-routing` | Each sub-run follows standard artifact routing internally. Cross-project artifacts follow the injection rules in Section 4. |
|
||||
| `cost-tracking` | Per-project costs come from sub-run `run.complete` events. The multi-project skill aggregates them and enforces the shared budget. |
|
||||
| `domains` | Each project auto-detects its domain independently. Different projects in the same multi-run can have different domains. |
|
||||
| `git-integration` | Each sub-run manages its own branch. The multi-project skill does not merge across repos — each project's Act phase handles its own merge. |
|
||||
| `autonomous-mode` | Multi-project runs are autonomous-mode-friendly. Budget enforcement is strict (halt, don't prompt). Blocked projects are skipped. |
|
||||
|
||||
---
|
||||
|
||||
## Progress Display
|
||||
|
||||
Throughout the multi-run, display live progress:
|
||||
|
||||
```
|
||||
━━━ ArcheFlow Multi-Run: giesing-v2 ━━━━━━━━━━━━━━━━━━━
|
||||
Projects: 3 | Budget: $15.00 | Parallel: yes
|
||||
|
||||
[archeflow] fast/code -> running (Plan: Creator designing...)
|
||||
[colette] standard/code -> running (Do: Maker implementing...)
|
||||
[giesing] kurzgeschichte/writing -> blocked (waiting: archeflow, colette)
|
||||
|
||||
Cost: $1.80 / $15.00 (12%)
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
```
|
||||
|
||||
Update the display when:
|
||||
- A project changes state (start, phase change, complete, fail, unblock)
|
||||
- Budget thresholds are crossed
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
| Error | Response |
|
||||
|-------|----------|
|
||||
| YAML parse error | Abort before starting. Report the parse error with line number. |
|
||||
| Dependency cycle detected | Abort. Report which projects form the cycle. |
|
||||
| Project path does not exist | Abort. Report the missing path. |
|
||||
| Sub-run agent fails to return | Mark project as failed (5-min timeout per the `run` skill). Continue independent projects. |
|
||||
| Master event write fails | Log warning. Continue orchestration. Events are observation, not control flow. |
|
||||
| Artifact directory creation fails | Abort the affected project. This is blocking for cross-project artifact sharing. |
|
||||
| Budget exceeded mid-project | Halt that project immediately. Emit `budget.exceeded`. Skip downstream dependents. |
|
||||
|
||||
---
|
||||
|
||||
## Design Principles
|
||||
|
||||
1. **Each project is autonomous.** Sub-runs use the standard `run` skill without modification. The multi-project skill is a coordinator, not a replacement.
|
||||
2. **DAG over sequence.** Dependencies are declared, not implied by order. Independent projects always run in parallel when possible.
|
||||
3. **Shared budget, independent domains.** Budget is global, but each project detects its own domain, selects its own workflow, and manages its own artifacts.
|
||||
4. **Fail forward.** A failure in one project does not halt independent projects. Only downstream dependents are blocked.
|
||||
5. **Artifacts are the interface.** Projects communicate through saved artifacts, not shared memory or direct agent-to-agent messaging.
|
||||
6. **Resume over restart.** Multi-runs can be resumed from any point. Master events provide enough state to reconstruct progress.
|
||||
7. **Registry-aware.** When a workspace registry exists, use it for discovery and keep it updated. When it does not exist, everything still works.
|
||||
Final report includes per-project results, cost breakdown by phase, and dependency graph execution timeline.
|
||||
|
||||
@@ -1,160 +1,59 @@
|
||||
---
|
||||
name: presence
|
||||
description: |
|
||||
Defines how ArcheFlow communicates its activity to the user — visible but not noisy.
|
||||
Defines how ArcheFlow communicates its activity to the user -- visible but not noisy.
|
||||
Show value, not process. Auto-loaded by the run skill.
|
||||
---
|
||||
|
||||
# ArcheFlow Presence — Visible Value, Not Noise
|
||||
# ArcheFlow Presence -- Visible Value, Not Noise
|
||||
|
||||
ArcheFlow should feel like a skilled colleague working alongside you: you know they're there, you see results, but they don't narrate every keystroke.
|
||||
## Output Rules
|
||||
|
||||
## Principles
|
||||
|
||||
1. **Show outcomes, not mechanics.** "Guardian caught a timeline bug" — good. "Spawning Guardian agent with attention filters..." — noise.
|
||||
2. **One line per phase, not per agent.** The user sees phases complete, not individual agent lifecycle.
|
||||
3. **Numbers over words.** "2 fixes applied" beats "We have successfully applied two fixes to the codebase."
|
||||
4. **Silence is fine.** If a phase completes cleanly with no findings, don't announce it. Clean passes are the expected case.
|
||||
5. **Value at the end.** The completion summary is the most important output — what was built, what was caught, what was fixed.
|
||||
1. Show outcomes, not mechanics
|
||||
2. One line per phase, not per agent
|
||||
3. Numbers over words
|
||||
4. Silence on clean passes
|
||||
5. Value summary at the end
|
||||
|
||||
## Status Line Format
|
||||
|
||||
At key moments during a run, output a compact status line:
|
||||
|
||||
### Run Start
|
||||
**Run start:**
|
||||
```
|
||||
── archeflow ── <task> ── <workflow> (<max_cycles> cycles) ──
|
||||
```
|
||||
Example:
|
||||
```
|
||||
── archeflow ── Write story "Der Huster" ── kurzgeschichte (2 cycles) ──
|
||||
-- archeflow -- <task> -- <workflow> (<max_cycles> cycles) --
|
||||
```
|
||||
|
||||
### Phase Complete (only if something happened worth mentioning)
|
||||
**Phase complete (only if noteworthy):**
|
||||
```
|
||||
✓ plan explorer: 3 directions → chose C (Koffer) | creator: 6 scenes
|
||||
✓ do 6004 words drafted
|
||||
△ check guardian: 1 fix needed | sage: 5 voice adjustments
|
||||
✓ act 6 fixes applied
|
||||
V plan explorer: 3 directions -> chose C | creator: 6 scenes
|
||||
V do 6004 words drafted
|
||||
T check guardian: 1 fix needed | sage: 5 voice adjustments
|
||||
V act 6 fixes applied
|
||||
```
|
||||
Symbols: V = clean, T = issues found, X = failed/blocked.
|
||||
|
||||
Symbols:
|
||||
- `✓` — phase clean, no issues
|
||||
- `△` — phase found issues (fixes needed)
|
||||
- `✗` — phase failed (blocked, needs user input)
|
||||
|
||||
### Run Complete
|
||||
**Run complete:**
|
||||
```
|
||||
── done ── 1 cycle · 5 agents · 6 fixes · ~22 min ──
|
||||
```
|
||||
|
||||
If value was delivered, add a one-liner:
|
||||
```
|
||||
── done ── 1 cycle · 5 agents · 6 fixes · ~22 min ──
|
||||
-- done -- 1 cycle . 5 agents . 6 fixes . ~22 min --
|
||||
story drafted, reviewed, and polished. see stories/01-der-huster.md
|
||||
```
|
||||
|
||||
### Run Complete (with DAG, if terminal supports it)
|
||||
Only show if the user explicitly asks or if `progress.dag_on_complete: true` in config:
|
||||
**Activation indicator (session start, one line):**
|
||||
```
|
||||
── archeflow ── complete ──────────────────────
|
||||
#1 run.start
|
||||
├── #2 explorer → #3 decision (C) → #4 creator
|
||||
├── #6 maker (6004 words)
|
||||
├── #8 guardian △1 · #9 sage △5
|
||||
└── #12 complete [6 fixes]
|
||||
───────────────────────────────────────────────
|
||||
archeflow v0.7.0 . 24 skills . writing domain detected
|
||||
```
|
||||
|
||||
## When to Be Silent
|
||||
|
||||
- **Agent spawning/completion** — don't announce
|
||||
- **Event emission** — internal bookkeeping, never visible
|
||||
- **Artifact routing** — internal
|
||||
- **Clean review passes** — if Guardian says APPROVED with 0 findings, skip it
|
||||
- **Phase transitions** — only show if the phase produced visible output
|
||||
- Agent spawning/completion lifecycle
|
||||
- Event emission
|
||||
- Artifact routing
|
||||
- Clean review passes (0 findings)
|
||||
- Phase transitions with no visible output
|
||||
|
||||
## When to Speak
|
||||
|
||||
- **Run start** — always (user should know ArcheFlow activated)
|
||||
- **Findings found** — always (this is the value)
|
||||
- **Fixes applied** — always (this is the outcome)
|
||||
- **Run complete** — always (closure)
|
||||
- **Budget warnings** — always (user needs to know)
|
||||
- **Shadow detected** — always (something went wrong)
|
||||
- **User decision needed** — always (blocking)
|
||||
|
||||
## Activation Indicator
|
||||
|
||||
When ArcheFlow activates at session start (via the `using-archeflow` skill), show ONE line:
|
||||
|
||||
```
|
||||
archeflow v0.7.0 · 24 skills · writing domain detected
|
||||
```
|
||||
|
||||
Or for code projects:
|
||||
```
|
||||
archeflow v0.7.0 · 24 skills · code domain
|
||||
```
|
||||
|
||||
If ArcheFlow decides NOT to activate (simple task, single file):
|
||||
```
|
||||
(nothing — silence is correct for simple tasks)
|
||||
```
|
||||
|
||||
## Integration with Progress File
|
||||
|
||||
The `.archeflow/progress.md` file is the detailed view for users who want more. The status lines above are the default — brief, inline, part of the conversation flow.
|
||||
|
||||
Users who want the full picture: `archeflow-progress.sh <run_id> --watch` in a second terminal.
|
||||
|
||||
## Anti-Patterns (Don't Do This)
|
||||
|
||||
```
|
||||
❌ "I'm now activating the ArcheFlow orchestration framework..."
|
||||
❌ "Spawning Explorer agent with model haiku and attention filter..."
|
||||
❌ "The Guardian archetype has completed its security review and found..."
|
||||
❌ "Let me run the convergence detection algorithm to check..."
|
||||
❌ "According to the ArcheFlow process-log event schema..."
|
||||
```
|
||||
|
||||
These expose internal mechanics. The user doesn't care about archetypes, attention filters, or event schemas. They care about: what was done, what was found, what was fixed.
|
||||
|
||||
## Examples: Good Presence
|
||||
|
||||
### Example 1: Feature Implementation
|
||||
```
|
||||
── archeflow ── Add JWT auth ── standard (2 cycles) ──
|
||||
✓ plan 3 files affected, JWT + middleware approach
|
||||
✓ do implemented (auth.ts, middleware.ts, tests)
|
||||
△ check guardian: missing token expiry check
|
||||
✓ act 1 fix applied
|
||||
── done ── 1 cycle · 4 agents · 1 fix · ~8 min ──
|
||||
```
|
||||
|
||||
### Example 2: Story Writing
|
||||
```
|
||||
── archeflow ── Write "Der Huster" ── kurzgeschichte (2 cycles) ──
|
||||
✓ plan 3 plot directions → chose C (Mo krank + Koffer)
|
||||
✓ do 6004 words, 7 scenes
|
||||
△ check 1 timeline bug, 5 voice adjustments
|
||||
✓ act 6 fixes applied
|
||||
── done ── 1 cycle · 5 agents · 6 fixes · ~22 min ──
|
||||
stories/01-der-huster.md ready
|
||||
```
|
||||
|
||||
### Example 3: Quick Fix (minimal output)
|
||||
```
|
||||
── archeflow ── Fix pagination bug ── fast ──
|
||||
✓ fix applied, tests pass
|
||||
── done ── 1 cycle · 3 agents · ~4 min ──
|
||||
```
|
||||
|
||||
### Example 4: Multi-Project
|
||||
```
|
||||
── archeflow ── giesing-story-v2 ── 3 projects ──
|
||||
✓ archeflow artifact routing improved
|
||||
✓ colette voice validation added
|
||||
✓ giesing story #2 drafted (5800 words)
|
||||
── done ── 3 projects · 12 agents · ~35 min ──
|
||||
```
|
||||
- Run start and complete (always)
|
||||
- Findings found and fixes applied
|
||||
- Budget warnings
|
||||
- Shadow detected
|
||||
- User decision needed
|
||||
|
||||
@@ -3,37 +3,20 @@ name: progress
|
||||
description: |
|
||||
Live progress file for ArcheFlow orchestrations. Regenerates `.archeflow/progress.md`
|
||||
after every event emission, giving users real-time visibility into run status, budget
|
||||
usage, and DAG shape — watchable from a second terminal.
|
||||
usage, and DAG shape -- watchable from a second terminal.
|
||||
<example>User: "What's happening with my run?"</example>
|
||||
<example>watch -n 2 cat .archeflow/progress.md</example>
|
||||
---
|
||||
|
||||
# Live Progress — Real-Time Run Visibility
|
||||
# Live Progress -- Real-Time Run Visibility
|
||||
|
||||
During long-running orchestrations (Maker drafting, parallel reviews), users have no visibility into what is happening. This skill solves that by maintaining a live progress file that is regenerated after every event.
|
||||
|
||||
## Progress File
|
||||
|
||||
**Location:** `.archeflow/progress.md`
|
||||
|
||||
Updated after every event emission during a run. Users can watch it from a second terminal:
|
||||
|
||||
```bash
|
||||
# Simple polling
|
||||
watch -n 2 cat .archeflow/progress.md
|
||||
|
||||
# Continuous mode (built-in)
|
||||
./lib/archeflow-progress.sh <run_id> --watch
|
||||
|
||||
# Programmatic consumption
|
||||
./lib/archeflow-progress.sh <run_id> --json
|
||||
```
|
||||
Maintains `.archeflow/progress.md`, updated after every event during a run.
|
||||
|
||||
## Progress File Format
|
||||
|
||||
```markdown
|
||||
# ArcheFlow Run: 2026-04-03-der-huster
|
||||
**Status:** DO phase — maker running (3/6 scenes drafted)
|
||||
**Status:** DO phase -- maker running (3/6 scenes drafted)
|
||||
**Started:** 14:32 | **Elapsed:** 8 min
|
||||
**Budget:** $1.45 / $10.00 (14%)
|
||||
|
||||
@@ -47,145 +30,40 @@ watch -n 2 cat .archeflow/progress.md
|
||||
- [ ] ACT: Apply fixes
|
||||
|
||||
## Latest Event
|
||||
#6 agent.start — maker (do) — 14:40
|
||||
|
||||
## DAG (so far)
|
||||
#1 run.start
|
||||
├── #2 story-explorer ✓
|
||||
│ ├── #3 decision ✓
|
||||
│ └── #4 creator ✓
|
||||
├── #5 plan→do ✓
|
||||
└── #6 maker ← running
|
||||
#6 agent.start -- maker (do) -- 14:40
|
||||
```
|
||||
|
||||
## How to Use
|
||||
## Usage
|
||||
|
||||
### During Orchestration (run skill integration)
|
||||
|
||||
The `run` skill should call `archeflow-progress.sh` after each event emission. This keeps progress decoupled from the event emitter itself — no modification to `archeflow-event.sh` is needed.
|
||||
|
||||
Add this call after every `archeflow-event.sh` invocation in the run loop:
|
||||
|
||||
```bash
|
||||
# After emitting an event:
|
||||
./lib/archeflow-event.sh "$RUN_ID" agent.complete plan explorer '{"archetype":"explorer",...}'
|
||||
|
||||
# Update progress:
|
||||
./lib/archeflow-progress.sh "$RUN_ID"
|
||||
The `run` skill calls `archeflow-progress.sh` after each event emission:
|
||||
```
|
||||
|
||||
This is a fast operation (reads JSONL, writes one markdown file) and adds negligible overhead.
|
||||
|
||||
### From a Second Terminal
|
||||
|
||||
```bash
|
||||
# One-shot: see current state
|
||||
./lib/archeflow-progress.sh <run_id>
|
||||
cat .archeflow/progress.md
|
||||
|
||||
# Continuous: auto-refresh every 2 seconds
|
||||
./lib/archeflow-progress.sh <run_id> --watch
|
||||
|
||||
# JSON output for dashboards or scripts
|
||||
./lib/archeflow-progress.sh <run_id> --json
|
||||
```
|
||||
|
||||
### Reactive Mode (via JSONL tail)
|
||||
**From a second terminal:**
|
||||
- One-shot: `cat .archeflow/progress.md`
|
||||
- Continuous: `./lib/archeflow-progress.sh <run_id> --watch`
|
||||
- JSON output: `./lib/archeflow-progress.sh <run_id> --json`
|
||||
|
||||
```bash
|
||||
tail -f .archeflow/events/<run_id>.jsonl | while read line; do
|
||||
./lib/archeflow-progress.sh <run_id>
|
||||
done
|
||||
```
|
||||
## How the Script Works
|
||||
|
||||
## Progress Script
|
||||
|
||||
**Location:** `lib/archeflow-progress.sh`
|
||||
|
||||
```
|
||||
Usage:
|
||||
archeflow-progress.sh <run_id> # Generate/update progress.md
|
||||
archeflow-progress.sh <run_id> --watch # Continuous update mode (2s interval)
|
||||
archeflow-progress.sh <run_id> --json # Output as JSON (for dashboards)
|
||||
```
|
||||
|
||||
### What the Script Does
|
||||
|
||||
1. **Read** `.archeflow/events/<run_id>.jsonl` — the event stream for this run
|
||||
2. **Determine** current phase and active agent from the latest events
|
||||
3. **Build checklist** — mark completed agents with timing/cost data, show pending agents as unchecked
|
||||
4. **Show partial DAG** — completed nodes with checkmarks, running node with arrow indicator
|
||||
5. **Calculate budget** — sum `estimated_cost_usd` from `agent.complete` events, compare to budget from `run.start` config or `.archeflow/config.yaml`
|
||||
6. **Compute elapsed time** — difference between `run.start` timestamp and now
|
||||
7. **Write** to `.archeflow/progress.md`
|
||||
|
||||
### Output Modes
|
||||
|
||||
**Default (markdown):** Writes `.archeflow/progress.md` and prints the same content to stdout.
|
||||
|
||||
**`--watch`:** Clears the terminal every 2 seconds, re-reads the JSONL, and regenerates the display. Exits when a `run.complete` event is found.
|
||||
|
||||
**`--json`:** Outputs a structured JSON object to stdout (does not write progress.md):
|
||||
|
||||
```json
|
||||
{
|
||||
"run_id": "2026-04-03-der-huster",
|
||||
"status": "running",
|
||||
"phase": "do",
|
||||
"active_agent": "maker",
|
||||
"elapsed_seconds": 480,
|
||||
"budget_used_usd": 1.45,
|
||||
"budget_total_usd": 10.00,
|
||||
"budget_percent": 14,
|
||||
"completed": [
|
||||
{"agent": "explorer", "phase": "plan", "duration_s": 87, "tokens": 21000, "cost_usd": 0.02},
|
||||
{"agent": "creator", "phase": "plan", "duration_s": 167, "tokens": 26000, "cost_usd": 0.08}
|
||||
],
|
||||
"pending": ["guardian", "sage"],
|
||||
"latest_event": {"seq": 6, "type": "agent.start", "agent": "maker", "phase": "do"},
|
||||
"total_events": 6
|
||||
}
|
||||
```
|
||||
1. Read `.archeflow/events/<run_id>.jsonl`
|
||||
2. Determine current phase and active agent
|
||||
3. Build checklist from events (only started/completed agents shown)
|
||||
4. Calculate budget from `agent.complete` cost data
|
||||
5. Write `.archeflow/progress.md`
|
||||
|
||||
## Checklist Construction
|
||||
|
||||
The progress checklist is built from events, not from a predefined workflow definition. Each event type maps to a checklist entry:
|
||||
|
||||
| Event Type | Checklist Entry |
|
||||
|-----------|----------------|
|
||||
| Event Type | Entry |
|
||||
|-----------|-------|
|
||||
| `agent.complete` | `- [x] PHASE: archetype (duration, tokens, cost)` |
|
||||
| `agent.start` (no matching complete) | `- [ ] **PHASE: archetype** <- running (elapsed)` |
|
||||
| `agent.start` (no complete) | `- [ ] **PHASE: archetype** <- running` |
|
||||
| `phase.transition` | `- [x] PHASE -> PHASE transition` |
|
||||
| `review.verdict` | `- [x] CHECK: archetype -> VERDICT` |
|
||||
| `fix.applied` | `- [x] ACT: Fix (source)` |
|
||||
| `cycle.boundary` | `- [x] Cycle N complete` |
|
||||
|
||||
Pending agents (not yet started) are NOT shown in the checklist — only started or completed agents appear. This avoids guessing which agents will be spawned.
|
||||
Pending (not-yet-started) agents are NOT shown to avoid guessing.
|
||||
|
||||
## Budget Display
|
||||
|
||||
Budget information comes from two sources:
|
||||
|
||||
1. **`run.start` event** — may contain `config.budget_usd`
|
||||
2. **`.archeflow/config.yaml`** — global `budget.per_run_usd`
|
||||
|
||||
If no budget is configured, the budget line shows cost only (no percentage):
|
||||
|
||||
```
|
||||
**Cost:** $1.45 (no budget set)
|
||||
```
|
||||
|
||||
## Integration with Other Skills
|
||||
|
||||
- **`run`**: Should call `archeflow-progress.sh` after each event emission
|
||||
- **`process-log`**: Progress reads the same JSONL that process-log defines
|
||||
- **`cost-tracking`**: Budget data and cost calculations follow cost-tracking conventions
|
||||
- **`autonomous-mode`**: Progress file is useful for monitoring autonomous overnight runs
|
||||
|
||||
## Design Principles
|
||||
|
||||
1. **Read-only on events.** Progress never modifies the JSONL. It is a derived view.
|
||||
2. **Fast.** One JSONL read + one markdown write. No jq streaming, no databases.
|
||||
3. **Decoupled.** No hooks in `archeflow-event.sh`. The `run` skill calls progress explicitly.
|
||||
4. **Optional.** If progress is never called, orchestration works fine. No side effects.
|
||||
5. **Terminal-friendly.** Output is plain markdown — renders well in `cat`, `bat`, `glow`, or any terminal.
|
||||
Source: `run.start` event or `.archeflow/config.yaml`. If no budget configured: show cost only.
|
||||
|
||||
@@ -352,10 +352,12 @@ Emit events via `./lib/archeflow-event.sh <run_id> <type> <phase> <agent> '<json
|
||||
| After agent returns | `agent.complete` | archetype, duration_ms, artifacts, summary |
|
||||
| Phase boundary | `phase.transition` | from, to, artifacts_so_far |
|
||||
| Alternative chosen | `decision` | what, chosen, alternatives, rationale |
|
||||
| Orchestrator decision (replay) | `decision.point` | archetype, input, decision, confidence — use `./lib/archeflow-decision.sh` |
|
||||
| Reviewer verdict | `review.verdict` | archetype, verdict, findings[] |
|
||||
| Fix addressing review | `fix.applied` | source, finding, file, line |
|
||||
| End of PDCA cycle | `cycle.boundary` | cycle, max_cycles, exit_condition, convergence |
|
||||
| Shadow triggered | `shadow.detected` | archetype, shadow, trigger, action |
|
||||
| Policy halt | `wiggum.break` | trigger, run_state, unresolved_findings, hard/soft |
|
||||
| Run ends | `run.complete` | status, cycles, agents_total, fixes_total |
|
||||
|
||||
Parent rules: `run.start` has `parent: []`. Agents parent to the event that triggered them. Phase transitions fan-in from all completing events. Parallel agents share the same parent.
|
||||
@@ -403,6 +405,12 @@ Scores stored in `.archeflow/memory/effectiveness.jsonl`. After 10+ runs, recomm
|
||||
|
||||
---
|
||||
|
||||
## Run replay (decision log + what-if)
|
||||
|
||||
After key choices (routing, fast-path skip, escalation), emit `decision.point` via `./lib/archeflow-decision.sh` so runs can be inspected with `./lib/archeflow-replay.sh timeline|whatif|compare <run_id>`. Weighted what-if helps estimate how much each review archetype swayed the effective ship/block outcome. See skill `af-replay`.
|
||||
|
||||
---
|
||||
|
||||
## Dry-Run Mode
|
||||
|
||||
When `--dry-run`: Run Plan phase only. Display workflow, agent counts, confidence scores, cost estimate. Ask user to proceed. If yes, continue with `--start-from do`.
|
||||
|
||||
@@ -1,66 +1,139 @@
|
||||
---
|
||||
name: shadow-detection
|
||||
description: Use when monitoring agent behavior for dysfunction, when an agent seems stuck, or when orchestration quality is degrading. Detects and corrects Jungian shadow activation in archetypes.
|
||||
description: |
|
||||
Corrective action framework for agent dysfunction, system health, and operational policy.
|
||||
Three layers — archetype shadows, system shadows, policy boundaries — one escalation protocol.
|
||||
---
|
||||
|
||||
# Shadow Detection
|
||||
# Corrective Action Framework
|
||||
|
||||
Every archetype has a virtue and a shadow (its destructive inversion). Shadow activates when the virtue is pushed too far.
|
||||
Detect dysfunction. Apply corrective action. Escalate if repeated.
|
||||
|
||||
| Archetype | Virtue | Shadow |
|
||||
|-----------|--------|--------|
|
||||
| Explorer | Contextual Clarity | Rabbit Hole |
|
||||
| Creator | Decisive Framing | Over-Architect |
|
||||
| Maker | Execution Discipline | Rogue |
|
||||
| Guardian | Threat Intuition | Paranoid |
|
||||
| Skeptic | Assumption Surfacing | Paralytic |
|
||||
| Trickster | Adversarial Creativity | False Alarm |
|
||||
| Sage | Maintainability Judgment | Bureaucrat |
|
||||
Three layers, one protocol:
|
||||
- **Archetype Shadows** — individual agent dysfunction (virtue pushed too far)
|
||||
- **System Shadows** — orchestration-level dysfunction (process going wrong)
|
||||
- **Policy Boundaries** — operational limits (time, cost, quality thresholds)
|
||||
|
||||
---
|
||||
|
||||
### Explorer -> Rabbit Hole
|
||||
**Detect** (any): output >2000w without Recommendation | >3 tangents | >15 files no patterns | no synthesis in final 25%
|
||||
**Correct**: "Summarize top 3 findings and one recommendation in under 300 words."
|
||||
## Archetype Shadows
|
||||
|
||||
### Creator -> Over-Architect
|
||||
**Detect** (any): >2 new abstractions for a single feature | "future-proof" in rationale | scope exceeds task by >50% | >1 new package for one feature
|
||||
**Correct**: "Design for the current order of magnitude. Remove abstractions that serve hypothetical requirements."
|
||||
| Archetype | Shadow | Detect (any) | Corrective Action |
|
||||
|-----------|--------|-------------|-------------------|
|
||||
| Explorer | Rabbit Hole | Output >2000w without Recommendation; >3 tangents; >15 files no patterns; no synthesis in final 25% | "Summarize top 3 findings and one recommendation in 300 words." |
|
||||
| Creator | Over-Architect | >2 new abstractions for one feature; "future-proof" in rationale; scope exceeds task >50%; >1 new package | "Design for the current order of magnitude. Remove abstractions for hypothetical requirements." |
|
||||
| Maker | Rogue | Zero test files with >=3 files changed; single monolithic commit; files outside proposal; no test run evidence | "Read the proposal. Write a test. Commit. Revert out-of-scope files." |
|
||||
| Guardian | Paranoid | CRITICAL:WARNING ratio >2:1 (min 3); zero APPROVED in 3+ reviews; <50% findings include fix; findings require compromised systems | "For each CRITICAL: would a senior engineer block a PR? If not, downgrade. Every rejection needs a specific fix." |
|
||||
| Skeptic | Paralytic | >7 challenges; <50% include alternatives; same concern 2+ times reworded; >3 findings outside scope | "Rank by impact. Keep top 3 with alternatives. Delete the rest." |
|
||||
| Trickster | False Alarm | Findings in untouched code; >10 findings for <5 files; impossible scenarios; >3 without repro steps | "Delete findings outside the diff. Rank by likelihood x impact. Keep top 3-5." |
|
||||
| Sage | Bureaucrat | Review words >2x diff lines; findings outside changeset; >2 "consider" without action; suggesting docs for trivial functions | "Limit to issues affecting maintainability in 6 months. Every finding needs a specific action." |
|
||||
|
||||
### Maker -> Rogue
|
||||
**Detect** (any): zero test files with >=3 files changed | single monolithic commit | diff contains files not in proposal | no evidence of running tests
|
||||
**Correct**: "Read the proposal. Write a test. Commit what you have. Revert changes to files not in the proposal."
|
||||
### Shadow Immunity
|
||||
|
||||
### Guardian -> Paranoid
|
||||
**Detect** (any): CRITICAL:WARNING ratio >2:1 (min 3 findings) | zero APPROVED in 3+ reviews | <50% findings include a fix | findings require already-compromised systems
|
||||
**Correct**: "For each CRITICAL: would a senior engineer block a PR for this? If not, downgrade. Every rejection must include a specific fix."
|
||||
Intensity alone is not a shadow. **Shadow = behavior disconnected from the goal.**
|
||||
|
||||
### Skeptic -> Paralytic
|
||||
**Detect** (any): >7 challenges in a single review | <50% include alternatives | same concern appears 2+ times reworded | >3 findings outside task scope
|
||||
**Correct**: "Rank challenges by impact. Keep top 3. Each must include a specific alternative. Delete the rest."
|
||||
|
||||
### Trickster -> False Alarm
|
||||
**Detect** (any): findings reference code untouched by diff | >10 findings for <5 files | impossible deployment scenarios | >3 findings without repro steps
|
||||
**Correct**: "Delete findings outside the diff. Rank remaining by likelihood x impact. Keep top 3-5."
|
||||
|
||||
### Sage -> Bureaucrat
|
||||
**Detect** (any): review words >2x diff lines | findings reference files not in changeset | >2 "consider" without concrete action | suggesting docs for <5-line functions
|
||||
**Correct**: "Limit to issues affecting maintainability in the next 6 months. Every finding must end with a specific action."
|
||||
|
||||
---
|
||||
|
||||
## Escalation Protocol
|
||||
|
||||
1. **1st detection:** Log the shadow, apply the correction prompt, let the agent continue
|
||||
2. **2nd detection (same agent, same shadow):** Replace the agent -- the shadow is entrenched
|
||||
3. **3+ agents shadowed in same cycle:** Escalate to user -- the task may need to be broken down
|
||||
|
||||
## Shadow Immunity
|
||||
|
||||
Some behaviors look like shadows but are not. **Rule of thumb:** shadow = behavior disconnected from the goal. Intensity alone is not a shadow.
|
||||
|
||||
- Explorer reading 20 files in a monorepo with scattered dependencies -- not a rabbit hole if each file is genuinely relevant
|
||||
- Creator adding an abstraction -- not over-architect if the current task genuinely needs it
|
||||
- Guardian blocking with 2 CRITICALs -- not paranoid if both are genuine security vulnerabilities
|
||||
- Explorer reading 20 files in a monorepo with scattered deps -- not rabbit hole if each is relevant
|
||||
- Guardian blocking with 2 CRITICALs -- not paranoid if both are genuine vulnerabilities
|
||||
- Trickster finding 5 edge cases -- not false alarm if all are in changed code with repro steps
|
||||
- Sage writing a long review -- not bureaucrat if the change is large and every finding is actionable
|
||||
|
||||
---
|
||||
|
||||
## System Shadows
|
||||
|
||||
Orchestration-level dysfunction that isn't tied to one archetype.
|
||||
|
||||
| Shadow | Detect | Corrective Action |
|
||||
|--------|--------|-------------------|
|
||||
| **Tunnel Vision** | All reviewers flag same category (e.g., 4 security findings, 0 quality/testing) | "Redistribute attention. Are we missing quality, testing, or design concerns?" |
|
||||
| **Echo Chamber** | Unanimous approval in <30s on standard/thorough workflow | "Suspicious fast consensus. Re-run Guardian with adversarial prompt." |
|
||||
| **Gold Plating** | Maker working on INFO fixes while CRITICALs remain open | "Fix CRITICALs first. Park INFO items." |
|
||||
| **Analysis Paralysis** | Plan phase >2x longer than Do phase; Explorer spawned 3+ times | "Stop researching. Ship a proposal with known gaps." |
|
||||
| **Cargo Cult** | Memory lesson injected but the same finding repeats anyway | "Lesson ineffective. Reword, strengthen, or remove it." |
|
||||
| **Broken Window** | 3+ WARNINGs deferred across consecutive runs in the same project | "Accumulated tech debt. Schedule a cleanup sprint." |
|
||||
| **Scope Creep** | Maker changes >2x files listed in proposal | "Revert to proposal scope. If more files needed, update the proposal first." |
|
||||
|
||||
---
|
||||
|
||||
## Policy Boundaries
|
||||
|
||||
Operational limits that protect session quality, cost, and resumability.
|
||||
|
||||
### Checkpoint Policy
|
||||
|
||||
Every **45 minutes** or **3 completed tasks** (whichever first):
|
||||
|
||||
1. Commit + push all work in progress
|
||||
2. Write handoff summary to `control-center.md`
|
||||
3. Log token spend so far
|
||||
4. Compare output quality: last task vs first task
|
||||
5. If quality degrading -> STOP with clean state
|
||||
6. If budget >80% spent -> STOP with clean state
|
||||
7. Otherwise -> continue
|
||||
|
||||
### Budget Gate
|
||||
|
||||
| Threshold | Action |
|
||||
|-----------|--------|
|
||||
| 50% budget spent | Log warning, continue |
|
||||
| 80% budget spent | Downgrade models (sonnet->haiku for reviewers) |
|
||||
| 95% budget spent | Complete current task, then STOP |
|
||||
| 100% budget | STOP immediately, commit WIP |
|
||||
|
||||
### Wiggum Break (Circuit Breaker)
|
||||
|
||||
Named after Chief Wiggum — policy enforcement AND the Ralph Loop's dad.
|
||||
When a Wiggum Break triggers, the system halts execution, saves state, and
|
||||
reports to the user. "Bake 'em away, toys."
|
||||
|
||||
**Hard breaks** (halt immediately, commit WIP):
|
||||
|
||||
| Trigger | Reason |
|
||||
|---------|--------|
|
||||
| 3 consecutive agent failures/timeouts | Infrastructure issue, not a code problem |
|
||||
| 3 consecutive task failures in sprint | Something systemic is wrong |
|
||||
| Same shadow detected 3+ times in one cycle | Task needs to be broken down or re-scoped |
|
||||
| Test suite broken after merge | Auto-revert, then halt |
|
||||
| 2+ oscillating findings (present→absent→present) | Fundamental tension in review criteria |
|
||||
|
||||
**Soft breaks** (finish current task, then halt):
|
||||
|
||||
| Signal | Reason |
|
||||
|--------|--------|
|
||||
| Cycle N findings identical to cycle N-1 | No progress — present best result |
|
||||
| Convergence score <0.5 for 2 consecutive cycles | "This needs a different approach" |
|
||||
| Reviewer finding count increases cycle over cycle | Implementation is diverging, not converging |
|
||||
|
||||
When a Wiggum Break fires, emit a `wiggum.break` event with trigger, run state, and unresolved findings.
|
||||
The event log makes it easy to audit why a run was halted and whether the break was warranted.
|
||||
|
||||
### Context Pollution
|
||||
|
||||
| Signal | Action |
|
||||
|--------|--------|
|
||||
| >15 memory lessons injected into one prompt | Prune to top 5 by frequency |
|
||||
| >20 findings tracked across cycles | Summarize into top 5 themes |
|
||||
| Agent prompt exceeds estimated 50% of context window | Strip examples, keep rules only |
|
||||
|
||||
---
|
||||
|
||||
## Unified Escalation Protocol
|
||||
|
||||
All three layers use the same escalation:
|
||||
|
||||
| Step | Archetype Shadows | System Shadows | Policy Boundaries |
|
||||
|------|-------------------|----------------|-------------------|
|
||||
| **1st** | Apply corrective action, let agent continue | Apply corrective action, continue run | Apply boundary action (downgrade, checkpoint) |
|
||||
| **2nd** (same issue) | Replace the agent -- shadow is entrenched | Pause run, report to user | Force stop with clean state |
|
||||
| **3rd** (pattern) | Escalate to user: "task needs re-scoping" | Escalate to user: "systemic issue" | Escalate to user: "resource limits reached" |
|
||||
|
||||
---
|
||||
|
||||
## Integration
|
||||
|
||||
Shadow checks run **after each agent completes** during orchestration. System shadow checks run **at phase boundaries**. Policy checks run **on a timer and at task boundaries**.
|
||||
|
||||
The `run` skill references this framework at:
|
||||
- Step 3 (Check phase): archetype shadow monitoring
|
||||
- Step 4 (Act phase): convergence/diminishing returns
|
||||
- Step 5 (Completion): effectiveness scoring
|
||||
- Sprint skill: checkpoint policy between batches
|
||||
|
||||
@@ -7,316 +7,79 @@ description: |
|
||||
<example>User: "archeflow init writing-short-story"</example>
|
||||
<example>User: "archeflow template save my-backend-setup"</example>
|
||||
<example>User: "archeflow template list"</example>
|
||||
<example>User: "archeflow init --from ../book.giesing-gschichten"</example>
|
||||
---
|
||||
|
||||
# Template Gallery — Shareable ArcheFlow Configurations
|
||||
# Template Gallery -- Shareable ArcheFlow Configurations
|
||||
|
||||
Workflows, team presets, custom archetypes, and domain configs should be reusable across projects. This skill defines the template system that makes ArcheFlow setups portable and shareable.
|
||||
Makes ArcheFlow setups portable and reusable across projects.
|
||||
|
||||
## Template Storage
|
||||
|
||||
Templates live in two locations, with project-local overriding global:
|
||||
|
||||
| Location | Scope | Precedence |
|
||||
|----------|-------|------------|
|
||||
| `.archeflow/templates/` | Project-local | Higher (checked first) |
|
||||
| `~/.archeflow/templates/` | Global (user-wide) | Lower (fallback) |
|
||||
|
||||
### Directory Structure
|
||||
Subdirectories: `workflows/`, `teams/`, `archetypes/`, `domains/`, `bundles/`.
|
||||
|
||||
```
|
||||
~/.archeflow/templates/
|
||||
├── workflows/
|
||||
│ ├── kurzgeschichte.yaml
|
||||
│ ├── feature-implementation.yaml
|
||||
│ └── security-review.yaml
|
||||
├── teams/
|
||||
│ ├── story-development.yaml
|
||||
│ ├── backend.yaml
|
||||
│ └── fullstack.yaml
|
||||
├── archetypes/
|
||||
│ ├── story-explorer.md
|
||||
│ ├── story-sage.md
|
||||
│ └── db-specialist.md
|
||||
├── domains/
|
||||
│ ├── writing.yaml
|
||||
│ ├── code.yaml
|
||||
│ └── research.yaml
|
||||
└── bundles/
|
||||
├── writing-short-story/
|
||||
│ ├── manifest.yaml
|
||||
│ ├── team.yaml
|
||||
│ ├── workflow.yaml
|
||||
│ ├── archetypes/
|
||||
│ │ ├── story-explorer.md
|
||||
│ │ └── story-sage.md
|
||||
│ └── domain.yaml
|
||||
└── backend-feature/
|
||||
├── manifest.yaml
|
||||
├── team.yaml
|
||||
├── workflow.yaml
|
||||
└── domain.yaml
|
||||
```
|
||||
## Bundles
|
||||
|
||||
Individual templates (workflows/, teams/, archetypes/, domains/) are single files that can be used standalone. Bundles are complete setups that include everything a project needs.
|
||||
A bundle is a complete setup (team + workflow + archetypes + domain) in one directory.
|
||||
|
||||
---
|
||||
|
||||
## Bundle Manifest
|
||||
|
||||
Every bundle has a `manifest.yaml` that declares what it contains, what it requires, and what variables it exposes.
|
||||
**Manifest (`manifest.yaml`):**
|
||||
|
||||
```yaml
|
||||
name: writing-short-story
|
||||
description: "Complete setup for short fiction writing with ArcheFlow"
|
||||
version: 1
|
||||
description: "Complete setup for short fiction writing"
|
||||
domain: writing
|
||||
includes:
|
||||
team: story-development.yaml
|
||||
workflow: kurzgeschichte.yaml
|
||||
archetypes:
|
||||
- story-explorer.md
|
||||
- story-sage.md
|
||||
archetypes: [story-explorer.md, story-sage.md]
|
||||
domain: writing.yaml
|
||||
requires:
|
||||
- colette.yaml # Project must have this file
|
||||
requires: [colette.yaml]
|
||||
variables:
|
||||
target_words: 6000 # Default, can be overridden at init time
|
||||
max_cycles: 2 # Default, can be overridden at init time
|
||||
target_words: 6000
|
||||
max_cycles: 2
|
||||
```
|
||||
|
||||
### Manifest Fields
|
||||
|
||||
| Field | Required | Description |
|
||||
|-------|----------|-------------|
|
||||
| `name` | Yes | Bundle identifier (used in `archeflow init <name>`) |
|
||||
| `name` | Yes | Bundle identifier for `archeflow init <name>` |
|
||||
| `description` | Yes | Human-readable description |
|
||||
| `version` | No | Bundle version (integer, default 1) |
|
||||
| `domain` | No | Domain this bundle is designed for |
|
||||
| `includes` | Yes | Map of file types to filenames within the bundle |
|
||||
| `requires` | No | List of files that must exist in the target project |
|
||||
| `variables` | No | Key-value pairs with defaults, overridable at init |
|
||||
|
||||
### Includes Types
|
||||
|
||||
| Key | Target location in `.archeflow/` | Accepts |
|
||||
|-----|----------------------------------|---------|
|
||||
| `team` | `teams/<filename>` | Single YAML file |
|
||||
| `workflow` | `workflows/<filename>` | Single YAML file |
|
||||
| `archetypes` | `archetypes/<filename>` | List of Markdown files |
|
||||
| `domain` | `domains/<filename>` | Single YAML file |
|
||||
| `hooks` | `hooks.yaml` | Single YAML file |
|
||||
|
||||
---
|
||||
| `includes` | Yes | File types to filenames within bundle |
|
||||
| `requires` | No | Files that must exist in target project |
|
||||
| `variables` | No | Key-value defaults, overridable at init |
|
||||
|
||||
## Operations
|
||||
|
||||
### `archeflow init <bundle-name>`
|
||||
**`archeflow init <bundle-name>`**
|
||||
1. Find bundle (project-local, then global)
|
||||
2. Check `requires` files exist
|
||||
3. Warn before overwriting existing `.archeflow/` config
|
||||
4. Copy files to `.archeflow/` (teams/, workflows/, archetypes/, domains/)
|
||||
5. Generate `.archeflow/config.yaml` with variables
|
||||
|
||||
Initialize a project's `.archeflow/` directory from a named bundle.
|
||||
**`archeflow init --from <project-path>`**
|
||||
- Copy teams/, workflows/, archetypes/, domains/, config.yaml, hooks.yaml
|
||||
- Skip run-specific data: events/, artifacts/, context/, templates/
|
||||
|
||||
**Procedure:**
|
||||
**`archeflow template save <name>`**
|
||||
- Package current `.archeflow/` into `~/.archeflow/templates/bundles/<name>/`
|
||||
- Auto-generate manifest.yaml
|
||||
|
||||
1. Search for the bundle:
|
||||
- `.archeflow/templates/bundles/<name>/manifest.yaml` (project-local)
|
||||
- `~/.archeflow/templates/bundles/<name>/manifest.yaml` (global)
|
||||
- If not found: error with list of available bundles
|
||||
2. Read `manifest.yaml`
|
||||
3. Check `requires`:
|
||||
- For each required file, verify it exists in the project root
|
||||
- If missing: error with `"Required file not found: <file>. This bundle requires it."`
|
||||
4. Check for existing `.archeflow/` setup:
|
||||
- If `.archeflow/teams/`, `.archeflow/workflows/`, etc. already contain files: warn and ask before overwriting
|
||||
- Never silently overwrite existing configuration
|
||||
5. Copy files from bundle to `.archeflow/`:
|
||||
- `team` → `.archeflow/teams/<filename>`
|
||||
- `workflow` → `.archeflow/workflows/<filename>`
|
||||
- `archetypes` → `.archeflow/archetypes/<filename>` (each file)
|
||||
- `domain` → `.archeflow/domains/<filename>`
|
||||
- `hooks` → `.archeflow/hooks.yaml`
|
||||
6. Create `.archeflow/config.yaml` with variables from manifest:
|
||||
```yaml
|
||||
# Generated by archeflow init from bundle: <name>
|
||||
bundle: <name>
|
||||
bundle_version: <version>
|
||||
initialized: <timestamp>
|
||||
variables:
|
||||
target_words: 6000
|
||||
max_cycles: 2
|
||||
```
|
||||
7. Print setup summary:
|
||||
```
|
||||
ArcheFlow initialized from bundle: <name>
|
||||
Team: <team filename> → .archeflow/teams/
|
||||
Workflow: <workflow filename> → .archeflow/workflows/
|
||||
Archetypes: <count> files → .archeflow/archetypes/
|
||||
Domain: <domain filename> → .archeflow/domains/
|
||||
Config: .archeflow/config.yaml (variables: target_words=6000, max_cycles=2)
|
||||
|
||||
Ready to run: archeflow:run
|
||||
```
|
||||
|
||||
### `archeflow init --from <project-path>`
|
||||
|
||||
Clone another project's ArcheFlow setup into the current project.
|
||||
|
||||
**Procedure:**
|
||||
|
||||
1. Verify `<project-path>/.archeflow/` exists
|
||||
2. Copy these subdirectories (if they exist):
|
||||
- `teams/`
|
||||
- `workflows/`
|
||||
- `archetypes/`
|
||||
- `domains/`
|
||||
- `config.yaml`
|
||||
- `hooks.yaml`
|
||||
3. Do NOT copy (run-specific data):
|
||||
- `events/`
|
||||
- `artifacts/`
|
||||
- `context/` (generated by colette-bridge, project-specific)
|
||||
- `templates/` (project-local templates stay local)
|
||||
4. Warn if target `.archeflow/` already has files
|
||||
5. Print summary of what was copied
|
||||
|
||||
### `archeflow template save <name>`
|
||||
|
||||
Save the current project's `.archeflow/` setup as a reusable template bundle.
|
||||
|
||||
**Procedure:**
|
||||
|
||||
1. Verify `.archeflow/` exists and has content
|
||||
2. Create bundle directory: `~/.archeflow/templates/bundles/<name>/`
|
||||
- If it already exists: warn and ask before overwriting
|
||||
3. Copy from `.archeflow/` to bundle:
|
||||
- `teams/*.yaml` → bundle `team` (first file, or prompt if multiple)
|
||||
- `workflows/*.yaml` → bundle `workflow` (first file, or prompt if multiple)
|
||||
- `archetypes/*.md` → bundle `archetypes/`
|
||||
- `domains/*.yaml` → bundle `domain` (first file, or prompt if multiple)
|
||||
- `hooks.yaml` → bundle (if exists)
|
||||
4. Generate `manifest.yaml`:
|
||||
```yaml
|
||||
name: <name>
|
||||
description: "Saved from <project directory name>"
|
||||
version: 1
|
||||
domain: <from domain yaml if present>
|
||||
includes:
|
||||
team: <filename>
|
||||
workflow: <filename>
|
||||
archetypes: [<filenames>]
|
||||
domain: <filename>
|
||||
requires: []
|
||||
variables: <from config.yaml variables section if present>
|
||||
```
|
||||
5. Print summary:
|
||||
```
|
||||
Template saved: <name>
|
||||
Location: ~/.archeflow/templates/bundles/<name>/
|
||||
Files: <count> files
|
||||
Use with: archeflow init <name>
|
||||
```
|
||||
|
||||
### `archeflow template list`
|
||||
|
||||
List all available templates — both individual files and bundles, from both global and project-local locations.
|
||||
|
||||
**Output format:**
|
||||
|
||||
```
|
||||
ArcheFlow Templates
|
||||
====================
|
||||
|
||||
Bundles:
|
||||
writing-short-story Complete setup for short fiction writing [global]
|
||||
backend-feature Backend feature implementation [global]
|
||||
my-project-setup Saved from book.giesing-gschichten [global]
|
||||
|
||||
Individual Templates:
|
||||
Workflows:
|
||||
kurzgeschichte.yaml [global]
|
||||
feature-implementation.yaml [global]
|
||||
Teams:
|
||||
story-development.yaml [global]
|
||||
backend.yaml [global]
|
||||
Archetypes:
|
||||
story-explorer.md [global]
|
||||
story-sage.md [global]
|
||||
Domains:
|
||||
writing.yaml [global]
|
||||
code.yaml [global]
|
||||
```
|
||||
|
||||
### `archeflow template share <name> <path>`
|
||||
|
||||
Export a template bundle to a directory for sharing (e.g., via git, email, file share).
|
||||
|
||||
**Procedure:**
|
||||
|
||||
1. Find the bundle (global or local)
|
||||
2. Copy the entire bundle directory to `<path>/<name>/`
|
||||
3. Print the path and a one-liner for importing:
|
||||
```
|
||||
Exported: <path>/<name>/
|
||||
To import: cp -r <path>/<name> ~/.archeflow/templates/bundles/
|
||||
```
|
||||
|
||||
---
|
||||
**`archeflow template list`**
|
||||
- Show all bundles and individual templates (global + project-local)
|
||||
|
||||
## Variable Substitution
|
||||
|
||||
Bundle manifests can define variables with defaults. These are stored in `.archeflow/config.yaml` after init and can be overridden:
|
||||
Variables in manifests are stored in `.archeflow/config.yaml` after init. Substitution happens at run time, not template time.
|
||||
|
||||
- At init time: `archeflow init writing-short-story --set target_words=8000`
|
||||
- After init: edit `.archeflow/config.yaml` directly
|
||||
Override at init: `archeflow init writing-short-story --set target_words=8000`
|
||||
|
||||
Variables are available to workflows and the run skill via config:
|
||||
## Individual Templates
|
||||
|
||||
```yaml
|
||||
# In a workflow, reference variables:
|
||||
phases:
|
||||
do:
|
||||
description: |
|
||||
Draft the story. Target: ${target_words} words.
|
||||
```
|
||||
|
||||
Variable substitution happens at run time, not at init time. The workflow file contains the `${variable}` placeholder; the run skill reads `.archeflow/config.yaml` and substitutes before passing to agents.
|
||||
|
||||
---
|
||||
|
||||
## Individual Template Usage
|
||||
|
||||
Not everything needs a bundle. Individual templates can be copied directly:
|
||||
|
||||
```bash
|
||||
# Copy a single workflow
|
||||
cp ~/.archeflow/templates/workflows/kurzgeschichte.yaml .archeflow/workflows/
|
||||
|
||||
# Copy a single archetype
|
||||
cp ~/.archeflow/templates/archetypes/story-explorer.md .archeflow/archetypes/
|
||||
|
||||
# Copy a team preset
|
||||
cp ~/.archeflow/templates/teams/story-development.yaml .archeflow/teams/
|
||||
```
|
||||
|
||||
The `archeflow init` command handles bundles. For individual files, manual copy or the helper script (`lib/archeflow-init.sh`) can be used.
|
||||
|
||||
---
|
||||
|
||||
## Integration with Other Skills
|
||||
|
||||
- **`archeflow:run`** — Reads `.archeflow/config.yaml` for variables, applies them during run initialization
|
||||
- **`archeflow:domains`** — Domain YAML from templates is loaded like any other domain config
|
||||
- **`archeflow:custom-archetypes`** — Archetype .md files from templates work identically to hand-written ones
|
||||
- **`archeflow:workflow-design`** — Workflow YAML from templates follows the same schema
|
||||
- **`archeflow:colette-bridge`** — Bundle `requires: [colette.yaml]` ensures the bridge has what it needs
|
||||
|
||||
---
|
||||
|
||||
## Design Principles
|
||||
|
||||
1. **Bundles are self-contained.** Everything needed to set up a project is in the bundle directory. No external dependencies beyond `requires`.
|
||||
2. **Never silently overwrite.** Init warns before replacing existing files. Templates are helpers, not bulldozers.
|
||||
3. **Global + local layering.** Project-local templates override global ones. This allows per-project customization without polluting the global registry.
|
||||
4. **Skip run data.** Events, artifacts, and context are run-specific. Templates carry only configuration.
|
||||
5. **Variables are late-bound.** Substitution happens at run time, not template time. This keeps templates generic.
|
||||
6. **Plain files, no magic.** Templates are just directories of YAML and Markdown files. No databases, no registries, no lock files.
|
||||
Single files can be copied directly without a bundle:
|
||||
- `~/.archeflow/templates/workflows/<name>.yaml`
|
||||
- `~/.archeflow/templates/archetypes/<name>.md`
|
||||
- `~/.archeflow/templates/teams/<name>.yaml`
|
||||
|
||||
22
skills/using-archeflow/ACTIVATION.md
Normal file
22
skills/using-archeflow/ACTIVATION.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# ArcheFlow -- Active
|
||||
|
||||
Multi-agent orchestration using archetypal roles and PDCA quality cycles.
|
||||
|
||||
## Session Start
|
||||
|
||||
On activation, print ONE line then proceed silently:
|
||||
```
|
||||
archeflow v0.8.0 · 19 skills · <domain> domain
|
||||
```
|
||||
Domain: `writing` if `colette.yaml` exists, `research` if paper/thesis files, `code` otherwise.
|
||||
|
||||
## When to Use
|
||||
|
||||
| Need | Command |
|
||||
|------|---------|
|
||||
| Work the queue | `/af-sprint` |
|
||||
| Deep orchestration | `/af-run <task>` |
|
||||
| Code review | `/af-review` |
|
||||
| Simple fix / question | Skip ArcheFlow — just do it directly |
|
||||
|
||||
Do NOT use ArcheFlow for: single-line fixes, questions, reading code, config tweaks, git ops.
|
||||
@@ -7,7 +7,7 @@ description: Use at session start when implementing features, reviewing code, de
|
||||
|
||||
On activation, print ONE line then proceed silently:
|
||||
```
|
||||
archeflow v0.7.0 · 25 skills · <domain> domain
|
||||
archeflow v0.9.0 · 24 skills · <domain> domain
|
||||
```
|
||||
Domain auto-detected: `writing` if `colette.yaml` exists, `research` if paper/thesis files, `code` otherwise.
|
||||
|
||||
@@ -46,6 +46,7 @@ Do NOT use for: single-line fixes, questions, reading/exploring, config tweaks,
|
||||
| `/af-memory` | Cross-run lesson memory |
|
||||
| `/af-fanout` | Colette book fanout via agents |
|
||||
| `/af-dag` | DAG of current/last run |
|
||||
| `/af-replay <run_id>` | Decision timeline + weighted what-if on recorded events |
|
||||
|
||||
## Mini-Reflect Fallback
|
||||
|
||||
|
||||
@@ -1,248 +1,70 @@
|
||||
---
|
||||
name: workflow-design
|
||||
description: Use when designing custom orchestration workflows — choosing which archetypes run in each PDCA phase, setting exit conditions, and configuring PDCA cycles.
|
||||
description: Use when designing custom orchestration workflows -- choosing which archetypes run in each PDCA phase, setting exit conditions, and configuring PDCA cycles.
|
||||
---
|
||||
|
||||
# Workflow Design — PDCA Cycles
|
||||
# Workflow Design -- PDCA Cycles
|
||||
|
||||
ArcheFlow's PDCA cycles spiral upward through iterations — each cycle incorporates feedback from the previous one, producing progressively better results. Each cycle incorporates feedback from the previous one.
|
||||
|
||||
```
|
||||
╱ Act ──────────── Done ✓
|
||||
╱ ↑
|
||||
╱ Check (review)
|
||||
╱ ↑
|
||||
╱ Do (implement)
|
||||
╱ ↑
|
||||
╱ Plan (design) ← Cycle 2 (with feedback from Cycle 1)
|
||||
╱ ↑
|
||||
╱ Act ─┘ (issues found → feed back)
|
||||
│ ↑
|
||||
│ Check (review)
|
||||
│ ↑
|
||||
│ Do (implement)
|
||||
│ ↑
|
||||
│ Plan (design) ← Cycle 1 (initial)
|
||||
```
|
||||
|
||||
## Strategy vs Workflow
|
||||
|
||||
A **strategy** defines the execution shape: PDCA is cyclic (Plan-Do-Check-Act with feedback loops), pipeline is linear (Plan-Implement-Review-Verify, no cycle-back). A **workflow** defines the depth: fast uses fewer agents and cycles, thorough uses more. Strategy and workflow are orthogonal — you can run a `fast` workflow with either strategy, though `thorough` always uses PDCA because linear flows cannot iterate on findings.
|
||||
PDCA cycles spiral upward: each cycle incorporates feedback from the previous one.
|
||||
|
||||
## Built-in Workflows
|
||||
|
||||
### `fast` — Single Turn
|
||||
```
|
||||
Plan: Creator designs
|
||||
Do: Maker implements (worktree)
|
||||
Check: Guardian reviews
|
||||
Act: Approve or reject (1 cycle max)
|
||||
```
|
||||
**Use for:** Bug fixes, small changes, low-risk tasks.
|
||||
|
||||
### `standard` — Two Cycles
|
||||
```
|
||||
Plan: Explorer researches → Creator designs
|
||||
Do: Maker implements (worktree)
|
||||
Check: Guardian + Skeptic + Sage review (parallel)
|
||||
Act: Approve or cycle (2 cycles max)
|
||||
```
|
||||
**Use for:** Features, refactors, moderate-risk changes.
|
||||
|
||||
### `thorough` — Three Cycles
|
||||
```
|
||||
Plan: Explorer researches → Creator designs
|
||||
Do: Maker implements (worktree)
|
||||
Check: Guardian + Skeptic + Sage + Trickster (parallel)
|
||||
Act: Approve or cycle (3 cycles max)
|
||||
```
|
||||
**Use for:** Security-critical, public APIs, infrastructure changes.
|
||||
| Workflow | Plan | Do | Check | Exit | Max Cycles |
|
||||
|----------|------|----|-------|------|------------|
|
||||
| `fast` | Creator | Maker | Guardian | approve/reject | 1 |
|
||||
| `standard` | Explorer + Creator | Maker | Guardian + Skeptic + Sage | all_approved | 2 |
|
||||
| `thorough` | Explorer + Creator | Maker | Guardian + Skeptic + Sage + Trickster | all_approved | 3 |
|
||||
|
||||
## Designing Custom Workflows
|
||||
|
||||
### Step 1: Identify the Concern
|
||||
**Step 1: Identify the concern**
|
||||
|
||||
What's the primary risk?
|
||||
| Risk | Emphasize in Check |
|
||||
|------|-------------------|
|
||||
| Security | Guardian + Trickster |
|
||||
| Correctness | Skeptic + Sage |
|
||||
| Performance | Custom `perf-tester` |
|
||||
| Compliance | Custom `compliance-auditor` |
|
||||
| Data integrity | Custom `db-specialist` |
|
||||
|
||||
| Primary Risk | Emphasize |
|
||||
|-------------|-----------|
|
||||
| Security | Guardian + Trickster in Check |
|
||||
| Correctness | Skeptic + Sage in Check |
|
||||
| Performance | Custom `perf-tester` archetype |
|
||||
| Compliance | Custom `compliance-auditor` archetype |
|
||||
| Data integrity | Custom `db-specialist` archetype |
|
||||
| User experience | Custom `ux-reviewer` archetype |
|
||||
**Step 2: Phase assignment rules**
|
||||
- Plan always includes Creator
|
||||
- Do always includes Maker
|
||||
- Check needs at least one reviewer
|
||||
- Max 3 archetypes per phase
|
||||
- Explorer goes in Plan only; Maker goes in Do only
|
||||
|
||||
### Step 2: Assign Phases
|
||||
**Step 3: Exit conditions**
|
||||
|
||||
Rules:
|
||||
- **Plan** always includes Creator (someone must propose)
|
||||
- **Do** always includes Maker (someone must build)
|
||||
- **Check** needs at least one reviewer
|
||||
- Max 3 archetypes per phase (diminishing returns beyond that)
|
||||
- Explorer goes in Plan only (research before design)
|
||||
- Maker goes in Do only (build from plan, not from scratch)
|
||||
| Condition | Cycle ends when |
|
||||
|-----------|----------------|
|
||||
| `all_approved` | Every reviewer says APPROVED |
|
||||
| `no_critical` | No CRITICAL findings |
|
||||
| `convergence` | No new issues vs previous cycle |
|
||||
| `always` | Runs all maxCycles unconditionally |
|
||||
|
||||
### Step 3: Set Exit Conditions
|
||||
|
||||
| Condition | When Cycle Ends | Best For |
|
||||
|-----------|----------------|----------|
|
||||
| `all_approved` | Every Check reviewer says APPROVED | Consensus-driven (default) |
|
||||
| `no_critical` | No CRITICAL findings in Check output | Speed with safety net |
|
||||
| `convergence` | No new issues vs. previous cycle | Diminishing returns detection |
|
||||
| `always` | Runs all maxCycles unconditionally | Research, exploration |
|
||||
|
||||
### Step 4: Set Max Cycles
|
||||
|
||||
- **1 cycle:** Fast, low-risk (fast workflow)
|
||||
- **2 cycles:** Balanced — one shot + one fix (standard workflow)
|
||||
- **3 cycles:** Thorough — usually converges by cycle 3
|
||||
- **4+ cycles:** Rarely useful. If 3 cycles don't converge, the task needs human input.
|
||||
|
||||
## Example Custom Workflows
|
||||
|
||||
### Security-First
|
||||
```
|
||||
Plan: Explorer (threat modeling) → Creator
|
||||
Do: Maker
|
||||
Check: Guardian + Trickster (parallel)
|
||||
Exit: all_approved, max 3 cycles
|
||||
```
|
||||
|
||||
### Research-Heavy
|
||||
```
|
||||
Plan: Explorer (deep research) → Creator
|
||||
Do: Maker
|
||||
Check: Skeptic + Sage (parallel)
|
||||
Exit: all_approved, max 2 cycles
|
||||
```
|
||||
|
||||
### Domain-Specific (with custom archetypes)
|
||||
```
|
||||
Plan: Explorer → Creator
|
||||
Do: Maker
|
||||
Check: Guardian + db-specialist + compliance-auditor (parallel)
|
||||
Exit: all_approved, max 2 cycles
|
||||
```
|
||||
|
||||
### Minimal Validation
|
||||
```
|
||||
Plan: Creator (no research)
|
||||
Do: Maker
|
||||
Check: Guardian
|
||||
Exit: no_critical, max 1 cycle
|
||||
```
|
||||
**Step 4: Max cycles** -- 1 (fast), 2 (balanced), 3 (thorough). 4+ rarely useful.
|
||||
|
||||
## Hook Points
|
||||
|
||||
Add project-specific validation at key moments in the PDCA cycle. Define hooks in `.archeflow/hooks.yaml`:
|
||||
Define in `.archeflow/hooks.yaml`:
|
||||
|
||||
```yaml
|
||||
# .archeflow/hooks.yaml
|
||||
pre-plan:
|
||||
- command: "npm run lint"
|
||||
description: "Ensure clean baseline before planning"
|
||||
fail_action: abort # abort | warn | ignore
|
||||
|
||||
post-check:
|
||||
- command: "npm test"
|
||||
description: "Run tests after review to verify reviewer suggestions"
|
||||
fail_action: cycle_back
|
||||
|
||||
pre-merge:
|
||||
- command: "./scripts/check-migrations.sh"
|
||||
description: "Verify migration safety before merging"
|
||||
fail_action: abort
|
||||
|
||||
post-merge:
|
||||
- command: "npm run integration-test"
|
||||
description: "Full integration test after merge"
|
||||
fail_action: revert
|
||||
```
|
||||
|
||||
**Available hook points:**
|
||||
| Hook | When | Typical Use |
|
||||
| Hook | When | Typical use |
|
||||
|------|------|-------------|
|
||||
| `pre-plan` | Before Explorer/Creator start | Lint, ensure clean baseline |
|
||||
| `post-plan` | After Creator's proposal | Validate proposal against constraints |
|
||||
| `pre-do` | Before Maker starts | Check worktree setup |
|
||||
| `post-do` | After Maker commits | Quick smoke test |
|
||||
| `post-check` | After reviewers finish | Run test suite |
|
||||
| `pre-merge` | Before merging to main | Migration safety, API compatibility |
|
||||
| `post-merge` | After merge completes | Integration tests, deploy checks |
|
||||
| `pre-plan` | Before Explorer/Creator | Lint, clean baseline |
|
||||
| `post-plan` | After Creator's proposal | Validate constraints |
|
||||
| `pre-do` | Before Maker | Check worktree |
|
||||
| `post-do` | After Maker commits | Smoke test |
|
||||
| `post-check` | After reviewers | Run test suite |
|
||||
| `pre-merge` | Before merge | Migration safety |
|
||||
| `post-merge` | After merge | Integration tests |
|
||||
|
||||
## Workflow Template Library
|
||||
|
||||
Pre-built workflows for common scenarios. Use as-is or as starting points for custom workflows.
|
||||
|
||||
### API Design
|
||||
```yaml
|
||||
name: api-design
|
||||
description: New or changed API endpoints
|
||||
plan: [explorer, creator]
|
||||
do: [maker]
|
||||
check: [guardian, skeptic] # Guardian for security, Skeptic for API design assumptions
|
||||
exit: all_approved
|
||||
max_cycles: 2
|
||||
hooks:
|
||||
post-check: "npm run api-compatibility-check"
|
||||
```
|
||||
|
||||
### Database Migration
|
||||
```yaml
|
||||
name: migration
|
||||
description: Schema changes and data migrations
|
||||
plan: [explorer, creator]
|
||||
do: [maker]
|
||||
check: [guardian, db-specialist] # Requires custom db-specialist archetype
|
||||
exit: all_approved
|
||||
max_cycles: 2
|
||||
hooks:
|
||||
pre-merge: "./scripts/check-migration-reversibility.sh"
|
||||
```
|
||||
|
||||
### Dependency Upgrade
|
||||
```yaml
|
||||
name: dep-upgrade
|
||||
description: Upgrading dependencies (major versions, security patches)
|
||||
plan: [creator] # No Explorer needed — changelog is the research
|
||||
do: [maker]
|
||||
check: [guardian]
|
||||
exit: no_critical
|
||||
max_cycles: 1
|
||||
hooks:
|
||||
post-do: "npm audit"
|
||||
post-merge: "npm test && npm run e2e"
|
||||
```
|
||||
|
||||
### Documentation Rewrite
|
||||
```yaml
|
||||
name: docs-rewrite
|
||||
description: Major documentation changes
|
||||
plan: [explorer, creator]
|
||||
do: [maker]
|
||||
check: [sage] # Quality/consistency only — no security review needed
|
||||
exit: all_approved
|
||||
max_cycles: 1
|
||||
```
|
||||
|
||||
### Hotfix
|
||||
```yaml
|
||||
name: hotfix
|
||||
description: Emergency production fix
|
||||
plan: [creator]
|
||||
do: [maker]
|
||||
check: [guardian]
|
||||
exit: no_critical
|
||||
max_cycles: 1
|
||||
hooks:
|
||||
post-merge: "npm test"
|
||||
```
|
||||
Each hook has `command`, `description`, and `fail_action` (abort / warn / ignore / cycle_back / revert).
|
||||
|
||||
## Anti-Patterns
|
||||
|
||||
- **Kitchen sink:** Putting all 7 archetypes in Check. Most can't add value simultaneously.
|
||||
- **Runaway cycles:** maxCycles > 4 burns tokens without convergence.
|
||||
- **Reviewerless Do:** Skipping Check phase "to save time." You'll pay in bugs.
|
||||
- **Maker in Plan:** Maker should implement from a proposal, not design on the fly.
|
||||
- **Solo orchestration:** One archetype in every phase. That's just a single agent with extra steps.
|
||||
- All 7 archetypes in Check (diminishing returns)
|
||||
- maxCycles > 4 (burns tokens without convergence)
|
||||
- Skipping Check phase
|
||||
- Maker in Plan phase
|
||||
- One archetype in every phase (just a single agent with overhead)
|
||||
|
||||
71
tests/archeflow-dag.bats
Normal file
71
tests/archeflow-dag.bats
Normal file
@@ -0,0 +1,71 @@
|
||||
# Tests for archeflow-dag.sh — ASCII DAG rendering from JSONL events.
|
||||
#
|
||||
# Validates: basic rendering, parent relationships, color flags, missing file handling.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
|
||||
# Create a standard events file with parent relationships
|
||||
cat > "$BATS_TEST_TMPDIR/dag-events.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"dag-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"DAG test"}}
|
||||
{"ts":"2026-04-03T10:01:00Z","run_id":"dag-run","seq":2,"parent":[1],"type":"agent.complete","phase":"plan","agent":"creator","data":{"archetype":"creator","duration_ms":60000,"tokens":1500}}
|
||||
{"ts":"2026-04-03T10:02:00Z","run_id":"dag-run","seq":3,"parent":[2],"type":"phase.transition","phase":"do","agent":null,"data":{"from":"plan","to":"do"}}
|
||||
{"ts":"2026-04-03T10:03:00Z","run_id":"dag-run","seq":4,"parent":[3],"type":"agent.complete","phase":"do","agent":"maker","data":{"archetype":"maker","duration_ms":120000,"tokens":3000}}
|
||||
{"ts":"2026-04-03T10:04:00Z","run_id":"dag-run","seq":5,"parent":[4],"type":"run.complete","phase":"act","agent":null,"data":{"agents_total":2,"fixes_total":0}}
|
||||
EVENTS
|
||||
}
|
||||
|
||||
@test "dag: exits 1 with usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-dag.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "dag: exits 1 when events file not found" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" nonexistent.jsonl
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "dag: renders run.start as root node" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"#1"* ]]
|
||||
[[ "$output" == *"run.start"* ]]
|
||||
}
|
||||
|
||||
@test "dag: renders agent.complete events with archetype name" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"creator"* ]]
|
||||
[[ "$output" == *"maker"* ]]
|
||||
}
|
||||
|
||||
@test "dag: renders phase transitions" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"plan"* ]]
|
||||
[[ "$output" == *"do"* ]]
|
||||
}
|
||||
|
||||
@test "dag: renders run.complete with agent/fix counts" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"run.complete"* ]]
|
||||
[[ "$output" == *"2 agents"* ]]
|
||||
}
|
||||
|
||||
@test "dag: --no-color suppresses ANSI codes" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
# Should not contain escape sequences
|
||||
[[ "$output" != *$'\033'* ]]
|
||||
}
|
||||
|
||||
@test "dag: uses tree-drawing characters for hierarchy" {
|
||||
run "$LIB_DIR/archeflow-dag.sh" "$BATS_TEST_TMPDIR/dag-events.jsonl" --no-color
|
||||
[ "$status" -eq 0 ]
|
||||
# Should contain box-drawing characters (either unicode or ASCII connectors)
|
||||
[[ "$output" == *"├"* ]] || [[ "$output" == *"└"* ]]
|
||||
}
|
||||
127
tests/archeflow-event.bats
Normal file
127
tests/archeflow-event.bats
Normal file
@@ -0,0 +1,127 @@
|
||||
# Tests for archeflow-event.sh — structured JSONL event logging.
|
||||
#
|
||||
# Validates: JSONL output format, sequence numbering, parent field handling,
|
||||
# input validation, file/directory creation.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
@test "event: exits 1 with usage when called with fewer than 4 args" {
|
||||
run "$LIB_DIR/archeflow-event.sh" run1 type1 plan
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "event: creates events directory and file on first call" {
|
||||
run "$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{"task":"test"}'
|
||||
[ "$status" -eq 0 ]
|
||||
[ -d ".archeflow/events" ]
|
||||
[ -f ".archeflow/events/test-run.jsonl" ]
|
||||
}
|
||||
|
||||
@test "event: first event has seq=1" {
|
||||
run "$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{"task":"test"}'
|
||||
[ "$status" -eq 0 ]
|
||||
local seq
|
||||
seq=$(head -1 ".archeflow/events/test-run.jsonl" | jq -r '.seq')
|
||||
[ "$seq" -eq 1 ]
|
||||
}
|
||||
|
||||
@test "event: second event has seq=2" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{"task":"test"}' 2>/dev/null
|
||||
"$LIB_DIR/archeflow-event.sh" test-run agent.complete plan creator '{"dur":100}' "1" 2>/dev/null
|
||||
local count
|
||||
count=$(wc -l < ".archeflow/events/test-run.jsonl")
|
||||
[ "$count" -eq 2 ]
|
||||
local seq2
|
||||
seq2=$(tail -1 ".archeflow/events/test-run.jsonl" | jq -r '.seq')
|
||||
[ "$seq2" -eq 2 ]
|
||||
}
|
||||
|
||||
@test "event: output is valid JSONL" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{"task":"hello"}' 2>/dev/null
|
||||
# jq will fail if the line is not valid JSON
|
||||
jq empty ".archeflow/events/test-run.jsonl"
|
||||
}
|
||||
|
||||
@test "event: fields are correctly populated" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run agent.complete do maker '{"tokens":500}' 2>/dev/null
|
||||
local event
|
||||
event=$(head -1 ".archeflow/events/test-run.jsonl")
|
||||
[ "$(echo "$event" | jq -r '.run_id')" = "test-run" ]
|
||||
[ "$(echo "$event" | jq -r '.type')" = "agent.complete" ]
|
||||
[ "$(echo "$event" | jq -r '.phase')" = "do" ]
|
||||
[ "$(echo "$event" | jq -r '.agent')" = "maker" ]
|
||||
[ "$(echo "$event" | jq -r '.data.tokens')" = "500" ]
|
||||
}
|
||||
|
||||
@test "event: empty agent becomes null in JSON" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run phase.transition do "" '{"from":"plan","to":"do"}' 2>/dev/null
|
||||
local agent
|
||||
agent=$(head -1 ".archeflow/events/test-run.jsonl" | jq -r '.agent')
|
||||
[ "$agent" = "null" ]
|
||||
}
|
||||
|
||||
@test "event: parent field is empty array for root events" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' 2>/dev/null
|
||||
local parent
|
||||
parent=$(head -1 ".archeflow/events/test-run.jsonl" | jq -c '.parent')
|
||||
[ "$parent" = "[]" ]
|
||||
}
|
||||
|
||||
@test "event: single parent is parsed correctly" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' 2>/dev/null
|
||||
"$LIB_DIR/archeflow-event.sh" test-run agent.complete plan creator '{}' "1" 2>/dev/null
|
||||
local parent
|
||||
parent=$(tail -1 ".archeflow/events/test-run.jsonl" | jq -c '.parent')
|
||||
[ "$parent" = "[1]" ]
|
||||
}
|
||||
|
||||
@test "event: multiple parents (fan-in) are parsed correctly" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' 2>/dev/null
|
||||
"$LIB_DIR/archeflow-event.sh" test-run a plan "" '{}' "1" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-event.sh" test-run b plan "" '{}' "1" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-event.sh" test-run merge plan "" '{}' "2,3" 2>/dev/null
|
||||
local parent
|
||||
parent=$(tail -1 ".archeflow/events/test-run.jsonl" | jq -c '.parent')
|
||||
[ "$parent" = "[2,3]" ]
|
||||
}
|
||||
|
||||
@test "event: rejects invalid JSON data" {
|
||||
run "$LIB_DIR/archeflow-event.sh" test-run run.start plan "" 'not-json'
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"invalid JSON"* ]]
|
||||
}
|
||||
|
||||
@test "event: rejects invalid parent format" {
|
||||
run "$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' "abc"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"invalid parent format"* ]]
|
||||
}
|
||||
|
||||
@test "event: timestamp is ISO 8601 UTC format" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' 2>/dev/null
|
||||
local ts
|
||||
ts=$(head -1 ".archeflow/events/test-run.jsonl" | jq -r '.ts')
|
||||
# Matches YYYY-MM-DDTHH:MM:SSZ
|
||||
[[ "$ts" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$ ]]
|
||||
}
|
||||
|
||||
@test "event: default data is empty object when omitted" {
|
||||
"$LIB_DIR/archeflow-event.sh" test-run run.start plan agent 2>/dev/null
|
||||
local data
|
||||
data=$(head -1 ".archeflow/events/test-run.jsonl" | jq -c '.data')
|
||||
[ "$data" = "{}" ]
|
||||
}
|
||||
|
||||
@test "event: confirmation message goes to stderr" {
|
||||
run "$LIB_DIR/archeflow-event.sh" test-run run.start plan "" '{}' "" 2>&1
|
||||
[[ "$output" == *"[archeflow-event]"* ]]
|
||||
[[ "$output" == *"#1"* ]]
|
||||
}
|
||||
212
tests/archeflow-git.bats
Normal file
212
tests/archeflow-git.bats
Normal file
@@ -0,0 +1,212 @@
|
||||
# Tests for archeflow-git.sh — git branch/commit strategy for ArcheFlow runs.
|
||||
#
|
||||
# Validates: branch creation with correct naming, commit formatting,
|
||||
# merge strategies, input validation, and safety guards.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
# --- Usage ---
|
||||
|
||||
@test "git: exits 1 with usage when called with fewer than 2 args" {
|
||||
run "$LIB_DIR/archeflow-git.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "git: exits 1 for unknown command" {
|
||||
run "$LIB_DIR/archeflow-git.sh" nonexistent test-run
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Unknown command"* ]]
|
||||
}
|
||||
|
||||
# --- init ---
|
||||
|
||||
@test "git init: creates branch with archeflow/ prefix" {
|
||||
run "$LIB_DIR/archeflow-git.sh" init test-run
|
||||
[ "$status" -eq 0 ]
|
||||
local current
|
||||
current=$(git branch --show-current)
|
||||
[ "$current" = "archeflow/test-run" ]
|
||||
}
|
||||
|
||||
@test "git init: stores base branch in .archeflow/runs/<run_id>/base-branch" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
[ -f ".archeflow/runs/test-run/base-branch" ]
|
||||
local base
|
||||
base=$(cat ".archeflow/runs/test-run/base-branch")
|
||||
[ "$base" = "main" ]
|
||||
}
|
||||
|
||||
@test "git init: fails if branch already exists" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
git checkout main --quiet
|
||||
run "$LIB_DIR/archeflow-git.sh" init test-run
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"already exists"* ]]
|
||||
}
|
||||
|
||||
# --- commit ---
|
||||
|
||||
@test "git commit: uses conventional commit format by default" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
# Create a file to commit
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "initial plan" 2>/dev/null
|
||||
local msg
|
||||
msg=$(git log -1 --format=%s)
|
||||
[[ "$msg" == "archeflow(plan): initial plan" ]]
|
||||
}
|
||||
|
||||
@test "git commit: stages event file automatically" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "test commit" 2>/dev/null
|
||||
|
||||
# Verify the event file was committed
|
||||
local committed_files
|
||||
committed_files=$(git diff-tree --no-commit-id --name-only -r HEAD)
|
||||
[[ "$committed_files" == *"test-run.jsonl"* ]]
|
||||
}
|
||||
|
||||
@test "git commit: stages extra files passed as arguments" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
echo "extra content" > extra.txt
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run do "with extras" extra.txt 2>/dev/null
|
||||
local committed_files
|
||||
committed_files=$(git diff-tree --no-commit-id --name-only -r HEAD)
|
||||
[[ "$committed_files" == *"extra.txt"* ]]
|
||||
}
|
||||
|
||||
@test "git commit: reports nothing to commit when no changes" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
# Commit the init artifacts first so there's a clean state
|
||||
git add -A && git commit -m "init artifacts" --quiet 2>/dev/null || true
|
||||
run bash -c "cd '$BATS_TEST_TMPDIR' && '$LIB_DIR/archeflow-git.sh' commit test-run plan 'empty' 2>&1"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Nothing to commit"* ]]
|
||||
}
|
||||
|
||||
@test "git commit: fails if not on the run branch" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
git checkout main --quiet
|
||||
run "$LIB_DIR/archeflow-git.sh" commit test-run plan "wrong branch"
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Expected to be on branch"* ]]
|
||||
}
|
||||
|
||||
# --- phase-commit ---
|
||||
|
||||
@test "git phase-commit: creates commit with phase transition message" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" phase-commit test-run plan 2>/dev/null
|
||||
local msg
|
||||
msg=$(git log -1 --format=%s)
|
||||
# Should contain the phase transition arrow
|
||||
[[ "$msg" == *"plan"* ]]
|
||||
[[ "$msg" == *"do"* ]]
|
||||
}
|
||||
|
||||
# --- merge ---
|
||||
|
||||
@test "git merge: squash merge is the default strategy" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "test" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-git.sh" merge test-run 2>/dev/null
|
||||
|
||||
local current
|
||||
current=$(git branch --show-current)
|
||||
[ "$current" = "main" ]
|
||||
|
||||
local msg
|
||||
msg=$(git log -1 --format=%s)
|
||||
[[ "$msg" == *"archeflow run test-run"* ]]
|
||||
}
|
||||
|
||||
@test "git merge: --no-ff creates a merge commit" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "test" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-git.sh" merge test-run --no-ff 2>/dev/null
|
||||
|
||||
local current
|
||||
current=$(git branch --show-current)
|
||||
[ "$current" = "main" ]
|
||||
|
||||
# no-ff merge commit should have 2 parents
|
||||
local parent_count
|
||||
parent_count=$(git cat-file -p HEAD | grep -c '^parent')
|
||||
[ "$parent_count" -eq 2 ]
|
||||
}
|
||||
|
||||
@test "git merge: rejects unknown merge strategy" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "test" 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-git.sh" merge test-run --fast-forward
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Unknown merge strategy"* ]]
|
||||
}
|
||||
|
||||
@test "git merge: fails with uncommitted changes" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
echo "dirty" > dirty.txt
|
||||
git add dirty.txt
|
||||
run "$LIB_DIR/archeflow-git.sh" merge test-run
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Uncommitted changes"* ]]
|
||||
}
|
||||
|
||||
# --- format_message ---
|
||||
|
||||
@test "git commit: simple style uses 'phase: msg' format" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
# Create config with simple style
|
||||
mkdir -p .archeflow
|
||||
echo "commit_style: simple" > .archeflow/config.yaml
|
||||
mkdir -p .archeflow/events
|
||||
echo '{"test":true}' > .archeflow/events/test-run.jsonl
|
||||
"$LIB_DIR/archeflow-git.sh" commit test-run plan "simple test" 2>/dev/null
|
||||
local msg
|
||||
msg=$(git log -1 --format=%s)
|
||||
[ "$msg" = "plan: simple test" ]
|
||||
}
|
||||
|
||||
# --- status ---
|
||||
|
||||
@test "git status: shows branch info for existing run" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-git.sh" status test-run
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Branch: archeflow/test-run"* ]]
|
||||
[[ "$output" == *"Base: main"* ]]
|
||||
}
|
||||
|
||||
@test "git status: fails for nonexistent branch" {
|
||||
run "$LIB_DIR/archeflow-git.sh" status nonexistent
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"does not exist"* ]]
|
||||
}
|
||||
|
||||
# --- cleanup ---
|
||||
|
||||
@test "git cleanup: fails if currently on the run branch" {
|
||||
"$LIB_DIR/archeflow-git.sh" init test-run 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-git.sh" cleanup test-run
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Cannot delete"* ]]
|
||||
}
|
||||
81
tests/archeflow-init.bats
Normal file
81
tests/archeflow-init.bats
Normal file
@@ -0,0 +1,81 @@
|
||||
# Tests for archeflow-init.sh — project initialization from templates.
|
||||
#
|
||||
# Validates: usage output, --list, --from (clone), and argument parsing.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
@test "init: shows usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-init.sh"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
[[ "$output" == *"bundle-name"* ]]
|
||||
}
|
||||
|
||||
@test "init: --list shows template listing without errors" {
|
||||
run "$LIB_DIR/archeflow-init.sh" --list
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Templates"* ]]
|
||||
[[ "$output" == *"Bundles"* ]]
|
||||
}
|
||||
|
||||
@test "init: --from fails when source has no .archeflow dir" {
|
||||
local source_dir
|
||||
source_dir=$(mktemp -d)
|
||||
run "$LIB_DIR/archeflow-init.sh" --from "$source_dir"
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"No .archeflow/"* ]]
|
||||
rm -rf "$source_dir"
|
||||
}
|
||||
|
||||
@test "init: --from clones setup from another project" {
|
||||
# Create a source project with .archeflow structure
|
||||
local source_dir
|
||||
source_dir=$(mktemp -d)
|
||||
mkdir -p "$source_dir/.archeflow/teams" "$source_dir/.archeflow/workflows"
|
||||
echo "name: test-team" > "$source_dir/.archeflow/teams/test.yaml"
|
||||
echo "name: test-workflow" > "$source_dir/.archeflow/workflows/test.yaml"
|
||||
echo "bundle: test" > "$source_dir/.archeflow/config.yaml"
|
||||
|
||||
run "$LIB_DIR/archeflow-init.sh" --from "$source_dir"
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f ".archeflow/teams/test.yaml" ]
|
||||
[ -f ".archeflow/workflows/test.yaml" ]
|
||||
[ -f ".archeflow/config.yaml" ]
|
||||
rm -rf "$source_dir"
|
||||
}
|
||||
|
||||
@test "init: --from skips events and artifacts directories" {
|
||||
local source_dir
|
||||
source_dir=$(mktemp -d)
|
||||
mkdir -p "$source_dir/.archeflow/events" "$source_dir/.archeflow/artifacts"
|
||||
mkdir -p "$source_dir/.archeflow/teams"
|
||||
echo "name: test" > "$source_dir/.archeflow/teams/t.yaml"
|
||||
echo '{"test":true}' > "$source_dir/.archeflow/events/run.jsonl"
|
||||
echo "artifact" > "$source_dir/.archeflow/artifacts/test.txt"
|
||||
|
||||
run "$LIB_DIR/archeflow-init.sh" --from "$source_dir"
|
||||
[ "$status" -eq 0 ]
|
||||
[ ! -f ".archeflow/events/run.jsonl" ]
|
||||
[ ! -f ".archeflow/artifacts/test.txt" ]
|
||||
[[ "$output" == *"skipped events"* ]]
|
||||
rm -rf "$source_dir"
|
||||
}
|
||||
|
||||
@test "init: rejects unknown options" {
|
||||
run "$LIB_DIR/archeflow-init.sh" --nonexistent
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Unknown option"* ]]
|
||||
}
|
||||
|
||||
@test "init: --save fails with no .archeflow directory" {
|
||||
run "$LIB_DIR/archeflow-init.sh" --save test-save
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"No .archeflow/"* ]]
|
||||
}
|
||||
227
tests/archeflow-memory.bats
Normal file
227
tests/archeflow-memory.bats
Normal file
@@ -0,0 +1,227 @@
|
||||
# Tests for archeflow-memory.sh — cross-run lesson memory management.
|
||||
#
|
||||
# Validates: add, list, decay, forget, inject filtering, and JSONL format.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
# --- Usage / error handling ---
|
||||
|
||||
@test "memory: exits 1 with usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-memory.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "memory: exits 1 for unknown command" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" nonexistent
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Unknown command"* ]]
|
||||
}
|
||||
|
||||
# --- add ---
|
||||
|
||||
@test "memory add: creates lessons.jsonl and appends a valid JSONL line" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" add preference "Always validate inputs"
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f ".archeflow/memory/lessons.jsonl" ]
|
||||
jq empty ".archeflow/memory/lessons.jsonl"
|
||||
}
|
||||
|
||||
@test "memory add: lesson has correct fields" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Guardian misses SQL injection" 2>/dev/null
|
||||
[ "$(jq -r '.type' .archeflow/memory/lessons.jsonl)" = "pattern" ]
|
||||
[ "$(jq -r '.description' .archeflow/memory/lessons.jsonl)" = "Guardian misses SQL injection" ]
|
||||
[ "$(jq -r '.source' .archeflow/memory/lessons.jsonl)" = "user_feedback" ]
|
||||
[ "$(jq -r '.frequency' .archeflow/memory/lessons.jsonl)" = "1" ]
|
||||
[ "$(jq -r '.run_id' .archeflow/memory/lessons.jsonl)" = "manual" ]
|
||||
[ "$(jq -r '.domain' .archeflow/memory/lessons.jsonl)" = "general" ]
|
||||
}
|
||||
|
||||
@test "memory add: generates sequential IDs" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "first lesson" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "second lesson" 2>/dev/null
|
||||
local id1 id2
|
||||
id1=$(head -1 ".archeflow/memory/lessons.jsonl" | jq -r '.id')
|
||||
id2=$(tail -1 ".archeflow/memory/lessons.jsonl" | jq -r '.id')
|
||||
[ "$id1" = "m-001" ]
|
||||
[ "$id2" = "m-002" ]
|
||||
}
|
||||
|
||||
@test "memory add: generates tags from description" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Guardian misses SQL injection attacks" 2>/dev/null
|
||||
local tags_count
|
||||
tags_count=$(head -1 ".archeflow/memory/lessons.jsonl" | jq '.tags | length')
|
||||
[ "$tags_count" -gt 0 ]
|
||||
}
|
||||
|
||||
@test "memory add: exits 1 when description is missing" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" add pattern
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
# --- list ---
|
||||
|
||||
@test "memory list: shows message when no lessons exist" {
|
||||
run bash -c "'$LIB_DIR/archeflow-memory.sh' list 2>&1"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"No lessons"* ]]
|
||||
}
|
||||
|
||||
@test "memory list: shows table header and lesson data" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Test lesson for listing" 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-memory.sh" list
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"ID"* ]]
|
||||
[[ "$output" == *"Freq"* ]]
|
||||
[[ "$output" == *"m-001"* ]]
|
||||
[[ "$output" == *"Test lesson for listing"* ]]
|
||||
}
|
||||
|
||||
# --- decay ---
|
||||
|
||||
@test "memory decay: increments runs_since_last_seen" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Decay test lesson" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-memory.sh" decay 2>/dev/null
|
||||
local runs_since
|
||||
runs_since=$(head -1 ".archeflow/memory/lessons.jsonl" | jq '.runs_since_last_seen')
|
||||
[ "$runs_since" -eq 1 ]
|
||||
}
|
||||
|
||||
@test "memory decay: decrements frequency after 10 runs" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Decay frequency test" 2>/dev/null
|
||||
# Set frequency=3 and runs_since=9 to trigger decay on next call
|
||||
local tmp=".archeflow/memory/lessons.jsonl.tmp"
|
||||
head -1 ".archeflow/memory/lessons.jsonl" | jq -c '.frequency = 3 | .runs_since_last_seen = 9' > "$tmp"
|
||||
mv "$tmp" ".archeflow/memory/lessons.jsonl"
|
||||
|
||||
"$LIB_DIR/archeflow-memory.sh" decay 2>/dev/null
|
||||
local freq
|
||||
freq=$(head -1 ".archeflow/memory/lessons.jsonl" | jq '.frequency')
|
||||
[ "$freq" -eq 2 ]
|
||||
}
|
||||
|
||||
@test "memory decay: archives lesson when frequency reaches 0" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Will be archived" 2>/dev/null
|
||||
# Set frequency=1 and runs_since=9 to trigger archival
|
||||
local tmp=".archeflow/memory/lessons.jsonl.tmp"
|
||||
head -1 ".archeflow/memory/lessons.jsonl" | jq -c '.frequency = 1 | .runs_since_last_seen = 9' > "$tmp"
|
||||
mv "$tmp" ".archeflow/memory/lessons.jsonl"
|
||||
|
||||
"$LIB_DIR/archeflow-memory.sh" decay 2>/dev/null
|
||||
|
||||
# Lesson should be gone from lessons file (file should be empty)
|
||||
local remaining
|
||||
remaining=$(wc -l < ".archeflow/memory/lessons.jsonl" | tr -d ' ')
|
||||
[ "$remaining" -eq 0 ]
|
||||
|
||||
# And present in archive
|
||||
[ -f ".archeflow/memory/archive.jsonl" ]
|
||||
local archived_count
|
||||
archived_count=$(wc -l < ".archeflow/memory/archive.jsonl" | tr -d ' ')
|
||||
[ "$archived_count" -eq 1 ]
|
||||
}
|
||||
|
||||
@test "memory decay: does nothing when no lessons exist" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" decay
|
||||
[ "$status" -eq 0 ]
|
||||
}
|
||||
|
||||
# --- forget ---
|
||||
|
||||
@test "memory forget: moves lesson to archive" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Will forget this" 2>/dev/null
|
||||
"$LIB_DIR/archeflow-memory.sh" forget m-001 2>/dev/null
|
||||
|
||||
# Lessons file should be empty
|
||||
local remaining
|
||||
remaining=$(wc -l < ".archeflow/memory/lessons.jsonl" | tr -d ' ')
|
||||
[ "$remaining" -eq 0 ]
|
||||
|
||||
# Archive should have it
|
||||
[ -f ".archeflow/memory/archive.jsonl" ]
|
||||
local archived_id
|
||||
archived_id=$(head -1 ".archeflow/memory/archive.jsonl" | jq -r '.id')
|
||||
[ "$archived_id" = "m-001" ]
|
||||
}
|
||||
|
||||
@test "memory forget: exits 1 for nonexistent ID" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "test" 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-memory.sh" forget m-999
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "memory forget: exits 1 when no lessons file exists" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" forget m-001
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"No lessons file"* ]]
|
||||
}
|
||||
|
||||
# --- inject ---
|
||||
|
||||
@test "memory inject: outputs nothing when no lessons file exists" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" inject code guardian
|
||||
[ "$status" -eq 0 ]
|
||||
[ -z "$output" ]
|
||||
}
|
||||
|
||||
@test "memory inject: outputs relevant lessons with frequency >= 2" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Test injection lesson" 2>/dev/null
|
||||
# Bump frequency to 2
|
||||
local tmp=".archeflow/memory/lessons.jsonl.tmp"
|
||||
jq -c '.frequency = 2' ".archeflow/memory/lessons.jsonl" > "$tmp"
|
||||
mv "$tmp" ".archeflow/memory/lessons.jsonl"
|
||||
|
||||
run "$LIB_DIR/archeflow-memory.sh" inject "" ""
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Known Issues"* ]]
|
||||
[[ "$output" == *"Test injection lesson"* ]]
|
||||
}
|
||||
|
||||
@test "memory inject: skips lessons with frequency < 2 (except preferences)" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add pattern "Low frequency lesson" 2>/dev/null
|
||||
# frequency is 1 by default, type is pattern -> should NOT be injected
|
||||
run "$LIB_DIR/archeflow-memory.sh" inject "" ""
|
||||
[ "$status" -eq 0 ]
|
||||
[ -z "$output" ]
|
||||
}
|
||||
|
||||
@test "memory inject: always injects preferences regardless of frequency" {
|
||||
"$LIB_DIR/archeflow-memory.sh" add preference "User prefers explicit error messages" 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-memory.sh" inject "" ""
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"User prefers explicit error messages"* ]]
|
||||
}
|
||||
|
||||
# --- extract ---
|
||||
|
||||
@test "memory extract: exits 1 when events file not found" {
|
||||
run "$LIB_DIR/archeflow-memory.sh" extract nonexistent.jsonl
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "memory extract: extracts findings from review.verdict events" {
|
||||
# Create a mock events file with a review.verdict
|
||||
mkdir -p .archeflow/events
|
||||
cat > /tmp/test-events.jsonl <<'EOF'
|
||||
{"run_id":"test-run","seq":1,"type":"run.start","phase":"plan","data":{"task":"test"}}
|
||||
{"run_id":"test-run","seq":2,"type":"review.verdict","phase":"check","data":{"archetype":"guardian","verdict":"needs_changes","findings":[{"severity":"warning","description":"Missing input validation on user endpoint","category":"code"}]}}
|
||||
EOF
|
||||
|
||||
run "$LIB_DIR/archeflow-memory.sh" extract /tmp/test-events.jsonl
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f ".archeflow/memory/lessons.jsonl" ]
|
||||
local desc
|
||||
desc=$(jq -r '.description' ".archeflow/memory/lessons.jsonl")
|
||||
[[ "$desc" == *"Missing input validation"* ]]
|
||||
rm -f /tmp/test-events.jsonl
|
||||
}
|
||||
78
tests/archeflow-progress.bats
Normal file
78
tests/archeflow-progress.bats
Normal file
@@ -0,0 +1,78 @@
|
||||
# Tests for archeflow-progress.sh — live progress file generation.
|
||||
#
|
||||
# Validates: markdown output structure, JSON mode, missing events handling, exit codes.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
|
||||
# Create standard events for progress tests
|
||||
mkdir -p .archeflow/events
|
||||
cat > ".archeflow/events/test-run.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"test-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"Build feature","workflow":"standard","team":"default"}}
|
||||
{"ts":"2026-04-03T10:01:00Z","run_id":"test-run","seq":2,"parent":[1],"type":"agent.complete","phase":"plan","agent":"creator","data":{"archetype":"creator","duration_ms":60000,"tokens":1500,"estimated_cost_usd":0.02,"summary":"Planned"}}
|
||||
EVENTS
|
||||
}
|
||||
|
||||
@test "progress: exits 1 with usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-progress.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "progress: exits 1 when events file not found" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" nonexistent-run
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "progress: default mode generates progress.md" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" test-run
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f ".archeflow/progress.md" ]
|
||||
[[ "$output" == *"# ArcheFlow Run: test-run"* ]]
|
||||
[[ "$output" == *"Status:"* ]]
|
||||
[[ "$output" == *"Progress"* ]]
|
||||
}
|
||||
|
||||
@test "progress: json mode outputs valid JSON" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" test-run --json
|
||||
[ "$status" -eq 0 ]
|
||||
echo "$output" | jq empty
|
||||
local run_id
|
||||
run_id=$(echo "$output" | jq -r '.run_id')
|
||||
[ "$run_id" = "test-run" ]
|
||||
}
|
||||
|
||||
@test "progress: json mode includes completed agents" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" test-run --json
|
||||
[ "$status" -eq 0 ]
|
||||
local completed_count
|
||||
completed_count=$(echo "$output" | jq '.completed | length')
|
||||
[ "$completed_count" -eq 1 ]
|
||||
local agent
|
||||
agent=$(echo "$output" | jq -r '.completed[0].agent')
|
||||
[ "$agent" = "creator" ]
|
||||
}
|
||||
|
||||
@test "progress: json mode shows correct phase" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" test-run --json
|
||||
[ "$status" -eq 0 ]
|
||||
local phase
|
||||
phase=$(echo "$output" | jq -r '.phase')
|
||||
[ "$phase" = "plan" ]
|
||||
}
|
||||
|
||||
@test "progress: reports error in json when events file missing" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" missing-run --json
|
||||
# JSON mode returns the JSON even on error
|
||||
local error
|
||||
error=$(echo "$output" | jq -r '.error // empty')
|
||||
[[ "$error" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "progress: rejects unknown flags" {
|
||||
run "$LIB_DIR/archeflow-progress.sh" test-run --invalid
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Unknown flag"* ]]
|
||||
}
|
||||
62
tests/archeflow-replay.bats
Normal file
62
tests/archeflow-replay.bats
Normal file
@@ -0,0 +1,62 @@
|
||||
# Tests for archeflow-replay.sh — timeline, what-if, and compare modes.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
|
||||
mkdir -p .archeflow/events
|
||||
cat > ".archeflow/events/replay-run.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"replay-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"replay test"}}
|
||||
{"ts":"2026-04-03T10:05:00Z","run_id":"replay-run","seq":2,"parent":[1],"type":"decision.point","phase":"check","agent":"guardian","data":{"archetype":"guardian","input":"diff","decision":"needs_changes","confidence":0.88}}
|
||||
{"ts":"2026-04-03T10:06:00Z","run_id":"replay-run","seq":3,"parent":[1],"type":"review.verdict","phase":"check","agent":"guardian","data":{"archetype":"guardian","verdict":"needs_changes","findings":[]}}
|
||||
{"ts":"2026-04-03T10:07:00Z","run_id":"replay-run","seq":4,"parent":[1],"type":"review.verdict","phase":"check","agent":"sage","data":{"archetype":"sage","verdict":"approved","findings":[]}}
|
||||
{"ts":"2026-04-03T10:08:00Z","run_id":"replay-run","seq":5,"parent":[1],"type":"run.complete","phase":"act","agent":null,"data":{"agents_total":2,"fixes_total":0}}
|
||||
EVENTS
|
||||
}
|
||||
|
||||
@test "replay: usage without args" {
|
||||
run "$LIB_DIR/archeflow-replay.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "replay: timeline shows decision.point" {
|
||||
run "$LIB_DIR/archeflow-replay.sh" timeline replay-run
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"decision.point"* ]]
|
||||
[[ "$output" == *"guardian"* ]]
|
||||
[[ "$output" == *"needs_changes"* ]]
|
||||
}
|
||||
|
||||
@test "replay: whatif strict blocks when any reviewer blocks" {
|
||||
run "$LIB_DIR/archeflow-replay.sh" whatif replay-run
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"BLOCK"* ]]
|
||||
}
|
||||
|
||||
@test "replay: whatif weighted can ship when blocker is down-weighted" {
|
||||
run "$LIB_DIR/archeflow-replay.sh" whatif replay-run --weights guardian=0.2,sage=3
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Weighted replay"* ]] || [[ "$output" == *"SHIP"* ]]
|
||||
[[ "$output" == *"SHIP"* ]]
|
||||
}
|
||||
|
||||
@test "replay: whatif --json is valid JSON" {
|
||||
run "$LIB_DIR/archeflow-replay.sh" whatif replay-run --json
|
||||
[ "$status" -eq 0 ]
|
||||
echo "$output" | jq -e '.run_id == "replay-run"' >/dev/null
|
||||
}
|
||||
|
||||
@test "replay: compare includes timeline and whatif" {
|
||||
run "$LIB_DIR/archeflow-replay.sh" compare replay-run
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Decision timeline"* ]]
|
||||
[[ "$output" == *"What-if replay"* ]]
|
||||
}
|
||||
|
||||
@test "decision: logs decision.point via wrapper" {
|
||||
run "$LIB_DIR/archeflow-decision.sh" replay-run check trickster 'diff only' 'edge_case' 0.61 1
|
||||
[ "$status" -eq 0 ]
|
||||
last=$(jq -r 'select(.type=="decision.point") | .data.decision' ".archeflow/events/replay-run.jsonl" | tail -1)
|
||||
[ "$last" = "edge_case" ]
|
||||
}
|
||||
80
tests/archeflow-report.bats
Normal file
80
tests/archeflow-report.bats
Normal file
@@ -0,0 +1,80 @@
|
||||
# Tests for archeflow-report.sh — Markdown process report generation from JSONL events.
|
||||
#
|
||||
# Validates: report output format, summary mode, missing file handling, jq dependency check.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
|
||||
# Create a standard events file used by multiple tests
|
||||
mkdir -p .archeflow/events
|
||||
cat > "$BATS_TEST_TMPDIR/events.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"test-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"Write unit tests","workflow":"standard","team":"default"}}
|
||||
{"ts":"2026-04-03T10:01:00Z","run_id":"test-run","seq":2,"parent":[1],"type":"agent.complete","phase":"plan","agent":"creator","data":{"archetype":"creator","duration_ms":60000,"tokens":1500,"summary":"Designed test structure"}}
|
||||
{"ts":"2026-04-03T10:02:00Z","run_id":"test-run","seq":3,"parent":[2],"type":"phase.transition","phase":"do","agent":null,"data":{"from":"plan","to":"do"}}
|
||||
{"ts":"2026-04-03T10:05:00Z","run_id":"test-run","seq":4,"parent":[3],"type":"agent.complete","phase":"do","agent":"maker","data":{"archetype":"maker","duration_ms":180000,"tokens":3000,"summary":"Implemented tests"}}
|
||||
{"ts":"2026-04-03T10:06:00Z","run_id":"test-run","seq":5,"parent":[4],"type":"phase.transition","phase":"check","agent":null,"data":{"from":"do","to":"check"}}
|
||||
{"ts":"2026-04-03T10:07:00Z","run_id":"test-run","seq":6,"parent":[5],"type":"review.verdict","phase":"check","agent":"guardian","data":{"archetype":"guardian","verdict":"approved","findings":[]}}
|
||||
{"ts":"2026-04-03T10:08:00Z","run_id":"test-run","seq":7,"parent":[6],"type":"run.complete","phase":"act","agent":null,"data":{"status":"completed","cycles":1,"agents_total":3,"fixes_total":0,"duration_ms":480000}}
|
||||
EVENTS
|
||||
}
|
||||
|
||||
@test "report: exits 1 with usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-report.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "report: exits 1 when events file not found" {
|
||||
run "$LIB_DIR/archeflow-report.sh" nonexistent.jsonl
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "report: full mode produces markdown with header and overview" {
|
||||
run "$LIB_DIR/archeflow-report.sh" "$BATS_TEST_TMPDIR/events.jsonl"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"# Process Report: Write unit tests"* ]]
|
||||
[[ "$output" == *"test-run"* ]]
|
||||
[[ "$output" == *"Overview"* ]]
|
||||
[[ "$output" == *"Status"* ]]
|
||||
[[ "$output" == *"completed"* ]]
|
||||
}
|
||||
|
||||
@test "report: full mode includes phase sections" {
|
||||
run "$LIB_DIR/archeflow-report.sh" "$BATS_TEST_TMPDIR/events.jsonl"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"PLAN"* ]]
|
||||
[[ "$output" == *"DO"* ]]
|
||||
[[ "$output" == *"CHECK"* ]]
|
||||
}
|
||||
|
||||
@test "report: summary mode outputs one-line summary" {
|
||||
run "$LIB_DIR/archeflow-report.sh" "$BATS_TEST_TMPDIR/events.jsonl" --summary
|
||||
[ "$status" -eq 0 ]
|
||||
# Should be a single logical line with key stats
|
||||
[[ "$output" == *"[completed]"* ]]
|
||||
[[ "$output" == *"Write unit tests"* ]]
|
||||
[[ "$output" == *"1 cycles"* ]]
|
||||
[[ "$output" == *"test-run"* ]]
|
||||
}
|
||||
|
||||
@test "report: --output writes to file instead of stdout" {
|
||||
run "$LIB_DIR/archeflow-report.sh" "$BATS_TEST_TMPDIR/events.jsonl" --output "$BATS_TEST_TMPDIR/report.md"
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f "$BATS_TEST_TMPDIR/report.md" ]
|
||||
local content
|
||||
content=$(cat "$BATS_TEST_TMPDIR/report.md")
|
||||
[[ "$content" == *"# Process Report"* ]]
|
||||
}
|
||||
|
||||
@test "report: summary for in-progress run shows [in-progress]" {
|
||||
# Events file without run.complete
|
||||
cat > "$BATS_TEST_TMPDIR/in-progress.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"wip-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"WIP task","workflow":"fast","team":"default"}}
|
||||
EVENTS
|
||||
run "$LIB_DIR/archeflow-report.sh" "$BATS_TEST_TMPDIR/in-progress.jsonl" --summary
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"[in-progress]"* ]]
|
||||
[[ "$output" == *"WIP task"* ]]
|
||||
}
|
||||
82
tests/archeflow-review.bats
Normal file
82
tests/archeflow-review.bats
Normal file
@@ -0,0 +1,82 @@
|
||||
# Tests for archeflow-review.sh — git diff extraction for code review.
|
||||
#
|
||||
# Validates: argument parsing, diff modes, stats output, empty diff handling.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
@test "review: --help shows usage" {
|
||||
run "$LIB_DIR/archeflow-review.sh" --help
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
[[ "$output" == *"--branch"* ]]
|
||||
[[ "$output" == *"--commit"* ]]
|
||||
}
|
||||
|
||||
@test "review: exits 1 when no changes to review" {
|
||||
run "$LIB_DIR/archeflow-review.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"No changes"* ]]
|
||||
}
|
||||
|
||||
@test "review: shows diff for uncommitted changes" {
|
||||
echo "new content" > testfile.txt
|
||||
git add testfile.txt
|
||||
run "$LIB_DIR/archeflow-review.sh"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"testfile.txt"* ]]
|
||||
}
|
||||
|
||||
@test "review: --stat-only prints stats without diff content" {
|
||||
echo "stat content" > statfile.txt
|
||||
git add statfile.txt
|
||||
run "$LIB_DIR/archeflow-review.sh" --stat-only
|
||||
[ "$status" -eq 0 ]
|
||||
# stderr has stats, stdout should be empty (no diff)
|
||||
# But run captures both, so just check it ran ok
|
||||
[[ "$output" == *"Review Stats"* ]]
|
||||
}
|
||||
|
||||
@test "review: --branch fails for nonexistent branch" {
|
||||
run "$LIB_DIR/archeflow-review.sh" --branch nonexistent-branch-xyz
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "review: rejects unknown arguments" {
|
||||
run "$LIB_DIR/archeflow-review.sh" --unknown
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "$output" == *"Unknown argument"* ]]
|
||||
}
|
||||
|
||||
@test "review: --branch shows diff against base" {
|
||||
# Create a feature branch with changes
|
||||
git checkout -b feat/test-review --quiet
|
||||
echo "feature" > feature.txt
|
||||
git add feature.txt
|
||||
git commit -m "feat: add feature" --quiet
|
||||
git checkout main --quiet
|
||||
|
||||
run "$LIB_DIR/archeflow-review.sh" --branch feat/test-review
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"feature.txt"* ]]
|
||||
}
|
||||
|
||||
@test "review: --commit shows diff for commit range" {
|
||||
echo "first" > first.txt
|
||||
git add first.txt
|
||||
git commit -m "first" --quiet
|
||||
echo "second" > second.txt
|
||||
git add second.txt
|
||||
git commit -m "second" --quiet
|
||||
|
||||
run "$LIB_DIR/archeflow-review.sh" --commit HEAD~1..HEAD
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"second.txt"* ]]
|
||||
}
|
||||
58
tests/archeflow-rollback.bats
Normal file
58
tests/archeflow-rollback.bats
Normal file
@@ -0,0 +1,58 @@
|
||||
# Tests for archeflow-rollback.sh — post-merge test and phase rollback.
|
||||
#
|
||||
# Validates: argument parsing, mutual exclusivity, phase validation, test-cmd config reading.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
}
|
||||
|
||||
teardown() {
|
||||
_common_teardown
|
||||
}
|
||||
|
||||
@test "rollback: exits with error when called with no args" {
|
||||
run "$LIB_DIR/archeflow-rollback.sh"
|
||||
[ "$status" -ne 0 ]
|
||||
}
|
||||
|
||||
@test "rollback: rejects mutually exclusive --to and --test-cmd" {
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run --to plan --test-cmd "true"
|
||||
[ "$status" -eq 2 ]
|
||||
[[ "$output" == *"mutually exclusive"* ]]
|
||||
}
|
||||
|
||||
@test "rollback: rejects invalid phase names" {
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run --to invalid-phase
|
||||
[ "$status" -eq 2 ]
|
||||
[[ "$output" == *"Invalid phase"* ]]
|
||||
}
|
||||
|
||||
@test "rollback: accepts valid phase names (plan, do, check)" {
|
||||
# This will fail because no git branch exists, but should NOT fail on phase validation
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run --to plan
|
||||
# Should fail later (archeflow-git.sh rollback) not on phase validation
|
||||
[[ "$output" != *"Invalid phase"* ]]
|
||||
}
|
||||
|
||||
@test "rollback: exits 2 when no test command available" {
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run
|
||||
[ "$status" -eq 2 ]
|
||||
[[ "$output" == *"No test command"* ]]
|
||||
}
|
||||
|
||||
@test "rollback: reads test_command from config.yaml" {
|
||||
mkdir -p .archeflow
|
||||
echo 'test_command: "echo ok"' > .archeflow/config.yaml
|
||||
# HEAD won't have archeflow in its message, but the script just warns and proceeds
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run
|
||||
# It should pick up the command and try to run it (test should pass -> exit 0)
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Tests passed"* ]]
|
||||
}
|
||||
|
||||
@test "rollback: rejects unknown options" {
|
||||
run "$LIB_DIR/archeflow-rollback.sh" test-run --unknown-flag
|
||||
[ "$status" -eq 2 ]
|
||||
[[ "$output" == *"Unknown option"* ]]
|
||||
}
|
||||
105
tests/archeflow-score.bats
Normal file
105
tests/archeflow-score.bats
Normal file
@@ -0,0 +1,105 @@
|
||||
# Tests for archeflow-score.sh — archetype effectiveness scoring.
|
||||
#
|
||||
# Validates: score extraction from events, report generation, input validation.
|
||||
|
||||
setup() {
|
||||
load test_helper
|
||||
_common_setup
|
||||
|
||||
# Create a complete run events file with review data
|
||||
mkdir -p .archeflow/events .archeflow/memory
|
||||
cat > "$BATS_TEST_TMPDIR/scored-events.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"score-run","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"Score test"}}
|
||||
{"ts":"2026-04-03T10:01:00Z","run_id":"score-run","seq":2,"parent":[1],"type":"agent.complete","phase":"plan","agent":"creator","data":{"archetype":"creator","duration_ms":60000,"tokens":1500,"estimated_cost_usd":0.02}}
|
||||
{"ts":"2026-04-03T10:02:00Z","run_id":"score-run","seq":3,"parent":[2],"type":"agent.complete","phase":"do","agent":"maker","data":{"archetype":"maker","duration_ms":120000,"tokens":3000,"estimated_cost_usd":0.05}}
|
||||
{"ts":"2026-04-03T10:03:00Z","run_id":"score-run","seq":4,"parent":[3],"type":"review.verdict","phase":"check","agent":"guardian","data":{"archetype":"guardian","verdict":"needs_changes","findings":[{"severity":"warning","description":"Missing validation","fix_required":true},{"severity":"info","description":"Consider logging","fix_required":false}]}}
|
||||
{"ts":"2026-04-03T10:03:30Z","run_id":"score-run","seq":5,"parent":[3],"type":"review.verdict","phase":"check","agent":"sage","data":{"archetype":"sage","verdict":"approved","findings":[]}}
|
||||
{"ts":"2026-04-03T10:04:00Z","run_id":"score-run","seq":6,"parent":[4],"type":"fix.applied","phase":"act","agent":null,"data":{"source":"guardian","finding":"Missing validation"}}
|
||||
{"ts":"2026-04-03T10:05:00Z","run_id":"score-run","seq":7,"parent":[6],"type":"cycle.boundary","phase":"act","agent":null,"data":{"cycle":1,"max_cycles":3,"met":true,"next_action":"merge"}}
|
||||
{"ts":"2026-04-03T10:06:00Z","run_id":"score-run","seq":8,"parent":[7],"type":"run.complete","phase":"act","agent":null,"data":{"status":"completed","cycles":1,"agents_total":4,"fixes_total":1}}
|
||||
EVENTS
|
||||
}
|
||||
|
||||
@test "score: exits 1 with usage when called with no args" {
|
||||
run "$LIB_DIR/archeflow-score.sh"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Usage"* ]]
|
||||
}
|
||||
|
||||
@test "score: exits 1 for unknown command" {
|
||||
run "$LIB_DIR/archeflow-score.sh" nonexistent
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"Unknown command"* ]]
|
||||
}
|
||||
|
||||
@test "score extract: exits 1 when events file not found" {
|
||||
run "$LIB_DIR/archeflow-score.sh" extract nonexistent.jsonl
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"not found"* ]]
|
||||
}
|
||||
|
||||
@test "score extract: exits 1 for incomplete run (no run.complete)" {
|
||||
cat > "$BATS_TEST_TMPDIR/incomplete.jsonl" <<'EVENTS'
|
||||
{"ts":"2026-04-03T10:00:00Z","run_id":"incomplete","seq":1,"parent":[],"type":"run.start","phase":"plan","agent":null,"data":{"task":"Incomplete"}}
|
||||
EVENTS
|
||||
run "$LIB_DIR/archeflow-score.sh" extract "$BATS_TEST_TMPDIR/incomplete.jsonl"
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"run.complete"* ]]
|
||||
}
|
||||
|
||||
@test "score extract: creates effectiveness.jsonl with archetype scores" {
|
||||
run "$LIB_DIR/archeflow-score.sh" extract "$BATS_TEST_TMPDIR/scored-events.jsonl"
|
||||
[ "$status" -eq 0 ]
|
||||
[ -f ".archeflow/memory/effectiveness.jsonl" ]
|
||||
|
||||
# Should have scores for guardian and sage (the reviewers)
|
||||
local guardian_score
|
||||
guardian_score=$(grep '"guardian"' ".archeflow/memory/effectiveness.jsonl" | head -1)
|
||||
[ -n "$guardian_score" ]
|
||||
|
||||
# Verify JSONL is valid
|
||||
while IFS= read -r line; do
|
||||
echo "$line" | jq empty
|
||||
done < ".archeflow/memory/effectiveness.jsonl"
|
||||
}
|
||||
|
||||
@test "score extract: guardian has correct finding counts" {
|
||||
"$LIB_DIR/archeflow-score.sh" extract "$BATS_TEST_TMPDIR/scored-events.jsonl" 2>/dev/null
|
||||
local guardian
|
||||
guardian=$(grep '"guardian"' ".archeflow/memory/effectiveness.jsonl" | head -1)
|
||||
local total_findings
|
||||
total_findings=$(echo "$guardian" | jq '.findings_total')
|
||||
[ "$total_findings" -eq 2 ]
|
||||
local useful_findings
|
||||
useful_findings=$(echo "$guardian" | jq '.findings_useful')
|
||||
[ "$useful_findings" -eq 1 ]
|
||||
local fixes
|
||||
fixes=$(echo "$guardian" | jq '.fixes_applied')
|
||||
[ "$fixes" -eq 1 ]
|
||||
}
|
||||
|
||||
@test "score extract: composite score is between 0 and 1" {
|
||||
"$LIB_DIR/archeflow-score.sh" extract "$BATS_TEST_TMPDIR/scored-events.jsonl" 2>/dev/null
|
||||
while IFS= read -r line; do
|
||||
local score
|
||||
score=$(echo "$line" | jq '.composite_score')
|
||||
# score >= 0 and score <= 1
|
||||
[ "$(echo "$score >= 0" | bc)" -eq 1 ]
|
||||
[ "$(echo "$score <= 1" | bc)" -eq 1 ]
|
||||
done < ".archeflow/memory/effectiveness.jsonl"
|
||||
}
|
||||
|
||||
@test "score report: exits 1 when no effectiveness data" {
|
||||
run "$LIB_DIR/archeflow-score.sh" report
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" == *"No effectiveness data"* ]]
|
||||
}
|
||||
|
||||
@test "score report: outputs markdown table with archetype data" {
|
||||
"$LIB_DIR/archeflow-score.sh" extract "$BATS_TEST_TMPDIR/scored-events.jsonl" 2>/dev/null
|
||||
run "$LIB_DIR/archeflow-score.sh" report
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"Archetype Effectiveness Report"* ]]
|
||||
[[ "$output" == *"Archetype"* ]]
|
||||
[[ "$output" == *"guardian"* ]]
|
||||
}
|
||||
40
tests/test_helper.bash
Normal file
40
tests/test_helper.bash
Normal file
@@ -0,0 +1,40 @@
|
||||
# test_helper.bash — Shared setup/teardown for ArcheFlow bats tests.
|
||||
#
|
||||
# Usage in .bats files:
|
||||
# setup() { load test_helper; _common_setup; }
|
||||
# teardown() { _common_teardown; }
|
||||
#
|
||||
# Provides:
|
||||
# - BATS_TEST_TMPDIR: unique temp directory per test
|
||||
# - Mock .archeflow/ structure via a git repo
|
||||
# - LIB_DIR: path to the lib/ scripts under test
|
||||
|
||||
LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../lib" && pwd)"
|
||||
|
||||
_common_setup() {
|
||||
# Create a unique temp directory for this test
|
||||
BATS_TEST_TMPDIR="$(mktemp -d)"
|
||||
export BATS_TEST_TMPDIR
|
||||
|
||||
# Work inside the temp dir so scripts create .archeflow/ there
|
||||
cd "$BATS_TEST_TMPDIR"
|
||||
|
||||
# Initialize a minimal git repo (many scripts need it)
|
||||
git init --quiet
|
||||
git config user.email "test@test.com"
|
||||
git config user.name "Test User"
|
||||
# Disable commit signing in tests (global config may have it enabled)
|
||||
git config commit.gpgsign false
|
||||
git config tag.gpgsign false
|
||||
|
||||
# Create an initial commit so HEAD exists
|
||||
echo "init" > README.md
|
||||
git add README.md
|
||||
git commit -m "init" --quiet
|
||||
}
|
||||
|
||||
_common_teardown() {
|
||||
# Return to a safe directory before cleanup
|
||||
cd /tmp
|
||||
rm -rf "$BATS_TEST_TMPDIR"
|
||||
}
|
||||
Reference in New Issue
Block a user