feat: add progress and effectiveness scoring scripts
This commit is contained in:
333
lib/archeflow-progress.sh
Executable file
333
lib/archeflow-progress.sh
Executable file
@@ -0,0 +1,333 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# archeflow-progress.sh — Generate a live progress file from ArcheFlow JSONL events.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# archeflow-progress.sh <run_id> # Generate/update .archeflow/progress.md
|
||||||
|
# archeflow-progress.sh <run_id> --watch # Continuous update mode (2s interval)
|
||||||
|
# archeflow-progress.sh <run_id> --json # Output as JSON (for dashboards)
|
||||||
|
#
|
||||||
|
# Reads .archeflow/events/<run_id>.jsonl and produces a human-readable progress
|
||||||
|
# snapshot. Designed to be called after every archeflow-event.sh invocation during
|
||||||
|
# a run, or watched from a second terminal.
|
||||||
|
#
|
||||||
|
# Requires: jq
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
if [[ $# -lt 1 ]]; then
|
||||||
|
echo "Usage: $0 <run_id> [--watch] [--json]" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
RUN_ID="$1"
|
||||||
|
shift
|
||||||
|
|
||||||
|
MODE="default" # default | watch | json
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--watch) MODE="watch" ;;
|
||||||
|
--json) MODE="json" ;;
|
||||||
|
*) echo "Unknown flag: $1" >&2; exit 1 ;;
|
||||||
|
esac
|
||||||
|
shift
|
||||||
|
done
|
||||||
|
|
||||||
|
EVENTS_DIR=".archeflow/events"
|
||||||
|
EVENT_FILE="${EVENTS_DIR}/${RUN_ID}.jsonl"
|
||||||
|
PROGRESS_FILE=".archeflow/progress.md"
|
||||||
|
|
||||||
|
if ! command -v jq &> /dev/null; then
|
||||||
|
echo "Error: jq is required but not installed." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Core: generate progress from current JSONL state ---
|
||||||
|
|
||||||
|
generate_progress_json() {
|
||||||
|
# Produce a structured JSON object from the event stream.
|
||||||
|
# This is the single source of truth — markdown and terminal output derive from it.
|
||||||
|
|
||||||
|
if [[ ! -f "$EVENT_FILE" ]]; then
|
||||||
|
echo '{"error":"Event file not found","run_id":"'"$RUN_ID"'"}'
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
jq -s '
|
||||||
|
# Extract run metadata
|
||||||
|
(.[0] // {}) as $first |
|
||||||
|
([.[] | select(.type == "run.start")] | first // {}) as $run_start_evt |
|
||||||
|
($run_start_evt.data // {}) as $run_data |
|
||||||
|
($run_start_evt.ts // "") as $start_ts |
|
||||||
|
([.[] | select(.type == "run.complete")] | first // null) as $run_complete |
|
||||||
|
|
||||||
|
# Current phase: last phase seen
|
||||||
|
(map(.phase) | map(select(. != null and . != "")) | last // "unknown") as $current_phase |
|
||||||
|
|
||||||
|
# Total events
|
||||||
|
length as $total_events |
|
||||||
|
|
||||||
|
# Latest event
|
||||||
|
(last // {}) as $latest |
|
||||||
|
|
||||||
|
# Completed agents: agent.complete events
|
||||||
|
[.[] | select(.type == "agent.complete") | {
|
||||||
|
agent: (.data.archetype // .agent // "unknown"),
|
||||||
|
phase: .phase,
|
||||||
|
duration_s: ((.data.duration_ms // 0) / 1000 | floor),
|
||||||
|
tokens: (.data.tokens // (.data.tokens_input // 0) + (.data.tokens_output // 0)),
|
||||||
|
cost_usd: (.data.estimated_cost_usd // .data.cost_usd // 0),
|
||||||
|
seq: .seq
|
||||||
|
}] as $completed |
|
||||||
|
|
||||||
|
# Running agents: agent.start with no matching agent.complete
|
||||||
|
(
|
||||||
|
[.[] | select(.type == "agent.start") | {
|
||||||
|
agent: (.data.archetype // .agent // "unknown"),
|
||||||
|
phase: .phase,
|
||||||
|
start_ts: .ts,
|
||||||
|
seq: .seq
|
||||||
|
}] |
|
||||||
|
[.[] | select(
|
||||||
|
.agent as $a |
|
||||||
|
.seq as $s |
|
||||||
|
($completed | map(.agent) | index($a)) == null
|
||||||
|
)]
|
||||||
|
) as $running |
|
||||||
|
|
||||||
|
# Phase transitions
|
||||||
|
[.[] | select(.type == "phase.transition") | {
|
||||||
|
from: (.data.from // "?"),
|
||||||
|
to: (.data.to // "?"),
|
||||||
|
seq: .seq
|
||||||
|
}] as $transitions |
|
||||||
|
|
||||||
|
# Review verdicts
|
||||||
|
[.[] | select(.type == "review.verdict") | {
|
||||||
|
agent: (.data.archetype // .agent // "unknown"),
|
||||||
|
verdict: (.data.verdict // "unknown"),
|
||||||
|
findings_count: ((.data.findings // []) | length),
|
||||||
|
seq: .seq
|
||||||
|
}] as $verdicts |
|
||||||
|
|
||||||
|
# Fixes
|
||||||
|
[.[] | select(.type == "fix.applied")] | length as $fixes_count |
|
||||||
|
|
||||||
|
# Budget: sum costs from agent.complete events
|
||||||
|
($completed | map(.cost_usd) | add // 0) as $budget_used |
|
||||||
|
|
||||||
|
# Try to get budget limit from run.start config
|
||||||
|
($run_data.config.budget_usd // $run_data.budget_usd // null) as $budget_total |
|
||||||
|
|
||||||
|
# Determine status
|
||||||
|
(if $run_complete != null then "completed"
|
||||||
|
elif ($running | length) > 0 then
|
||||||
|
"running"
|
||||||
|
else "idle"
|
||||||
|
end) as $status |
|
||||||
|
|
||||||
|
# Active agent description
|
||||||
|
(if ($running | length) > 0 then ($running[0].agent) else null end) as $active_agent |
|
||||||
|
|
||||||
|
{
|
||||||
|
run_id: $first.run_id // "unknown",
|
||||||
|
task: ($run_data.task // "unknown"),
|
||||||
|
workflow: ($run_data.workflow // "unknown"),
|
||||||
|
status: $status,
|
||||||
|
phase: $current_phase,
|
||||||
|
active_agent: $active_agent,
|
||||||
|
start_ts: $start_ts,
|
||||||
|
budget_used_usd: $budget_used,
|
||||||
|
budget_total_usd: $budget_total,
|
||||||
|
budget_percent: (if $budget_total != null and $budget_total > 0 then
|
||||||
|
(($budget_used / $budget_total * 100) | floor)
|
||||||
|
else null end),
|
||||||
|
completed: $completed,
|
||||||
|
running: $running,
|
||||||
|
transitions: $transitions,
|
||||||
|
verdicts: $verdicts,
|
||||||
|
fixes_count: $fixes_count,
|
||||||
|
latest_event: {
|
||||||
|
seq: ($latest.seq // 0),
|
||||||
|
type: ($latest.type // "unknown"),
|
||||||
|
agent: ($latest.agent // null),
|
||||||
|
phase: ($latest.phase // "unknown"),
|
||||||
|
ts: ($latest.ts // "")
|
||||||
|
},
|
||||||
|
total_events: $total_events
|
||||||
|
}
|
||||||
|
' "$EVENT_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
generate_progress_markdown() {
|
||||||
|
local progress_json
|
||||||
|
progress_json=$(generate_progress_json)
|
||||||
|
|
||||||
|
if echo "$progress_json" | jq -e '.error' > /dev/null 2>&1; then
|
||||||
|
echo "Error: $(echo "$progress_json" | jq -r '.error')"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract fields for the markdown template
|
||||||
|
local run_id task workflow status phase active_agent start_ts
|
||||||
|
local budget_used budget_total budget_percent total_events
|
||||||
|
|
||||||
|
run_id=$(echo "$progress_json" | jq -r '.run_id')
|
||||||
|
task=$(echo "$progress_json" | jq -r '.task')
|
||||||
|
workflow=$(echo "$progress_json" | jq -r '.workflow')
|
||||||
|
status=$(echo "$progress_json" | jq -r '.status')
|
||||||
|
phase=$(echo "$progress_json" | jq -r '.phase')
|
||||||
|
active_agent=$(echo "$progress_json" | jq -r '.active_agent // "none"')
|
||||||
|
start_ts=$(echo "$progress_json" | jq -r '.start_ts')
|
||||||
|
budget_used=$(echo "$progress_json" | jq -r '.budget_used_usd')
|
||||||
|
budget_total=$(echo "$progress_json" | jq -r '.budget_total_usd')
|
||||||
|
budget_percent=$(echo "$progress_json" | jq -r '.budget_percent')
|
||||||
|
total_events=$(echo "$progress_json" | jq -r '.total_events')
|
||||||
|
|
||||||
|
# Calculate elapsed time
|
||||||
|
local elapsed_display="n/a"
|
||||||
|
if [[ -n "$start_ts" && "$start_ts" != "null" ]]; then
|
||||||
|
local start_epoch now_epoch elapsed_s elapsed_min
|
||||||
|
start_epoch=$(date -d "$start_ts" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$start_ts" +%s 2>/dev/null || echo 0)
|
||||||
|
now_epoch=$(date +%s)
|
||||||
|
if [[ "$start_epoch" -gt 0 ]]; then
|
||||||
|
elapsed_s=$(( now_epoch - start_epoch ))
|
||||||
|
elapsed_min=$(( elapsed_s / 60 ))
|
||||||
|
if [[ $elapsed_min -gt 0 ]]; then
|
||||||
|
elapsed_display="${elapsed_min} min"
|
||||||
|
else
|
||||||
|
elapsed_display="${elapsed_s}s"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Status line
|
||||||
|
local phase_upper
|
||||||
|
phase_upper=$(echo "$phase" | tr '[:lower:]' '[:upper:]')
|
||||||
|
local status_line="${phase_upper} phase"
|
||||||
|
if [[ "$active_agent" != "none" && "$active_agent" != "null" ]]; then
|
||||||
|
status_line="${status_line} — ${active_agent} running"
|
||||||
|
fi
|
||||||
|
if [[ "$status" == "completed" ]]; then
|
||||||
|
status_line="Completed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Budget line
|
||||||
|
local budget_line
|
||||||
|
if [[ "$budget_total" != "null" && "$budget_total" != "0" ]]; then
|
||||||
|
budget_line="\$${budget_used} / \$${budget_total} (${budget_percent}%)"
|
||||||
|
else
|
||||||
|
budget_line="\$${budget_used} (no budget set)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Start time display (HH:MM)
|
||||||
|
local start_display="n/a"
|
||||||
|
if [[ -n "$start_ts" && "$start_ts" != "null" ]]; then
|
||||||
|
start_display=$(echo "$start_ts" | grep -oP '\d{2}:\d{2}' | head -1 || echo "$start_ts")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Header
|
||||||
|
cat <<EOF
|
||||||
|
# ArcheFlow Run: ${run_id}
|
||||||
|
**Status:** ${status_line}
|
||||||
|
**Started:** ${start_display} | **Elapsed:** ${elapsed_display}
|
||||||
|
**Budget:** ${budget_line}
|
||||||
|
|
||||||
|
## Progress
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Build checklist from completed agents, transitions, verdicts, and running agents
|
||||||
|
# Order: by seq number (chronological)
|
||||||
|
|
||||||
|
# Completed agents
|
||||||
|
echo "$progress_json" | jq -r '
|
||||||
|
# Build sorted event list for the checklist
|
||||||
|
(
|
||||||
|
[.completed[] | {
|
||||||
|
seq: .seq,
|
||||||
|
line: ("- [x] " + (.phase | ascii_upcase) + ": " + .agent +
|
||||||
|
" (" + (.duration_s | tostring) + "s, " +
|
||||||
|
(if .tokens > 0 then ((.tokens / 1000 | floor | tostring) + "k tok, ") else "" end) +
|
||||||
|
"$" + (.cost_usd | tostring) + ")")
|
||||||
|
}] +
|
||||||
|
[.transitions[] | {
|
||||||
|
seq: .seq,
|
||||||
|
line: ("- [x] " + (.from | ascii_upcase) + " -> " + (.to | ascii_upcase) + " transition")
|
||||||
|
}] +
|
||||||
|
[.verdicts[] | {
|
||||||
|
seq: .seq,
|
||||||
|
line: ("- [x] CHECK: " + .agent + " -> " + (.verdict | ascii_upcase | gsub("_"; " ")) +
|
||||||
|
(if .findings_count > 0 then " (" + (.findings_count | tostring) + " findings)" else "" end))
|
||||||
|
}] +
|
||||||
|
[.running[] | {
|
||||||
|
seq: .seq,
|
||||||
|
line: ("- [ ] **" + (.phase | ascii_upcase) + ": " + .agent + "** <- running")
|
||||||
|
}]
|
||||||
|
) | sort_by(.seq) | .[].line
|
||||||
|
'
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Latest event
|
||||||
|
local latest_seq latest_type latest_agent latest_phase latest_ts
|
||||||
|
latest_seq=$(echo "$progress_json" | jq -r '.latest_event.seq')
|
||||||
|
latest_type=$(echo "$progress_json" | jq -r '.latest_event.type')
|
||||||
|
latest_agent=$(echo "$progress_json" | jq -r '.latest_event.agent // "_"')
|
||||||
|
latest_phase=$(echo "$progress_json" | jq -r '.latest_event.phase')
|
||||||
|
latest_ts=$(echo "$progress_json" | jq -r '.latest_event.ts')
|
||||||
|
local latest_time
|
||||||
|
latest_time=$(echo "$latest_ts" | grep -oP '\d{2}:\d{2}' | head -1 || echo "$latest_ts")
|
||||||
|
|
||||||
|
echo "## Latest Event"
|
||||||
|
if [[ "$latest_agent" != "null" && "$latest_agent" != "_" ]]; then
|
||||||
|
echo "#${latest_seq} ${latest_type} — ${latest_agent} (${latest_phase}) — ${latest_time}"
|
||||||
|
else
|
||||||
|
echo "#${latest_seq} ${latest_type} (${latest_phase}) — ${latest_time}"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# DAG (delegate to archeflow-dag.sh if available)
|
||||||
|
local script_dir
|
||||||
|
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
if [[ -x "${script_dir}/archeflow-dag.sh" && -f "$EVENT_FILE" ]]; then
|
||||||
|
echo "## DAG"
|
||||||
|
"${script_dir}/archeflow-dag.sh" "$EVENT_FILE" --no-color
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Mode dispatch ---
|
||||||
|
|
||||||
|
case "$MODE" in
|
||||||
|
json)
|
||||||
|
generate_progress_json
|
||||||
|
;;
|
||||||
|
|
||||||
|
watch)
|
||||||
|
while true; do
|
||||||
|
clear
|
||||||
|
if [[ -f "$EVENT_FILE" ]]; then
|
||||||
|
generate_progress_markdown
|
||||||
|
# Check if run is complete
|
||||||
|
if jq -e 'select(.type == "run.complete")' "$EVENT_FILE" > /dev/null 2>&1; then
|
||||||
|
echo ""
|
||||||
|
echo "--- Run complete. Exiting watch mode. ---"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Waiting for events: ${EVENT_FILE}"
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
;;
|
||||||
|
|
||||||
|
default)
|
||||||
|
if [[ ! -f "$EVENT_FILE" ]]; then
|
||||||
|
echo "Error: Event file not found: $EVENT_FILE" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
mkdir -p "$(dirname "$PROGRESS_FILE")"
|
||||||
|
output=$(generate_progress_markdown)
|
||||||
|
echo "$output" > "$PROGRESS_FILE"
|
||||||
|
echo "$output"
|
||||||
|
echo "[archeflow-progress] Updated ${PROGRESS_FILE}" >&2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
368
lib/archeflow-score.sh
Executable file
368
lib/archeflow-score.sh
Executable file
@@ -0,0 +1,368 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# archeflow-score.sh — Archetype effectiveness scoring for ArcheFlow orchestrations.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# archeflow-score.sh extract <events.jsonl> # Score archetypes from a completed run
|
||||||
|
# archeflow-score.sh report # Show aggregate effectiveness report
|
||||||
|
# archeflow-score.sh recommend <team.yaml> # Recommend model tiers for a team
|
||||||
|
#
|
||||||
|
# Scores review archetypes (Guardian, Sage, Skeptic, Trickster, etc.) on signal-to-noise,
|
||||||
|
# fix rate, cost efficiency, accuracy, and cycle impact. Stores per-run scores in
|
||||||
|
# .archeflow/memory/effectiveness.jsonl and produces aggregate reports with recommendations.
|
||||||
|
#
|
||||||
|
# Requires: jq
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
if [[ $# -lt 1 ]]; then
|
||||||
|
echo "Usage: $0 <command> [args...]" >&2
|
||||||
|
echo "" >&2
|
||||||
|
echo "Commands:" >&2
|
||||||
|
echo " extract <events.jsonl> Score archetypes from a completed run" >&2
|
||||||
|
echo " report Show aggregate effectiveness report" >&2
|
||||||
|
echo " recommend <team.yaml> Recommend model tiers for a team" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
COMMAND="$1"
|
||||||
|
shift
|
||||||
|
|
||||||
|
if ! command -v jq &> /dev/null; then
|
||||||
|
echo "Error: jq is required but not installed." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
MEMORY_DIR=".archeflow/memory"
|
||||||
|
EFFECTIVENESS_FILE="${MEMORY_DIR}/effectiveness.jsonl"
|
||||||
|
|
||||||
|
# --- extract: score archetypes from a completed run ---
|
||||||
|
|
||||||
|
cmd_extract() {
|
||||||
|
local event_file="${1:?Usage: $0 extract <events.jsonl>}"
|
||||||
|
|
||||||
|
if [[ ! -f "$event_file" ]]; then
|
||||||
|
echo "Error: Event file not found: $event_file" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify run is complete
|
||||||
|
if ! jq -e 'select(.type == "run.complete")' "$event_file" > /dev/null 2>&1; then
|
||||||
|
echo "Error: No run.complete event found. Scoring incomplete runs is unreliable." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "$MEMORY_DIR"
|
||||||
|
|
||||||
|
# Extract run metadata
|
||||||
|
local run_id
|
||||||
|
run_id=$(jq -r 'select(.type == "run.start") | .run_id' "$event_file" | head -1)
|
||||||
|
local ts
|
||||||
|
ts=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||||
|
|
||||||
|
# Score each review archetype using jq
|
||||||
|
# This processes all events in a single jq pass for efficiency
|
||||||
|
jq -sc --arg run_id "$run_id" --arg ts "$ts" '
|
||||||
|
|
||||||
|
# Collect review verdicts
|
||||||
|
[.[] | select(.type == "review.verdict")] as $verdicts |
|
||||||
|
|
||||||
|
# Collect fixes
|
||||||
|
[.[] | select(.type == "fix.applied")] as $fixes |
|
||||||
|
|
||||||
|
# Collect agent.complete for cost data
|
||||||
|
[.[] | select(.type == "agent.complete")] as $completions |
|
||||||
|
|
||||||
|
# Collect cycle boundaries
|
||||||
|
[.[] | select(.type == "cycle.boundary")] as $cycles |
|
||||||
|
|
||||||
|
# Final cycle exit status
|
||||||
|
($cycles | last // {data:{}}) as $final_cycle |
|
||||||
|
($final_cycle.data.met // false) as $cycle_exited |
|
||||||
|
|
||||||
|
# Get unique review archetypes
|
||||||
|
[$verdicts[] | (.data.archetype // .agent // "unknown")] | unique | .[] |
|
||||||
|
|
||||||
|
. as $arch |
|
||||||
|
|
||||||
|
# This archetype verdicts
|
||||||
|
[$verdicts[] | select((.data.archetype // .agent) == $arch)] as $arch_verdicts |
|
||||||
|
|
||||||
|
# All findings from this archetype
|
||||||
|
[$arch_verdicts[] | .data.findings // [] | .[]] as $all_findings |
|
||||||
|
($all_findings | length) as $total_findings |
|
||||||
|
|
||||||
|
# Useful findings: severity >= WARNING and fix_required
|
||||||
|
[$all_findings[] | select(
|
||||||
|
(.severity == "warning" or .severity == "bug" or .severity == "critical") and
|
||||||
|
(.fix_required == true)
|
||||||
|
)] as $useful_findings |
|
||||||
|
($useful_findings | length) as $useful_count |
|
||||||
|
|
||||||
|
# Signal-to-noise
|
||||||
|
(if $total_findings > 0 then ($useful_count / $total_findings) else 0 end) as $signal_noise |
|
||||||
|
|
||||||
|
# Fixes applied from this archetype
|
||||||
|
[$fixes[] | select(.data.source == $arch)] as $arch_fixes |
|
||||||
|
($arch_fixes | length) as $fix_count |
|
||||||
|
|
||||||
|
# Fix rate
|
||||||
|
(if $total_findings > 0 then ($fix_count / $total_findings) else 0 end) as $fix_rate |
|
||||||
|
|
||||||
|
# Cost from agent.complete
|
||||||
|
([$completions[] | select((.data.archetype // .agent) == $arch)] | last // {data:{}}) as $completion |
|
||||||
|
($completion.data.estimated_cost_usd // $completion.data.cost_usd // 0) as $cost_usd |
|
||||||
|
($completion.data.tokens // (($completion.data.tokens_input // 0) + ($completion.data.tokens_output // 0))) as $tokens |
|
||||||
|
($completion.data.model // "unknown") as $model |
|
||||||
|
|
||||||
|
# Cost efficiency: useful findings per dollar (normalized to 0-1 via /100 cap)
|
||||||
|
(if $cost_usd > 0 then ($useful_count / $cost_usd) else 0 end) as $raw_cost_eff |
|
||||||
|
([1.0, ($raw_cost_eff / 100)] | min) as $cost_eff_norm |
|
||||||
|
|
||||||
|
# Accuracy: 1 - (contradicted / total)
|
||||||
|
# Approximation: count other archetypes that approved with 0 findings
|
||||||
|
([$verdicts[] | select(
|
||||||
|
((.data.archetype // .agent) != $arch) and
|
||||||
|
(.data.verdict == "approved") and
|
||||||
|
((.data.findings // []) | length == 0)
|
||||||
|
)] | length) as $contradictors |
|
||||||
|
(if $total_findings > 0 and $contradictors > 0 then
|
||||||
|
(1 - ([1.0, ($contradictors / ($verdicts | length))] | min) * 0.5)
|
||||||
|
else 1.0 end) as $accuracy |
|
||||||
|
|
||||||
|
# Cycle impact: did fixes from this archetype contribute to cycle exit?
|
||||||
|
(if $cycle_exited and $fix_count > 0 then true else false end) as $cycle_impact |
|
||||||
|
(if $cycle_impact then 1.0 else 0.0 end) as $cycle_impact_score |
|
||||||
|
|
||||||
|
# Composite score
|
||||||
|
(
|
||||||
|
($signal_noise * 0.30) +
|
||||||
|
($fix_rate * 0.25) +
|
||||||
|
($cost_eff_norm * 0.20) +
|
||||||
|
($accuracy * 0.15) +
|
||||||
|
($cycle_impact_score * 0.10)
|
||||||
|
) as $composite |
|
||||||
|
|
||||||
|
{
|
||||||
|
ts: $ts,
|
||||||
|
run_id: $run_id,
|
||||||
|
archetype: $arch,
|
||||||
|
signal_to_noise: ($signal_noise * 100 | round / 100),
|
||||||
|
fix_rate: ($fix_rate * 100 | round / 100),
|
||||||
|
cost_efficiency: ($raw_cost_eff * 10 | round / 10),
|
||||||
|
accuracy: ($accuracy * 100 | round / 100),
|
||||||
|
cycle_impact: $cycle_impact,
|
||||||
|
composite_score: ($composite * 100 | round / 100),
|
||||||
|
tokens: $tokens,
|
||||||
|
cost_usd: $cost_usd,
|
||||||
|
model: $model,
|
||||||
|
findings_total: $total_findings,
|
||||||
|
findings_useful: $useful_count,
|
||||||
|
fixes_applied: $fix_count
|
||||||
|
}
|
||||||
|
' "$event_file" | while IFS= read -r score_line; do
|
||||||
|
# Append each score as a single JSONL line
|
||||||
|
echo "$score_line" >> "$EFFECTIVENESS_FILE"
|
||||||
|
local arch
|
||||||
|
arch=$(echo "$score_line" | jq -r '.archetype')
|
||||||
|
local composite
|
||||||
|
composite=$(echo "$score_line" | jq -r '.composite_score')
|
||||||
|
echo "[archeflow-score] Scored ${arch}: composite=${composite}" >&2
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[archeflow-score] Scores appended to ${EFFECTIVENESS_FILE}" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- report: show aggregate effectiveness report ---
|
||||||
|
|
||||||
|
cmd_report() {
|
||||||
|
if [[ ! -f "$EFFECTIVENESS_FILE" ]]; then
|
||||||
|
echo "No effectiveness data found at ${EFFECTIVENESS_FILE}" >&2
|
||||||
|
echo "Run 'archeflow-score.sh extract <events.jsonl>' after completing runs." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "# Archetype Effectiveness Report"
|
||||||
|
echo ""
|
||||||
|
echo "| Archetype | Runs | Avg Score | S/N | Fix Rate | Cost Eff | Accuracy | Trend | Rec |"
|
||||||
|
echo "|-----------|------|-----------|-----|----------|----------|----------|-------|-----|"
|
||||||
|
|
||||||
|
# Process aggregates with jq
|
||||||
|
jq -s '
|
||||||
|
group_by(.archetype) | .[] |
|
||||||
|
. as $group |
|
||||||
|
(.[0].archetype) as $arch |
|
||||||
|
(length) as $total_runs |
|
||||||
|
|
||||||
|
# Last 10 runs
|
||||||
|
(if length > 10 then .[-10:] else . end) as $recent |
|
||||||
|
|
||||||
|
# Averages over recent
|
||||||
|
($recent | map(.composite_score) | add / length * 100 | round / 100) as $avg_composite |
|
||||||
|
($recent | map(.signal_to_noise) | add / length * 100 | round / 100) as $avg_sn |
|
||||||
|
($recent | map(.fix_rate) | add / length * 100 | round / 100) as $avg_fix |
|
||||||
|
($recent | map(.cost_efficiency) | add / length * 10 | round / 10) as $avg_cost_eff |
|
||||||
|
($recent | map(.accuracy) | add / length * 100 | round / 100) as $avg_acc |
|
||||||
|
|
||||||
|
# Trend: last 5 vs prior 5
|
||||||
|
(if ($recent | length) >= 10 then
|
||||||
|
(($recent[-5:] | map(.composite_score) | add / length) -
|
||||||
|
($recent[-10:-5] | map(.composite_score) | add / length)) as $delta |
|
||||||
|
if $delta > 0.05 then "improving"
|
||||||
|
elif $delta < -0.05 then "declining"
|
||||||
|
else "stable"
|
||||||
|
end
|
||||||
|
else "n/a"
|
||||||
|
end) as $trend |
|
||||||
|
|
||||||
|
# Recommendation
|
||||||
|
(if $avg_composite >= 0.70 then "keep"
|
||||||
|
elif $avg_composite >= 0.40 then "optimize"
|
||||||
|
else "consider_removing"
|
||||||
|
end) as $rec |
|
||||||
|
|
||||||
|
# Most common model
|
||||||
|
($recent | group_by(.model) | sort_by(-length) | .[0][0].model // "unknown") as $model |
|
||||||
|
|
||||||
|
{
|
||||||
|
archetype: $arch,
|
||||||
|
runs: $total_runs,
|
||||||
|
avg_composite: $avg_composite,
|
||||||
|
avg_sn: $avg_sn,
|
||||||
|
avg_fix: $avg_fix,
|
||||||
|
avg_cost_eff: $avg_cost_eff,
|
||||||
|
avg_acc: $avg_acc,
|
||||||
|
trend: $trend,
|
||||||
|
rec: $rec,
|
||||||
|
model: $model,
|
||||||
|
avg_cost: ($recent | map(.cost_usd) | add / length * 10000 | round / 10000)
|
||||||
|
}
|
||||||
|
' "$EFFECTIVENESS_FILE" | jq -r '
|
||||||
|
"| \(.archetype) | \(.runs) | \(.avg_composite) | \(.avg_sn) | \(.avg_fix) | \(.avg_cost_eff) | \(.avg_acc) | \(.trend) | \(.rec) |"
|
||||||
|
'
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Model suggestions
|
||||||
|
echo "**Model suggestions:**"
|
||||||
|
jq -s '
|
||||||
|
group_by(.archetype) | .[] |
|
||||||
|
(.[0].archetype) as $arch |
|
||||||
|
(if length > 10 then .[-10:] else . end) as $recent |
|
||||||
|
($recent | map(.composite_score) | add / length * 100 | round / 100) as $avg |
|
||||||
|
($recent | group_by(.model) | sort_by(-length) | .[0][0].model // "unknown") as $model |
|
||||||
|
($recent | map(.cost_usd) | add / length * 10000 | round / 10000) as $avg_cost |
|
||||||
|
|
||||||
|
if $avg >= 0.70 and ($model == "haiku") then
|
||||||
|
"- \($arch) (\($model), score \($avg)): Keep \($model) — high effectiveness at low cost"
|
||||||
|
elif $avg < 0.50 and ($model == "haiku") then
|
||||||
|
"- \($arch) (\($model), score \($avg)): Consider upgrading to sonnet or tightening review lens"
|
||||||
|
elif $avg >= 0.70 and ($model == "sonnet") then
|
||||||
|
"- \($arch) (\($model), score \($avg)): Try downgrading to haiku — may maintain quality at lower cost"
|
||||||
|
elif $avg < 0.50 and ($model == "sonnet") then
|
||||||
|
"- \($arch) (\($model), score \($avg)): Consider removing — expensive and not contributing"
|
||||||
|
else
|
||||||
|
"- \($arch) (\($model), score \($avg)): No change recommended"
|
||||||
|
end
|
||||||
|
' "$EFFECTIVENESS_FILE" | jq -r '.'
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- recommend: suggest model tiers for a team ---
|
||||||
|
|
||||||
|
cmd_recommend() {
|
||||||
|
local team_file="${1:?Usage: $0 recommend <team.yaml>}"
|
||||||
|
|
||||||
|
if [[ ! -f "$team_file" ]]; then
|
||||||
|
echo "Error: Team file not found: $team_file" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$EFFECTIVENESS_FILE" ]]; then
|
||||||
|
echo "No effectiveness data found. Cannot make recommendations without historical data." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract archetypes from the team YAML
|
||||||
|
# Support both yq and a simple grep fallback
|
||||||
|
local archetypes
|
||||||
|
if command -v yq &> /dev/null; then
|
||||||
|
archetypes=$(yq -r '.agents[].archetype // .archetypes[] // empty' "$team_file" 2>/dev/null || true)
|
||||||
|
fi
|
||||||
|
if [[ -z "${archetypes:-}" ]]; then
|
||||||
|
# Fallback: grep for archetype names from the YAML
|
||||||
|
archetypes=$(grep -oP '(?:archetype:\s*|^\s*-\s*)(\w+)' "$team_file" | grep -oP '\w+$' || true)
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$archetypes" ]]; then
|
||||||
|
echo "Error: Could not extract archetypes from ${team_file}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local team_name
|
||||||
|
team_name=$(grep -oP '(?:^name:\s*)(.+)' "$team_file" | head -1 | sed 's/^name:\s*//' || echo "unknown")
|
||||||
|
|
||||||
|
echo "# Model Recommendations for team: ${team_name}"
|
||||||
|
echo ""
|
||||||
|
echo "| Archetype | Current Model | Score | Suggestion |"
|
||||||
|
echo "|-----------|--------------|-------|------------|"
|
||||||
|
|
||||||
|
for arch in $archetypes; do
|
||||||
|
# Look up effectiveness for this archetype
|
||||||
|
local score_data
|
||||||
|
score_data=$(jq -s --arg arch "$arch" '
|
||||||
|
[.[] | select(.archetype == $arch)] |
|
||||||
|
if length == 0 then null
|
||||||
|
else
|
||||||
|
(if length > 10 then .[-10:] else . end) as $recent |
|
||||||
|
{
|
||||||
|
avg_composite: ($recent | map(.composite_score) | add / length * 100 | round / 100),
|
||||||
|
model: ($recent | group_by(.model) | sort_by(-length) | .[0][0].model // "unknown"),
|
||||||
|
runs: length
|
||||||
|
}
|
||||||
|
end
|
||||||
|
' "$EFFECTIVENESS_FILE" 2>/dev/null)
|
||||||
|
|
||||||
|
if [[ "$score_data" == "null" ]]; then
|
||||||
|
echo "| ${arch} | unknown | n/a | No data — run more orchestrations first |"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
local model avg runs suggestion
|
||||||
|
model=$(echo "$score_data" | jq -r '.model')
|
||||||
|
avg=$(echo "$score_data" | jq -r '.avg_composite')
|
||||||
|
runs=$(echo "$score_data" | jq -r '.runs')
|
||||||
|
|
||||||
|
# Generate suggestion
|
||||||
|
if (( $(echo "$avg >= 0.70" | bc -l 2>/dev/null || echo 0) )); then
|
||||||
|
if [[ "$model" == "haiku" ]]; then
|
||||||
|
suggestion="Keep haiku — high effectiveness at low cost"
|
||||||
|
elif [[ "$model" == "sonnet" ]]; then
|
||||||
|
suggestion="Try haiku — may maintain quality cheaper"
|
||||||
|
else
|
||||||
|
suggestion="Keep current model — performing well"
|
||||||
|
fi
|
||||||
|
elif (( $(echo "$avg >= 0.40" | bc -l 2>/dev/null || echo 0) )); then
|
||||||
|
if [[ "$model" == "haiku" ]]; then
|
||||||
|
suggestion="Try sonnet — may improve signal quality"
|
||||||
|
else
|
||||||
|
suggestion="Optimize review lens — moderate effectiveness"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
suggestion="Consider removing from team — low effectiveness"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "| ${arch} | ${model} | ${avg} (${runs} runs) | ${suggestion} |"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Dispatch ---
|
||||||
|
|
||||||
|
case "$COMMAND" in
|
||||||
|
extract) cmd_extract "$@" ;;
|
||||||
|
report) cmd_report "$@" ;;
|
||||||
|
recommend) cmd_recommend "$@" ;;
|
||||||
|
*)
|
||||||
|
echo "Unknown command: $COMMAND" >&2
|
||||||
|
echo "Usage: $0 {extract|report|recommend} [args...]" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
Reference in New Issue
Block a user