transcripts

2026-02-22 12:25:10 +01:00
parent 050514211b
commit 7025225ed9
3 changed files with 782 additions and 0 deletions
--- a/claude-transcripts-setup/scripts/save-transcript.py
+++ b/claude-transcripts-setup/scripts/save-transcript.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""
+Save a Claude Code chat session to {project}/transcripts/ as:
+  <project>-<task>-<yyyymmdd>-<hhmmss>.json  — all JSONL records (100% fidelity)
+  <project>-<task>-<yyyymmdd>-<hhmmss>.md    — human-readable markdown
+
+Usage:
+  python3 save-transcript.py <jsonl_path> [output_dir]   # manual
+  python3 save-transcript.py --verify <json_path>        # verify saved JSON vs source
+  (stdin)                                                 # called from Stop hook
+"""
+
+import hashlib
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def slugify(text: str, max_len: int = 40) -> str:
+    text = text.lower()
+    text = re.sub(r"[^\w\s-]", "", text)
+    text = re.sub(r"[\s_]+", "-", text)
+    text = text.strip("-")
+    return text[:max_len].rstrip("-")
+
+
+def load_jsonl(path: str) -> list:
+    records = []
+    with open(path, "r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if line:
+                try:
+                    records.append(json.loads(line))
+                except json.JSONDecodeError as exc:
+                    print(f"  [warn] bad JSON line: {exc}", file=sys.stderr)
+    return records
+
+
+def find_jsonl_by_session(session_id: str) -> str | None:
+    projects_dir = Path.home() / ".claude" / "projects"
+    for p in projects_dir.rglob(f"{session_id}.jsonl"):
+        return str(p)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Content extraction
+# ---------------------------------------------------------------------------
+
+def content_to_text(content) -> str:
+    """Flatten message content (str or list of blocks) to plain text / md."""
+    if isinstance(content, str):
+        return content
+    if not isinstance(content, list):
+        return ""
+    parts = []
+    for block in content:
+        if not isinstance(block, dict):
+            continue
+        t = block.get("type", "")
+        if t == "text":
+            parts.append(block.get("text", ""))
+        elif t == "tool_use":
+            name = block.get("name", "tool")
+            inp = json.dumps(block.get("input", {}), indent=2, ensure_ascii=False)
+            parts.append(f"**Tool call:** `{name}`\n```json\n{inp}\n```")
+        elif t == "tool_result":
+            inner = block.get("content", "")
+            if isinstance(inner, list):
+                inner = "\n".join(
+                    b.get("text", "")
+                    for b in inner
+                    if isinstance(b, dict) and b.get("type") == "text"
+                )
+            status = " *(error)*" if block.get("is_error") else ""
+            parts.append(f"**Tool result**{status}\n```\n{inner}\n```")
+    return "\n\n".join(p for p in parts if p)
+
+
+def strip_system_tags(text: str) -> str:
+    """Remove <system-reminder>, <ide_*>, and similar injected tags."""
+    text = re.sub(r"<system-reminder>.*?</system-reminder>", "", text, flags=re.DOTALL)
+    text = re.sub(r"<ide_[^>]*>.*?</ide_[^>]*>", "", text, flags=re.DOTALL)
+    text = re.sub(r"<system-reminder[^/]*/?>", "", text)
+    return text.strip()
+
+
+# ---------------------------------------------------------------------------
+# Metadata derivation
+# ---------------------------------------------------------------------------
+
+def derive_project_name(cwd: str) -> str:
+    return slugify(Path(cwd.rstrip("/")).name or "project")
+
+
+def derive_task_name(records: list) -> str:
+    for r in records:
+        if r.get("type") != "user":
+            continue
+        content = r.get("message", {}).get("content", "")
+        text = content_to_text(content)
+        text = strip_system_tags(text)
+        text = re.sub(r"\[Request interrupted[^\]]*\]", "", text)
+        text = text.strip()
+        if len(text) > 5:
+            first_line = text.split("\n")[0].strip()
+            return slugify(first_line) or "session"
+    return "session"
+
+
+def derive_timestamp(records: list) -> datetime:
+    for r in records:
+        ts = r.get("timestamp", "")
+        if ts:
+            try:
+                return datetime.fromisoformat(ts.replace("Z", "+00:00"))
+            except ValueError:
+                pass
+    return datetime.now(timezone.utc)
+
+
+# ---------------------------------------------------------------------------
+# Markdown rendering
+# ---------------------------------------------------------------------------
+
+def render_markdown(records: list, session_id: str, cwd: str) -> str:
+    messages = [r for r in records if r.get("type") in ("user", "assistant")]
+    lines = [
+        f"# Transcript: {session_id}",
+        "",
+        f"**Project:** {cwd}  ",
+        f"**Messages:** {len(messages)}",
+        "",
+        "---",
+        "",
+    ]
+    for msg in messages:
+        role = "User" if msg.get("type") == "user" else "Assistant"
+        ts = msg.get("timestamp", "")
+        content = msg.get("message", {}).get("content", "")
+        text = content_to_text(content)
+        if role == "User":
+            text = strip_system_tags(text)
+        text = text.strip()
+        if not text:
+            continue
+        header = f"### {role}"
+        if ts:
+            header += f" *({ts})*"
+        lines.append(header)
+        lines.append("")
+        lines.append(text)
+        lines.append("")
+        lines.append("---")
+        lines.append("")
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Save
+# ---------------------------------------------------------------------------
+
+def save(jsonl_path: str, output_dir: str | None) -> tuple[str, str]:
+    records = load_jsonl(jsonl_path)
+    if not records:
+        raise ValueError(f"No records found in {jsonl_path}")
+
+    # Find session metadata from first record that has it
+    meta = next(
+        (r for r in records if r.get("sessionId") or r.get("cwd")),
+        records[0],
+    )
+    session_id = meta.get("sessionId") or Path(jsonl_path).stem
+    cwd = meta.get("cwd") or os.getcwd()
+
+    project = derive_project_name(cwd)
+    task = derive_task_name(records)
+    dt = derive_timestamp(records)
+    ts_str = dt.strftime("%Y%m%d-%H%M%S")
+    base = f"{project}-{task}-{ts_str}"
+
+    if output_dir is None:
+        output_dir = os.path.join(cwd, "transcripts")
+    os.makedirs(output_dir, exist_ok=True)
+
+    json_path = os.path.join(output_dir, f"{base}.json")
+    md_path = os.path.join(output_dir, f"{base}.md")
+
+    # JSON: all records (100% fidelity — identical content to source JSONL)
+    with open(json_path, "w", encoding="utf-8") as fh:
+        json.dump(records, fh, indent=2, ensure_ascii=False)
+
+    # Verify: re-read and compare checksums
+    source_hash = _jsonl_hash(jsonl_path)
+    saved_hash = _json_array_hash(json_path)
+    if source_hash != saved_hash:
+        raise RuntimeError(
+            f"Fidelity check FAILED: source={source_hash} saved={saved_hash}\n"
+            f"  Source: {jsonl_path}\n  Saved:  {json_path}"
+        )
+
+    # Markdown: human-readable rendering
+    md = render_markdown(records, session_id, cwd)
+    with open(md_path, "w", encoding="utf-8") as fh:
+        fh.write(md)
+
+    return md_path, json_path
+
+
+# ---------------------------------------------------------------------------
+# Fidelity verification
+# ---------------------------------------------------------------------------
+
+def _jsonl_hash(path: str) -> str:
+    """Hash the ordered sequence of parsed records from a JSONL file."""
+    records = load_jsonl(path)
+    # Normalise to canonical JSON so key order doesn't matter
+    canonical = json.dumps(records, sort_keys=True, ensure_ascii=False)
+    return hashlib.sha256(canonical.encode()).hexdigest()
+
+
+def _json_array_hash(path: str) -> str:
+    """Hash the record array stored in a .json transcript file."""
+    with open(path, "r", encoding="utf-8") as fh:
+        records = json.load(fh)
+    canonical = json.dumps(records, sort_keys=True, ensure_ascii=False)
+    return hashlib.sha256(canonical.encode()).hexdigest()
+
+
+def verify(json_path: str) -> bool:
+    """Verify a saved .json transcript against its source JSONL."""
+    with open(json_path, "r", encoding="utf-8") as fh:
+        records = json.load(fh)
+
+    meta = next(
+        (r for r in records if r.get("sessionId")),
+        None,
+    )
+    if not meta:
+        print("ERROR: no sessionId found in saved transcript", file=sys.stderr)
+        return False
+
+    session_id = meta["sessionId"]
+    jsonl_path = find_jsonl_by_session(session_id)
+    if not jsonl_path:
+        print(f"ERROR: source JSONL not found for session {session_id}", file=sys.stderr)
+        return False
+
+    source_hash = _jsonl_hash(jsonl_path)
+    saved_hash = _json_array_hash(json_path)
+    ok = source_hash == saved_hash
+
+    status = "OK" if ok else "MISMATCH"
+    print(f"[{status}] {json_path}")
+    print(f"  source: {jsonl_path}")
+    print(f"  source hash: {source_hash}")
+    print(f"  saved  hash: {saved_hash}")
+    return ok
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+def main():
+    # --verify mode
+    if len(sys.argv) >= 3 and sys.argv[1] == "--verify":
+        ok = verify(sys.argv[2])
+        sys.exit(0 if ok else 1)
+
+    # Manual invocation
+    if len(sys.argv) >= 2:
+        jsonl_path = sys.argv[1]
+        output_dir = sys.argv[2] if len(sys.argv) >= 3 else None
+        md_path, json_path = save(jsonl_path, output_dir)
+        print(f"Saved:\n  {md_path}\n  {json_path}")
+        return
+
+    # Called from Stop hook (JSON on stdin)
+    if not sys.stdin.isatty():
+        try:
+            hook_data = json.load(sys.stdin)
+        except json.JSONDecodeError as exc:
+            print(f"save-transcript: bad hook JSON: {exc}", file=sys.stderr)
+            sys.exit(1)
+
+        jsonl_path = hook_data.get("transcript_path")
+        if not jsonl_path or not os.path.exists(jsonl_path):
+            session_id = hook_data.get("session_id", "")
+            jsonl_path = find_jsonl_by_session(session_id) if session_id else None
+
+        if not jsonl_path or not os.path.exists(jsonl_path):
+            print("save-transcript: cannot locate JSONL for this session", file=sys.stderr)
+            sys.exit(1)
+
+        cwd = hook_data.get("cwd") or os.getcwd()
+        output_dir = os.path.join(cwd, "transcripts")
+        try:
+            md_path, json_path = save(jsonl_path, output_dir)
+            print(f"Transcript saved:\n  {md_path}\n  {json_path}")
+        except Exception as exc:
+            print(f"save-transcript ERROR: {exc}", file=sys.stderr)
+            sys.exit(1)
+        return
+
+    print(__doc__)
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()