Files
claude-utils/claude-transcripts-setup/scripts/save-transcript.py
2026-02-22 12:25:10 +01:00

320 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Save a Claude Code chat session to {project}/transcripts/ as:
<project>-<task>-<yyyymmdd>-<hhmmss>.json — all JSONL records (100% fidelity)
<project>-<task>-<yyyymmdd>-<hhmmss>.md — human-readable markdown
Usage:
python3 save-transcript.py <jsonl_path> [output_dir] # manual
python3 save-transcript.py --verify <json_path> # verify saved JSON vs source
(stdin) # called from Stop hook
"""
import hashlib
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def slugify(text: str, max_len: int = 40) -> str:
text = text.lower()
text = re.sub(r"[^\w\s-]", "", text)
text = re.sub(r"[\s_]+", "-", text)
text = text.strip("-")
return text[:max_len].rstrip("-")
def load_jsonl(path: str) -> list:
records = []
with open(path, "r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if line:
try:
records.append(json.loads(line))
except json.JSONDecodeError as exc:
print(f" [warn] bad JSON line: {exc}", file=sys.stderr)
return records
def find_jsonl_by_session(session_id: str) -> str | None:
projects_dir = Path.home() / ".claude" / "projects"
for p in projects_dir.rglob(f"{session_id}.jsonl"):
return str(p)
return None
# ---------------------------------------------------------------------------
# Content extraction
# ---------------------------------------------------------------------------
def content_to_text(content) -> str:
"""Flatten message content (str or list of blocks) to plain text / md."""
if isinstance(content, str):
return content
if not isinstance(content, list):
return ""
parts = []
for block in content:
if not isinstance(block, dict):
continue
t = block.get("type", "")
if t == "text":
parts.append(block.get("text", ""))
elif t == "tool_use":
name = block.get("name", "tool")
inp = json.dumps(block.get("input", {}), indent=2, ensure_ascii=False)
parts.append(f"**Tool call:** `{name}`\n```json\n{inp}\n```")
elif t == "tool_result":
inner = block.get("content", "")
if isinstance(inner, list):
inner = "\n".join(
b.get("text", "")
for b in inner
if isinstance(b, dict) and b.get("type") == "text"
)
status = " *(error)*" if block.get("is_error") else ""
parts.append(f"**Tool result**{status}\n```\n{inner}\n```")
return "\n\n".join(p for p in parts if p)
def strip_system_tags(text: str) -> str:
"""Remove <system-reminder>, <ide_*>, and similar injected tags."""
text = re.sub(r"<system-reminder>.*?</system-reminder>", "", text, flags=re.DOTALL)
text = re.sub(r"<ide_[^>]*>.*?</ide_[^>]*>", "", text, flags=re.DOTALL)
text = re.sub(r"<system-reminder[^/]*/?>", "", text)
return text.strip()
# ---------------------------------------------------------------------------
# Metadata derivation
# ---------------------------------------------------------------------------
def derive_project_name(cwd: str) -> str:
return slugify(Path(cwd.rstrip("/")).name or "project")
def derive_task_name(records: list) -> str:
for r in records:
if r.get("type") != "user":
continue
content = r.get("message", {}).get("content", "")
text = content_to_text(content)
text = strip_system_tags(text)
text = re.sub(r"\[Request interrupted[^\]]*\]", "", text)
text = text.strip()
if len(text) > 5:
first_line = text.split("\n")[0].strip()
return slugify(first_line) or "session"
return "session"
def derive_timestamp(records: list) -> datetime:
for r in records:
ts = r.get("timestamp", "")
if ts:
try:
return datetime.fromisoformat(ts.replace("Z", "+00:00"))
except ValueError:
pass
return datetime.now(timezone.utc)
# ---------------------------------------------------------------------------
# Markdown rendering
# ---------------------------------------------------------------------------
def render_markdown(records: list, session_id: str, cwd: str) -> str:
messages = [r for r in records if r.get("type") in ("user", "assistant")]
lines = [
f"# Transcript: {session_id}",
"",
f"**Project:** {cwd} ",
f"**Messages:** {len(messages)}",
"",
"---",
"",
]
for msg in messages:
role = "User" if msg.get("type") == "user" else "Assistant"
ts = msg.get("timestamp", "")
content = msg.get("message", {}).get("content", "")
text = content_to_text(content)
if role == "User":
text = strip_system_tags(text)
text = text.strip()
if not text:
continue
header = f"### {role}"
if ts:
header += f" *({ts})*"
lines.append(header)
lines.append("")
lines.append(text)
lines.append("")
lines.append("---")
lines.append("")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Save
# ---------------------------------------------------------------------------
def save(jsonl_path: str, output_dir: str | None) -> tuple[str, str]:
records = load_jsonl(jsonl_path)
if not records:
raise ValueError(f"No records found in {jsonl_path}")
# Find session metadata from first record that has it
meta = next(
(r for r in records if r.get("sessionId") or r.get("cwd")),
records[0],
)
session_id = meta.get("sessionId") or Path(jsonl_path).stem
cwd = meta.get("cwd") or os.getcwd()
project = derive_project_name(cwd)
task = derive_task_name(records)
dt = derive_timestamp(records)
ts_str = dt.strftime("%Y%m%d-%H%M%S")
base = f"{project}-{task}-{ts_str}"
if output_dir is None:
output_dir = os.path.join(cwd, "transcripts")
os.makedirs(output_dir, exist_ok=True)
json_path = os.path.join(output_dir, f"{base}.json")
md_path = os.path.join(output_dir, f"{base}.md")
# JSON: all records (100% fidelity — identical content to source JSONL)
with open(json_path, "w", encoding="utf-8") as fh:
json.dump(records, fh, indent=2, ensure_ascii=False)
# Verify: re-read and compare checksums
source_hash = _jsonl_hash(jsonl_path)
saved_hash = _json_array_hash(json_path)
if source_hash != saved_hash:
raise RuntimeError(
f"Fidelity check FAILED: source={source_hash} saved={saved_hash}\n"
f" Source: {jsonl_path}\n Saved: {json_path}"
)
# Markdown: human-readable rendering
md = render_markdown(records, session_id, cwd)
with open(md_path, "w", encoding="utf-8") as fh:
fh.write(md)
return md_path, json_path
# ---------------------------------------------------------------------------
# Fidelity verification
# ---------------------------------------------------------------------------
def _jsonl_hash(path: str) -> str:
"""Hash the ordered sequence of parsed records from a JSONL file."""
records = load_jsonl(path)
# Normalise to canonical JSON so key order doesn't matter
canonical = json.dumps(records, sort_keys=True, ensure_ascii=False)
return hashlib.sha256(canonical.encode()).hexdigest()
def _json_array_hash(path: str) -> str:
"""Hash the record array stored in a .json transcript file."""
with open(path, "r", encoding="utf-8") as fh:
records = json.load(fh)
canonical = json.dumps(records, sort_keys=True, ensure_ascii=False)
return hashlib.sha256(canonical.encode()).hexdigest()
def verify(json_path: str) -> bool:
"""Verify a saved .json transcript against its source JSONL."""
with open(json_path, "r", encoding="utf-8") as fh:
records = json.load(fh)
meta = next(
(r for r in records if r.get("sessionId")),
None,
)
if not meta:
print("ERROR: no sessionId found in saved transcript", file=sys.stderr)
return False
session_id = meta["sessionId"]
jsonl_path = find_jsonl_by_session(session_id)
if not jsonl_path:
print(f"ERROR: source JSONL not found for session {session_id}", file=sys.stderr)
return False
source_hash = _jsonl_hash(jsonl_path)
saved_hash = _json_array_hash(json_path)
ok = source_hash == saved_hash
status = "OK" if ok else "MISMATCH"
print(f"[{status}] {json_path}")
print(f" source: {jsonl_path}")
print(f" source hash: {source_hash}")
print(f" saved hash: {saved_hash}")
return ok
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main():
# --verify mode
if len(sys.argv) >= 3 and sys.argv[1] == "--verify":
ok = verify(sys.argv[2])
sys.exit(0 if ok else 1)
# Manual invocation
if len(sys.argv) >= 2:
jsonl_path = sys.argv[1]
output_dir = sys.argv[2] if len(sys.argv) >= 3 else None
md_path, json_path = save(jsonl_path, output_dir)
print(f"Saved:\n {md_path}\n {json_path}")
return
# Called from Stop hook (JSON on stdin)
if not sys.stdin.isatty():
try:
hook_data = json.load(sys.stdin)
except json.JSONDecodeError as exc:
print(f"save-transcript: bad hook JSON: {exc}", file=sys.stderr)
sys.exit(1)
jsonl_path = hook_data.get("transcript_path")
if not jsonl_path or not os.path.exists(jsonl_path):
session_id = hook_data.get("session_id", "")
jsonl_path = find_jsonl_by_session(session_id) if session_id else None
if not jsonl_path or not os.path.exists(jsonl_path):
print("save-transcript: cannot locate JSONL for this session", file=sys.stderr)
sys.exit(1)
cwd = hook_data.get("cwd") or os.getcwd()
output_dir = os.path.join(cwd, "transcripts")
try:
md_path, json_path = save(jsonl_path, output_dir)
print(f"Transcript saved:\n {md_path}\n {json_path}")
except Exception as exc:
print(f"save-transcript ERROR: {exc}", file=sys.stderr)
sys.exit(1)
return
print(__doc__)
sys.exit(1)
if __name__ == "__main__":
main()