transcripts
This commit is contained in:
319
claude-transcripts-setup/scripts/save-transcript.py
Executable file
319
claude-transcripts-setup/scripts/save-transcript.py
Executable file
@@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Save a Claude Code chat session to {project}/transcripts/ as:
|
||||
<project>-<task>-<yyyymmdd>-<hhmmss>.json — all JSONL records (100% fidelity)
|
||||
<project>-<task>-<yyyymmdd>-<hhmmss>.md — human-readable markdown
|
||||
|
||||
Usage:
|
||||
python3 save-transcript.py <jsonl_path> [output_dir] # manual
|
||||
python3 save-transcript.py --verify <json_path> # verify saved JSON vs source
|
||||
(stdin) # called from Stop hook
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def slugify(text: str, max_len: int = 40) -> str:
|
||||
text = text.lower()
|
||||
text = re.sub(r"[^\w\s-]", "", text)
|
||||
text = re.sub(r"[\s_]+", "-", text)
|
||||
text = text.strip("-")
|
||||
return text[:max_len].rstrip("-")
|
||||
|
||||
|
||||
def load_jsonl(path: str) -> list:
|
||||
records = []
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
records.append(json.loads(line))
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f" [warn] bad JSON line: {exc}", file=sys.stderr)
|
||||
return records
|
||||
|
||||
|
||||
def find_jsonl_by_session(session_id: str) -> str | None:
|
||||
projects_dir = Path.home() / ".claude" / "projects"
|
||||
for p in projects_dir.rglob(f"{session_id}.jsonl"):
|
||||
return str(p)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Content extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def content_to_text(content) -> str:
|
||||
"""Flatten message content (str or list of blocks) to plain text / md."""
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if not isinstance(content, list):
|
||||
return ""
|
||||
parts = []
|
||||
for block in content:
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
t = block.get("type", "")
|
||||
if t == "text":
|
||||
parts.append(block.get("text", ""))
|
||||
elif t == "tool_use":
|
||||
name = block.get("name", "tool")
|
||||
inp = json.dumps(block.get("input", {}), indent=2, ensure_ascii=False)
|
||||
parts.append(f"**Tool call:** `{name}`\n```json\n{inp}\n```")
|
||||
elif t == "tool_result":
|
||||
inner = block.get("content", "")
|
||||
if isinstance(inner, list):
|
||||
inner = "\n".join(
|
||||
b.get("text", "")
|
||||
for b in inner
|
||||
if isinstance(b, dict) and b.get("type") == "text"
|
||||
)
|
||||
status = " *(error)*" if block.get("is_error") else ""
|
||||
parts.append(f"**Tool result**{status}\n```\n{inner}\n```")
|
||||
return "\n\n".join(p for p in parts if p)
|
||||
|
||||
|
||||
def strip_system_tags(text: str) -> str:
|
||||
"""Remove <system-reminder>, <ide_*>, and similar injected tags."""
|
||||
text = re.sub(r"<system-reminder>.*?</system-reminder>", "", text, flags=re.DOTALL)
|
||||
text = re.sub(r"<ide_[^>]*>.*?</ide_[^>]*>", "", text, flags=re.DOTALL)
|
||||
text = re.sub(r"<system-reminder[^/]*/?>", "", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Metadata derivation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def derive_project_name(cwd: str) -> str:
|
||||
return slugify(Path(cwd.rstrip("/")).name or "project")
|
||||
|
||||
|
||||
def derive_task_name(records: list) -> str:
|
||||
for r in records:
|
||||
if r.get("type") != "user":
|
||||
continue
|
||||
content = r.get("message", {}).get("content", "")
|
||||
text = content_to_text(content)
|
||||
text = strip_system_tags(text)
|
||||
text = re.sub(r"\[Request interrupted[^\]]*\]", "", text)
|
||||
text = text.strip()
|
||||
if len(text) > 5:
|
||||
first_line = text.split("\n")[0].strip()
|
||||
return slugify(first_line) or "session"
|
||||
return "session"
|
||||
|
||||
|
||||
def derive_timestamp(records: list) -> datetime:
|
||||
for r in records:
|
||||
ts = r.get("timestamp", "")
|
||||
if ts:
|
||||
try:
|
||||
return datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
pass
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Markdown rendering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def render_markdown(records: list, session_id: str, cwd: str) -> str:
|
||||
messages = [r for r in records if r.get("type") in ("user", "assistant")]
|
||||
lines = [
|
||||
f"# Transcript: {session_id}",
|
||||
"",
|
||||
f"**Project:** {cwd} ",
|
||||
f"**Messages:** {len(messages)}",
|
||||
"",
|
||||
"---",
|
||||
"",
|
||||
]
|
||||
for msg in messages:
|
||||
role = "User" if msg.get("type") == "user" else "Assistant"
|
||||
ts = msg.get("timestamp", "")
|
||||
content = msg.get("message", {}).get("content", "")
|
||||
text = content_to_text(content)
|
||||
if role == "User":
|
||||
text = strip_system_tags(text)
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
header = f"### {role}"
|
||||
if ts:
|
||||
header += f" *({ts})*"
|
||||
lines.append(header)
|
||||
lines.append("")
|
||||
lines.append(text)
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Save
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def save(jsonl_path: str, output_dir: str | None) -> tuple[str, str]:
|
||||
records = load_jsonl(jsonl_path)
|
||||
if not records:
|
||||
raise ValueError(f"No records found in {jsonl_path}")
|
||||
|
||||
# Find session metadata from first record that has it
|
||||
meta = next(
|
||||
(r for r in records if r.get("sessionId") or r.get("cwd")),
|
||||
records[0],
|
||||
)
|
||||
session_id = meta.get("sessionId") or Path(jsonl_path).stem
|
||||
cwd = meta.get("cwd") or os.getcwd()
|
||||
|
||||
project = derive_project_name(cwd)
|
||||
task = derive_task_name(records)
|
||||
dt = derive_timestamp(records)
|
||||
ts_str = dt.strftime("%Y%m%d-%H%M%S")
|
||||
base = f"{project}-{task}-{ts_str}"
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = os.path.join(cwd, "transcripts")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
json_path = os.path.join(output_dir, f"{base}.json")
|
||||
md_path = os.path.join(output_dir, f"{base}.md")
|
||||
|
||||
# JSON: all records (100% fidelity — identical content to source JSONL)
|
||||
with open(json_path, "w", encoding="utf-8") as fh:
|
||||
json.dump(records, fh, indent=2, ensure_ascii=False)
|
||||
|
||||
# Verify: re-read and compare checksums
|
||||
source_hash = _jsonl_hash(jsonl_path)
|
||||
saved_hash = _json_array_hash(json_path)
|
||||
if source_hash != saved_hash:
|
||||
raise RuntimeError(
|
||||
f"Fidelity check FAILED: source={source_hash} saved={saved_hash}\n"
|
||||
f" Source: {jsonl_path}\n Saved: {json_path}"
|
||||
)
|
||||
|
||||
# Markdown: human-readable rendering
|
||||
md = render_markdown(records, session_id, cwd)
|
||||
with open(md_path, "w", encoding="utf-8") as fh:
|
||||
fh.write(md)
|
||||
|
||||
return md_path, json_path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fidelity verification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _jsonl_hash(path: str) -> str:
|
||||
"""Hash the ordered sequence of parsed records from a JSONL file."""
|
||||
records = load_jsonl(path)
|
||||
# Normalise to canonical JSON so key order doesn't matter
|
||||
canonical = json.dumps(records, sort_keys=True, ensure_ascii=False)
|
||||
return hashlib.sha256(canonical.encode()).hexdigest()
|
||||
|
||||
|
||||
def _json_array_hash(path: str) -> str:
|
||||
"""Hash the record array stored in a .json transcript file."""
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
records = json.load(fh)
|
||||
canonical = json.dumps(records, sort_keys=True, ensure_ascii=False)
|
||||
return hashlib.sha256(canonical.encode()).hexdigest()
|
||||
|
||||
|
||||
def verify(json_path: str) -> bool:
|
||||
"""Verify a saved .json transcript against its source JSONL."""
|
||||
with open(json_path, "r", encoding="utf-8") as fh:
|
||||
records = json.load(fh)
|
||||
|
||||
meta = next(
|
||||
(r for r in records if r.get("sessionId")),
|
||||
None,
|
||||
)
|
||||
if not meta:
|
||||
print("ERROR: no sessionId found in saved transcript", file=sys.stderr)
|
||||
return False
|
||||
|
||||
session_id = meta["sessionId"]
|
||||
jsonl_path = find_jsonl_by_session(session_id)
|
||||
if not jsonl_path:
|
||||
print(f"ERROR: source JSONL not found for session {session_id}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
source_hash = _jsonl_hash(jsonl_path)
|
||||
saved_hash = _json_array_hash(json_path)
|
||||
ok = source_hash == saved_hash
|
||||
|
||||
status = "OK" if ok else "MISMATCH"
|
||||
print(f"[{status}] {json_path}")
|
||||
print(f" source: {jsonl_path}")
|
||||
print(f" source hash: {source_hash}")
|
||||
print(f" saved hash: {saved_hash}")
|
||||
return ok
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
# --verify mode
|
||||
if len(sys.argv) >= 3 and sys.argv[1] == "--verify":
|
||||
ok = verify(sys.argv[2])
|
||||
sys.exit(0 if ok else 1)
|
||||
|
||||
# Manual invocation
|
||||
if len(sys.argv) >= 2:
|
||||
jsonl_path = sys.argv[1]
|
||||
output_dir = sys.argv[2] if len(sys.argv) >= 3 else None
|
||||
md_path, json_path = save(jsonl_path, output_dir)
|
||||
print(f"Saved:\n {md_path}\n {json_path}")
|
||||
return
|
||||
|
||||
# Called from Stop hook (JSON on stdin)
|
||||
if not sys.stdin.isatty():
|
||||
try:
|
||||
hook_data = json.load(sys.stdin)
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f"save-transcript: bad hook JSON: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
jsonl_path = hook_data.get("transcript_path")
|
||||
if not jsonl_path or not os.path.exists(jsonl_path):
|
||||
session_id = hook_data.get("session_id", "")
|
||||
jsonl_path = find_jsonl_by_session(session_id) if session_id else None
|
||||
|
||||
if not jsonl_path or not os.path.exists(jsonl_path):
|
||||
print("save-transcript: cannot locate JSONL for this session", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
cwd = hook_data.get("cwd") or os.getcwd()
|
||||
output_dir = os.path.join(cwd, "transcripts")
|
||||
try:
|
||||
md_path, json_path = save(jsonl_path, output_dir)
|
||||
print(f"Transcript saved:\n {md_path}\n {json_path}")
|
||||
except Exception as exc:
|
||||
print(f"save-transcript ERROR: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return
|
||||
|
||||
print(__doc__)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user