transcripts
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
# Add below lines to exclude OS settings and caches
|
||||
.trash/
|
||||
.DS_Store
|
||||
460
claude-transcripts-setup/claude-transcripts.md
Normal file
460
claude-transcripts-setup/claude-transcripts.md
Normal file
@@ -0,0 +1,460 @@
|
||||
# Auto-saving Claude Code Session Transcripts
|
||||
|
||||
Automatically saves every Claude Code chat session to a `transcripts/` folder in your project directory, in both machine-readable JSON and human-readable Markdown.
|
||||
|
||||
## What gets saved
|
||||
|
||||
For each session, two files are written to `<project-dir>/transcripts/`:
|
||||
|
||||
| File | Contents |
|
||||
|---|---|
|
||||
| `<project>-<task>-<yyyymmdd>-<hhmmss>.json` | All raw JSONL records as a JSON array — 100% fidelity, SHA-256 verified |
|
||||
| `<project>-<task>-<yyyymmdd>-<hhmmss>.md` | Human-readable rendering: user + assistant turns, tool calls in code blocks |
|
||||
|
||||
The project name is derived from the working directory basename; the task name from the first user message (slugified).
|
||||
|
||||
---
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Create the script
|
||||
|
||||
Create `~/.claude/scripts/save-transcript.py`:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/.claude/scripts
|
||||
```
|
||||
|
||||
Paste the full script from the source below, or copy it directly:
|
||||
|
||||
```bash
|
||||
cp /path/to/save-transcript.py ~/.claude/scripts/save-transcript.py
|
||||
chmod +x ~/.claude/scripts/save-transcript.py
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary>Full script — <code>~/.claude/scripts/save-transcript.py</code></summary>
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Save a Claude Code chat session to {project}/transcripts/ as:
|
||||
<project>-<task>-<yyyymmdd>-<hhmmss>.json — all JSONL records (100% fidelity)
|
||||
<project>-<task>-<yyyymmdd>-<hhmmss>.md — human-readable markdown
|
||||
|
||||
Usage:
|
||||
python3 save-transcript.py <jsonl_path> [output_dir] # manual
|
||||
python3 save-transcript.py --verify <json_path> # verify saved JSON vs source
|
||||
(stdin) # called from Stop hook
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def slugify(text: str, max_len: int = 40) -> str:
|
||||
text = text.lower()
|
||||
text = re.sub(r"[^\w\s-]", "", text)
|
||||
text = re.sub(r"[\s_]+", "-", text)
|
||||
text = text.strip("-")
|
||||
return text[:max_len].rstrip("-")
|
||||
|
||||
|
||||
def load_jsonl(path: str) -> list:
|
||||
records = []
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
records.append(json.loads(line))
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f" [warn] bad JSON line: {exc}", file=sys.stderr)
|
||||
return records
|
||||
|
||||
|
||||
def find_jsonl_by_session(session_id: str) -> str | None:
|
||||
projects_dir = Path.home() / ".claude" / "projects"
|
||||
for p in projects_dir.rglob(f"{session_id}.jsonl"):
|
||||
return str(p)
|
||||
return None
|
||||
|
||||
|
||||
def content_to_text(content) -> str:
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if not isinstance(content, list):
|
||||
return ""
|
||||
parts = []
|
||||
for block in content:
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
t = block.get("type", "")
|
||||
if t == "text":
|
||||
parts.append(block.get("text", ""))
|
||||
elif t == "tool_use":
|
||||
name = block.get("name", "tool")
|
||||
inp = json.dumps(block.get("input", {}), indent=2, ensure_ascii=False)
|
||||
parts.append(f"**Tool call:** `{name}`\n```json\n{inp}\n```")
|
||||
elif t == "tool_result":
|
||||
inner = block.get("content", "")
|
||||
if isinstance(inner, list):
|
||||
inner = "\n".join(
|
||||
b.get("text", "")
|
||||
for b in inner
|
||||
if isinstance(b, dict) and b.get("type") == "text"
|
||||
)
|
||||
status = " *(error)*" if block.get("is_error") else ""
|
||||
parts.append(f"**Tool result**{status}\n```\n{inner}\n```")
|
||||
return "\n\n".join(p for p in parts if p)
|
||||
|
||||
|
||||
def strip_system_tags(text: str) -> str:
|
||||
text = re.sub(r"<system-reminder>.*?</system-reminder>", "", text, flags=re.DOTALL)
|
||||
text = re.sub(r"<ide_[^>]*>.*?</ide_[^>]*>", "", text, flags=re.DOTALL)
|
||||
text = re.sub(r"<system-reminder[^/]*/?>", "", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def derive_project_name(cwd: str) -> str:
|
||||
return slugify(Path(cwd.rstrip("/")).name or "project")
|
||||
|
||||
|
||||
def derive_task_name(records: list) -> str:
|
||||
for r in records:
|
||||
if r.get("type") != "user":
|
||||
continue
|
||||
content = r.get("message", {}).get("content", "")
|
||||
text = content_to_text(content)
|
||||
text = strip_system_tags(text)
|
||||
text = re.sub(r"\[Request interrupted[^\]]*\]", "", text)
|
||||
text = text.strip()
|
||||
if len(text) > 5:
|
||||
first_line = text.split("\n")[0].strip()
|
||||
return slugify(first_line) or "session"
|
||||
return "session"
|
||||
|
||||
|
||||
def derive_timestamp(records: list) -> datetime:
|
||||
for r in records:
|
||||
ts = r.get("timestamp", "")
|
||||
if ts:
|
||||
try:
|
||||
return datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
pass
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def render_markdown(records: list, session_id: str, cwd: str) -> str:
|
||||
messages = [r for r in records if r.get("type") in ("user", "assistant")]
|
||||
lines = [
|
||||
f"# Transcript: {session_id}",
|
||||
"",
|
||||
f"**Project:** {cwd} ",
|
||||
f"**Messages:** {len(messages)}",
|
||||
"",
|
||||
"---",
|
||||
"",
|
||||
]
|
||||
for msg in messages:
|
||||
role = "User" if msg.get("type") == "user" else "Assistant"
|
||||
ts = msg.get("timestamp", "")
|
||||
content = msg.get("message", {}).get("content", "")
|
||||
text = content_to_text(content)
|
||||
if role == "User":
|
||||
text = strip_system_tags(text)
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
header = f"### {role}"
|
||||
if ts:
|
||||
header += f" *({ts})*"
|
||||
lines.append(header)
|
||||
lines.append("")
|
||||
lines.append(text)
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _jsonl_hash(path: str) -> str:
|
||||
records = load_jsonl(path)
|
||||
canonical = json.dumps(records, sort_keys=True, ensure_ascii=False)
|
||||
return hashlib.sha256(canonical.encode()).hexdigest()
|
||||
|
||||
|
||||
def _json_array_hash(path: str) -> str:
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
records = json.load(fh)
|
||||
canonical = json.dumps(records, sort_keys=True, ensure_ascii=False)
|
||||
return hashlib.sha256(canonical.encode()).hexdigest()
|
||||
|
||||
|
||||
def save(jsonl_path: str, output_dir: str | None) -> tuple[str, str]:
|
||||
records = load_jsonl(jsonl_path)
|
||||
if not records:
|
||||
raise ValueError(f"No records found in {jsonl_path}")
|
||||
|
||||
meta = next(
|
||||
(r for r in records if r.get("sessionId") or r.get("cwd")),
|
||||
records[0],
|
||||
)
|
||||
session_id = meta.get("sessionId") or Path(jsonl_path).stem
|
||||
cwd = meta.get("cwd") or os.getcwd()
|
||||
|
||||
project = derive_project_name(cwd)
|
||||
task = derive_task_name(records)
|
||||
dt = derive_timestamp(records)
|
||||
ts_str = dt.strftime("%Y%m%d-%H%M%S")
|
||||
base = f"{project}-{task}-{ts_str}"
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = os.path.join(cwd, "transcripts")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
json_path = os.path.join(output_dir, f"{base}.json")
|
||||
md_path = os.path.join(output_dir, f"{base}.md")
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as fh:
|
||||
json.dump(records, fh, indent=2, ensure_ascii=False)
|
||||
|
||||
source_hash = _jsonl_hash(jsonl_path)
|
||||
saved_hash = _json_array_hash(json_path)
|
||||
if source_hash != saved_hash:
|
||||
raise RuntimeError(
|
||||
f"Fidelity check FAILED: source={source_hash} saved={saved_hash}\n"
|
||||
f" Source: {jsonl_path}\n Saved: {json_path}"
|
||||
)
|
||||
|
||||
md = render_markdown(records, session_id, cwd)
|
||||
with open(md_path, "w", encoding="utf-8") as fh:
|
||||
fh.write(md)
|
||||
|
||||
return md_path, json_path
|
||||
|
||||
|
||||
def verify(json_path: str) -> bool:
|
||||
with open(json_path, "r", encoding="utf-8") as fh:
|
||||
records = json.load(fh)
|
||||
|
||||
meta = next((r for r in records if r.get("sessionId")), None)
|
||||
if not meta:
|
||||
print("ERROR: no sessionId found in saved transcript", file=sys.stderr)
|
||||
return False
|
||||
|
||||
session_id = meta["sessionId"]
|
||||
jsonl_path = find_jsonl_by_session(session_id)
|
||||
if not jsonl_path:
|
||||
print(f"ERROR: source JSONL not found for session {session_id}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
source_hash = _jsonl_hash(jsonl_path)
|
||||
saved_hash = _json_array_hash(json_path)
|
||||
ok = source_hash == saved_hash
|
||||
|
||||
status = "OK" if ok else "MISMATCH"
|
||||
print(f"[{status}] {json_path}")
|
||||
print(f" source: {jsonl_path}")
|
||||
print(f" source hash: {source_hash}")
|
||||
print(f" saved hash: {saved_hash}")
|
||||
return ok
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) >= 3 and sys.argv[1] == "--verify":
|
||||
ok = verify(sys.argv[2])
|
||||
sys.exit(0 if ok else 1)
|
||||
|
||||
if len(sys.argv) >= 2:
|
||||
jsonl_path = sys.argv[1]
|
||||
output_dir = sys.argv[2] if len(sys.argv) >= 3 else None
|
||||
md_path, json_path = save(jsonl_path, output_dir)
|
||||
print(f"Saved:\n {md_path}\n {json_path}")
|
||||
return
|
||||
|
||||
if not sys.stdin.isatty():
|
||||
try:
|
||||
hook_data = json.load(sys.stdin)
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f"save-transcript: bad hook JSON: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
jsonl_path = hook_data.get("transcript_path")
|
||||
if not jsonl_path or not os.path.exists(jsonl_path):
|
||||
session_id = hook_data.get("session_id", "")
|
||||
jsonl_path = find_jsonl_by_session(session_id) if session_id else None
|
||||
|
||||
if not jsonl_path or not os.path.exists(jsonl_path):
|
||||
print("save-transcript: cannot locate JSONL for this session", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
cwd = hook_data.get("cwd") or os.getcwd()
|
||||
output_dir = os.path.join(cwd, "transcripts")
|
||||
try:
|
||||
md_path, json_path = save(jsonl_path, output_dir)
|
||||
print(f"Transcript saved:\n {md_path}\n {json_path}")
|
||||
except Exception as exc:
|
||||
print(f"save-transcript ERROR: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return
|
||||
|
||||
print(__doc__)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
---
|
||||
|
||||
### 2. Register the Stop hook
|
||||
|
||||
Claude Code runs hooks defined in `~/.claude/settings.json`. The `Stop` event fires each time Claude finishes a response turn.
|
||||
|
||||
Edit (or create) `~/.claude/settings.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"hooks": {
|
||||
"Stop": [
|
||||
{
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "python3 /YOUR_HOME/.claude/scripts/save-transcript.py"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Replace `/YOUR_HOME` with your actual home directory (e.g. `/Users/alice`). Using `~` in the command path does not expand reliably in hooks.
|
||||
|
||||
If `settings.json` already has content, merge the `"hooks"` key into the existing object — don't replace the whole file.
|
||||
|
||||
---
|
||||
|
||||
### 3. (Optional) Add a reminder to CLAUDE.md
|
||||
|
||||
Add this section to `~/.claude/CLAUDE.md` so Claude knows transcripts are being saved:
|
||||
|
||||
```markdown
|
||||
## Transcripts
|
||||
|
||||
Every session is auto-saved to `transcripts/` in the project's working directory via the `Stop` hook.
|
||||
|
||||
File naming: `<project>-<task>-<yyyymmdd>-<hhmmss>.<ext>`
|
||||
- `.json` — all JSONL records as a JSON array (100% fidelity, SHA-256 verified)
|
||||
- `.md` — human-readable markdown (user + assistant turns, tool calls shown)
|
||||
|
||||
Manual save:
|
||||
python3 ~/.claude/scripts/save-transcript.py <jsonl_path> [output_dir]
|
||||
|
||||
Verify saved transcript against source:
|
||||
python3 ~/.claude/scripts/save-transcript.py --verify <transcript.json>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Manual usage
|
||||
|
||||
**Save any session by JSONL path:**
|
||||
```bash
|
||||
python3 ~/.claude/scripts/save-transcript.py \
|
||||
~/.claude/projects/-Users-alice-myproject/abc123.jsonl
|
||||
```
|
||||
|
||||
**Save to a custom directory:**
|
||||
```bash
|
||||
python3 ~/.claude/scripts/save-transcript.py \
|
||||
~/.claude/projects/.../abc123.jsonl \
|
||||
/path/to/output/dir
|
||||
```
|
||||
|
||||
**Verify a saved transcript has not drifted from its source:**
|
||||
```bash
|
||||
python3 ~/.claude/scripts/save-transcript.py --verify \
|
||||
myproject/transcripts/myproject-some-task-20260222-114800.json
|
||||
# → [OK] ... or [MISMATCH] ...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## How it works
|
||||
|
||||
Claude Code stores every session as a `.jsonl` file at:
|
||||
```
|
||||
~/.claude/projects/<encoded-cwd>/<session-uuid>.jsonl
|
||||
```
|
||||
|
||||
where `<encoded-cwd>` is the project path with `/` replaced by `-`.
|
||||
|
||||
Each line is a JSON record of type `user`, `assistant`, `queue-operation`, `file-history-snapshot`, etc. The script:
|
||||
|
||||
1. Reads all records from the source JSONL.
|
||||
2. Writes them as a JSON array to `<project>/transcripts/<name>.json`.
|
||||
3. Computes a SHA-256 hash of the canonical (sort-keys) JSON for both source and saved file, and raises an error if they differ.
|
||||
4. Renders a `.md` file from the `user` and `assistant` records, stripping injected system tags and formatting tool calls as fenced code blocks.
|
||||
|
||||
The `Stop` hook passes a JSON object on stdin with at minimum `session_id` and (in recent Claude Code versions) `transcript_path`. The script falls back to scanning `~/.claude/projects/` by session ID if `transcript_path` is absent.
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- **One file per session start, overwritten each turn.** The timestamp in the filename comes from the session's first message, so all saves within a session share the same base name. The final state after the last turn is what remains.
|
||||
- **Python 3.10+ required** (uses `str | None` union syntax). Test with `python3 --version`.
|
||||
- **No third-party dependencies** — stdlib only.
|
||||
- The hook fires on every assistant turn, not just at explicit session close, so transcripts are incrementally up to date even if a session is force-quit.
|
||||
|
||||
---
|
||||
|
||||
## TODO
|
||||
|
||||
### 1. Security — encryption at rest
|
||||
|
||||
Transcripts are currently saved as plain text JSON and Markdown. For sessions that include sensitive context (credentials, private code, personal data), this is a risk on shared or unencrypted disks.
|
||||
|
||||
Options to explore:
|
||||
- **Symmetric encryption at save time** — encrypt the `.json` and `.md` files with a key derived from a passphrase (e.g. via `cryptography` / `Fernet`, or `age`). Requires a decryption step before viewing.
|
||||
- **Selective redaction** — pre-process records before saving: strip tool results that match patterns (API keys, tokens, file paths outside the project), log a redaction marker in their place.
|
||||
- **Disk-level encryption** — rely on macOS FileVault or a dedicated encrypted volume for the `transcripts/` directory. Simpler operationally, but doesn't protect against an already-unlocked session.
|
||||
- **Opt-out per project** — check for a `.no-transcript` marker file in the project root and skip saving if found. Useful for projects where you never want anything persisted.
|
||||
|
||||
Key question to answer: is the threat model "protect files if the laptop is stolen" (disk encryption is enough) or "protect against an attacker with filesystem access to a running system" (requires key management)?
|
||||
|
||||
### 2. Interval saves and crash recovery
|
||||
|
||||
The current design saves on every `Stop` event (each assistant turn), which gives reasonable durability — but a very long single turn (e.g. a multi-minute agentic task) won't be checkpointed mid-flight, and force-quitting before Claude responds at all will leave the last user message unsaved.
|
||||
|
||||
Options to explore:
|
||||
- **Periodic background save** — a `launchd` plist (macOS) or `cron` job that runs the save script every N minutes, scanning `~/.claude/projects/` for JSONL files newer than the last saved transcript. This is the most robust recovery path for power interruptions.
|
||||
- **`PreToolUse` hook checkpoint** — fire a lightweight checkpoint save before each tool call. Already partially covered by the current approach (tool results appear in subsequent turns), but worth testing for long agentic sessions.
|
||||
- **Append-only saves** — instead of overwriting the same filename, write a new file each save cycle with an incrementing suffix or current timestamp. Lets you reconstruct sessions from partial saves. Trade-off: many small files.
|
||||
- **Signal handler in the script** — trap `SIGTERM`/`SIGINT` and flush a partial save before exit. Only helps if the process is killed cleanly.
|
||||
|
||||
Worth testing: how large do JSONL files get in a 2-hour agentic session? That determines whether the "overwrite on each turn" approach is fast enough not to lag behind the session.
|
||||
|
||||
### 3. Automatic git commit of transcripts
|
||||
|
||||
Persisting transcripts in a git repository gives versioned history, remote backup, and a searchable audit trail.
|
||||
|
||||
Options to explore:
|
||||
- **Dedicated transcripts repo** — a separate bare repo (local or remote) that only holds transcripts. The save script commits and pushes after each save. Clean separation from project code; no risk of transcript noise in project history.
|
||||
- **Subdirectory in the project repo** — commit `transcripts/` as part of the existing project. Simpler, but means transcript commits appear in the project log. Mitigate with a branch (`transcripts`) or a `.gitattributes` rule to exclude from `git log --oneline`.
|
||||
- **Post-save hook extension** — extend `save-transcript.py` to optionally run `git -C <transcripts_dir> add . && git commit -m "transcript: <base_name>"` after a successful save. Add a `--git-push` flag to also push to a remote.
|
||||
- **Rclone / cloud sync** — if the transcripts directory is synced to S3, Backblaze, or similar via `rclone`, git may be unnecessary. Simpler for backup; loses diff history.
|
||||
|
||||
Key decisions: (a) same repo or separate, (b) commit on every turn or only at session end, (c) whether to push automatically (requires network access and auth on every save, which will slow the hook).
|
||||
319
claude-transcripts-setup/scripts/save-transcript.py
Executable file
319
claude-transcripts-setup/scripts/save-transcript.py
Executable file
@@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Save a Claude Code chat session to {project}/transcripts/ as:
|
||||
<project>-<task>-<yyyymmdd>-<hhmmss>.json — all JSONL records (100% fidelity)
|
||||
<project>-<task>-<yyyymmdd>-<hhmmss>.md — human-readable markdown
|
||||
|
||||
Usage:
|
||||
python3 save-transcript.py <jsonl_path> [output_dir] # manual
|
||||
python3 save-transcript.py --verify <json_path> # verify saved JSON vs source
|
||||
(stdin) # called from Stop hook
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def slugify(text: str, max_len: int = 40) -> str:
|
||||
text = text.lower()
|
||||
text = re.sub(r"[^\w\s-]", "", text)
|
||||
text = re.sub(r"[\s_]+", "-", text)
|
||||
text = text.strip("-")
|
||||
return text[:max_len].rstrip("-")
|
||||
|
||||
|
||||
def load_jsonl(path: str) -> list:
|
||||
records = []
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
records.append(json.loads(line))
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f" [warn] bad JSON line: {exc}", file=sys.stderr)
|
||||
return records
|
||||
|
||||
|
||||
def find_jsonl_by_session(session_id: str) -> str | None:
|
||||
projects_dir = Path.home() / ".claude" / "projects"
|
||||
for p in projects_dir.rglob(f"{session_id}.jsonl"):
|
||||
return str(p)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Content extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def content_to_text(content) -> str:
|
||||
"""Flatten message content (str or list of blocks) to plain text / md."""
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if not isinstance(content, list):
|
||||
return ""
|
||||
parts = []
|
||||
for block in content:
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
t = block.get("type", "")
|
||||
if t == "text":
|
||||
parts.append(block.get("text", ""))
|
||||
elif t == "tool_use":
|
||||
name = block.get("name", "tool")
|
||||
inp = json.dumps(block.get("input", {}), indent=2, ensure_ascii=False)
|
||||
parts.append(f"**Tool call:** `{name}`\n```json\n{inp}\n```")
|
||||
elif t == "tool_result":
|
||||
inner = block.get("content", "")
|
||||
if isinstance(inner, list):
|
||||
inner = "\n".join(
|
||||
b.get("text", "")
|
||||
for b in inner
|
||||
if isinstance(b, dict) and b.get("type") == "text"
|
||||
)
|
||||
status = " *(error)*" if block.get("is_error") else ""
|
||||
parts.append(f"**Tool result**{status}\n```\n{inner}\n```")
|
||||
return "\n\n".join(p for p in parts if p)
|
||||
|
||||
|
||||
def strip_system_tags(text: str) -> str:
|
||||
"""Remove <system-reminder>, <ide_*>, and similar injected tags."""
|
||||
text = re.sub(r"<system-reminder>.*?</system-reminder>", "", text, flags=re.DOTALL)
|
||||
text = re.sub(r"<ide_[^>]*>.*?</ide_[^>]*>", "", text, flags=re.DOTALL)
|
||||
text = re.sub(r"<system-reminder[^/]*/?>", "", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Metadata derivation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def derive_project_name(cwd: str) -> str:
|
||||
return slugify(Path(cwd.rstrip("/")).name or "project")
|
||||
|
||||
|
||||
def derive_task_name(records: list) -> str:
|
||||
for r in records:
|
||||
if r.get("type") != "user":
|
||||
continue
|
||||
content = r.get("message", {}).get("content", "")
|
||||
text = content_to_text(content)
|
||||
text = strip_system_tags(text)
|
||||
text = re.sub(r"\[Request interrupted[^\]]*\]", "", text)
|
||||
text = text.strip()
|
||||
if len(text) > 5:
|
||||
first_line = text.split("\n")[0].strip()
|
||||
return slugify(first_line) or "session"
|
||||
return "session"
|
||||
|
||||
|
||||
def derive_timestamp(records: list) -> datetime:
|
||||
for r in records:
|
||||
ts = r.get("timestamp", "")
|
||||
if ts:
|
||||
try:
|
||||
return datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
pass
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Markdown rendering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def render_markdown(records: list, session_id: str, cwd: str) -> str:
|
||||
messages = [r for r in records if r.get("type") in ("user", "assistant")]
|
||||
lines = [
|
||||
f"# Transcript: {session_id}",
|
||||
"",
|
||||
f"**Project:** {cwd} ",
|
||||
f"**Messages:** {len(messages)}",
|
||||
"",
|
||||
"---",
|
||||
"",
|
||||
]
|
||||
for msg in messages:
|
||||
role = "User" if msg.get("type") == "user" else "Assistant"
|
||||
ts = msg.get("timestamp", "")
|
||||
content = msg.get("message", {}).get("content", "")
|
||||
text = content_to_text(content)
|
||||
if role == "User":
|
||||
text = strip_system_tags(text)
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
header = f"### {role}"
|
||||
if ts:
|
||||
header += f" *({ts})*"
|
||||
lines.append(header)
|
||||
lines.append("")
|
||||
lines.append(text)
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Save
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def save(jsonl_path: str, output_dir: str | None) -> tuple[str, str]:
|
||||
records = load_jsonl(jsonl_path)
|
||||
if not records:
|
||||
raise ValueError(f"No records found in {jsonl_path}")
|
||||
|
||||
# Find session metadata from first record that has it
|
||||
meta = next(
|
||||
(r for r in records if r.get("sessionId") or r.get("cwd")),
|
||||
records[0],
|
||||
)
|
||||
session_id = meta.get("sessionId") or Path(jsonl_path).stem
|
||||
cwd = meta.get("cwd") or os.getcwd()
|
||||
|
||||
project = derive_project_name(cwd)
|
||||
task = derive_task_name(records)
|
||||
dt = derive_timestamp(records)
|
||||
ts_str = dt.strftime("%Y%m%d-%H%M%S")
|
||||
base = f"{project}-{task}-{ts_str}"
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = os.path.join(cwd, "transcripts")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
json_path = os.path.join(output_dir, f"{base}.json")
|
||||
md_path = os.path.join(output_dir, f"{base}.md")
|
||||
|
||||
# JSON: all records (100% fidelity — identical content to source JSONL)
|
||||
with open(json_path, "w", encoding="utf-8") as fh:
|
||||
json.dump(records, fh, indent=2, ensure_ascii=False)
|
||||
|
||||
# Verify: re-read and compare checksums
|
||||
source_hash = _jsonl_hash(jsonl_path)
|
||||
saved_hash = _json_array_hash(json_path)
|
||||
if source_hash != saved_hash:
|
||||
raise RuntimeError(
|
||||
f"Fidelity check FAILED: source={source_hash} saved={saved_hash}\n"
|
||||
f" Source: {jsonl_path}\n Saved: {json_path}"
|
||||
)
|
||||
|
||||
# Markdown: human-readable rendering
|
||||
md = render_markdown(records, session_id, cwd)
|
||||
with open(md_path, "w", encoding="utf-8") as fh:
|
||||
fh.write(md)
|
||||
|
||||
return md_path, json_path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fidelity verification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _jsonl_hash(path: str) -> str:
|
||||
"""Hash the ordered sequence of parsed records from a JSONL file."""
|
||||
records = load_jsonl(path)
|
||||
# Normalise to canonical JSON so key order doesn't matter
|
||||
canonical = json.dumps(records, sort_keys=True, ensure_ascii=False)
|
||||
return hashlib.sha256(canonical.encode()).hexdigest()
|
||||
|
||||
|
||||
def _json_array_hash(path: str) -> str:
|
||||
"""Hash the record array stored in a .json transcript file."""
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
records = json.load(fh)
|
||||
canonical = json.dumps(records, sort_keys=True, ensure_ascii=False)
|
||||
return hashlib.sha256(canonical.encode()).hexdigest()
|
||||
|
||||
|
||||
def verify(json_path: str) -> bool:
|
||||
"""Verify a saved .json transcript against its source JSONL."""
|
||||
with open(json_path, "r", encoding="utf-8") as fh:
|
||||
records = json.load(fh)
|
||||
|
||||
meta = next(
|
||||
(r for r in records if r.get("sessionId")),
|
||||
None,
|
||||
)
|
||||
if not meta:
|
||||
print("ERROR: no sessionId found in saved transcript", file=sys.stderr)
|
||||
return False
|
||||
|
||||
session_id = meta["sessionId"]
|
||||
jsonl_path = find_jsonl_by_session(session_id)
|
||||
if not jsonl_path:
|
||||
print(f"ERROR: source JSONL not found for session {session_id}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
source_hash = _jsonl_hash(jsonl_path)
|
||||
saved_hash = _json_array_hash(json_path)
|
||||
ok = source_hash == saved_hash
|
||||
|
||||
status = "OK" if ok else "MISMATCH"
|
||||
print(f"[{status}] {json_path}")
|
||||
print(f" source: {jsonl_path}")
|
||||
print(f" source hash: {source_hash}")
|
||||
print(f" saved hash: {saved_hash}")
|
||||
return ok
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
# --verify mode
|
||||
if len(sys.argv) >= 3 and sys.argv[1] == "--verify":
|
||||
ok = verify(sys.argv[2])
|
||||
sys.exit(0 if ok else 1)
|
||||
|
||||
# Manual invocation
|
||||
if len(sys.argv) >= 2:
|
||||
jsonl_path = sys.argv[1]
|
||||
output_dir = sys.argv[2] if len(sys.argv) >= 3 else None
|
||||
md_path, json_path = save(jsonl_path, output_dir)
|
||||
print(f"Saved:\n {md_path}\n {json_path}")
|
||||
return
|
||||
|
||||
# Called from Stop hook (JSON on stdin)
|
||||
if not sys.stdin.isatty():
|
||||
try:
|
||||
hook_data = json.load(sys.stdin)
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f"save-transcript: bad hook JSON: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
jsonl_path = hook_data.get("transcript_path")
|
||||
if not jsonl_path or not os.path.exists(jsonl_path):
|
||||
session_id = hook_data.get("session_id", "")
|
||||
jsonl_path = find_jsonl_by_session(session_id) if session_id else None
|
||||
|
||||
if not jsonl_path or not os.path.exists(jsonl_path):
|
||||
print("save-transcript: cannot locate JSONL for this session", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
cwd = hook_data.get("cwd") or os.getcwd()
|
||||
output_dir = os.path.join(cwd, "transcripts")
|
||||
try:
|
||||
md_path, json_path = save(jsonl_path, output_dir)
|
||||
print(f"Transcript saved:\n {md_path}\n {json_path}")
|
||||
except Exception as exc:
|
||||
print(f"save-transcript ERROR: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return
|
||||
|
||||
print(__doc__)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user