From 85c5b0e01dbd2eeabcc6ad9fd37e98a7ffa91448 Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Fri, 3 Apr 2026 04:00:33 +0000 Subject: [PATCH] Expand parity harness coverage before behavioral drift lands The landed mock Anthropic harness now covers multi-tool turns, bash flows, permission prompt approve/deny paths, and an external plugin tool path. A machine-readable scenario manifest plus a diff/checklist runner keep the new scenarios tied back to PARITY.md so future additions stay honest. Constraint: Must build on the deterministic mock service and clean-environment CLI harness Rejected: Add an MCP tool scenario now | current MCP tool surface is still stubbed, so plugin coverage is the real executable path Confidence: high Scope-risk: moderate Reversibility: clean Directive: Keep rust/mock_parity_scenarios.json, mock_parity_harness.rs, and PARITY.md refs in lockstep Tested: cargo fmt --all Tested: cargo clippy --workspace --all-targets -- -D warnings Tested: cargo test --workspace Tested: python3 rust/scripts/run_mock_parity_diff.py Not-tested: Real MCP lifecycle handshakes; remote plugin marketplace install flows --- PARITY.md | 28 +- rust/MOCK_PARITY_HARNESS.md | 14 + rust/README.md | 7 + rust/crates/mock-anthropic-service/src/lib.rs | 357 +++++++++- .../tests/mock_parity_harness.rs | 626 ++++++++++++++++-- rust/mock_parity_scenarios.json | 92 +++ rust/scripts/run_mock_parity_diff.py | 130 ++++ 7 files changed, 1154 insertions(+), 100 deletions(-) create mode 100644 rust/mock_parity_scenarios.json create mode 100755 rust/scripts/run_mock_parity_diff.py diff --git a/PARITY.md b/PARITY.md index c8c2f1c..b573509 100644 --- a/PARITY.md +++ b/PARITY.md @@ -1,6 +1,6 @@ # Parity Status — claw-code Rust Port -Last updated: 2026-04-03 (`03bd7f0`) +Last updated: 2026-04-03 ## Mock parity harness — milestone 1 @@ -8,6 +8,24 @@ Last updated: 2026-04-03 (`03bd7f0`) - [x] Reproducible clean-environment CLI harness (`rust/crates/rusty-claude-cli/tests/mock_parity_harness.rs`) - [x] Scripted scenarios: `streaming_text`, `read_file_roundtrip`, `grep_chunk_assembly`, `write_file_allowed`, `write_file_denied` +## Mock parity harness — milestone 2 (behavioral expansion) + +- [x] Scripted multi-tool turn coverage: `multi_tool_turn_roundtrip` +- [x] Scripted bash coverage: `bash_stdout_roundtrip` +- [x] Scripted permission prompt coverage: `bash_permission_prompt_approved`, `bash_permission_prompt_denied` +- [x] Scripted plugin-path coverage: `plugin_tool_roundtrip` +- [x] Behavioral diff/checklist runner: `rust/scripts/run_mock_parity_diff.py` + +## Harness v2 behavioral checklist + +Canonical scenario map: `rust/mock_parity_scenarios.json` + +- Multi-tool assistant turns +- Bash flow roundtrips +- Permission enforcement across tool paths +- Plugin tool execution path +- File tools — harness-validated flows + ## Tool Surface: 40/40 (spec parity) ### Real Implementations (behavioral parity — varying depth) @@ -79,17 +97,23 @@ Last updated: 2026-04-03 (`03bd7f0`) - [ ] `modeValidation` — validate against current permission mode - [ ] `shouldUseSandbox` — sandbox decision logic +Harness note: milestone 2 validates bash success plus workspace-write escalation approve/deny flows, but the deeper validation/security submodules above are still open. + **File tools — need verification:** - [ ] Path traversal prevention (symlink following, ../ escapes) - [ ] Size limits on read/write - [ ] Binary file detection - [ ] Permission mode enforcement (read-only vs workspace-write) +Harness note: read_file, grep_search, write_file allow/deny, and multi-tool same-turn assembly are now covered by the mock parity harness. + **Config/Plugin/MCP flows:** - [ ] Full MCP server lifecycle (connect, list tools, call tool, disconnect) - [ ] Plugin install/enable/disable/uninstall full flow - [ ] Config merge precedence (user > project > local) +Harness note: external plugin discovery + execution is now covered via `plugin_tool_roundtrip`; full lifecycle and MCP behavior remain open. + ## Runtime Behavioral Gaps - [ ] Permission enforcement across all tools (read-only, workspace-write, danger-full-access) @@ -98,6 +122,8 @@ Last updated: 2026-04-03 (`03bd7f0`) - [ ] Token counting / cost tracking accuracy - [x] Streaming response support validated by the mock parity harness +Harness note: current coverage now includes write-file denial, bash escalation approve/deny, and plugin workspace-write execution paths. + ## Migration Readiness - [ ] `PARITY.md` maintained and honest diff --git a/rust/MOCK_PARITY_HARNESS.md b/rust/MOCK_PARITY_HARNESS.md index 1d5cd05..bc38466 100644 --- a/rust/MOCK_PARITY_HARNESS.md +++ b/rust/MOCK_PARITY_HARNESS.md @@ -17,6 +17,11 @@ The harness runs these scripted scenarios against a fresh workspace and isolated 3. `grep_chunk_assembly` 4. `write_file_allowed` 5. `write_file_denied` +6. `multi_tool_turn_roundtrip` +7. `bash_stdout_roundtrip` +8. `bash_permission_prompt_approved` +9. `bash_permission_prompt_denied` +10. `plugin_tool_roundtrip` ## Run @@ -25,6 +30,15 @@ cd rust/ ./scripts/run_mock_parity_harness.sh ``` +Behavioral checklist / parity diff: + +```bash +cd rust/ +python3 scripts/run_mock_parity_diff.py +``` + +Scenario-to-PARITY mappings live in `mock_parity_scenarios.json`. + ## Manual mock server ```bash diff --git a/rust/README.md b/rust/README.md index f13998b..2ddbf5c 100644 --- a/rust/README.md +++ b/rust/README.md @@ -56,12 +56,19 @@ Harness coverage: - `grep_chunk_assembly` - `write_file_allowed` - `write_file_denied` +- `multi_tool_turn_roundtrip` +- `bash_stdout_roundtrip` +- `bash_permission_prompt_approved` +- `bash_permission_prompt_denied` +- `plugin_tool_roundtrip` Primary artifacts: - `crates/mock-anthropic-service/` — reusable mock Anthropic-compatible service - `crates/rusty-claude-cli/tests/mock_parity_harness.rs` — clean-env CLI harness - `scripts/run_mock_parity_harness.sh` — reproducible wrapper +- `scripts/run_mock_parity_diff.py` — scenario checklist + PARITY mapping runner +- `mock_parity_scenarios.json` — scenario-to-PARITY manifest ## Features diff --git a/rust/crates/mock-anthropic-service/src/lib.rs b/rust/crates/mock-anthropic-service/src/lib.rs index b327c26..232417e 100644 --- a/rust/crates/mock-anthropic-service/src/lib.rs +++ b/rust/crates/mock-anthropic-service/src/lib.rs @@ -93,6 +93,11 @@ enum Scenario { GrepChunkAssembly, WriteFileAllowed, WriteFileDenied, + MultiToolTurnRoundtrip, + BashStdoutRoundtrip, + BashPermissionPromptApproved, + BashPermissionPromptDenied, + PluginToolRoundtrip, } impl Scenario { @@ -103,6 +108,11 @@ impl Scenario { "grep_chunk_assembly" => Some(Self::GrepChunkAssembly), "write_file_allowed" => Some(Self::WriteFileAllowed), "write_file_denied" => Some(Self::WriteFileDenied), + "multi_tool_turn_roundtrip" => Some(Self::MultiToolTurnRoundtrip), + "bash_stdout_roundtrip" => Some(Self::BashStdoutRoundtrip), + "bash_permission_prompt_approved" => Some(Self::BashPermissionPromptApproved), + "bash_permission_prompt_denied" => Some(Self::BashPermissionPromptDenied), + "plugin_tool_roundtrip" => Some(Self::PluginToolRoundtrip), _ => None, } } @@ -114,6 +124,11 @@ impl Scenario { Self::GrepChunkAssembly => "grep_chunk_assembly", Self::WriteFileAllowed => "write_file_allowed", Self::WriteFileDenied => "write_file_denied", + Self::MultiToolTurnRoundtrip => "multi_tool_turn_roundtrip", + Self::BashStdoutRoundtrip => "bash_stdout_roundtrip", + Self::BashPermissionPromptApproved => "bash_permission_prompt_approved", + Self::BashPermissionPromptDenied => "bash_permission_prompt_denied", + Self::PluginToolRoundtrip => "plugin_tool_roundtrip", } } } @@ -243,6 +258,38 @@ fn latest_tool_result(request: &MessageRequest) -> Option<(String, bool)> { }) } +fn tool_results_by_name(request: &MessageRequest) -> HashMap { + let mut tool_names_by_id = HashMap::new(); + for message in &request.messages { + for block in &message.content { + if let InputContentBlock::ToolUse { id, name, .. } = block { + tool_names_by_id.insert(id.clone(), name.clone()); + } + } + } + + let mut results = HashMap::new(); + for message in request.messages.iter().rev() { + for block in message.content.iter().rev() { + if let InputContentBlock::ToolResult { + tool_use_id, + content, + is_error, + } = block + { + let tool_name = tool_names_by_id + .get(tool_use_id) + .cloned() + .unwrap_or_else(|| tool_use_id.clone()); + results + .entry(tool_name) + .or_insert_with(|| (flatten_tool_result_content(content), *is_error)); + } + } + } + results +} + fn flatten_tool_result_content(content: &[api::ToolResultContentBlock]) -> String { content .iter() @@ -276,6 +323,7 @@ fn build_http_response(request: &MessageRequest, scenario: Scenario) -> String { ) } +#[allow(clippy::too_many_lines)] fn build_stream_body(request: &MessageRequest, scenario: Scenario) -> String { match scenario { Scenario::StreamingText => streaming_text_sse(), @@ -326,9 +374,88 @@ fn build_stream_body(request: &MessageRequest, scenario: Scenario) -> String { &[r#"{"path":"generated/denied.txt","content":"should not exist\n"}"#], ), }, + Scenario::MultiToolTurnRoundtrip => { + let tool_results = tool_results_by_name(request); + match ( + tool_results.get("read_file"), + tool_results.get("grep_search"), + ) { + (Some((read_output, _)), Some((grep_output, _))) => final_text_sse(&format!( + "multi-tool roundtrip complete: {} / {} occurrences", + extract_read_content(read_output), + extract_num_matches(grep_output) + )), + _ => tool_uses_sse(&[ + ToolUseSse { + tool_id: "toolu_multi_read", + tool_name: "read_file", + partial_json_chunks: &[r#"{"path":"fixture.txt"}"#], + }, + ToolUseSse { + tool_id: "toolu_multi_grep", + tool_name: "grep_search", + partial_json_chunks: &[ + "{\"pattern\":\"par", + "ity\",\"path\":\"fixture.txt\"", + ",\"output_mode\":\"count\"}", + ], + }, + ]), + } + } + Scenario::BashStdoutRoundtrip => match latest_tool_result(request) { + Some((tool_output, _)) => final_text_sse(&format!( + "bash completed: {}", + extract_bash_stdout(&tool_output) + )), + None => tool_use_sse( + "toolu_bash_stdout", + "bash", + &[r#"{"command":"printf 'alpha from bash'","timeout":1000}"#], + ), + }, + Scenario::BashPermissionPromptApproved => match latest_tool_result(request) { + Some((tool_output, is_error)) => { + if is_error { + final_text_sse(&format!("bash approval unexpectedly failed: {tool_output}")) + } else { + final_text_sse(&format!( + "bash approved and executed: {}", + extract_bash_stdout(&tool_output) + )) + } + } + None => tool_use_sse( + "toolu_bash_prompt_allow", + "bash", + &[r#"{"command":"printf 'approved via prompt'","timeout":1000}"#], + ), + }, + Scenario::BashPermissionPromptDenied => match latest_tool_result(request) { + Some((tool_output, _)) => { + final_text_sse(&format!("bash denied as expected: {tool_output}")) + } + None => tool_use_sse( + "toolu_bash_prompt_deny", + "bash", + &[r#"{"command":"printf 'should not run'","timeout":1000}"#], + ), + }, + Scenario::PluginToolRoundtrip => match latest_tool_result(request) { + Some((tool_output, _)) => final_text_sse(&format!( + "plugin tool completed: {}", + extract_plugin_message(&tool_output) + )), + None => tool_use_sse( + "toolu_plugin_echo", + "plugin_echo", + &[r#"{"message":"hello from plugin parity"}"#], + ), + }, } } +#[allow(clippy::too_many_lines)] fn build_message_response(request: &MessageRequest, scenario: Scenario) -> MessageResponse { match scenario { Scenario::StreamingText => text_message_response( @@ -389,6 +516,100 @@ fn build_message_response(request: &MessageRequest, scenario: Scenario) -> Messa json!({"path": "generated/denied.txt", "content": "should not exist\n"}), ), }, + Scenario::MultiToolTurnRoundtrip => { + let tool_results = tool_results_by_name(request); + match ( + tool_results.get("read_file"), + tool_results.get("grep_search"), + ) { + (Some((read_output, _)), Some((grep_output, _))) => text_message_response( + "msg_multi_tool_final", + &format!( + "multi-tool roundtrip complete: {} / {} occurrences", + extract_read_content(read_output), + extract_num_matches(grep_output) + ), + ), + _ => tool_message_response_many( + "msg_multi_tool_start", + &[ + ToolUseMessage { + tool_id: "toolu_multi_read", + tool_name: "read_file", + input: json!({"path": "fixture.txt"}), + }, + ToolUseMessage { + tool_id: "toolu_multi_grep", + tool_name: "grep_search", + input: json!({"pattern": "parity", "path": "fixture.txt", "output_mode": "count"}), + }, + ], + ), + } + } + Scenario::BashStdoutRoundtrip => match latest_tool_result(request) { + Some((tool_output, _)) => text_message_response( + "msg_bash_stdout_final", + &format!("bash completed: {}", extract_bash_stdout(&tool_output)), + ), + None => tool_message_response( + "msg_bash_stdout_tool", + "toolu_bash_stdout", + "bash", + json!({"command": "printf 'alpha from bash'", "timeout": 1000}), + ), + }, + Scenario::BashPermissionPromptApproved => match latest_tool_result(request) { + Some((tool_output, is_error)) => { + if is_error { + text_message_response( + "msg_bash_prompt_allow_error", + &format!("bash approval unexpectedly failed: {tool_output}"), + ) + } else { + text_message_response( + "msg_bash_prompt_allow_final", + &format!( + "bash approved and executed: {}", + extract_bash_stdout(&tool_output) + ), + ) + } + } + None => tool_message_response( + "msg_bash_prompt_allow_tool", + "toolu_bash_prompt_allow", + "bash", + json!({"command": "printf 'approved via prompt'", "timeout": 1000}), + ), + }, + Scenario::BashPermissionPromptDenied => match latest_tool_result(request) { + Some((tool_output, _)) => text_message_response( + "msg_bash_prompt_deny_final", + &format!("bash denied as expected: {tool_output}"), + ), + None => tool_message_response( + "msg_bash_prompt_deny_tool", + "toolu_bash_prompt_deny", + "bash", + json!({"command": "printf 'should not run'", "timeout": 1000}), + ), + }, + Scenario::PluginToolRoundtrip => match latest_tool_result(request) { + Some((tool_output, _)) => text_message_response( + "msg_plugin_tool_final", + &format!( + "plugin tool completed: {}", + extract_plugin_message(&tool_output) + ), + ), + None => tool_message_response( + "msg_plugin_tool_start", + "toolu_plugin_echo", + "plugin_echo", + json!({"message": "hello from plugin parity"}), + ), + }, } } @@ -399,6 +620,11 @@ fn request_id_for(scenario: Scenario) -> &'static str { Scenario::GrepChunkAssembly => "req_grep_chunk_assembly", Scenario::WriteFileAllowed => "req_write_file_allowed", Scenario::WriteFileDenied => "req_write_file_denied", + Scenario::MultiToolTurnRoundtrip => "req_multi_tool_turn_roundtrip", + Scenario::BashStdoutRoundtrip => "req_bash_stdout_roundtrip", + Scenario::BashPermissionPromptApproved => "req_bash_permission_prompt_approved", + Scenario::BashPermissionPromptDenied => "req_bash_permission_prompt_denied", + Scenario::PluginToolRoundtrip => "req_plugin_tool_roundtrip", } } @@ -441,15 +667,35 @@ fn tool_message_response( tool_name: &str, input: Value, ) -> MessageResponse { + tool_message_response_many( + id, + &[ToolUseMessage { + tool_id, + tool_name, + input, + }], + ) +} + +struct ToolUseMessage<'a> { + tool_id: &'a str, + tool_name: &'a str, + input: Value, +} + +fn tool_message_response_many(id: &str, tool_uses: &[ToolUseMessage<'_>]) -> MessageResponse { MessageResponse { id: id.to_string(), kind: "message".to_string(), role: "assistant".to_string(), - content: vec![OutputContentBlock::ToolUse { - id: tool_id.to_string(), - name: tool_name.to_string(), - input, - }], + content: tool_uses + .iter() + .map(|tool_use| OutputContentBlock::ToolUse { + id: tool_use.tool_id.to_string(), + name: tool_use.tool_name.to_string(), + input: tool_use.input.clone(), + }) + .collect(), model: DEFAULT_MODEL.to_string(), stop_reason: Some("tool_use".to_string()), stop_sequence: None, @@ -531,14 +777,32 @@ fn streaming_text_sse() -> String { } fn tool_use_sse(tool_id: &str, tool_name: &str, partial_json_chunks: &[&str]) -> String { + tool_uses_sse(&[ToolUseSse { + tool_id, + tool_name, + partial_json_chunks, + }]) +} + +struct ToolUseSse<'a> { + tool_id: &'a str, + tool_name: &'a str, + partial_json_chunks: &'a [&'a str], +} + +fn tool_uses_sse(tool_uses: &[ToolUseSse<'_>]) -> String { let mut body = String::new(); + let message_id = tool_uses.first().map_or_else( + || "msg_tool_use".to_string(), + |tool_use| format!("msg_{}", tool_use.tool_id), + ); append_sse( &mut body, "message_start", json!({ "type": "message_start", "message": { - "id": format!("msg_{tool_id}"), + "id": message_id, "type": "message", "role": "assistant", "content": [], @@ -549,39 +813,41 @@ fn tool_use_sse(tool_id: &str, tool_name: &str, partial_json_chunks: &[&str]) -> } }), ); - append_sse( - &mut body, - "content_block_start", - json!({ - "type": "content_block_start", - "index": 0, - "content_block": { - "type": "tool_use", - "id": tool_id, - "name": tool_name, - "input": {} - } - }), - ); - for chunk in partial_json_chunks { + for (index, tool_use) in tool_uses.iter().enumerate() { append_sse( &mut body, - "content_block_delta", + "content_block_start", json!({ - "type": "content_block_delta", - "index": 0, - "delta": {"type": "input_json_delta", "partial_json": chunk} + "type": "content_block_start", + "index": index, + "content_block": { + "type": "tool_use", + "id": tool_use.tool_id, + "name": tool_use.tool_name, + "input": {} + } + }), + ); + for chunk in tool_use.partial_json_chunks { + append_sse( + &mut body, + "content_block_delta", + json!({ + "type": "content_block_delta", + "index": index, + "delta": {"type": "input_json_delta", "partial_json": chunk} + }), + ); + } + append_sse( + &mut body, + "content_block_stop", + json!({ + "type": "content_block_stop", + "index": index }), ); } - append_sse( - &mut body, - "content_block_stop", - json!({ - "type": "content_block_stop", - "index": 0 - }), - ); append_sse( &mut body, "message_delta", @@ -710,3 +976,28 @@ fn extract_file_path(tool_output: &str) -> String { }) .unwrap_or_else(|| tool_output.trim().to_string()) } + +fn extract_bash_stdout(tool_output: &str) -> String { + serde_json::from_str::(tool_output) + .ok() + .and_then(|value| { + value + .get("stdout") + .and_then(Value::as_str) + .map(ToOwned::to_owned) + }) + .unwrap_or_else(|| tool_output.trim().to_string()) +} + +fn extract_plugin_message(tool_output: &str) -> String { + serde_json::from_str::(tool_output) + .ok() + .and_then(|value| { + value + .get("input") + .and_then(|input| input.get("message")) + .and_then(Value::as_str) + .map(ToOwned::to_owned) + }) + .unwrap_or_else(|| tool_output.trim().to_string()) +} diff --git a/rust/crates/rusty-claude-cli/tests/mock_parity_harness.rs b/rust/crates/rusty-claude-cli/tests/mock_parity_harness.rs index 9dcfc47..e70667a 100644 --- a/rust/crates/rusty-claude-cli/tests/mock_parity_harness.rs +++ b/rust/crates/rusty-claude-cli/tests/mock_parity_harness.rs @@ -1,16 +1,26 @@ +use std::collections::BTreeMap; use std::fs; +use std::io::Write; +use std::os::unix::fs::PermissionsExt; use std::path::{Path, PathBuf}; -use std::process::{Command, Output}; +use std::process::{Command, Output, Stdio}; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{SystemTime, UNIX_EPOCH}; use mock_anthropic_service::{MockAnthropicService, SCENARIO_PREFIX}; -use serde_json::Value; +use serde_json::{json, Value}; static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0); #[test] +#[allow(clippy::too_many_lines)] fn clean_env_cli_reaches_mock_anthropic_service_across_scripted_parity_scenarios() { + let manifest_entries = load_scenario_manifest(); + let manifest = manifest_entries + .iter() + .cloned() + .map(|entry| (entry.name.clone(), entry)) + .collect::>(); let runtime = tokio::runtime::Runtime::new().expect("tokio runtime should build"); let server = runtime .block_on(MockAnthropicService::spawn()) @@ -22,53 +32,121 @@ fn clean_env_cli_reaches_mock_anthropic_service_across_scripted_parity_scenarios name: "streaming_text", permission_mode: "read-only", allowed_tools: None, - seed: seed_noop, + stdin: None, + prepare: prepare_noop, assert: assert_streaming_text, }, ScenarioCase { name: "read_file_roundtrip", permission_mode: "read-only", allowed_tools: Some("read_file"), - seed: seed_read_fixture, + stdin: None, + prepare: prepare_read_fixture, assert: assert_read_file_roundtrip, }, ScenarioCase { name: "grep_chunk_assembly", permission_mode: "read-only", allowed_tools: Some("grep_search"), - seed: seed_grep_fixture, + stdin: None, + prepare: prepare_grep_fixture, assert: assert_grep_chunk_assembly, }, ScenarioCase { name: "write_file_allowed", permission_mode: "workspace-write", allowed_tools: Some("write_file"), - seed: seed_noop, + stdin: None, + prepare: prepare_noop, assert: assert_write_file_allowed, }, ScenarioCase { name: "write_file_denied", permission_mode: "read-only", allowed_tools: Some("write_file"), - seed: seed_noop, + stdin: None, + prepare: prepare_noop, assert: assert_write_file_denied, }, + ScenarioCase { + name: "multi_tool_turn_roundtrip", + permission_mode: "read-only", + allowed_tools: Some("read_file,grep_search"), + stdin: None, + prepare: prepare_multi_tool_fixture, + assert: assert_multi_tool_turn_roundtrip, + }, + ScenarioCase { + name: "bash_stdout_roundtrip", + permission_mode: "danger-full-access", + allowed_tools: Some("bash"), + stdin: None, + prepare: prepare_noop, + assert: assert_bash_stdout_roundtrip, + }, + ScenarioCase { + name: "bash_permission_prompt_approved", + permission_mode: "workspace-write", + allowed_tools: Some("bash"), + stdin: Some("y\n"), + prepare: prepare_noop, + assert: assert_bash_permission_prompt_approved, + }, + ScenarioCase { + name: "bash_permission_prompt_denied", + permission_mode: "workspace-write", + allowed_tools: Some("bash"), + stdin: Some("n\n"), + prepare: prepare_noop, + assert: assert_bash_permission_prompt_denied, + }, + ScenarioCase { + name: "plugin_tool_roundtrip", + permission_mode: "workspace-write", + allowed_tools: None, + stdin: None, + prepare: prepare_plugin_fixture, + assert: assert_plugin_tool_roundtrip, + }, ]; + let case_names = cases.iter().map(|case| case.name).collect::>(); + let manifest_names = manifest_entries + .iter() + .map(|entry| entry.name.as_str()) + .collect::>(); + assert_eq!( + case_names, manifest_names, + "manifest and harness cases must stay aligned" + ); + + let mut scenario_reports = Vec::new(); + for case in cases { - let workspace = unique_temp_dir(case.name); - fs::create_dir_all(&workspace).expect("workspace should exist"); - (case.seed)(&workspace); - let response = run_case(case, &workspace, &base_url); - (case.assert)(&workspace, &response); - fs::remove_dir_all(&workspace).expect("workspace cleanup should succeed"); + let workspace = HarnessWorkspace::new(unique_temp_dir(case.name)); + workspace.create().expect("workspace should exist"); + (case.prepare)(&workspace); + + let run = run_case(case, &workspace, &base_url); + (case.assert)(&workspace, &run); + + let manifest_entry = manifest + .get(case.name) + .unwrap_or_else(|| panic!("missing manifest entry for {}", case.name)); + scenario_reports.push(build_scenario_report( + case.name, + manifest_entry, + &run.response, + )); + + fs::remove_dir_all(&workspace.root).expect("workspace cleanup should succeed"); } let captured = runtime.block_on(server.captured_requests()); assert_eq!( captured.len(), - 9, - "five scenarios should produce nine requests" + 19, + "ten scenarios should produce nineteen requests" ); assert!(captured .iter() @@ -91,8 +169,32 @@ fn clean_env_cli_reaches_mock_anthropic_service_across_scripted_parity_scenarios "write_file_allowed", "write_file_denied", "write_file_denied", + "multi_tool_turn_roundtrip", + "multi_tool_turn_roundtrip", + "bash_stdout_roundtrip", + "bash_stdout_roundtrip", + "bash_permission_prompt_approved", + "bash_permission_prompt_approved", + "bash_permission_prompt_denied", + "bash_permission_prompt_denied", + "plugin_tool_roundtrip", + "plugin_tool_roundtrip", ] ); + + let mut request_counts = BTreeMap::new(); + for request in &captured { + *request_counts + .entry(request.scenario.as_str()) + .or_insert(0_usize) += 1; + } + for report in &mut scenario_reports { + report.request_count = *request_counts + .get(report.name.as_str()) + .unwrap_or_else(|| panic!("missing request count for {}", report.name)); + } + + maybe_write_report(&scenario_reports); } #[derive(Clone, Copy)] @@ -100,25 +202,71 @@ struct ScenarioCase { name: &'static str, permission_mode: &'static str, allowed_tools: Option<&'static str>, - seed: fn(&Path), - assert: fn(&Path, &Value), + stdin: Option<&'static str>, + prepare: fn(&HarnessWorkspace), + assert: fn(&HarnessWorkspace, &ScenarioRun), } -fn run_case(case: ScenarioCase, workspace: &Path, base_url: &str) -> Value { - let config_home = workspace.join("config-home"); - let home = workspace.join("home"); - fs::create_dir_all(config_home.join(".claw")).expect("config home should exist"); - fs::create_dir_all(&home).expect("home should exist"); +struct HarnessWorkspace { + root: PathBuf, + config_home: PathBuf, + home: PathBuf, +} +impl HarnessWorkspace { + fn new(root: PathBuf) -> Self { + Self { + config_home: root.join("config-home"), + home: root.join("home"), + root, + } + } + + fn create(&self) -> std::io::Result<()> { + fs::create_dir_all(&self.root)?; + fs::create_dir_all(&self.config_home)?; + fs::create_dir_all(&self.home)?; + Ok(()) + } +} + +struct ScenarioRun { + response: Value, + stdout: String, +} + +#[derive(Debug, Clone)] +struct ScenarioManifestEntry { + name: String, + category: String, + description: String, + parity_refs: Vec, +} + +#[derive(Debug)] +struct ScenarioReport { + name: String, + category: String, + description: String, + parity_refs: Vec, + iterations: u64, + request_count: usize, + tool_uses: Vec, + tool_error_count: usize, + final_message: String, +} + +fn run_case(case: ScenarioCase, workspace: &HarnessWorkspace, base_url: &str) -> ScenarioRun { let mut command = Command::new(env!("CARGO_BIN_EXE_claw")); command - .current_dir(workspace) + .current_dir(&workspace.root) .env_clear() .env("ANTHROPIC_API_KEY", "test-parity-key") .env("ANTHROPIC_BASE_URL", base_url) - .env("CLAW_CONFIG_HOME", &config_home) - .env("HOME", &home) + .env("CLAW_CONFIG_HOME", &workspace.config_home) + .env("HOME", &workspace.home) .env("NO_COLOR", "1") + .env("PATH", "/usr/bin:/bin") .args([ "--model", "sonnet", @@ -132,107 +280,453 @@ fn run_case(case: ScenarioCase, workspace: &Path, base_url: &str) -> Value { } let prompt = format!("{SCENARIO_PREFIX}{}", case.name); - let output = command.arg(prompt).output().expect("claw should launch"); + command.arg(prompt); + + let output = if let Some(stdin) = case.stdin { + let mut child = command + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .expect("claw should launch"); + child + .stdin + .as_mut() + .expect("stdin should be piped") + .write_all(stdin.as_bytes()) + .expect("stdin should write"); + child.wait_with_output().expect("claw should finish") + } else { + command.output().expect("claw should launch") + }; + assert_success(&output); - serde_json::from_slice(&output.stdout).expect("prompt output should be valid json") + let stdout = String::from_utf8_lossy(&output.stdout).into_owned(); + ScenarioRun { + response: parse_json_output(&stdout), + stdout, + } } -fn seed_noop(_: &Path) {} +fn prepare_noop(_: &HarnessWorkspace) {} -fn seed_read_fixture(workspace: &Path) { - fs::write(workspace.join("fixture.txt"), "alpha parity line\n").expect("fixture should write"); +fn prepare_read_fixture(workspace: &HarnessWorkspace) { + fs::write(workspace.root.join("fixture.txt"), "alpha parity line\n") + .expect("fixture should write"); } -fn seed_grep_fixture(workspace: &Path) { +fn prepare_grep_fixture(workspace: &HarnessWorkspace) { fs::write( - workspace.join("fixture.txt"), + workspace.root.join("fixture.txt"), "alpha parity line\nbeta line\ngamma parity line\n", ) .expect("grep fixture should write"); } -fn assert_streaming_text(_: &Path, response: &Value) { - assert_eq!( - response["message"], - Value::String("Mock streaming says hello from the parity harness.".to_string()) - ); - assert_eq!(response["iterations"], Value::from(1)); - assert_eq!(response["tool_uses"], Value::Array(Vec::new())); - assert_eq!(response["tool_results"], Value::Array(Vec::new())); +fn prepare_multi_tool_fixture(workspace: &HarnessWorkspace) { + fs::write( + workspace.root.join("fixture.txt"), + "alpha parity line\nbeta line\ngamma parity line\n", + ) + .expect("multi tool fixture should write"); } -fn assert_read_file_roundtrip(workspace: &Path, response: &Value) { - assert_eq!(response["iterations"], Value::from(2)); +fn prepare_plugin_fixture(workspace: &HarnessWorkspace) { + let plugin_root = workspace + .root + .join("external-plugins") + .join("parity-plugin"); + let tool_dir = plugin_root.join("tools"); + let manifest_dir = plugin_root.join(".claude-plugin"); + fs::create_dir_all(&tool_dir).expect("plugin tools dir"); + fs::create_dir_all(&manifest_dir).expect("plugin manifest dir"); + + let script_path = tool_dir.join("echo-json.sh"); + fs::write( + &script_path, + "#!/bin/sh\nINPUT=$(cat)\nprintf '{\"plugin\":\"%s\",\"tool\":\"%s\",\"input\":%s}\\n' \"$CLAWD_PLUGIN_ID\" \"$CLAWD_TOOL_NAME\" \"$INPUT\"\n", + ) + .expect("plugin script should write"); + let mut permissions = fs::metadata(&script_path) + .expect("plugin script metadata") + .permissions(); + permissions.set_mode(0o755); + fs::set_permissions(&script_path, permissions).expect("plugin script should be executable"); + + fs::write( + manifest_dir.join("plugin.json"), + r#"{ + "name": "parity-plugin", + "version": "1.0.0", + "description": "mock parity plugin", + "tools": [ + { + "name": "plugin_echo", + "description": "Echo JSON input", + "inputSchema": { + "type": "object", + "properties": { + "message": { "type": "string" } + }, + "required": ["message"], + "additionalProperties": false + }, + "command": "./tools/echo-json.sh", + "requiredPermission": "workspace-write" + } + ] +}"#, + ) + .expect("plugin manifest should write"); + + fs::write( + workspace.config_home.join("settings.json"), + json!({ + "enabledPlugins": { + "parity-plugin@external": true + }, + "plugins": { + "externalDirectories": [plugin_root.parent().expect("plugin parent").display().to_string()] + } + }) + .to_string(), + ) + .expect("plugin settings should write"); +} + +fn assert_streaming_text(_: &HarnessWorkspace, run: &ScenarioRun) { assert_eq!( - response["tool_uses"][0]["name"], + run.response["message"], + Value::String("Mock streaming says hello from the parity harness.".to_string()) + ); + assert_eq!(run.response["iterations"], Value::from(1)); + assert_eq!(run.response["tool_uses"], Value::Array(Vec::new())); + assert_eq!(run.response["tool_results"], Value::Array(Vec::new())); +} + +fn assert_read_file_roundtrip(workspace: &HarnessWorkspace, run: &ScenarioRun) { + assert_eq!(run.response["iterations"], Value::from(2)); + assert_eq!( + run.response["tool_uses"][0]["name"], Value::String("read_file".to_string()) ); assert_eq!( - response["tool_uses"][0]["input"], + run.response["tool_uses"][0]["input"], Value::String(r#"{"path":"fixture.txt"}"#.to_string()) ); - assert!(response["message"] + assert!(run.response["message"] .as_str() .expect("message text") .contains("alpha parity line")); - let output = response["tool_results"][0]["output"] + let output = run.response["tool_results"][0]["output"] .as_str() .expect("tool output"); - assert!(output.contains(&workspace.join("fixture.txt").display().to_string())); + assert!(output.contains(&workspace.root.join("fixture.txt").display().to_string())); assert!(output.contains("alpha parity line")); } -fn assert_grep_chunk_assembly(_: &Path, response: &Value) { - assert_eq!(response["iterations"], Value::from(2)); +fn assert_grep_chunk_assembly(_: &HarnessWorkspace, run: &ScenarioRun) { + assert_eq!(run.response["iterations"], Value::from(2)); assert_eq!( - response["tool_uses"][0]["name"], + run.response["tool_uses"][0]["name"], Value::String("grep_search".to_string()) ); assert_eq!( - response["tool_uses"][0]["input"], + run.response["tool_uses"][0]["input"], Value::String( r#"{"pattern":"parity","path":"fixture.txt","output_mode":"count"}"#.to_string() ) ); - assert!(response["message"] + assert!(run.response["message"] .as_str() .expect("message text") .contains("2 occurrences")); - assert_eq!(response["tool_results"][0]["is_error"], Value::Bool(false)); + assert_eq!( + run.response["tool_results"][0]["is_error"], + Value::Bool(false) + ); } -fn assert_write_file_allowed(workspace: &Path, response: &Value) { - assert_eq!(response["iterations"], Value::from(2)); +fn assert_write_file_allowed(workspace: &HarnessWorkspace, run: &ScenarioRun) { + assert_eq!(run.response["iterations"], Value::from(2)); assert_eq!( - response["tool_uses"][0]["name"], + run.response["tool_uses"][0]["name"], Value::String("write_file".to_string()) ); - assert!(response["message"] + assert!(run.response["message"] .as_str() .expect("message text") .contains("generated/output.txt")); - let generated = workspace.join("generated").join("output.txt"); + let generated = workspace.root.join("generated").join("output.txt"); let contents = fs::read_to_string(&generated).expect("generated file should exist"); assert_eq!(contents, "created by mock service\n"); - assert_eq!(response["tool_results"][0]["is_error"], Value::Bool(false)); + assert_eq!( + run.response["tool_results"][0]["is_error"], + Value::Bool(false) + ); } -fn assert_write_file_denied(workspace: &Path, response: &Value) { - assert_eq!(response["iterations"], Value::from(2)); +fn assert_write_file_denied(workspace: &HarnessWorkspace, run: &ScenarioRun) { + assert_eq!(run.response["iterations"], Value::from(2)); assert_eq!( - response["tool_uses"][0]["name"], + run.response["tool_uses"][0]["name"], Value::String("write_file".to_string()) ); - let tool_output = response["tool_results"][0]["output"] + let tool_output = run.response["tool_results"][0]["output"] .as_str() .expect("tool output"); assert!(tool_output.contains("requires workspace-write permission")); - assert_eq!(response["tool_results"][0]["is_error"], Value::Bool(true)); - assert!(response["message"] + assert_eq!( + run.response["tool_results"][0]["is_error"], + Value::Bool(true) + ); + assert!(run.response["message"] .as_str() .expect("message text") .contains("denied as expected")); - assert!(!workspace.join("generated").join("denied.txt").exists()); + assert!(!workspace.root.join("generated").join("denied.txt").exists()); +} + +fn assert_multi_tool_turn_roundtrip(_: &HarnessWorkspace, run: &ScenarioRun) { + assert_eq!(run.response["iterations"], Value::from(2)); + let tool_uses = run.response["tool_uses"] + .as_array() + .expect("tool uses array"); + assert_eq!( + tool_uses.len(), + 2, + "expected two tool uses in a single turn" + ); + assert_eq!(tool_uses[0]["name"], Value::String("read_file".to_string())); + assert_eq!( + tool_uses[1]["name"], + Value::String("grep_search".to_string()) + ); + let tool_results = run.response["tool_results"] + .as_array() + .expect("tool results array"); + assert_eq!( + tool_results.len(), + 2, + "expected two tool results in a single turn" + ); + assert!(run.response["message"] + .as_str() + .expect("message text") + .contains("alpha parity line")); + assert!(run.response["message"] + .as_str() + .expect("message text") + .contains("2 occurrences")); +} + +fn assert_bash_stdout_roundtrip(_: &HarnessWorkspace, run: &ScenarioRun) { + assert_eq!(run.response["iterations"], Value::from(2)); + assert_eq!( + run.response["tool_uses"][0]["name"], + Value::String("bash".to_string()) + ); + let tool_output = run.response["tool_results"][0]["output"] + .as_str() + .expect("tool output"); + let parsed: Value = serde_json::from_str(tool_output).expect("bash output json"); + assert_eq!( + parsed["stdout"], + Value::String("alpha from bash".to_string()) + ); + assert_eq!( + run.response["tool_results"][0]["is_error"], + Value::Bool(false) + ); + assert!(run.response["message"] + .as_str() + .expect("message text") + .contains("alpha from bash")); +} + +fn assert_bash_permission_prompt_approved(_: &HarnessWorkspace, run: &ScenarioRun) { + assert!(run.stdout.contains("Permission approval required")); + assert!(run.stdout.contains("Approve this tool call? [y/N]:")); + assert_eq!(run.response["iterations"], Value::from(2)); + assert_eq!( + run.response["tool_results"][0]["is_error"], + Value::Bool(false) + ); + let tool_output = run.response["tool_results"][0]["output"] + .as_str() + .expect("tool output"); + let parsed: Value = serde_json::from_str(tool_output).expect("bash output json"); + assert_eq!( + parsed["stdout"], + Value::String("approved via prompt".to_string()) + ); + assert!(run.response["message"] + .as_str() + .expect("message text") + .contains("approved and executed")); +} + +fn assert_bash_permission_prompt_denied(_: &HarnessWorkspace, run: &ScenarioRun) { + assert!(run.stdout.contains("Permission approval required")); + assert!(run.stdout.contains("Approve this tool call? [y/N]:")); + assert_eq!(run.response["iterations"], Value::from(2)); + let tool_output = run.response["tool_results"][0]["output"] + .as_str() + .expect("tool output"); + assert!(tool_output.contains("denied by user approval prompt")); + assert_eq!( + run.response["tool_results"][0]["is_error"], + Value::Bool(true) + ); + assert!(run.response["message"] + .as_str() + .expect("message text") + .contains("denied as expected")); +} + +fn assert_plugin_tool_roundtrip(_: &HarnessWorkspace, run: &ScenarioRun) { + assert_eq!(run.response["iterations"], Value::from(2)); + assert_eq!( + run.response["tool_uses"][0]["name"], + Value::String("plugin_echo".to_string()) + ); + let tool_output = run.response["tool_results"][0]["output"] + .as_str() + .expect("tool output"); + let parsed: Value = serde_json::from_str(tool_output).expect("plugin output json"); + assert_eq!( + parsed["plugin"], + Value::String("parity-plugin@external".to_string()) + ); + assert_eq!(parsed["tool"], Value::String("plugin_echo".to_string())); + assert_eq!( + parsed["input"]["message"], + Value::String("hello from plugin parity".to_string()) + ); + assert!(run.response["message"] + .as_str() + .expect("message text") + .contains("hello from plugin parity")); +} + +fn parse_json_output(stdout: &str) -> Value { + if let Some(index) = stdout.rfind("{\"auto_compaction\"") { + return serde_json::from_str(&stdout[index..]).unwrap_or_else(|error| { + panic!("failed to parse JSON response from stdout: {error}\n{stdout}") + }); + } + + stdout + .lines() + .rev() + .find_map(|line| { + let trimmed = line.trim(); + if trimmed.starts_with('{') && trimmed.ends_with('}') { + serde_json::from_str(trimmed).ok() + } else { + None + } + }) + .unwrap_or_else(|| panic!("no JSON response line found in stdout:\n{stdout}")) +} + +fn build_scenario_report( + name: &str, + manifest_entry: &ScenarioManifestEntry, + response: &Value, +) -> ScenarioReport { + ScenarioReport { + name: name.to_string(), + category: manifest_entry.category.clone(), + description: manifest_entry.description.clone(), + parity_refs: manifest_entry.parity_refs.clone(), + iterations: response["iterations"] + .as_u64() + .expect("iterations should exist"), + request_count: 0, + tool_uses: response["tool_uses"] + .as_array() + .expect("tool uses array") + .iter() + .filter_map(|value| value["name"].as_str().map(ToOwned::to_owned)) + .collect(), + tool_error_count: response["tool_results"] + .as_array() + .expect("tool results array") + .iter() + .filter(|value| value["is_error"].as_bool().unwrap_or(false)) + .count(), + final_message: response["message"] + .as_str() + .expect("message text") + .to_string(), + } +} + +fn maybe_write_report(reports: &[ScenarioReport]) { + let Some(path) = std::env::var_os("MOCK_PARITY_REPORT_PATH") else { + return; + }; + + let payload = json!({ + "scenario_count": reports.len(), + "request_count": reports.iter().map(|report| report.request_count).sum::(), + "scenarios": reports.iter().map(scenario_report_json).collect::>(), + }); + fs::write( + path, + serde_json::to_vec_pretty(&payload).expect("report json should serialize"), + ) + .expect("report should write"); +} + +fn load_scenario_manifest() -> Vec { + let manifest_path = + Path::new(env!("CARGO_MANIFEST_DIR")).join("../../mock_parity_scenarios.json"); + let manifest = fs::read_to_string(&manifest_path).expect("scenario manifest should exist"); + serde_json::from_str::>(&manifest) + .expect("scenario manifest should parse") + .into_iter() + .map(|entry| ScenarioManifestEntry { + name: entry["name"] + .as_str() + .expect("scenario name should be a string") + .to_string(), + category: entry["category"] + .as_str() + .expect("scenario category should be a string") + .to_string(), + description: entry["description"] + .as_str() + .expect("scenario description should be a string") + .to_string(), + parity_refs: entry["parity_refs"] + .as_array() + .expect("parity refs should be an array") + .iter() + .map(|value| { + value + .as_str() + .expect("parity ref should be a string") + .to_string() + }) + .collect(), + }) + .collect() +} + +fn scenario_report_json(report: &ScenarioReport) -> Value { + json!({ + "name": report.name, + "category": report.category, + "description": report.description, + "parity_refs": report.parity_refs, + "iterations": report.iterations, + "request_count": report.request_count, + "tool_uses": report.tool_uses, + "tool_error_count": report.tool_error_count, + "final_message": report.final_message, + }) } fn assert_success(output: &Output) { diff --git a/rust/mock_parity_scenarios.json b/rust/mock_parity_scenarios.json new file mode 100644 index 0000000..063e50a --- /dev/null +++ b/rust/mock_parity_scenarios.json @@ -0,0 +1,92 @@ +[ + { + "name": "streaming_text", + "category": "baseline", + "description": "Validates streamed assistant text with no tool calls.", + "parity_refs": [ + "Mock parity harness — milestone 1", + "Streaming response support validated by the mock parity harness" + ] + }, + { + "name": "read_file_roundtrip", + "category": "file-tools", + "description": "Exercises read_file tool execution and final assistant synthesis.", + "parity_refs": [ + "Mock parity harness — milestone 1", + "File tools — harness-validated flows" + ] + }, + { + "name": "grep_chunk_assembly", + "category": "file-tools", + "description": "Validates grep_search partial JSON chunk assembly and follow-up synthesis.", + "parity_refs": [ + "Mock parity harness — milestone 1", + "File tools — harness-validated flows" + ] + }, + { + "name": "write_file_allowed", + "category": "file-tools", + "description": "Confirms workspace-write write_file success and filesystem side effects.", + "parity_refs": [ + "Mock parity harness — milestone 1", + "File tools — harness-validated flows" + ] + }, + { + "name": "write_file_denied", + "category": "permissions", + "description": "Confirms read-only mode blocks write_file with an error result.", + "parity_refs": [ + "Mock parity harness — milestone 1", + "Permission enforcement across tool paths" + ] + }, + { + "name": "multi_tool_turn_roundtrip", + "category": "multi-tool-turns", + "description": "Executes read_file and grep_search in the same assistant turn before the final reply.", + "parity_refs": [ + "Mock parity harness — milestone 2 (behavioral expansion)", + "Multi-tool assistant turns" + ] + }, + { + "name": "bash_stdout_roundtrip", + "category": "bash", + "description": "Validates bash execution and stdout roundtrip in danger-full-access mode.", + "parity_refs": [ + "Mock parity harness — milestone 2 (behavioral expansion)", + "Bash tool — upstream has 18 submodules, Rust has 1:" + ] + }, + { + "name": "bash_permission_prompt_approved", + "category": "permissions", + "description": "Exercises workspace-write to bash escalation with a positive approval response.", + "parity_refs": [ + "Mock parity harness — milestone 2 (behavioral expansion)", + "Permission enforcement across tool paths" + ] + }, + { + "name": "bash_permission_prompt_denied", + "category": "permissions", + "description": "Exercises workspace-write to bash escalation with a denied approval response.", + "parity_refs": [ + "Mock parity harness — milestone 2 (behavioral expansion)", + "Permission enforcement across tool paths" + ] + }, + { + "name": "plugin_tool_roundtrip", + "category": "plugin-paths", + "description": "Loads an external plugin tool and executes it through the runtime tool registry.", + "parity_refs": [ + "Mock parity harness — milestone 2 (behavioral expansion)", + "Plugin tool execution path" + ] + } +] diff --git a/rust/scripts/run_mock_parity_diff.py b/rust/scripts/run_mock_parity_diff.py new file mode 100755 index 0000000..0ac8d09 --- /dev/null +++ b/rust/scripts/run_mock_parity_diff.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import json +import os +import subprocess +import sys +import tempfile +from collections import defaultdict +from pathlib import Path + + +def load_manifest(path: Path) -> list[dict]: + return json.loads(path.read_text()) + + +def load_parity_text(path: Path) -> str: + return path.read_text() + + +def ensure_refs_exist(manifest: list[dict], parity_text: str) -> list[tuple[str, str]]: + missing: list[tuple[str, str]] = [] + for entry in manifest: + for ref in entry.get("parity_refs", []): + if ref not in parity_text: + missing.append((entry["name"], ref)) + return missing + + +def run_harness(rust_root: Path) -> dict: + with tempfile.TemporaryDirectory(prefix="mock-parity-report-") as temp_dir: + report_path = Path(temp_dir) / "report.json" + env = os.environ.copy() + env["MOCK_PARITY_REPORT_PATH"] = str(report_path) + subprocess.run( + [ + "cargo", + "test", + "-p", + "rusty-claude-cli", + "--test", + "mock_parity_harness", + "--", + "--nocapture", + ], + cwd=rust_root, + check=True, + env=env, + ) + return json.loads(report_path.read_text()) + + +def main() -> int: + script_path = Path(__file__).resolve() + rust_root = script_path.parent.parent + repo_root = rust_root.parent + manifest = load_manifest(rust_root / "mock_parity_scenarios.json") + parity_text = load_parity_text(repo_root / "PARITY.md") + + missing_refs = ensure_refs_exist(manifest, parity_text) + if missing_refs: + print("Missing PARITY.md references:", file=sys.stderr) + for scenario_name, ref in missing_refs: + print(f" - {scenario_name}: {ref}", file=sys.stderr) + return 1 + + should_run = "--no-run" not in sys.argv[1:] + report = run_harness(rust_root) if should_run else None + report_by_name = { + entry["name"]: entry for entry in report.get("scenarios", []) + } if report else {} + + print("Mock parity diff checklist") + print(f"Repo root: {repo_root}") + print(f"Scenario manifest: {rust_root / 'mock_parity_scenarios.json'}") + print(f"PARITY source: {repo_root / 'PARITY.md'}") + print() + + for entry in manifest: + scenario_name = entry["name"] + scenario_report = report_by_name.get(scenario_name) + status = "PASS" if scenario_report else ("MAPPED" if not should_run else "MISSING") + print(f"[{status}] {scenario_name} ({entry['category']})") + print(f" description: {entry['description']}") + print(f" parity refs: {' | '.join(entry['parity_refs'])}") + if scenario_report: + print( + " result: iterations={iterations} requests={requests} tool_uses={tool_uses} tool_errors={tool_errors}".format( + iterations=scenario_report["iterations"], + requests=scenario_report["request_count"], + tool_uses=", ".join(scenario_report["tool_uses"]) or "none", + tool_errors=scenario_report["tool_error_count"], + ) + ) + print(f" final: {scenario_report['final_message']}") + print() + + coverage = defaultdict(list) + for entry in manifest: + for ref in entry["parity_refs"]: + coverage[ref].append(entry["name"]) + + print("PARITY coverage map") + for ref, scenarios in coverage.items(): + print(f"- {ref}") + print(f" scenarios: {', '.join(scenarios)}") + + if report and report.get("scenarios"): + first = report["scenarios"][0] + print() + print("First scenario result") + print(f"- name: {first['name']}") + print(f"- iterations: {first['iterations']}") + print(f"- requests: {first['request_count']}") + print(f"- tool_uses: {', '.join(first['tool_uses']) or 'none'}") + print(f"- tool_errors: {first['tool_error_count']}") + print(f"- final_message: {first['final_message']}") + print() + print( + "Harness summary: {scenario_count} scenarios, {request_count} requests".format( + scenario_count=report["scenario_count"], + request_count=report["request_count"], + ) + ) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())