Block oversized requests before providers hard-fail

The runtime already tracked rough token estimates for compaction, but provider-bound requests still relied on naive model output limits and could be sent upstream even when the selected model could not fit the estimated prompt plus requested output. This adds a small model token/context registry in the API layer, estimates request size from the serialized prompt payload, and fails locally with a dedicated context-window error before Anthropic or xAI calls are made. Focused integration coverage asserts the preflight fires before any HTTP request leaves the process. Constraint: Keep the first pass minimal and reusable across both Anthropic and OpenAI-compatible providers Rejected: Auto-compact-and-retry in the same patch | broader control-flow change than the requested minimal preflight Confidence: medium Scope-risk: narrow Reversibility: clean Directive: Expand the model registry before enabling preflight for additional providers or aliases Tested: cargo build -p api -p tools -p rusty-claude-cli; cargo test -p api Not-tested: End-to-end CLI auto-compaction or retry behavior after a local context_window_blocked failure
2026-06-05 05:57:10 +08:00 · 2026-04-05 16:39:58 +00:00
parent b9c5cc118e
commit fa72cd665e
6 changed files with 264 additions and 11 deletions
--- a/rust/crates/api/tests/openai_compat_integration.rs
+++ b/rust/crates/api/tests/openai_compat_integration.rs
@@ -4,10 +4,10 @@ use std::sync::Arc;
 use std::sync::{Mutex as StdMutex, OnceLock};

 use api::{
-    ContentBlockDelta, ContentBlockDeltaEvent, ContentBlockStartEvent, ContentBlockStopEvent,
-    InputContentBlock, InputMessage, MessageDeltaEvent, MessageRequest, OpenAiCompatClient,
-    OpenAiCompatConfig, OutputContentBlock, ProviderClient, StreamEvent, ToolChoice,
-    ToolDefinition,
+    ApiError, ContentBlockDelta, ContentBlockDeltaEvent, ContentBlockStartEvent,
+    ContentBlockStopEvent, InputContentBlock, InputMessage, MessageDeltaEvent, MessageRequest,
+    OpenAiCompatClient, OpenAiCompatConfig, OutputContentBlock, ProviderClient, StreamEvent,
+    ToolChoice, ToolDefinition,
 };
 use serde_json::json;
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
@@ -63,6 +63,42 @@ async fn send_message_uses_openai_compatible_endpoint_and_auth() {
    assert_eq!(body["tools"][0]["type"], json!("function"));
 }

+#[tokio::test]
+async fn send_message_blocks_oversized_xai_requests_before_the_http_call() {
+    let state = Arc::new(Mutex::new(Vec::<CapturedRequest>::new()));
+    let server = spawn_server(
+        state.clone(),
+        vec![http_response("200 OK", "application/json", "{}")],
+    )
+    .await;
+
+    let client = OpenAiCompatClient::new("xai-test-key", OpenAiCompatConfig::xai())
+        .with_base_url(server.base_url());
+    let error = client
+        .send_message(&MessageRequest {
+            model: "grok-3".to_string(),
+            max_tokens: 64_000,
+            messages: vec![InputMessage {
+                role: "user".to_string(),
+                content: vec![InputContentBlock::Text {
+                    text: "x".repeat(300_000),
+                }],
+            }],
+            system: Some("Keep the answer short.".to_string()),
+            tools: None,
+            tool_choice: None,
+            stream: false,
+        })
+        .await
+        .expect_err("oversized request should fail local context-window preflight");
+
+    assert!(matches!(error, ApiError::ContextWindowExceeded { .. }));
+    assert!(
+        state.lock().await.is_empty(),
+        "preflight failure should avoid any upstream HTTP request"
+    );
+}
+
 #[tokio::test]
 async fn send_message_accepts_full_chat_completions_endpoint_override() {
    let state = Arc::new(Mutex::new(Vec::<CapturedRequest>::new()));