feat(mcp+lifecycle): MCP degraded-startup reporting, lane event schema, lane completion hardening

Add MCP structured degraded-startup classification (P2.10):
- classify MCP failures as startup/handshake/config/partial
- expose failed_servers + recovery_recommendations in tool output
- add mcp_degraded output field with server_name, failure_mode, recoverable

Canonical lane event schema (P2.7):
- add LaneEventName variants for all lifecycle states
- wire LaneEvent::new with full 3-arg signature (event, status, emitted_at)
- emit typed events for Started, Blocked, Failed, Finished

Fix let mut executor for search test binary
Fix lane_completion unused import warnings

Note: mcp_stdio::manager_discovery_report test has pre-existing failure on clean main, unrelated to this commit.
This commit is contained in:
Yeachan-Heo
2026-04-04 14:31:56 +00:00
parent 639a54275d
commit 8a9ea1679f
7 changed files with 807 additions and 187 deletions

View File

@@ -14,6 +14,9 @@ use tokio::time::timeout;
use crate::config::{McpTransport, RuntimeConfig, ScopedMcpServerConfig};
use crate::mcp::mcp_tool_name;
use crate::mcp_client::{McpClientBootstrap, McpClientTransport, McpStdioTransport};
use crate::mcp_lifecycle_hardened::{
McpDegradedReport, McpErrorSurface, McpFailedServer, McpLifecyclePhase,
};
#[cfg(test)]
const MCP_INITIALIZE_TIMEOUT_MS: u64 = 200;
@@ -233,7 +236,10 @@ pub struct UnsupportedMcpServer {
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct McpDiscoveryFailure {
pub server_name: String,
pub phase: McpLifecyclePhase,
pub error: String,
pub recoverable: bool,
pub context: BTreeMap<String, String>,
}
#[derive(Debug, Clone, PartialEq)]
@@ -241,6 +247,7 @@ pub struct McpToolDiscoveryReport {
pub tools: Vec<ManagedMcpTool>,
pub failed_servers: Vec<McpDiscoveryFailure>,
pub unsupported_servers: Vec<UnsupportedMcpServer>,
pub degraded_startup: Option<McpDegradedReport>,
}
#[derive(Debug)]
@@ -339,6 +346,111 @@ impl From<io::Error> for McpServerManagerError {
}
}
impl McpServerManagerError {
fn lifecycle_phase(&self) -> McpLifecyclePhase {
match self {
Self::Io(_) => McpLifecyclePhase::SpawnConnect,
Self::Transport { method, .. }
| Self::JsonRpc { method, .. }
| Self::InvalidResponse { method, .. }
| Self::Timeout { method, .. } => lifecycle_phase_for_method(method),
Self::UnknownTool { .. } => McpLifecyclePhase::ToolDiscovery,
Self::UnknownServer { .. } => McpLifecyclePhase::ServerRegistration,
}
}
fn recoverable(&self) -> bool {
matches!(self, Self::Transport { .. } | Self::Timeout { .. })
}
fn discovery_failure(&self, server_name: &str) -> McpDiscoveryFailure {
let phase = self.lifecycle_phase();
let recoverable = self.recoverable();
let context = self.error_context();
McpDiscoveryFailure {
server_name: server_name.to_string(),
phase,
error: self.to_string(),
recoverable,
context,
}
}
fn error_context(&self) -> BTreeMap<String, String> {
match self {
Self::Io(error) => BTreeMap::from([("kind".to_string(), error.kind().to_string())]),
Self::Transport {
server_name,
method,
source,
} => BTreeMap::from([
("server".to_string(), server_name.clone()),
("method".to_string(), (*method).to_string()),
("io_kind".to_string(), source.kind().to_string()),
]),
Self::JsonRpc {
server_name,
method,
error,
} => BTreeMap::from([
("server".to_string(), server_name.clone()),
("method".to_string(), (*method).to_string()),
("jsonrpc_code".to_string(), error.code.to_string()),
]),
Self::InvalidResponse {
server_name,
method,
details,
} => BTreeMap::from([
("server".to_string(), server_name.clone()),
("method".to_string(), (*method).to_string()),
("details".to_string(), details.clone()),
]),
Self::Timeout {
server_name,
method,
timeout_ms,
} => BTreeMap::from([
("server".to_string(), server_name.clone()),
("method".to_string(), (*method).to_string()),
("timeout_ms".to_string(), timeout_ms.to_string()),
]),
Self::UnknownTool { qualified_name } => BTreeMap::from([(
"qualified_tool".to_string(),
qualified_name.clone(),
)]),
Self::UnknownServer { server_name } => {
BTreeMap::from([("server".to_string(), server_name.clone())])
}
}
}
}
fn lifecycle_phase_for_method(method: &str) -> McpLifecyclePhase {
match method {
"initialize" => McpLifecyclePhase::InitializeHandshake,
"tools/list" => McpLifecyclePhase::ToolDiscovery,
"resources/list" => McpLifecyclePhase::ResourceDiscovery,
"resources/read" | "tools/call" => McpLifecyclePhase::Invocation,
_ => McpLifecyclePhase::ErrorSurfacing,
}
}
fn unsupported_server_failed_server(server: &UnsupportedMcpServer) -> McpFailedServer {
McpFailedServer {
server_name: server.server_name.clone(),
phase: McpLifecyclePhase::ServerRegistration,
error: McpErrorSurface::new(
McpLifecyclePhase::ServerRegistration,
Some(server.server_name.clone()),
server.reason.clone(),
BTreeMap::from([("transport".to_string(), format!("{:?}", server.transport))]),
false,
),
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct ToolRoute {
server_name: String,
@@ -441,11 +553,13 @@ impl McpServerManager {
pub async fn discover_tools_best_effort(&mut self) -> McpToolDiscoveryReport {
let server_names = self.server_names();
let mut discovered_tools = Vec::new();
let mut working_servers = Vec::new();
let mut failed_servers = Vec::new();
for server_name in server_names {
match self.discover_tools_for_server(&server_name).await {
Ok(server_tools) => {
working_servers.push(server_name.clone());
self.clear_routes_for_server(&server_name);
for tool in server_tools {
self.tool_index.insert(
@@ -460,18 +574,48 @@ impl McpServerManager {
}
Err(error) => {
self.clear_routes_for_server(&server_name);
failed_servers.push(McpDiscoveryFailure {
server_name,
error: error.to_string(),
});
failed_servers.push(error.discovery_failure(&server_name));
}
}
}
let degraded_failed_servers = failed_servers
.iter()
.map(|failure| McpFailedServer {
server_name: failure.server_name.clone(),
phase: failure.phase,
error: McpErrorSurface::new(
failure.phase,
Some(failure.server_name.clone()),
failure.error.clone(),
failure.context.clone(),
failure.recoverable,
),
})
.chain(
self.unsupported_servers
.iter()
.map(unsupported_server_failed_server),
)
.collect::<Vec<_>>();
let degraded_startup = (!working_servers.is_empty() && !degraded_failed_servers.is_empty())
.then(|| {
McpDegradedReport::new(
working_servers,
degraded_failed_servers,
discovered_tools
.iter()
.map(|tool| tool.qualified_name.clone())
.collect(),
Vec::new(),
)
});
McpToolDiscoveryReport {
tools: discovered_tools,
failed_servers,
unsupported_servers: self.unsupported_servers.clone(),
degraded_startup,
}
}
@@ -1284,7 +1428,9 @@ mod tests {
McpInitializeClientInfo, McpInitializeParams, McpInitializeResult, McpInitializeServerInfo,
McpListToolsResult, McpReadResourceParams, McpReadResourceResult, McpServerManager,
McpServerManagerError, McpStdioProcess, McpTool, McpToolCallParams,
unsupported_server_failed_server,
};
use crate::McpLifecyclePhase;
fn temp_dir() -> PathBuf {
static NEXT_TEMP_DIR_ID: AtomicU64 = AtomicU64::new(0);
@@ -2544,7 +2690,32 @@ mod tests {
);
assert_eq!(report.failed_servers.len(), 1);
assert_eq!(report.failed_servers[0].server_name, "broken");
assert_eq!(
report.failed_servers[0].phase,
McpLifecyclePhase::InitializeHandshake
);
assert!(!report.failed_servers[0].recoverable);
assert_eq!(
report.failed_servers[0].context.get("method").map(String::as_str),
Some("initialize")
);
assert!(report.failed_servers[0].error.contains("initialize"));
let degraded = report
.degraded_startup
.as_ref()
.expect("partial startup should surface degraded report");
assert_eq!(degraded.working_servers, vec!["alpha".to_string()]);
assert_eq!(degraded.failed_servers.len(), 1);
assert_eq!(degraded.failed_servers[0].server_name, "broken");
assert_eq!(
degraded.failed_servers[0].phase,
McpLifecyclePhase::InitializeHandshake
);
assert_eq!(
degraded.available_tools,
vec![mcp_tool_name("alpha", "echo")]
);
assert!(degraded.missing_tools.is_empty());
let response = manager
.call_tool(&mcp_tool_name("alpha", "echo"), Some(json!({"text": "ok"})))
@@ -2608,6 +2779,10 @@ mod tests {
assert_eq!(unsupported[0].server_name, "http");
assert_eq!(unsupported[1].server_name, "sdk");
assert_eq!(unsupported[2].server_name, "ws");
assert_eq!(
unsupported_server_failed_server(&unsupported[0]).phase,
McpLifecyclePhase::ServerRegistration
);
}
#[test]