Add extracted source directory and README navigation

This commit is contained in:
Shawn Bot
2026-03-31 14:56:06 +00:00
parent 6252bb6eb5
commit 91e01d755b
4757 changed files with 984951 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,546 @@
export const BROWSER_TOOLS = [
{
name: "javascript_tool",
description:
"Execute JavaScript code in the context of the current page. The code runs in the page's context and can interact with the DOM, window object, and page variables. Returns the result of the last expression or any thrown errors. If you don't have a valid tab ID, use tabs_context_mcp first to get available tabs.",
inputSchema: {
type: "object",
properties: {
action: {
type: "string",
description: "Must be set to 'javascript_exec'",
},
text: {
type: "string",
description:
"The JavaScript code to execute. The code will be evaluated in the page context. The result of the last expression will be returned automatically. Do NOT use 'return' statements - just write the expression you want to evaluate (e.g., 'window.myData.value' not 'return window.myData.value'). You can access and modify the DOM, call page functions, and interact with page variables.",
},
tabId: {
type: "number",
description:
"Tab ID to execute the code in. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
},
required: ["action", "text", "tabId"],
},
},
{
name: "read_page",
description:
"Get an accessibility tree representation of elements on the page. By default returns all elements including non-visible ones. Output is limited to 50000 characters by default. If the output exceeds this limit, you will receive an error asking you to specify a smaller depth or focus on a specific element using ref_id. Optionally filter for only interactive elements. If you don't have a valid tab ID, use tabs_context_mcp first to get available tabs.",
inputSchema: {
type: "object",
properties: {
filter: {
type: "string",
enum: ["interactive", "all"],
description:
'Filter elements: "interactive" for buttons/links/inputs only, "all" for all elements including non-visible ones (default: all elements)',
},
tabId: {
type: "number",
description:
"Tab ID to read from. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
depth: {
type: "number",
description:
"Maximum depth of the tree to traverse (default: 15). Use a smaller depth if output is too large.",
},
ref_id: {
type: "string",
description:
"Reference ID of a parent element to read. Will return the specified element and all its children. Use this to focus on a specific part of the page when output is too large.",
},
max_chars: {
type: "number",
description:
"Maximum characters for output (default: 50000). Set to a higher value if your client can handle large outputs.",
},
},
required: ["tabId"],
},
},
{
name: "find",
description:
'Find elements on the page using natural language. Can search for elements by their purpose (e.g., "search bar", "login button") or by text content (e.g., "organic mango product"). Returns up to 20 matching elements with references that can be used with other tools. If more than 20 matches exist, you\'ll be notified to use a more specific query. If you don\'t have a valid tab ID, use tabs_context_mcp first to get available tabs.',
inputSchema: {
type: "object",
properties: {
query: {
type: "string",
description:
'Natural language description of what to find (e.g., "search bar", "add to cart button", "product title containing organic")',
},
tabId: {
type: "number",
description:
"Tab ID to search in. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
},
required: ["query", "tabId"],
},
},
{
name: "form_input",
description:
"Set values in form elements using element reference ID from the read_page tool. If you don't have a valid tab ID, use tabs_context_mcp first to get available tabs.",
inputSchema: {
type: "object",
properties: {
ref: {
type: "string",
description:
'Element reference ID from the read_page tool (e.g., "ref_1", "ref_2")',
},
value: {
type: ["string", "boolean", "number"],
description:
"The value to set. For checkboxes use boolean, for selects use option value or text, for other inputs use appropriate string/number",
},
tabId: {
type: "number",
description:
"Tab ID to set form value in. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
},
required: ["ref", "value", "tabId"],
},
},
{
name: "computer",
description: `Use a mouse and keyboard to interact with a web browser, and take screenshots. If you don't have a valid tab ID, use tabs_context_mcp first to get available tabs.\n* Whenever you intend to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your click location so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.`,
inputSchema: {
type: "object",
properties: {
action: {
type: "string",
enum: [
"left_click",
"right_click",
"type",
"screenshot",
"wait",
"scroll",
"key",
"left_click_drag",
"double_click",
"triple_click",
"zoom",
"scroll_to",
"hover",
],
description:
"The action to perform:\n* `left_click`: Click the left mouse button at the specified coordinates.\n* `right_click`: Click the right mouse button at the specified coordinates to open context menus.\n* `double_click`: Double-click the left mouse button at the specified coordinates.\n* `triple_click`: Triple-click the left mouse button at the specified coordinates.\n* `type`: Type a string of text.\n* `screenshot`: Take a screenshot of the screen.\n* `wait`: Wait for a specified number of seconds.\n* `scroll`: Scroll up, down, left, or right at the specified coordinates.\n* `key`: Press a specific keyboard key.\n* `left_click_drag`: Drag from start_coordinate to coordinate.\n* `zoom`: Take a screenshot of a specific region for closer inspection.\n* `scroll_to`: Scroll an element into view using its element reference ID from read_page or find tools.\n* `hover`: Move the mouse cursor to the specified coordinates or element without clicking. Useful for revealing tooltips, dropdown menus, or triggering hover states.",
},
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates. Required for `left_click`, `right_click`, `double_click`, `triple_click`, and `scroll`. For `left_click_drag`, this is the end position.",
},
text: {
type: "string",
description:
'The text to type (for `type` action) or the key(s) to press (for `key` action). For `key` action: Provide space-separated keys (e.g., "Backspace Backspace Delete"). Supports keyboard shortcuts using the platform\'s modifier key (use "cmd" on Mac, "ctrl" on Windows/Linux, e.g., "cmd+a" or "ctrl+a" for select all).',
},
duration: {
type: "number",
minimum: 0,
maximum: 30,
description:
"The number of seconds to wait. Required for `wait`. Maximum 30 seconds.",
},
scroll_direction: {
type: "string",
enum: ["up", "down", "left", "right"],
description: "The direction to scroll. Required for `scroll`.",
},
scroll_amount: {
type: "number",
minimum: 1,
maximum: 10,
description:
"The number of scroll wheel ticks. Optional for `scroll`, defaults to 3.",
},
start_coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
"(x, y): The starting coordinates for `left_click_drag`.",
},
region: {
type: "array",
items: { type: "number" },
minItems: 4,
maxItems: 4,
description:
"(x0, y0, x1, y1): The rectangular region to capture for `zoom`. Coordinates define a rectangle from top-left (x0, y0) to bottom-right (x1, y1) in pixels from the viewport origin. Required for `zoom` action. Useful for inspecting small UI elements like icons, buttons, or text.",
},
repeat: {
type: "number",
minimum: 1,
maximum: 100,
description:
"Number of times to repeat the key sequence. Only applicable for `key` action. Must be a positive integer between 1 and 100. Default is 1. Useful for navigation tasks like pressing arrow keys multiple times.",
},
ref: {
type: "string",
description:
'Element reference ID from read_page or find tools (e.g., "ref_1", "ref_2"). Required for `scroll_to` action. Can be used as alternative to `coordinate` for click actions.',
},
modifiers: {
type: "string",
description:
'Modifier keys for click actions. Supports: "ctrl", "shift", "alt", "cmd" (or "meta"), "win" (or "windows"). Can be combined with "+" (e.g., "ctrl+shift", "cmd+alt"). Optional.',
},
tabId: {
type: "number",
description:
"Tab ID to execute the action on. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
},
required: ["action", "tabId"],
},
},
{
name: "navigate",
description:
"Navigate to a URL, or go forward/back in browser history. If you don't have a valid tab ID, use tabs_context_mcp first to get available tabs.",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description:
'The URL to navigate to. Can be provided with or without protocol (defaults to https://). Use "forward" to go forward in history or "back" to go back in history.',
},
tabId: {
type: "number",
description:
"Tab ID to navigate. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
},
required: ["url", "tabId"],
},
},
{
name: "resize_window",
description:
"Resize the current browser window to specified dimensions. Useful for testing responsive designs or setting up specific screen sizes. If you don't have a valid tab ID, use tabs_context_mcp first to get available tabs.",
inputSchema: {
type: "object",
properties: {
width: {
type: "number",
description: "Target window width in pixels",
},
height: {
type: "number",
description: "Target window height in pixels",
},
tabId: {
type: "number",
description:
"Tab ID to get the window for. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
},
required: ["width", "height", "tabId"],
},
},
{
name: "gif_creator",
description:
"Manage GIF recording and export for browser automation sessions. Control when to start/stop recording browser actions (clicks, scrolls, navigation), then export as an animated GIF with visual overlays (click indicators, action labels, progress bar, watermark). All operations are scoped to the tab's group. When starting recording, take a screenshot immediately after to capture the initial state as the first frame. When stopping recording, take a screenshot immediately before to capture the final state as the last frame. For export, either provide 'coordinate' to drag/drop upload to a page element, or set 'download: true' to download the GIF.",
inputSchema: {
type: "object",
properties: {
action: {
type: "string",
enum: ["start_recording", "stop_recording", "export", "clear"],
description:
"Action to perform: 'start_recording' (begin capturing), 'stop_recording' (stop capturing but keep frames), 'export' (generate and export GIF), 'clear' (discard frames)",
},
tabId: {
type: "number",
description:
"Tab ID to identify which tab group this operation applies to",
},
download: {
type: "boolean",
description:
"Always set this to true for the 'export' action only. This causes the gif to be downloaded in the browser.",
},
filename: {
type: "string",
description:
"Optional filename for exported GIF (default: 'recording-[timestamp].gif'). For 'export' action only.",
},
options: {
type: "object",
description:
"Optional GIF enhancement options for 'export' action. Properties: showClickIndicators (bool), showDragPaths (bool), showActionLabels (bool), showProgressBar (bool), showWatermark (bool), quality (number 1-30). All default to true except quality (default: 10).",
properties: {
showClickIndicators: {
type: "boolean",
description:
"Show orange circles at click locations (default: true)",
},
showDragPaths: {
type: "boolean",
description: "Show red arrows for drag actions (default: true)",
},
showActionLabels: {
type: "boolean",
description:
"Show black labels describing actions (default: true)",
},
showProgressBar: {
type: "boolean",
description: "Show orange progress bar at bottom (default: true)",
},
showWatermark: {
type: "boolean",
description: "Show Claude logo watermark (default: true)",
},
quality: {
type: "number",
description:
"GIF compression quality, 1-30 (lower = better quality, slower encoding). Default: 10",
},
},
},
},
required: ["action", "tabId"],
},
},
{
name: "upload_image",
description:
"Upload a previously captured screenshot or user-uploaded image to a file input or drag & drop target. Supports two approaches: (1) ref - for targeting specific elements, especially hidden file inputs, (2) coordinate - for drag & drop to visible locations like Google Docs. Provide either ref or coordinate, not both.",
inputSchema: {
type: "object",
properties: {
imageId: {
type: "string",
description:
"ID of a previously captured screenshot (from the computer tool's screenshot action) or a user-uploaded image",
},
ref: {
type: "string",
description:
'Element reference ID from read_page or find tools (e.g., "ref_1", "ref_2"). Use this for file inputs (especially hidden ones) or specific elements. Provide either ref or coordinate, not both.',
},
coordinate: {
type: "array",
items: {
type: "number",
},
description:
"Viewport coordinates [x, y] for drag & drop to a visible location. Use this for drag & drop targets like Google Docs. Provide either ref or coordinate, not both.",
},
tabId: {
type: "number",
description:
"Tab ID where the target element is located. This is where the image will be uploaded to.",
},
filename: {
type: "string",
description:
'Optional filename for the uploaded file (default: "image.png")',
},
},
required: ["imageId", "tabId"],
},
},
{
name: "get_page_text",
description:
"Extract raw text content from the page, prioritizing article content. Ideal for reading articles, blog posts, or other text-heavy pages. Returns plain text without HTML formatting. If you don't have a valid tab ID, use tabs_context_mcp first to get available tabs.",
inputSchema: {
type: "object",
properties: {
tabId: {
type: "number",
description:
"Tab ID to extract text from. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
},
required: ["tabId"],
},
},
{
name: "tabs_context_mcp",
title: "Tabs Context",
description:
"Get context information about the current MCP tab group. Returns all tab IDs inside the group if it exists. CRITICAL: You must get the context at least once before using other browser automation tools so you know what tabs exist. Each new conversation should create its own new tab (using tabs_create_mcp) rather than reusing existing tabs, unless the user explicitly asks to use an existing tab.",
inputSchema: {
type: "object",
properties: {
createIfEmpty: {
type: "boolean",
description:
"Creates a new MCP tab group if none exists, creates a new Window with a new tab group containing an empty tab (which can be used for this conversation). If a MCP tab group already exists, this parameter has no effect.",
},
},
required: [],
},
},
{
name: "tabs_create_mcp",
title: "Tabs Create",
description:
"Creates a new empty tab in the MCP tab group. CRITICAL: You must get the context using tabs_context_mcp at least once before using other browser automation tools so you know what tabs exist.",
inputSchema: {
type: "object",
properties: {},
required: [],
},
},
{
name: "update_plan",
description:
"Present a plan to the user for approval before taking actions. The user will see the domains you intend to visit and your approach. Once approved, you can proceed with actions on the approved domains without additional permission prompts.",
inputSchema: {
type: "object" as const,
properties: {
domains: {
type: "array" as const,
items: { type: "string" as const },
description:
"List of domains you will visit (e.g., ['github.com', 'stackoverflow.com']). These domains will be approved for the session when the user accepts the plan.",
},
approach: {
type: "array" as const,
items: { type: "string" as const },
description:
"High-level description of what you will do. Focus on outcomes and key actions, not implementation details. Be concise - aim for 3-7 items.",
},
},
required: ["domains", "approach"],
},
},
{
name: "read_console_messages",
description:
"Read browser console messages (console.log, console.error, console.warn, etc.) from a specific tab. Useful for debugging JavaScript errors, viewing application logs, or understanding what's happening in the browser console. Returns console messages from the current domain only. If you don't have a valid tab ID, use tabs_context_mcp first to get available tabs. IMPORTANT: Always provide a pattern to filter messages - without a pattern, you may get too many irrelevant messages.",
inputSchema: {
type: "object",
properties: {
tabId: {
type: "number",
description:
"Tab ID to read console messages from. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
onlyErrors: {
type: "boolean",
description:
"If true, only return error and exception messages. Default is false (return all message types).",
},
clear: {
type: "boolean",
description:
"If true, clear the console messages after reading to avoid duplicates on subsequent calls. Default is false.",
},
pattern: {
type: "string",
description:
"Regex pattern to filter console messages. Only messages matching this pattern will be returned (e.g., 'error|warning' to find errors and warnings, 'MyApp' to filter app-specific logs). You should always provide a pattern to avoid getting too many irrelevant messages.",
},
limit: {
type: "number",
description:
"Maximum number of messages to return. Defaults to 100. Increase only if you need more results.",
},
},
required: ["tabId"],
},
},
{
name: "read_network_requests",
description:
"Read HTTP network requests (XHR, Fetch, documents, images, etc.) from a specific tab. Useful for debugging API calls, monitoring network activity, or understanding what requests a page is making. Returns all network requests made by the current page, including cross-origin requests. Requests are automatically cleared when the page navigates to a different domain. If you don't have a valid tab ID, use tabs_context_mcp first to get available tabs.",
inputSchema: {
type: "object",
properties: {
tabId: {
type: "number",
description:
"Tab ID to read network requests from. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
urlPattern: {
type: "string",
description:
"Optional URL pattern to filter requests. Only requests whose URL contains this string will be returned (e.g., '/api/' to filter API calls, 'example.com' to filter by domain).",
},
clear: {
type: "boolean",
description:
"If true, clear the network requests after reading to avoid duplicates on subsequent calls. Default is false.",
},
limit: {
type: "number",
description:
"Maximum number of requests to return. Defaults to 100. Increase only if you need more results.",
},
},
required: ["tabId"],
},
},
{
name: "shortcuts_list",
description:
"List all available shortcuts and workflows (shortcuts and workflows are interchangeable). Returns shortcuts with their commands, descriptions, and whether they are workflows. Use shortcuts_execute to run a shortcut or workflow.",
inputSchema: {
type: "object",
properties: {
tabId: {
type: "number",
description:
"Tab ID to list shortcuts from. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
},
required: ["tabId"],
},
},
{
name: "shortcuts_execute",
description:
"Execute a shortcut or workflow by running it in a new sidepanel window using the current tab (shortcuts and workflows are interchangeable). Use shortcuts_list first to see available shortcuts. This starts the execution and returns immediately - it does not wait for completion.",
inputSchema: {
type: "object",
properties: {
tabId: {
type: "number",
description:
"Tab ID to execute the shortcut on. Must be a tab in the current group. Use tabs_context_mcp first if you don't have a valid tab ID.",
},
shortcutId: {
type: "string",
description: "The ID of the shortcut to execute",
},
command: {
type: "string",
description:
"The command name of the shortcut to execute (e.g., 'debug', 'summarize'). Do not include the leading slash.",
},
},
required: ["tabId"],
},
},
{
name: "switch_browser",
description:
"Switch which Chrome browser is used for browser automation. Call this when the user wants to connect to a different Chrome browser. Broadcasts a connection request to all Chrome browsers with the extension installed — the user clicks 'Connect' in the desired browser.",
inputSchema: {
type: "object",
properties: {},
required: [],
},
},
];

View File

@@ -0,0 +1,15 @@
export { BridgeClient, createBridgeClient } from "./bridgeClient.js";
export { BROWSER_TOOLS } from "./browserTools.js";
export {
createChromeSocketClient,
createClaudeForChromeMcpServer,
} from "./mcpServer.js";
export { localPlatformLabel } from "./types.js";
export type {
BridgeConfig,
ChromeExtensionInfo,
ClaudeForChromeContext,
Logger,
PermissionMode,
SocketClient,
} from "./types.js";

View File

@@ -0,0 +1,96 @@
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
import {
CallToolRequestSchema,
ListToolsRequestSchema,
} from "@modelcontextprotocol/sdk/types.js";
import { createBridgeClient } from "./bridgeClient.js";
import { BROWSER_TOOLS } from "./browserTools.js";
import { createMcpSocketClient } from "./mcpSocketClient.js";
import { createMcpSocketPool } from "./mcpSocketPool.js";
import { handleToolCall } from "./toolCalls.js";
import type { ClaudeForChromeContext, SocketClient } from "./types.js";
/**
* Create the socket/bridge client for the Chrome extension MCP server.
* Exported so Desktop can share a single instance between the registered
* MCP server and the InternalMcpServerManager (CCD sessions).
*/
export function createChromeSocketClient(
context: ClaudeForChromeContext,
): SocketClient {
return context.bridgeConfig
? createBridgeClient(context)
: context.getSocketPaths
? createMcpSocketPool(context)
: createMcpSocketClient(context);
}
export function createClaudeForChromeMcpServer(
context: ClaudeForChromeContext,
existingSocketClient?: SocketClient,
): Server {
const { serverName, logger } = context;
// Choose transport: bridge (WebSocket) > socket pool (multi-profile) > single socket.
const socketClient =
existingSocketClient ?? createChromeSocketClient(context);
const server = new Server(
{
name: serverName,
version: "1.0.0",
},
{
capabilities: {
tools: {},
logging: {},
},
},
);
server.setRequestHandler(ListToolsRequestSchema, async () => {
if (context.isDisabled?.()) {
return { tools: [] };
}
return {
tools: context.bridgeConfig
? BROWSER_TOOLS
: BROWSER_TOOLS.filter((t) => t.name !== "switch_browser"),
};
});
server.setRequestHandler(
CallToolRequestSchema,
async (request): Promise<CallToolResult> => {
logger.info(`[${serverName}] Executing tool: ${request.params.name}`);
return handleToolCall(
context,
socketClient,
request.params.name,
request.params.arguments || {},
);
},
);
socketClient.setNotificationHandler((notification) => {
logger.info(
`[${serverName}] Forwarding MCP notification: ${notification.method}`,
);
server
.notification({
method: notification.method,
params: notification.params,
})
.catch((error) => {
// Server may not be connected yet (e.g., during startup or after disconnect)
logger.info(
`[${serverName}] Failed to forward MCP notification: ${error.message}`,
);
});
});
return server;
}

View File

@@ -0,0 +1,493 @@
import { promises as fsPromises } from "fs";
import { createConnection } from "net";
import type { Socket } from "net";
import { platform } from "os";
import { dirname } from "path";
import type {
ClaudeForChromeContext,
PermissionMode,
PermissionOverrides,
} from "./types.js";
export class SocketConnectionError extends Error {
constructor(message: string) {
super(message);
this.name = "SocketConnectionError";
}
}
interface ToolRequest {
method: string; // "execute_tool"
params?: {
client_id?: string; // "desktop" | "claude-code"
tool?: string;
args?: Record<string, unknown>;
};
}
interface ToolResponse {
result?: unknown;
error?: string;
}
interface Notification {
method: string;
params?: Record<string, unknown>;
}
type SocketMessage = ToolResponse | Notification;
function isToolResponse(message: SocketMessage): message is ToolResponse {
return "result" in message || "error" in message;
}
function isNotification(message: SocketMessage): message is Notification {
return "method" in message && typeof message.method === "string";
}
class McpSocketClient {
private socket: Socket | null = null;
private connected = false;
private connecting = false;
private responseCallback: ((response: ToolResponse) => void) | null = null;
private notificationHandler: ((notification: Notification) => void) | null =
null;
private responseBuffer = Buffer.alloc(0);
private reconnectAttempts = 0;
private maxReconnectAttempts = 10;
private reconnectDelay = 1000;
private reconnectTimer: NodeJS.Timeout | null = null;
private context: ClaudeForChromeContext;
// When true, disables automatic reconnection. Used by McpSocketPool which
// manages reconnection externally by rescanning available sockets.
public disableAutoReconnect = false;
constructor(context: ClaudeForChromeContext) {
this.context = context;
}
private async connect(): Promise<void> {
const { serverName, logger } = this.context;
if (this.connecting) {
logger.info(
`[${serverName}] Already connecting, skipping duplicate attempt`,
);
return;
}
this.closeSocket();
this.connecting = true;
const socketPath =
this.context.getSocketPath?.() ?? this.context.socketPath;
logger.info(`[${serverName}] Attempting to connect to: ${socketPath}`);
try {
await this.validateSocketSecurity(socketPath);
} catch (error) {
this.connecting = false;
logger.info(`[${serverName}] Security validation failed:`, error);
// Don't retry on security failures (wrong perms/owner) - those won't
// self-resolve. Only the error handler retries on transient errors.
return;
}
this.socket = createConnection(socketPath);
// Timeout the initial connection attempt - if socket file exists but native
// host is dead, the connect can hang indefinitely
const connectTimeout = setTimeout(() => {
if (!this.connected) {
logger.info(
`[${serverName}] Connection attempt timed out after 5000ms`,
);
this.closeSocket();
this.scheduleReconnect();
}
}, 5000);
this.socket.on("connect", () => {
clearTimeout(connectTimeout);
this.connected = true;
this.connecting = false;
this.reconnectAttempts = 0;
logger.info(`[${serverName}] Successfully connected to bridge server`);
});
this.socket.on("data", (data: Buffer) => {
this.responseBuffer = Buffer.concat([this.responseBuffer, data]);
while (this.responseBuffer.length >= 4) {
const length = this.responseBuffer.readUInt32LE(0);
if (this.responseBuffer.length < 4 + length) {
break;
}
const messageBytes = this.responseBuffer.slice(4, 4 + length);
this.responseBuffer = this.responseBuffer.slice(4 + length);
try {
const message = JSON.parse(
messageBytes.toString("utf-8"),
) as SocketMessage;
if (isNotification(message)) {
logger.info(
`[${serverName}] Received notification: ${message.method}`,
);
if (this.notificationHandler) {
this.notificationHandler(message);
}
} else if (isToolResponse(message)) {
logger.info(`[${serverName}] Received tool response: ${message}`);
this.handleResponse(message);
} else {
logger.info(`[${serverName}] Received unknown message: ${message}`);
}
} catch (error) {
logger.info(`[${serverName}] Failed to parse message:`, error);
}
}
});
this.socket.on("error", (error: Error & { code?: string }) => {
clearTimeout(connectTimeout);
logger.info(`[${serverName}] Socket error (code: ${error.code}):`, error);
this.connected = false;
this.connecting = false;
if (
error.code &&
[
"ECONNREFUSED", // Native host not listening (stale socket)
"ECONNRESET", // Connection reset by peer
"EPIPE", // Broken pipe (native host died mid-write)
"ENOENT", // Socket file was deleted
"EOPNOTSUPP", // Socket file exists but is not a valid socket
"ECONNABORTED", // Connection aborted
].includes(error.code)
) {
this.scheduleReconnect();
}
});
this.socket.on("close", () => {
clearTimeout(connectTimeout);
this.connected = false;
this.connecting = false;
this.scheduleReconnect();
});
}
private scheduleReconnect(): void {
const { serverName, logger } = this.context;
if (this.disableAutoReconnect) {
return;
}
if (this.reconnectTimer) {
logger.info(`[${serverName}] Reconnect already scheduled, skipping`);
return;
}
this.reconnectAttempts++;
// Give up after extended polling (~50 min). A new ensureConnected() call
// from a tool request will restart the cycle if needed.
const maxTotalAttempts = 100;
if (this.reconnectAttempts > maxTotalAttempts) {
logger.info(
`[${serverName}] Giving up after ${maxTotalAttempts} attempts. Will retry on next tool call.`,
);
this.reconnectAttempts = 0;
return;
}
// Use aggressive backoff for first 10 attempts, then slow poll every 30s.
const delay = Math.min(
this.reconnectDelay * Math.pow(1.5, this.reconnectAttempts - 1),
30000,
);
if (this.reconnectAttempts <= this.maxReconnectAttempts) {
logger.info(
`[${serverName}] Reconnecting in ${Math.round(delay)}ms (attempt ${
this.reconnectAttempts
})`,
);
} else if (this.reconnectAttempts % 10 === 0) {
// Log every 10th slow-poll attempt to avoid log spam
logger.info(
`[${serverName}] Still polling for native host (attempt ${this.reconnectAttempts})`,
);
}
this.reconnectTimer = setTimeout(() => {
this.reconnectTimer = null;
void this.connect();
}, delay);
}
private handleResponse(response: ToolResponse): void {
if (this.responseCallback) {
const callback = this.responseCallback;
this.responseCallback = null;
callback(response);
}
}
public setNotificationHandler(
handler: (notification: Notification) => void,
): void {
this.notificationHandler = handler;
}
public async ensureConnected(): Promise<boolean> {
const { serverName } = this.context;
if (this.connected && this.socket) {
return true;
}
if (!this.socket && !this.connecting) {
await this.connect();
}
// Wait for connection with timeout
return new Promise((resolve, reject) => {
let checkTimeoutId: NodeJS.Timeout | null = null;
const timeout = setTimeout(() => {
if (checkTimeoutId) {
clearTimeout(checkTimeoutId);
}
reject(
new SocketConnectionError(
`[${serverName}] Connection attempt timed out after 5000ms`,
),
);
}, 5000);
const checkConnection = () => {
if (this.connected) {
clearTimeout(timeout);
resolve(true);
} else {
checkTimeoutId = setTimeout(checkConnection, 500);
}
};
checkConnection();
});
}
private async sendRequest(
request: ToolRequest,
timeoutMs = 30000,
): Promise<ToolResponse> {
const { serverName } = this.context;
if (!this.socket) {
throw new SocketConnectionError(
`[${serverName}] Cannot send request: not connected`,
);
}
const socket = this.socket;
return new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
this.responseCallback = null;
reject(
new SocketConnectionError(
`[${serverName}] Tool request timed out after ${timeoutMs}ms`,
),
);
}, timeoutMs);
this.responseCallback = (response) => {
clearTimeout(timeout);
resolve(response);
};
const requestJson = JSON.stringify(request);
const requestBytes = Buffer.from(requestJson, "utf-8");
const lengthPrefix = Buffer.allocUnsafe(4);
lengthPrefix.writeUInt32LE(requestBytes.length, 0);
const message = Buffer.concat([lengthPrefix, requestBytes]);
socket.write(message);
});
}
public async callTool(
name: string,
args: Record<string, unknown>,
_permissionOverrides?: PermissionOverrides,
): Promise<unknown> {
const request: ToolRequest = {
method: "execute_tool",
params: {
client_id: this.context.clientTypeId,
tool: name,
args,
},
};
return this.sendRequestWithRetry(request);
}
/**
* Send a request with automatic retry on connection errors.
*
* On connection error or timeout, the native host may be a zombie (connected
* to dead Chrome). Force reconnect to pick up a fresh native host process
* and retry once.
*/
private async sendRequestWithRetry(request: ToolRequest): Promise<unknown> {
const { serverName, logger } = this.context;
try {
return await this.sendRequest(request);
} catch (error) {
if (!(error instanceof SocketConnectionError)) {
throw error;
}
logger.info(
`[${serverName}] Connection error, forcing reconnect and retrying: ${error.message}`,
);
this.closeSocket();
await this.ensureConnected();
return await this.sendRequest(request);
}
}
public async setPermissionMode(
_mode: PermissionMode,
_allowedDomains?: string[],
): Promise<void> {
// No-op: permission mode is only supported over the bridge (WebSocket) transport
}
public isConnected(): boolean {
return this.connected;
}
private closeSocket(): void {
if (this.socket) {
this.socket.removeAllListeners();
this.socket.end();
this.socket.destroy();
this.socket = null;
}
this.connected = false;
this.connecting = false;
}
private cleanup(): void {
if (this.reconnectTimer) {
clearTimeout(this.reconnectTimer);
this.reconnectTimer = null;
}
this.closeSocket();
this.reconnectAttempts = 0;
this.responseBuffer = Buffer.alloc(0);
this.responseCallback = null;
}
public disconnect(): void {
this.cleanup();
}
private async validateSocketSecurity(socketPath: string): Promise<void> {
const { serverName, logger } = this.context;
if (platform() === "win32") {
return;
}
try {
// Validate the parent directory permissions if it's the socket directory
// (not /tmp itself, which has mode 1777 for legacy single-socket paths)
const dirPath = dirname(socketPath);
const dirBasename = dirPath.split("/").pop() || "";
const isSocketDir = dirBasename.startsWith("claude-mcp-browser-bridge-");
if (isSocketDir) {
try {
const dirStats = await fsPromises.stat(dirPath);
if (dirStats.isDirectory()) {
const dirMode = dirStats.mode & 0o777;
if (dirMode !== 0o700) {
throw new Error(
`[${serverName}] Insecure socket directory permissions: ${dirMode.toString(
8,
)} (expected 0700). Directory may have been tampered with.`,
);
}
const currentUid = process.getuid?.();
if (currentUid !== undefined && dirStats.uid !== currentUid) {
throw new Error(
`Socket directory not owned by current user (uid: ${currentUid}, dir uid: ${dirStats.uid}). ` +
`Potential security risk.`,
);
}
}
} catch (dirError) {
if ((dirError as NodeJS.ErrnoException).code !== "ENOENT") {
throw dirError;
}
// Directory doesn't exist yet - native host will create it
}
}
const stats = await fsPromises.stat(socketPath);
if (!stats.isSocket()) {
throw new Error(
`[${serverName}] Path exists but it's not a socket: ${socketPath}`,
);
}
const mode = stats.mode & 0o777;
if (mode !== 0o600) {
throw new Error(
`[${serverName}] Insecure socket permissions: ${mode.toString(
8,
)} (expected 0600). Socket may have been tampered with.`,
);
}
const currentUid = process.getuid?.();
if (currentUid !== undefined && stats.uid !== currentUid) {
throw new Error(
`Socket not owned by current user (uid: ${currentUid}, socket uid: ${stats.uid}). ` +
`Potential security risk.`,
);
}
logger.info(`[${serverName}] Socket security validation passed`);
} catch (error) {
if ((error as NodeJS.ErrnoException).code === "ENOENT") {
logger.info(
`[${serverName}] Socket not found, will be created by server`,
);
return;
}
throw error;
}
}
}
export function createMcpSocketClient(
context: ClaudeForChromeContext,
): McpSocketClient {
return new McpSocketClient(context);
}
export type { McpSocketClient };

View File

@@ -0,0 +1,327 @@
import {
createMcpSocketClient,
SocketConnectionError,
} from "./mcpSocketClient.js";
import type { McpSocketClient } from "./mcpSocketClient.js";
import type {
ClaudeForChromeContext,
PermissionMode,
PermissionOverrides,
} from "./types.js";
/**
* Manages connections to multiple Chrome native host sockets (one per Chrome profile).
* Routes tool calls to the correct socket based on tab ID.
*
* For `tabs_context_mcp`: queries all connected sockets and merges results.
* For other tools: routes based on the `tabId` argument using a routing table
* built from tabs_context_mcp responses.
*/
export class McpSocketPool {
private clients: Map<string, McpSocketClient> = new Map();
private tabRoutes: Map<number, string> = new Map();
private context: ClaudeForChromeContext;
private notificationHandler:
| ((notification: { method: string; params?: Record<string, unknown> }) => void)
| null = null;
constructor(context: ClaudeForChromeContext) {
this.context = context;
}
public setNotificationHandler(
handler: (notification: {
method: string;
params?: Record<string, unknown>;
}) => void,
): void {
this.notificationHandler = handler;
for (const client of this.clients.values()) {
client.setNotificationHandler(handler);
}
}
/**
* Discover available sockets and ensure at least one is connected.
*/
public async ensureConnected(): Promise<boolean> {
const { logger, serverName } = this.context;
this.refreshClients();
// Try to connect any disconnected clients
const connectPromises: Promise<boolean>[] = [];
for (const client of this.clients.values()) {
if (!client.isConnected()) {
connectPromises.push(
client.ensureConnected().catch(() => false),
);
}
}
if (connectPromises.length > 0) {
await Promise.all(connectPromises);
}
const connectedCount = this.getConnectedClients().length;
if (connectedCount === 0) {
logger.info(`[${serverName}] No connected sockets in pool`);
return false;
}
logger.info(`[${serverName}] Socket pool: ${connectedCount} connected`);
return true;
}
/**
* Call a tool, routing to the correct socket based on tab ID.
* For tabs_context_mcp, queries all sockets and merges results.
*/
public async callTool(
name: string,
args: Record<string, unknown>,
_permissionOverrides?: PermissionOverrides,
): Promise<unknown> {
if (name === "tabs_context_mcp") {
return this.callTabsContext(args);
}
// Route by tabId if present
const tabId = args.tabId as number | undefined;
if (tabId !== undefined) {
const socketPath = this.tabRoutes.get(tabId);
if (socketPath) {
const client = this.clients.get(socketPath);
if (client?.isConnected()) {
return client.callTool(name, args);
}
}
// Tab route not found or client disconnected — fall through to any connected
}
// Fallback: use first connected client
const connected = this.getConnectedClients();
if (connected.length === 0) {
throw new SocketConnectionError(
`[${this.context.serverName}] No connected sockets available`,
);
}
return connected[0]!.callTool(name, args);
}
public async setPermissionMode(
mode: PermissionMode,
allowedDomains?: string[],
): Promise<void> {
const connected = this.getConnectedClients();
await Promise.all(
connected.map((client) => client.setPermissionMode(mode, allowedDomains)),
);
}
public isConnected(): boolean {
return this.getConnectedClients().length > 0;
}
public disconnect(): void {
for (const client of this.clients.values()) {
client.disconnect();
}
this.clients.clear();
this.tabRoutes.clear();
}
private getConnectedClients(): McpSocketClient[] {
return [...this.clients.values()].filter((c) => c.isConnected());
}
/**
* Query all connected sockets for tabs and merge results.
* Updates the tab routing table.
*/
private async callTabsContext(
args: Record<string, unknown>,
): Promise<unknown> {
const { logger, serverName } = this.context;
const connected = this.getConnectedClients();
if (connected.length === 0) {
throw new SocketConnectionError(
`[${serverName}] No connected sockets available`,
);
}
// If only one client, skip merging overhead
if (connected.length === 1) {
const result = await connected[0]!.callTool("tabs_context_mcp", args);
this.updateTabRoutes(result, this.getSocketPathForClient(connected[0]!));
return result;
}
// Query all connected clients in parallel
const results = await Promise.allSettled(
connected.map(async (client) => {
const result = await client.callTool("tabs_context_mcp", args);
const socketPath = this.getSocketPathForClient(client);
return { result, socketPath };
}),
);
// Merge tab results
const mergedTabs: unknown[] = [];
this.tabRoutes.clear();
for (const settledResult of results) {
if (settledResult.status !== "fulfilled") {
logger.info(
`[${serverName}] tabs_context_mcp failed on one socket: ${settledResult.reason}`,
);
continue;
}
const { result, socketPath } = settledResult.value;
this.updateTabRoutes(result, socketPath);
const tabs = this.extractTabs(result);
if (tabs) {
mergedTabs.push(...tabs);
}
}
// Return merged result in the same format as the extension response
if (mergedTabs.length > 0) {
const tabListText = mergedTabs
.map((t) => {
const tab = t as { tabId: number; title: string; url: string };
return ` • tabId ${tab.tabId}: "${tab.title}" (${tab.url})`;
})
.join("\n");
return {
result: {
content: [
{
type: "text",
text: JSON.stringify({ availableTabs: mergedTabs }),
},
{
type: "text",
text: `\n\nTab Context:\n- Available tabs:\n${tabListText}`,
},
],
},
};
}
// Fallback: return first successful result as-is
for (const settledResult of results) {
if (settledResult.status === "fulfilled") {
return settledResult.value.result;
}
}
throw new SocketConnectionError(
`[${serverName}] All sockets failed for tabs_context_mcp`,
);
}
/**
* Extract tab objects from a tool response to update routing table.
*/
private updateTabRoutes(result: unknown, socketPath: string): void {
const tabs = this.extractTabs(result);
if (!tabs) return;
for (const tab of tabs) {
if (typeof tab === "object" && tab !== null && "tabId" in tab) {
const tabId = (tab as { tabId: number }).tabId;
this.tabRoutes.set(tabId, socketPath);
}
}
}
private extractTabs(result: unknown): unknown[] | null {
if (!result || typeof result !== "object") return null;
// Response format: { result: { content: [{ type: "text", text: "{\"availableTabs\":[...],\"tabGroupId\":...}" }] } }
const asResponse = result as {
result?: { content?: Array<{ type: string; text?: string }> };
};
const content = asResponse.result?.content;
if (!content || !Array.isArray(content)) return null;
for (const item of content) {
if (item.type === "text" && item.text) {
try {
const parsed = JSON.parse(item.text);
if (Array.isArray(parsed)) return parsed;
// Handle { availableTabs: [...] } format
if (parsed && Array.isArray(parsed.availableTabs)) {
return parsed.availableTabs;
}
} catch {
// Not JSON, skip
}
}
}
return null;
}
private getSocketPathForClient(client: McpSocketClient): string {
for (const [path, c] of this.clients.entries()) {
if (c === client) return path;
}
return "";
}
/**
* Scan for available sockets and create/remove clients as needed.
*/
private refreshClients(): void {
const socketPaths = this.getAvailableSocketPaths();
const { logger, serverName } = this.context;
// Add new clients for newly discovered sockets
for (const path of socketPaths) {
if (!this.clients.has(path)) {
logger.info(`[${serverName}] Adding socket to pool: ${path}`);
const clientContext: ClaudeForChromeContext = {
...this.context,
socketPath: path,
getSocketPath: undefined,
getSocketPaths: undefined,
};
const client = createMcpSocketClient(clientContext);
client.disableAutoReconnect = true;
if (this.notificationHandler) {
client.setNotificationHandler(this.notificationHandler);
}
this.clients.set(path, client);
}
}
// Remove clients for sockets that no longer exist
for (const [path, client] of this.clients.entries()) {
if (!socketPaths.includes(path)) {
logger.info(`[${serverName}] Removing stale socket from pool: ${path}`);
client.disconnect();
this.clients.delete(path);
for (const [tabId, socketPath] of this.tabRoutes.entries()) {
if (socketPath === path) {
this.tabRoutes.delete(tabId);
}
}
}
}
}
private getAvailableSocketPaths(): string[] {
return this.context.getSocketPaths?.() ?? [];
}
}
export function createMcpSocketPool(
context: ClaudeForChromeContext,
): McpSocketPool {
return new McpSocketPool(context);
}

View File

@@ -0,0 +1,301 @@
import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
import { SocketConnectionError } from "./mcpSocketClient.js";
import type {
ClaudeForChromeContext,
PermissionMode,
PermissionOverrides,
SocketClient,
} from "./types.js";
export const handleToolCall = async (
context: ClaudeForChromeContext,
socketClient: SocketClient,
name: string,
args: Record<string, unknown>,
permissionOverrides?: PermissionOverrides,
): Promise<CallToolResult> => {
// Handle permission mode changes locally (not forwarded to extension)
if (name === "set_permission_mode") {
return handleSetPermissionMode(socketClient, args);
}
// Handle switch_browser outside the normal tool call flow (manages its own connection)
if (name === "switch_browser") {
return handleSwitchBrowser(context, socketClient);
}
try {
const isConnected = await socketClient.ensureConnected();
context.logger.silly(
`[${context.serverName}] Server is connected: ${isConnected}. Received tool call: ${name} with args: ${JSON.stringify(args)}.`,
);
if (isConnected) {
return await handleToolCallConnected(
context,
socketClient,
name,
args,
permissionOverrides,
);
}
return handleToolCallDisconnected(context);
} catch (error) {
context.logger.info(`[${context.serverName}] Error calling tool:`, error);
if (error instanceof SocketConnectionError) {
return handleToolCallDisconnected(context);
}
return {
content: [
{
type: "text",
text: `Error calling tool, please try again. : ${error instanceof Error ? error.message : String(error)}`,
},
],
isError: true,
};
}
};
async function handleToolCallConnected(
context: ClaudeForChromeContext,
socketClient: SocketClient,
name: string,
args: Record<string, unknown>,
permissionOverrides?: PermissionOverrides,
): Promise<CallToolResult> {
const response = await socketClient.callTool(name, args, permissionOverrides);
context.logger.silly(
`[${context.serverName}] Received result from socket bridge: ${JSON.stringify(response)}`,
);
if (response === null || response === undefined) {
return {
content: [{ type: "text", text: "Tool execution completed" }],
};
}
// Response will have either result or error field
const { result, error } = response as {
result?: { content: unknown[] | string };
error?: { content: unknown[] | string };
};
// Determine which field has the content and whether it's an error
const contentData = error || result;
const isError = !!error;
if (!contentData) {
return {
content: [{ type: "text", text: "Tool execution completed" }],
};
}
if (isError && isAuthenticationError(contentData.content)) {
context.onAuthenticationError();
}
const { content } = contentData;
if (content && Array.isArray(content)) {
if (isError) {
return {
content: content.map((item: unknown) => {
if (typeof item === "object" && item !== null && "type" in item) {
return item;
}
return { type: "text", text: String(item) };
}),
isError: true,
} as CallToolResult;
}
const convertedContent = content.map((item: unknown) => {
if (
typeof item === "object" &&
item !== null &&
"type" in item &&
"source" in item
) {
const typedItem = item;
if (
typedItem.type === "image" &&
typeof typedItem.source === "object" &&
typedItem.source !== null &&
"data" in typedItem.source
) {
return {
type: "image",
data: typedItem.source.data,
mimeType:
"media_type" in typedItem.source
? typedItem.source.media_type || "image/png"
: "image/png",
};
}
}
if (typeof item === "object" && item !== null && "type" in item) {
return item;
}
return { type: "text", text: String(item) };
});
return {
content: convertedContent,
isError,
} as CallToolResult;
}
// Handle string content
if (typeof content === "string") {
return {
content: [{ type: "text", text: content }],
isError,
} as CallToolResult;
}
// Fallback for unexpected result format
context.logger.warn(
`[${context.serverName}] Unexpected result format from socket bridge`,
response,
);
return {
content: [{ type: "text", text: JSON.stringify(response) }],
isError,
};
}
function handleToolCallDisconnected(
context: ClaudeForChromeContext,
): CallToolResult {
const text = context.onToolCallDisconnected();
return {
content: [{ type: "text", text }],
};
}
/**
* Handle set_permission_mode tool call locally.
* This is security-sensitive as it controls whether permission prompts are shown.
*/
async function handleSetPermissionMode(
socketClient: SocketClient,
args: Record<string, unknown>,
): Promise<CallToolResult> {
// Validate permission mode at runtime
const validModes = [
"ask",
"skip_all_permission_checks",
"follow_a_plan",
] as const;
const mode = args.mode as string | undefined;
const permissionMode: PermissionMode =
mode && validModes.includes(mode as PermissionMode)
? (mode as PermissionMode)
: "ask";
if (socketClient.setPermissionMode) {
await socketClient.setPermissionMode(
permissionMode,
args.allowed_domains as string[] | undefined,
);
}
return {
content: [
{ type: "text", text: `Permission mode set to: ${permissionMode}` },
],
};
}
/**
* Handle switch_browser tool call. Broadcasts a pairing request and blocks
* until a browser responds or timeout.
*/
async function handleSwitchBrowser(
context: ClaudeForChromeContext,
socketClient: SocketClient,
): Promise<CallToolResult> {
if (!context.bridgeConfig) {
return {
content: [
{
type: "text",
text: "Browser switching is only available with bridge connections.",
},
],
isError: true,
};
}
const isConnected = await socketClient.ensureConnected();
if (!isConnected) {
return handleToolCallDisconnected(context);
}
const result = (await socketClient.switchBrowser?.()) ?? null;
if (result === "no_other_browsers") {
return {
content: [
{
type: "text",
text: "No other browsers available to switch to. Open Chrome with the Claude extension in another browser to switch.",
},
],
isError: true,
};
}
if (result) {
return {
content: [
{ type: "text", text: `Connected to browser "${result.name}".` },
],
};
}
return {
content: [
{
type: "text",
text: "No browser responded within the timeout. Make sure Chrome is open with the Claude extension installed, then try again.",
},
],
isError: true,
};
}
/**
* Check if the error content indicates an authentication issue
*/
function isAuthenticationError(content: unknown[] | string): boolean {
const errorText = Array.isArray(content)
? content
.map((item) => {
if (typeof item === "string") return item;
if (
typeof item === "object" &&
item !== null &&
"text" in item &&
typeof item.text === "string"
) {
return item.text;
}
return "";
})
.join(" ")
: String(content);
return errorText.toLowerCase().includes("re-authenticated");
}

View File

@@ -0,0 +1,134 @@
export interface Logger {
info: (message: string, ...args: unknown[]) => void;
error: (message: string, ...args: unknown[]) => void;
warn: (message: string, ...args: unknown[]) => void;
debug: (message: string, ...args: unknown[]) => void;
silly: (message: string, ...args: unknown[]) => void;
}
export type PermissionMode =
| "ask"
| "skip_all_permission_checks"
| "follow_a_plan";
export interface BridgeConfig {
/** Bridge WebSocket base URL (e.g., wss://bridge.claudeusercontent.com) */
url: string;
/** Returns the user's account UUID for the connection path */
getUserId: () => Promise<string | undefined>;
/** Returns a valid OAuth token for bridge authentication */
getOAuthToken: () => Promise<string | undefined>;
/** Optional dev user ID for local development (bypasses OAuth) */
devUserId?: string;
}
/** Metadata about a connected Chrome extension instance. */
export interface ChromeExtensionInfo {
deviceId: string;
osPlatform?: string;
connectedAt: number;
name?: string;
}
export interface ClaudeForChromeContext {
serverName: string;
logger: Logger;
socketPath: string;
// Optional dynamic resolver for socket path. When provided, called on each
// connection attempt to handle runtime conditions (e.g., TMPDIR mismatch).
getSocketPath?: () => string;
// Optional resolver returning all available socket paths (for multi-profile support).
// When provided, a socket pool connects to all sockets and routes by tab ID.
getSocketPaths?: () => string[];
clientTypeId: string; // "desktop" | "claude-code"
onToolCallDisconnected: () => string;
onAuthenticationError: () => void;
isDisabled?: () => boolean;
/** Bridge WebSocket configuration. When provided, uses bridge instead of socket. */
bridgeConfig?: BridgeConfig;
/** If set, permission mode is sent to the extension immediately on bridge connection. */
initialPermissionMode?: PermissionMode;
/** Optional callback to track telemetry events for bridge connections */
trackEvent?: <K extends string>(
eventName: K,
metadata: Record<string, unknown> | null,
) => void;
/** Called when user pairs with an extension via the browser pairing flow. */
onExtensionPaired?: (deviceId: string, name: string) => void;
/** Returns the previously paired deviceId, if any. */
getPersistedDeviceId?: () => string | undefined;
/** Called when a remote extension is auto-selected (only option available). */
onRemoteExtensionWarning?: (ext: ChromeExtensionInfo) => void;
}
/**
* Map Node's process.platform to the platform string reported by Chrome extensions
* via navigator.userAgentData.platform.
*/
export function localPlatformLabel(): string {
return process.platform === "darwin"
? "macOS"
: process.platform === "win32"
? "Windows"
: "Linux";
}
/** Permission request forwarded from the extension to the desktop for user approval. */
export interface BridgePermissionRequest {
/** Links to the pending tool_call */
toolUseId: string;
/** Unique ID for this permission request */
requestId: string;
/** Tool type, e.g. "navigate", "click", "execute_javascript" */
toolType: string;
/** The URL/domain context */
url: string;
/** Additional action data (click coordinates, text, etc.) */
actionData?: Record<string, unknown>;
}
/** Desktop response to a bridge permission request. */
export interface BridgePermissionResponse {
requestId: string;
allowed: boolean;
}
/** Per-call permission overrides, allowing each session to use its own permission state. */
export interface PermissionOverrides {
permissionMode: PermissionMode;
allowedDomains?: string[];
/** Callback invoked when the extension requests user permission via the bridge. */
onPermissionRequest?: (request: BridgePermissionRequest) => Promise<boolean>;
}
/** Shared interface for McpSocketClient and McpSocketPool */
export interface SocketClient {
ensureConnected(): Promise<boolean>;
callTool(
name: string,
args: Record<string, unknown>,
permissionOverrides?: PermissionOverrides,
): Promise<unknown>;
isConnected(): boolean;
disconnect(): void;
setNotificationHandler(
handler: (notification: {
method: string;
params?: Record<string, unknown>;
}) => void,
): void;
/** Set permission mode for the current session. Only effective on BridgeClient. */
setPermissionMode?(
mode: PermissionMode,
allowedDomains?: string[],
): Promise<void>;
/** Switch to a different browser. Only available on BridgeClient. */
switchBrowser?(): Promise<
| {
deviceId: string;
name: string;
}
| "no_other_browsers"
| null
>;
}

View File

@@ -0,0 +1,25 @@
const path = require("path");
// Discriminated union: { isSupported: false } on non-darwin,
// { isSupported: true, ...nativeFns } on darwin. Cross-platform consumers
// (claude-cli-internal) require() unconditionally and narrow on isSupported.
if (process.platform !== "darwin") {
module.exports = { isSupported: false };
} else {
// COMPUTER_USE_INPUT_NODE_PATH: escape hatch for bundlers. Bun's --compile
// embeds the .node as an asset, not in a node_modules tree — __dirname is
// the exe dir and ../prebuilds/ doesn't exist. The consuming build bakes
// this var to the embedded asset's path. Unset → normal node_modules layout.
//
// key()/keys() dispatch enigo work onto DispatchQueue.main via
// dispatch2::run_on_main, then block a tokio worker on a channel. Under
// Electron (CFRunLoop drains the main queue) this works; under libuv
// (Node/bun) the main queue never drains and the promise hangs. Consumers
// running under libuv must pump CFRunLoop while key()/keys() are pending —
// e.g. claude-cli-internal borrows @ant/computer-use-swift's _drainMainRunLoop.
const native = require(
process.env.COMPUTER_USE_INPUT_NODE_PATH ??
path.resolve(__dirname, "../prebuilds/computer-use-input.node"),
);
module.exports = { isSupported: true, ...native };
}

View File

@@ -0,0 +1,553 @@
/**
* App category lookup for tiered CU permissions. Three categories land at a
* restricted tier instead of `"full"`:
*
* - **browser** → `"read"` tier — visible in screenshots, NO interaction.
* The model can read an already-open page but must use the Claude-in-Chrome
* MCP for navigation/clicking/typing.
* - **terminal** → `"click"` tier — visible + clickable, NO typing. The
* model can click a Run button or scroll test output in an IDE, but can't
* type into the integrated terminal. Use the Bash tool for shell work.
* - **trading** → `"read"` tier — same restrictions as browsers, but no
* CiC-MCP alternative exists. For platforms where a stray click can
* execute a trade or send a message to a counterparty.
*
* Uncategorized apps default to `"full"`. See `getDefaultTierForApp`.
*
* Identification is two-layered:
* 1. Bundle ID match (macOS-only; `InstalledApp.bundleId` is a
* CFBundleIdentifier and meaningless on Windows). Fast, exact, the
* primary mechanism while CU is darwin-gated.
* 2. Display-name substring match (cross-platform fallback). Catches
* unresolved requests ("Chrome" when Chrome isn't installed) AND will
* be the primary mechanism on Windows/Linux where there's no bundle ID.
* Windows-relevant names (PowerShell, cmd, Windows Terminal) are
* included now so they activate the moment the darwin gate lifts.
*
* Keep this file **import-free** (like sentinelApps.ts) — the renderer may
* import it via a package.json subpath export, and pulling in
* `@modelcontextprotocol/sdk` (a devDep) through the index → mcpServer chain
* would fail module resolution in Next.js. The `CuAppPermTier` type is
* duplicated as a string literal below rather than imported.
*/
export type DeniedCategory = "browser" | "terminal" | "trading";
/**
* Map a category to its hardcoded tier. Return-type is the string-literal
* union inline (this file is import-free; see header comment). The
* authoritative type is `CuAppPermTier` in types.ts — keep in sync.
*
* Not bijective — both `"browser"` and `"trading"` map to `"read"`. Copy
* that differs by category (the "use CiC" hint is browser-only) must check
* the category, not just the tier.
*/
export function categoryToTier(
category: DeniedCategory | null,
): "read" | "click" | "full" {
if (category === "browser" || category === "trading") return "read";
if (category === "terminal") return "click";
return "full";
}
// ─── Bundle-ID deny sets (macOS) ─────────────────────────────────────────
const BROWSER_BUNDLE_IDS: ReadonlySet<string> = new Set([
// Apple
"com.apple.Safari",
"com.apple.SafariTechnologyPreview",
// Google
"com.google.Chrome",
"com.google.Chrome.beta",
"com.google.Chrome.dev",
"com.google.Chrome.canary",
// Microsoft
"com.microsoft.edgemac",
"com.microsoft.edgemac.Beta",
"com.microsoft.edgemac.Dev",
"com.microsoft.edgemac.Canary",
// Mozilla
"org.mozilla.firefox",
"org.mozilla.firefoxdeveloperedition",
"org.mozilla.nightly",
// Chromium-based
"org.chromium.Chromium",
"com.brave.Browser",
"com.brave.Browser.beta",
"com.brave.Browser.nightly",
"com.operasoftware.Opera",
"com.operasoftware.OperaGX",
"com.operasoftware.OperaDeveloper",
"com.vivaldi.Vivaldi",
// The Browser Company
"company.thebrowser.Browser", // Arc
"company.thebrowser.dia", // Dia (agentic)
// Privacy-focused
"org.torproject.torbrowser",
"com.duckduckgo.macos.browser",
"ru.yandex.desktop.yandex-browser",
// Agentic / AI browsers — newer entrants with LLM integrations
"ai.perplexity.comet",
"com.sigmaos.sigmaos.macos", // SigmaOS
// Webkit-based misc
"com.kagi.kagimacOS", // Orion
]);
/**
* Terminals + IDEs with integrated terminals. Supersets
* `SHELL_ACCESS_BUNDLE_IDS` from sentinelApps.ts — terminals proceed to the
* approval dialog at tier "click", and the sentinel warning renders
* alongside the tier badge.
*/
const TERMINAL_BUNDLE_IDS: ReadonlySet<string> = new Set([
// Dedicated terminals
"com.apple.Terminal",
"com.googlecode.iterm2",
"dev.warp.Warp-Stable",
"dev.warp.Warp-Beta",
"com.github.wez.wezterm",
"org.alacritty",
"io.alacritty", // pre-v0.11.0 (renamed 2022-07) — kept for legacy installs
"net.kovidgoyal.kitty",
"co.zeit.hyper",
"com.mitchellh.ghostty",
"org.tabby",
"com.termius-dmg.mac", // Termius
// IDEs with integrated terminals — we can't distinguish "type in the
// editor" from "type in the integrated terminal" via screenshot+click.
// VS Code family
"com.microsoft.VSCode",
"com.microsoft.VSCodeInsiders",
"com.vscodium", // VSCodium
"com.todesktop.230313mzl4w4u92", // Cursor
"com.exafunction.windsurf", // Windsurf / Codeium
"dev.zed.Zed",
"dev.zed.Zed-Preview",
// JetBrains family (all have integrated terminals)
"com.jetbrains.intellij",
"com.jetbrains.intellij.ce",
"com.jetbrains.pycharm",
"com.jetbrains.pycharm.ce",
"com.jetbrains.WebStorm",
"com.jetbrains.CLion",
"com.jetbrains.goland",
"com.jetbrains.rubymine",
"com.jetbrains.PhpStorm",
"com.jetbrains.datagrip",
"com.jetbrains.rider",
"com.jetbrains.AppCode",
"com.jetbrains.rustrover",
"com.jetbrains.fleet",
"com.google.android.studio", // Android Studio (JetBrains-based)
// Other IDEs
"com.axosoft.gitkraken", // GitKraken has an integrated terminal panel. Also keeps the "kraken" trading-substring from miscategorizing it — bundle-ID wins.
"com.sublimetext.4",
"com.sublimetext.3",
"org.vim.MacVim",
"com.neovim.neovim",
"org.gnu.Emacs",
// Xcode's previous carve-out (full tier for Interface Builder / simulator)
// was reversed — at tier "click" IB and simulator taps still work (both are
// plain clicks) while the integrated terminal is blocked from keyboard input.
"com.apple.dt.Xcode",
"org.eclipse.platform.ide",
"org.netbeans.ide",
"com.microsoft.visual-studio", // Visual Studio for Mac
// AppleScript/automation execution surfaces — same threat as terminals:
// type(script) → key("cmd+r") runs arbitrary code. Added after #28011
// removed the osascript MCP server, making CU the only tool-call route
// to AppleScript.
"com.apple.ScriptEditor2",
"com.apple.Automator",
"com.apple.shortcuts",
]);
/**
* Trading / crypto platforms — granted at tier `"read"` so the agent can see
* balances and prices but can't click into an order, transfer, or IB chat.
* Bundle IDs populated from Homebrew cask `uninstall.quit` stanzas as they're
* verified; the name-substring fallback below is the primary check. Bloomberg
* Terminal has no native macOS build per their FAQ (web/Citrix only).
*
* Budgeting/accounting apps (Quicken, YNAB, QuickBooks, etc.) are NOT listed
* here — they default to tier `"full"`. The risk model for brokerage/crypto
* (a stray click can execute a trade) doesn't apply to budgeting apps; the
* Cowork system prompt carries the soft instruction to never execute trades
* or transfer money on the user's behalf.
*/
const TRADING_BUNDLE_IDS: ReadonlySet<string> = new Set([
// Verified via Homebrew quit/zap stanzas + mdls + electron-builder source.
// Trading
"com.webull.desktop.v1", // Webull (direct download, Qt)
"com.webull.trade.mac.v1", // Webull (Mac App Store)
"com.tastytrade.desktop",
"com.tradingview.tradingviewapp.desktop",
"com.fidelity.activetrader", // Fidelity Trader+ (new)
"com.fmr.activetrader", // Fidelity Active Trader Pro (legacy)
// Interactive Brokers TWS — install4j wrapper; Homebrew quit stanza is
// authoritative for this exact value but install4j IDs can drift across
// major versions — name-substring "trader workstation" is the fallback.
"com.install4j.5889-6375-8446-2021",
// Crypto
"com.binance.BinanceDesktop",
"com.electron.exodus",
// Electrum uses PyInstaller with bundle_identifier=None → defaults to
// org.pythonmac.unspecified.<AppName>. Confirmed in spesmilo/electrum
// source + Homebrew zap. IntuneBrew's "org.electrum.electrum" is a fork.
"org.pythonmac.unspecified.Electrum",
"com.ledger.live",
"io.trezor.TrezorSuite",
// No native macOS app (name-substring only): Schwab, E*TRADE, TradeStation,
// Robinhood, NinjaTrader, Coinbase, Kraken, Bloomberg. thinkorswim
// install4j ID drifts per-install — substring safer.
]);
// ─── Policy-deny (not a tier — cannot be granted at all) ─────────────────
//
// Streaming / ebook / music apps and a handful of publisher apps. These
// are auto-denied before the approval dialog — no tier can be granted.
// Rationale is copyright / content-control (the agent has no legitimate
// need to screenshot Netflix or click Play on Spotify).
//
// Sourced from the ACP CU-apps blocklist xlsx ("Full block" tab). See
// /tmp/extract_cu_blocklist.py for the extraction script.
const POLICY_DENIED_BUNDLE_IDS: ReadonlySet<string> = new Set([
// Verified via Homebrew quit/zap + mdls /System/Applications + IntuneBrew.
// Apple built-ins
"com.apple.TV",
"com.apple.Music",
"com.apple.iBooksX",
"com.apple.podcasts",
// Music
"com.spotify.client",
"com.amazon.music",
"com.tidal.desktop",
"com.deezer.deezer-desktop",
"com.pandora.desktop",
"com.electron.pocket-casts", // direct-download Electron wrapper
"au.com.shiftyjelly.PocketCasts", // Mac App Store
// Video
"tv.plex.desktop",
"tv.plex.htpc",
"tv.plex.plexamp",
"com.amazon.aiv.AIVApp", // Prime Video (iOS-on-Apple-Silicon)
// Ebooks
"net.kovidgoyal.calibre",
"com.amazon.Kindle", // legacy desktop, discontinued
"com.amazon.Lassen", // current Mac App Store (iOS-on-Mac)
"com.kobo.desktop.Kobo",
// No native macOS app (name-substring only): Netflix, Disney+, Hulu,
// HBO Max, Peacock, Paramount+, YouTube, Crunchyroll, Tubi, Vudu,
// Audible, Reddit, NYTimes. Their iOS apps don't opt into iPad-on-Mac.
]);
const POLICY_DENIED_NAME_SUBSTRINGS: readonly string[] = [
// Video streaming
"netflix",
"disney+",
"hulu",
"prime video",
"apple tv",
"peacock",
"paramount+",
// "plex" is too generic — would match "Perplexity". Covered by
// tv.plex.* bundle IDs on macOS.
"tubi",
"crunchyroll",
"vudu",
// E-readers / audiobooks
"kindle",
"apple books",
"kobo",
"play books",
"calibre",
"libby",
"readium",
"audible",
"libro.fm",
"speechify",
// Music
"spotify",
"apple music",
"amazon music",
"youtube music",
"tidal",
"deezer",
"pandora",
"pocket casts",
// Publisher / social apps (from the same blocklist tab)
"naver",
"reddit",
"sony music",
"vegas pro",
"pitchfork",
"economist",
"nytimes",
// Skipped (too generic for substring matching — need bundle ID):
// HBO Max / Max, YouTube (non-Music), Nook, Sony Catalyst, Wired
];
/**
* Policy-level auto-deny. Unlike `userDeniedBundleIds` (per-user Settings
* page), this is baked into the build. `buildAccessRequest` strips these
* before the approval dialog with "blocked by policy" guidance; the agent
* is told to not retry.
*/
export function isPolicyDenied(
bundleId: string | undefined,
displayName: string,
): boolean {
if (bundleId && POLICY_DENIED_BUNDLE_IDS.has(bundleId)) return true;
const lower = displayName.toLowerCase();
for (const sub of POLICY_DENIED_NAME_SUBSTRINGS) {
if (lower.includes(sub)) return true;
}
return false;
}
export function getDeniedCategory(bundleId: string): DeniedCategory | null {
if (BROWSER_BUNDLE_IDS.has(bundleId)) return "browser";
if (TERMINAL_BUNDLE_IDS.has(bundleId)) return "terminal";
if (TRADING_BUNDLE_IDS.has(bundleId)) return "trading";
return null;
}
// ─── Display-name fallback (cross-platform) ──────────────────────────────
/**
* Lowercase substrings checked against the requested display name. Catches:
* - Unresolved requests (app not installed, Spotlight miss)
* - Future Windows/Linux support where bundleId is meaningless
*
* Matched via `.includes()` on `name.toLowerCase()`. Entries are ordered
* by specificity (more-specific first is irrelevant since we return on
* first match, but groupings are by category for readability).
*/
const BROWSER_NAME_SUBSTRINGS: readonly string[] = [
"safari",
"chrome",
"firefox",
"microsoft edge",
"brave",
"opera",
"vivaldi",
"chromium",
// Arc/Dia: the canonical display name is just "Arc"/"Dia" — too short for
// substring matching (false-positives: "Arcade", "Diagram"). Covered by
// bundle ID on macOS. The "... browser" entries below catch natural-language
// phrasings ("the arc browser") but NOT the canonical short name.
"arc browser",
"tor browser",
"duckduckgo",
"yandex",
"orion browser",
// Agentic / AI browsers
"comet", // Perplexity's browser — "Comet" substring risks false positives
// but leaving for now; "comet" in an app name is rare
"sigmaos",
"dia browser",
];
const TERMINAL_NAME_SUBSTRINGS: readonly string[] = [
// macOS / cross-platform terminals
"terminal", // catches Terminal, Windows Terminal (NOT iTerm — separate entry)
"iterm",
"wezterm",
"alacritty",
"kitty",
"ghostty",
"tabby",
"termius",
// AppleScript runners — see bundle-ID comment above. "shortcuts" is too
// generic for substring matching (many apps have "shortcuts" in the name);
// covered by bundle ID only, like warp/hyper.
"script editor",
"automator",
// NOTE: "warp" and "hyper" are too generic for substring matching —
// they'd false-positive on "Warpaint" or "Hyperion". Covered by bundle ID
// (dev.warp.Warp-Stable, co.zeit.hyper) for macOS; Windows exe-name
// matching can be added when Windows CU ships.
// Windows shells (activate when the darwin gate lifts)
"powershell",
"cmd.exe",
"command prompt",
"git bash",
"conemu",
"cmder",
// IDEs (VS Code family)
"visual studio code",
"visual studio", // catches VS for Mac + Windows
"vscode",
"vs code",
"vscodium",
"cursor", // Cursor IDE — "cursor" is generic but IDE is the only common app
"windsurf",
// Zed: display name is just "Zed" — too short for substring matching
// (false-positives). Covered by bundle ID (dev.zed.Zed) on macOS.
// IDEs (JetBrains family)
"intellij",
"pycharm",
"webstorm",
"clion",
"goland",
"rubymine",
"phpstorm",
"datagrip",
"rider",
"appcode",
"rustrover",
"fleet",
"android studio",
// Other IDEs
"sublime text",
"macvim",
"neovim",
"emacs",
"xcode",
"eclipse",
"netbeans",
];
const TRADING_NAME_SUBSTRINGS: readonly string[] = [
// Trading — brokerage apps. Sourced from the ACP CU-apps blocklist xlsx
// ("Read Only" tab). Name-substring safe for proper nouns below; generic
// names (IG, Delta, HTX) are skipped and need bundle-ID matching once
// verified.
"bloomberg",
"ameritrade",
"thinkorswim",
"schwab",
"fidelity",
"e*trade",
"interactive brokers",
"trader workstation", // Interactive Brokers TWS
"tradestation",
"webull",
"robinhood",
"tastytrade",
"ninjatrader",
"tradingview",
"moomoo",
"tradezero",
"prorealtime",
"plus500",
"saxotrader",
"oanda",
"metatrader",
"forex.com",
"avaoptions",
"ctrader",
"jforex",
"iq option",
"olymp trade",
"binomo",
"pocket option",
"raceoption",
"expertoption",
"quotex",
"naga",
"morgan stanley",
"ubs neo",
"eikon", // Thomson Reuters / LSEG Workspace
// Crypto — exchanges, wallets, portfolio trackers
"coinbase",
"kraken",
"binance",
"okx",
"bybit",
// "gate.io" is too generic — the ".io" TLD suffix is common in app names
// (e.g., "Draw.io"). Needs bundle-ID matching once verified.
"phemex",
"stormgain",
"crypto.com",
// "exodus" is too generic — it's a common noun and would match unrelated
// apps/games. Needs bundle-ID matching once verified.
"electrum",
"ledger live",
"trezor",
"guarda",
"atomic wallet",
"bitpay",
"bisq",
"koinly",
"cointracker",
"blockfi",
"stripe cli",
// Crypto games / metaverse (same trade-execution risk model)
"decentraland",
"axie infinity",
"gods unchained",
];
/**
* Display-name substring match. Called when bundle-ID resolution returned
* nothing (`resolved === undefined`) or when no bundle-ID deny-list entry
* matched. Returns the category for the first matching substring, or null.
*
* Case-insensitive, substring — so `"Google Chrome"`, `"chrome"`, and
* `"Chrome Canary"` all match the `"chrome"` entry.
*/
export function getDeniedCategoryByDisplayName(
name: string,
): DeniedCategory | null {
const lower = name.toLowerCase();
// Trading first — proper-noun-only set, most specific. "Bloomberg Terminal"
// contains "terminal" and would miscategorize if TERMINAL_NAME_SUBSTRINGS
// ran first.
for (const sub of TRADING_NAME_SUBSTRINGS) {
if (lower.includes(sub)) return "trading";
}
for (const sub of BROWSER_NAME_SUBSTRINGS) {
if (lower.includes(sub)) return "browser";
}
for (const sub of TERMINAL_NAME_SUBSTRINGS) {
if (lower.includes(sub)) return "terminal";
}
return null;
}
/**
* Combined check — bundle ID first (exact, fast), then display-name
* fallback. This is the function tool-call handlers should use.
*
* `bundleId` may be undefined (unresolved request — model asked for an app
* that isn't installed or Spotlight didn't find). In that case only the
* display-name check runs.
*/
export function getDeniedCategoryForApp(
bundleId: string | undefined,
displayName: string,
): DeniedCategory | null {
if (bundleId) {
const byId = getDeniedCategory(bundleId);
if (byId) return byId;
}
return getDeniedCategoryByDisplayName(displayName);
}
/**
* Default tier for an app at grant time. Wraps `getDeniedCategoryForApp` +
* `categoryToTier`. Browsers → `"read"`, terminals/IDEs → `"click"`,
* everything else → `"full"`.
*
* Called by `buildAccessRequest` to populate `ResolvedAppRequest.proposedTier`
* before the approval dialog shows.
*/
export function getDefaultTierForApp(
bundleId: string | undefined,
displayName: string,
): "read" | "click" | "full" {
return categoryToTier(getDeniedCategoryForApp(bundleId, displayName));
}
export const _test = {
BROWSER_BUNDLE_IDS,
TERMINAL_BUNDLE_IDS,
TRADING_BUNDLE_IDS,
POLICY_DENIED_BUNDLE_IDS,
BROWSER_NAME_SUBSTRINGS,
TERMINAL_NAME_SUBSTRINGS,
TRADING_NAME_SUBSTRINGS,
POLICY_DENIED_NAME_SUBSTRINGS,
};

View File

@@ -0,0 +1,108 @@
/**
* Port of the API's image transcoder target-size algorithm. Pre-sizing
* screenshots to this function's output means the API's early-return fires
* (tokens ≤ max) and the image is NOT resized server-side — so the model
* sees exactly the dimensions in `ScreenshotResult.width/height` and
* `scaleCoord` stays coherent.
*
* Rust reference: api/api/image_transcoder/rust_transcoder/src/utils/resize.rs
* Sibling TS port: apps/claude-browser-use/src/utils/imageResize.ts (identical
* algorithm, lives in the Chrome extension tree — not a shared package).
*
* See COORDINATES.md for why this matters for click accuracy.
*/
export interface ResizeParams {
pxPerToken: number;
maxTargetPx: number;
maxTargetTokens: number;
}
/**
* Production defaults — match `resize.rs:160-164` and Chrome's
* `CDPService.ts:638-642`. Vision encoder uses 28px tiles; 1568 is both
* the long-edge cap (56 tiles) AND the token budget.
*/
export const API_RESIZE_PARAMS: ResizeParams = {
pxPerToken: 28,
maxTargetPx: 1568,
maxTargetTokens: 1568,
};
/** ceil(px / pxPerToken). Matches resize.rs:74-76 (which uses integer ceil-div). */
export function nTokensForPx(px: number, pxPerToken: number): number {
return Math.floor((px - 1) / pxPerToken) + 1;
}
function nTokensForImg(
width: number,
height: number,
pxPerToken: number,
): number {
return nTokensForPx(width, pxPerToken) * nTokensForPx(height, pxPerToken);
}
/**
* Binary-search along the width dimension for the largest image that:
* - preserves the input aspect ratio
* - has long edge ≤ maxTargetPx
* - has ceil(w/pxPerToken) × ceil(h/pxPerToken) ≤ maxTargetTokens
*
* Returns [width, height]. No-op if input already satisfies all three.
*
* The long-edge constraint alone (what we used to use) is insufficient on
* squarer-than-16:9 displays: 1568×1014 (MBP 16" AR) is 56×37 = 2072 tokens,
* over budget, and gets server-resized to 1372×887 — model then clicks in
* 1372-space but scaleCoord assumed 1568-space → ~14% coord error.
*
* Matches resize.rs:91-155 exactly (verified against its test vectors).
*/
export function targetImageSize(
width: number,
height: number,
params: ResizeParams,
): [number, number] {
const { pxPerToken, maxTargetPx, maxTargetTokens } = params;
if (
width <= maxTargetPx &&
height <= maxTargetPx &&
nTokensForImg(width, height, pxPerToken) <= maxTargetTokens
) {
return [width, height];
}
// Normalize to landscape for the search; transpose result back.
if (height > width) {
const [w, h] = targetImageSize(height, width, params);
return [h, w];
}
const aspectRatio = width / height;
// Loop invariant: lowerBoundWidth is always valid, upperBoundWidth is
// always invalid. ~12 iterations for a 4000px image.
let upperBoundWidth = width;
let lowerBoundWidth = 1;
for (;;) {
if (lowerBoundWidth + 1 === upperBoundWidth) {
return [
lowerBoundWidth,
Math.max(Math.round(lowerBoundWidth / aspectRatio), 1),
];
}
const middleWidth = Math.floor((lowerBoundWidth + upperBoundWidth) / 2);
const middleHeight = Math.max(Math.round(middleWidth / aspectRatio), 1);
if (
middleWidth <= maxTargetPx &&
nTokensForImg(middleWidth, middleHeight, pxPerToken) <= maxTargetTokens
) {
lowerBoundWidth = middleWidth;
} else {
upperBoundWidth = middleWidth;
}
}
}

View File

@@ -0,0 +1,69 @@
export type {
ComputerExecutor,
DisplayGeometry,
FrontmostApp,
InstalledApp,
ResolvePrepareCaptureResult,
RunningApp,
ScreenshotResult,
} from "./executor.js";
export type {
AppGrant,
CuAppPermTier,
ComputerUseHostAdapter,
ComputerUseOverrides,
ComputerUseSessionContext,
CoordinateMode,
CuGrantFlags,
CuPermissionRequest,
CuPermissionResponse,
CuSubGates,
CuTeachPermissionRequest,
Logger,
ResolvedAppRequest,
ScreenshotDims,
TeachStepRequest,
TeachStepResult,
} from "./types.js";
export { DEFAULT_GRANT_FLAGS } from "./types.js";
export {
SENTINEL_BUNDLE_IDS,
getSentinelCategory,
} from "./sentinelApps.js";
export type { SentinelCategory } from "./sentinelApps.js";
export {
categoryToTier,
getDefaultTierForApp,
getDeniedCategory,
getDeniedCategoryByDisplayName,
getDeniedCategoryForApp,
isPolicyDenied,
} from "./deniedApps.js";
export type { DeniedCategory } from "./deniedApps.js";
export { isSystemKeyCombo, normalizeKeySequence } from "./keyBlocklist.js";
export { ALL_SUB_GATES_OFF, ALL_SUB_GATES_ON } from "./subGates.js";
export { API_RESIZE_PARAMS, targetImageSize } from "./imageResize.js";
export type { ResizeParams } from "./imageResize.js";
export { defersLockAcquire, handleToolCall } from "./toolCalls.js";
export type {
CuCallTelemetry,
CuCallToolResult,
CuErrorKind,
} from "./toolCalls.js";
export { bindSessionContext, createComputerUseMcpServer } from "./mcpServer.js";
export { buildComputerUseTools } from "./tools.js";
export {
comparePixelAtLocation,
validateClickTarget,
} from "./pixelCompare.js";
export type { CropRawPatchFn, PixelCompareResult } from "./pixelCompare.js";

View File

@@ -0,0 +1,153 @@
/**
* Key combos that cross app boundaries or terminate processes. Gated behind
* the `systemKeyCombos` grant flag. When that flag is off, the `key` tool
* rejects these and returns a tool error telling the model to request the
* flag; all other combos work normally.
*
* Matching is canonicalized: every modifier alias the Rust executor accepts
* collapses to one canonical name. Without this, `command+q` / `meta+q` /
* `cmd+alt+escape` bypass the gate — see keyBlocklist.test.ts for the three
* bypass forms and the Rust parity check that catches future alias drift.
*/
/**
* Every modifier alias enigo_wrap.rs accepts (two copies: :351-359, :564-572),
* mapped to one canonical per Key:: variant. Left/right variants collapse —
* the blocklist doesn't distinguish which Ctrl.
*
* Canonical names are Rust's own variant names lowercased. Blocklist entries
* below use ONLY these. "meta" reads odd for Cmd+Q but it's honest: Rust
* sends Key::Meta, which is Cmd on darwin and Win on win32.
*/
const CANONICAL_MODIFIER: Readonly<Record<string, string>> = {
// Key::Meta — "meta"|"super"|"command"|"cmd"|"windows"|"win"
meta: "meta",
super: "meta",
command: "meta",
cmd: "meta",
windows: "meta",
win: "meta",
// Key::Control + LControl + RControl
ctrl: "ctrl",
control: "ctrl",
lctrl: "ctrl",
lcontrol: "ctrl",
rctrl: "ctrl",
rcontrol: "ctrl",
// Key::Shift + LShift + RShift
shift: "shift",
lshift: "shift",
rshift: "shift",
// Key::Alt and Key::Option — distinct Rust variants but same keycode on
// darwin (kVK_Option). Collapse: cmd+alt+escape and cmd+option+escape
// both Force Quit.
alt: "alt",
option: "alt",
};
/** Sort order for canonicals. ctrl < alt < shift < meta. */
const MODIFIER_ORDER = ["ctrl", "alt", "shift", "meta"];
/**
* Canonical-form entries only. Every modifier must be a CANONICAL_MODIFIER
* *value* (not key), modifiers must be in MODIFIER_ORDER, non-modifier last.
* The self-consistency test enforces this.
*/
const BLOCKED_DARWIN = new Set([
"meta+q", // Cmd+Q — quit frontmost app
"shift+meta+q", // Cmd+Shift+Q — log out
"alt+meta+escape", // Cmd+Option+Esc — Force Quit dialog
"meta+tab", // Cmd+Tab — app switcher
"meta+space", // Cmd+Space — Spotlight
"ctrl+meta+q", // Ctrl+Cmd+Q — lock screen
]);
const BLOCKED_WIN32 = new Set([
"ctrl+alt+delete", // Secure Attention Sequence
"alt+f4", // close window
"alt+tab", // window switcher
"meta+l", // Win+L — lock
"meta+d", // Win+D — show desktop
]);
/**
* Partition into sorted-canonical modifiers and non-modifier keys.
* Shared by normalizeKeySequence (join for display) and isSystemKeyCombo
* (check mods+each-key to catch the cmd+q+a suffix bypass).
*/
function partitionKeys(seq: string): { mods: string[]; keys: string[] } {
const parts = seq
.toLowerCase()
.split("+")
.map((p) => p.trim())
.filter(Boolean);
const mods: string[] = [];
const keys: string[] = [];
for (const p of parts) {
const canonical = CANONICAL_MODIFIER[p];
if (canonical !== undefined) {
mods.push(canonical);
} else {
keys.push(p);
}
}
// Dedupe: "cmd+command+q" → "meta+q", not "meta+meta+q".
const uniqueMods = [...new Set(mods)];
uniqueMods.sort(
(a, b) => MODIFIER_ORDER.indexOf(a) - MODIFIER_ORDER.indexOf(b),
);
return { mods: uniqueMods, keys };
}
/**
* Normalize "Cmd + Shift + Q" → "shift+meta+q": lowercase, trim, alias →
* canonical, dedupe, sort modifiers, non-modifiers last.
*/
export function normalizeKeySequence(seq: string): string {
const { mods, keys } = partitionKeys(seq);
return [...mods, ...keys].join("+");
}
/**
* True if the sequence would fire a blocked OS shortcut.
*
* Checks mods + EACH non-modifier key individually, not just the full
* joined string. `cmd+q+a` → Rust presses Cmd, then Q (Cmd+Q fires here),
* then A. Exact-match against "meta+q+a" misses; checking "meta+q" and
* "meta+a" separately catches the Q.
*
* Modifiers-only sequences ("cmd+shift") are checked as-is — no key to
* pair with, and no blocklist entry is modifier-only, so this is a no-op
* that falls through to false. Covers the click-modifier case where
* `left_click(text="cmd")` is legitimate.
*/
export function isSystemKeyCombo(
seq: string,
platform: "darwin" | "win32",
): boolean {
const blocklist = platform === "darwin" ? BLOCKED_DARWIN : BLOCKED_WIN32;
const { mods, keys } = partitionKeys(seq);
const prefix = mods.length > 0 ? mods.join("+") + "+" : "";
// No non-modifier keys (e.g. "cmd+shift" as click-modifiers) — check the
// whole thing. Never matches (no blocklist entry is modifier-only) but
// keeps the contract simple: every call reaches a .has().
if (keys.length === 0) {
return blocklist.has(mods.join("+"));
}
// mods + each key. Any hit blocks the whole sequence.
for (const key of keys) {
if (blocklist.has(prefix + key)) {
return true;
}
}
return false;
}
export const _test = {
CANONICAL_MODIFIER,
BLOCKED_DARWIN,
BLOCKED_WIN32,
MODIFIER_ORDER,
};

View File

@@ -0,0 +1,313 @@
/**
* MCP server factory + session-context binder.
*
* Two entry points:
*
* `bindSessionContext` — the wrapper closure. Takes a `ComputerUseSessionContext`
* (getters + callbacks backed by host session state), returns a dispatcher.
* Reusable by both the MCP CallTool handler here AND Cowork's
* `InternalServerDefinition.handleToolCall` (which doesn't go through MCP).
* This replaces the duplicated wrapper closures in apps/desktop/…/serverDef.ts
* and the Claude Code CLI's CU host wrapper — both did the same thing: build `ComputerUseOverrides`
* fresh from getters, call `handleToolCall`, stash screenshot, merge permissions.
*
* `createComputerUseMcpServer` — the Server object. When `context` is provided,
* the CallTool handler is real (uses `bindSessionContext`). When not, it's the
* legacy stub that returns a not-wired error. The tool-schema ListTools handler
* is the same either way.
*/
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
import {
CallToolRequestSchema,
ListToolsRequestSchema,
} from "@modelcontextprotocol/sdk/types.js";
import type { ScreenshotResult } from "./executor.js";
import type { CuCallToolResult } from "./toolCalls.js";
import {
defersLockAcquire,
handleToolCall,
resetMouseButtonHeld,
} from "./toolCalls.js";
import { buildComputerUseTools } from "./tools.js";
import type {
AppGrant,
ComputerUseHostAdapter,
ComputerUseOverrides,
ComputerUseSessionContext,
CoordinateMode,
CuGrantFlags,
CuPermissionResponse,
} from "./types.js";
import { DEFAULT_GRANT_FLAGS } from "./types.js";
const DEFAULT_LOCK_HELD_MESSAGE =
"Another Claude session is currently using the computer. Wait for that " +
"session to finish, or find a non-computer-use approach.";
/**
* Dedupe `granted` into `existing` on bundleId, spread truthy-only flags over
* defaults+existing. Truthy-only: a subsequent `request_access` that doesn't
* request clipboard can't revoke an earlier clipboard grant — revocation lives
* in a Settings page, not here.
*
* Same merge both hosts implemented independently today.
*/
function mergePermissionResponse(
existing: readonly AppGrant[],
existingFlags: CuGrantFlags,
response: CuPermissionResponse,
): { apps: AppGrant[]; flags: CuGrantFlags } {
const seen = new Set(existing.map((a) => a.bundleId));
const apps = [
...existing,
...response.granted.filter((g) => !seen.has(g.bundleId)),
];
const truthyFlags = Object.fromEntries(
Object.entries(response.flags).filter(([, v]) => v === true),
);
const flags: CuGrantFlags = {
...DEFAULT_GRANT_FLAGS,
...existingFlags,
...truthyFlags,
};
return { apps, flags };
}
/**
* Bind session state to a reusable dispatcher. The returned function is the
* wrapper closure: async lock gate → build overrides fresh → `handleToolCall`
* → stash screenshot → strip piggybacked fields.
*
* The last-screenshot blob is held in a closure cell here (not on `ctx`), so
* hosts don't need to guarantee `ctx` object identity across calls — they just
* need to hold onto the returned dispatcher. Cowork caches per
* `InternalServerContext` in a WeakMap; the CLI host constructs once at server creation.
*/
export function bindSessionContext(
adapter: ComputerUseHostAdapter,
coordinateMode: CoordinateMode,
ctx: ComputerUseSessionContext,
): (name: string, args: unknown) => Promise<CuCallToolResult> {
const { logger, serverName } = adapter;
// Screenshot blob persists here across calls — NOT on `ctx`. Hosts hold
// onto the returned dispatcher; that's the identity that matters.
let lastScreenshot: ScreenshotResult | undefined;
const wrapPermission = ctx.onPermissionRequest
? async (
req: Parameters<NonNullable<typeof ctx.onPermissionRequest>>[0],
signal: AbortSignal,
): Promise<CuPermissionResponse> => {
const response = await ctx.onPermissionRequest!(req, signal);
const { apps, flags } = mergePermissionResponse(
ctx.getAllowedApps(),
ctx.getGrantFlags(),
response,
);
logger.debug(
`[${serverName}] permission result: granted=${response.granted.length} denied=${response.denied.length}`,
);
ctx.onAllowedAppsChanged?.(apps, flags);
return response;
}
: undefined;
const wrapTeachPermission = ctx.onTeachPermissionRequest
? async (
req: Parameters<NonNullable<typeof ctx.onTeachPermissionRequest>>[0],
signal: AbortSignal,
): Promise<CuPermissionResponse> => {
const response = await ctx.onTeachPermissionRequest!(req, signal);
logger.debug(
`[${serverName}] teach permission result: granted=${response.granted.length} denied=${response.denied.length}`,
);
// Teach doesn't request grant flags — preserve existing.
const { apps } = mergePermissionResponse(
ctx.getAllowedApps(),
ctx.getGrantFlags(),
response,
);
ctx.onAllowedAppsChanged?.(apps, {
...DEFAULT_GRANT_FLAGS,
...ctx.getGrantFlags(),
});
return response;
}
: undefined;
return async (name, args) => {
// ─── Async lock gate ─────────────────────────────────────────────────
// Replaces the sync Gate-3 in `handleToolCall` — we pass
// `checkCuLock: undefined` below so it no-ops. Hosts with
// cross-process locks (O_EXCL file) await the real primitive here
// instead of pre-computing + feeding a fake sync result.
if (ctx.checkCuLock) {
const lock = await ctx.checkCuLock();
if (lock.holder !== undefined && !lock.isSelf) {
const text =
ctx.formatLockHeldMessage?.(lock.holder) ?? DEFAULT_LOCK_HELD_MESSAGE;
return {
content: [{ type: "text", text }],
isError: true,
telemetry: { error_kind: "cu_lock_held" },
};
}
if (lock.holder === undefined && !defersLockAcquire(name)) {
await ctx.acquireCuLock?.();
// Re-check: the awaits above yield the microtask queue, so another
// session's check+acquire can interleave with ours. Hosts where
// acquire is a no-op when already held (Cowork's CuLockManager) give
// no signal that we lost — verify we're now the holder before
// proceeding. The CLI's O_EXCL file lock would surface this as a throw from
// acquire instead; this re-check is a belt-and-suspenders for that
// path too.
const recheck = await ctx.checkCuLock();
if (recheck.holder !== undefined && !recheck.isSelf) {
const text =
ctx.formatLockHeldMessage?.(recheck.holder) ??
DEFAULT_LOCK_HELD_MESSAGE;
return {
content: [{ type: "text", text }],
isError: true,
telemetry: { error_kind: "cu_lock_held" },
};
}
// Fresh holder → any prior session's mouseButtonHeld is stale.
// Mirrors what Gate-3 does on the acquire branch. After the
// re-check so we only clear module state when we actually won.
resetMouseButtonHeld();
}
}
// ─── Build overrides fresh ───────────────────────────────────────────
// Blob-first; dims-fallback with base64:"" when the closure cell is
// unset (cross-respawn). scaleCoord reads dims; pixelCompare sees "" →
// isEmpty → skip.
const dimsFallback = lastScreenshot
? undefined
: ctx.getLastScreenshotDims?.();
// Per-call AbortController for dialog dismissal. Aborted in `finally` —
// if handleToolCall finishes (MCP timeout, throw) before the user
// answers, the host's dialog handler sees the abort and tears down.
const dialogAbort = new AbortController();
const overrides: ComputerUseOverrides = {
allowedApps: [...ctx.getAllowedApps()],
grantFlags: ctx.getGrantFlags(),
userDeniedBundleIds: ctx.getUserDeniedBundleIds(),
coordinateMode,
selectedDisplayId: ctx.getSelectedDisplayId(),
displayPinnedByModel: ctx.getDisplayPinnedByModel?.(),
displayResolvedForApps: ctx.getDisplayResolvedForApps?.(),
lastScreenshot:
lastScreenshot ??
(dimsFallback ? { ...dimsFallback, base64: "" } : undefined),
onPermissionRequest: wrapPermission
? (req) => wrapPermission(req, dialogAbort.signal)
: undefined,
onTeachPermissionRequest: wrapTeachPermission
? (req) => wrapTeachPermission(req, dialogAbort.signal)
: undefined,
onAppsHidden: ctx.onAppsHidden,
getClipboardStash: ctx.getClipboardStash,
onClipboardStashChanged: ctx.onClipboardStashChanged,
onResolvedDisplayUpdated: ctx.onResolvedDisplayUpdated,
onDisplayPinned: ctx.onDisplayPinned,
onDisplayResolvedForApps: ctx.onDisplayResolvedForApps,
onTeachModeActivated: ctx.onTeachModeActivated,
onTeachStep: ctx.onTeachStep,
onTeachWorking: ctx.onTeachWorking,
getTeachModeActive: ctx.getTeachModeActive,
// Undefined → handleToolCall's sync Gate-3 no-ops. The async gate
// above already ran.
checkCuLock: undefined,
acquireCuLock: undefined,
isAborted: ctx.isAborted,
};
logger.debug(
`[${serverName}] tool=${name} allowedApps=${overrides.allowedApps.length} coordMode=${coordinateMode}`,
);
// ─── Dispatch ────────────────────────────────────────────────────────
try {
const result = await handleToolCall(adapter, name, args, overrides);
if (result.screenshot) {
lastScreenshot = result.screenshot;
const { base64: _blob, ...dims } = result.screenshot;
logger.debug(`[${serverName}] screenshot dims: ${JSON.stringify(dims)}`);
ctx.onScreenshotCaptured?.(dims);
}
return result;
} finally {
dialogAbort.abort();
}
};
}
export function createComputerUseMcpServer(
adapter: ComputerUseHostAdapter,
coordinateMode: CoordinateMode,
context?: ComputerUseSessionContext,
): Server {
const { serverName, logger } = adapter;
const server = new Server(
{ name: serverName, version: "0.1.3" },
{ capabilities: { tools: {}, logging: {} } },
);
const tools = buildComputerUseTools(
adapter.executor.capabilities,
coordinateMode,
);
server.setRequestHandler(ListToolsRequestSchema, async () =>
adapter.isDisabled() ? { tools: [] } : { tools },
);
if (context) {
const dispatch = bindSessionContext(adapter, coordinateMode, context);
server.setRequestHandler(
CallToolRequestSchema,
async (request): Promise<CallToolResult> => {
const { screenshot: _s, telemetry: _t, ...result } = await dispatch(
request.params.name,
request.params.arguments ?? {},
);
return result;
},
);
return server;
}
// Legacy: no context → stub handler. Reached only if something calls the
// server over MCP transport WITHOUT going through a binder (a wiring
// regression). Clear error instead of silent failure.
server.setRequestHandler(
CallToolRequestSchema,
async (request): Promise<CallToolResult> => {
logger.warn(
`[${serverName}] tool call "${request.params.name}" reached the stub handler — no session context bound. Per-session state unavailable.`,
);
return {
content: [
{
type: "text",
text: "This computer-use server instance is not wired to a session. Per-session app permissions are not available on this code path.",
},
],
isError: true,
};
},
);
return server;
}

View File

@@ -0,0 +1,171 @@
/**
* Staleness guard ported from the Vercept acquisition.
*
* Compares the model's last-seen screenshot against a fresh-right-now
* screenshot at the click target, so the model never clicks pixels it hasn't
* seen. If the 9×9 patch around the target differs, the click is aborted and
* the model is told to re-screenshot. This is NOT a popup detector.
*
* Semantics preserved exactly:
* - Skip on no `lastScreenshot` (cold start) — click proceeds.
* - Skip on any internal error (crop throws, screenshot fails, etc.) —
* click proceeds. Validation failure must never block the action.
* - 9×9 exact byte equality on raw pixel bytes. No fuzzing, no tolerance.
* - Compare in percentage coords so Retina scale doesn't matter.
*
* JPEG decode + crop is INJECTED via `ComputerUseHostAdapter.cropRawPatch`.
* The original used `sharp` (LGPL, native `.node` addon); we inject Electron's
* `nativeImage` (Chromium decoders, BSD, nothing to bundle) from the host, so
* this package never imports it — the crop is a function parameter.
*/
import type { ScreenshotResult } from "./executor.js";
import type { Logger } from "./types.js";
/** Injected by the host. See `ComputerUseHostAdapter.cropRawPatch`. */
export type CropRawPatchFn = (
jpegBase64: string,
rect: { x: number; y: number; width: number; height: number },
) => Buffer | null;
/** 9×9 is empirically the sweet spot — large enough to catch a tooltip
* appearing, small enough to not false-positive on surrounding animation.
**/
const DEFAULT_GRID_SIZE = 9;
export interface PixelCompareResult {
/** true → click may proceed. false → patch changed, abort the click. */
valid: boolean;
/** true → validation did not run (cold start, sub-gate off, or internal
* error). The caller MUST treat this identically to `valid: true`. */
skipped: boolean;
/** Populated when valid === false. Returned to the model verbatim. */
warning?: string;
}
/**
* Compute the crop rect for a patch centered on (xPercent, yPercent).
*
* Dimensions come from ScreenshotResult.width/height (physical pixels). Both
* screenshots have the same dimensions (same display, consecutive captures),
* so the rect is the same for both.
*/
function computeCropRect(
imgW: number,
imgH: number,
xPercent: number,
yPercent: number,
gridSize: number,
): { x: number; y: number; width: number; height: number } | null {
if (!imgW || !imgH) return null;
const clampedX = Math.max(0, Math.min(100, xPercent));
const clampedY = Math.max(0, Math.min(100, yPercent));
const centerX = Math.round((clampedX / 100.0) * imgW);
const centerY = Math.round((clampedY / 100.0) * imgH);
const halfGrid = Math.floor(gridSize / 2);
const cropX = Math.max(0, centerX - halfGrid);
const cropY = Math.max(0, centerY - halfGrid);
const cropW = Math.min(gridSize, imgW - cropX);
const cropH = Math.min(gridSize, imgH - cropY);
if (cropW <= 0 || cropH <= 0) return null;
return { x: cropX, y: cropY, width: cropW, height: cropH };
}
/**
* Compare the same patch location between two screenshots.
*
* @returns true when the raw pixel bytes are identical. false on any
* difference, or on any internal error (the caller treats an error here as
* `skipped`, so the false is harmless).
*/
export function comparePixelAtLocation(
crop: CropRawPatchFn,
lastScreenshot: ScreenshotResult,
freshScreenshot: ScreenshotResult,
xPercent: number,
yPercent: number,
gridSize: number = DEFAULT_GRID_SIZE,
): boolean {
// Both screenshots are of the same display — use the fresh one's
// dimensions (less likely to be stale than last's).
const rect = computeCropRect(
freshScreenshot.width,
freshScreenshot.height,
xPercent,
yPercent,
gridSize,
);
if (!rect) return false;
const patch1 = crop(lastScreenshot.base64, rect);
const patch2 = crop(freshScreenshot.base64, rect);
if (!patch1 || !patch2) return false;
// Direct buffer equality. Note: nativeImage.toBitmap() gives BGRA, sharp's
// .raw() gave RGB.
// Doesn't matter — we're comparing two same-format buffers for equality.
return patch1.equals(patch2);
}
/**
* Battle-tested click-target validation ported from the Vercept acquisition,
* with the fresh-screenshot capture delegated to the caller (we don't have
* a global `SystemActions.takeScreenshot()` — the executor is injected).
*
* Skip conditions (any of these → `{ valid: true, skipped: true }`):
* - `lastScreenshot` is undefined (cold start).
* - `takeFreshScreenshot()` throws or returns null.
* - Injected crop function returns null (decode failure).
* - Any other exception.
*
* The caller decides whether to invoke this at all (sub-gate check lives
* in toolCalls.ts, not here).
*/
export async function validateClickTarget(
crop: CropRawPatchFn,
lastScreenshot: ScreenshotResult | undefined,
xPercent: number,
yPercent: number,
takeFreshScreenshot: () => Promise<ScreenshotResult | null>,
logger: Logger,
gridSize: number = DEFAULT_GRID_SIZE,
): Promise<PixelCompareResult> {
if (!lastScreenshot) {
return { valid: true, skipped: true };
}
try {
const fresh = await takeFreshScreenshot();
if (!fresh) {
return { valid: true, skipped: true };
}
const pixelsMatch = comparePixelAtLocation(
crop,
lastScreenshot,
fresh,
xPercent,
yPercent,
gridSize,
);
if (pixelsMatch) {
return { valid: true, skipped: false };
}
return {
valid: false,
skipped: false,
warning:
"Screen content at the target location changed since the last screenshot. Take a new screenshot before clicking.",
};
} catch (err) {
// Skip validation on technical errors, execute action anyway.
// Battle-tested: validation failure must never block the click.
logger.debug("[pixelCompare] validation error, skipping", err);
return { valid: true, skipped: true };
}
}

View File

@@ -0,0 +1,43 @@
/**
* Bundle IDs that are escalations-in-disguise. The approval UI shows a warning
* badge for these; they are NOT blocked. Power users may legitimately want the
* model controlling a terminal.
*
* Imported by the renderer via the `./sentinelApps` subpath (package.json
* `exports`), which keeps Next.js from reaching index.ts → mcpServer.ts →
* @modelcontextprotocol/sdk (devDep, would fail module resolution). Keep
* this file import-free so the subpath stays clean.
*/
/** These apps can execute arbitrary shell commands. */
const SHELL_ACCESS_BUNDLE_IDS = new Set([
"com.apple.Terminal",
"com.googlecode.iterm2",
"com.microsoft.VSCode",
"dev.warp.Warp-Stable",
"com.github.wez.wezterm",
"io.alacritty",
"net.kovidgoyal.kitty",
"com.jetbrains.intellij",
"com.jetbrains.pycharm",
]);
/** Finder in the allowlist ≈ browse + open-any-file. */
const FILESYSTEM_ACCESS_BUNDLE_IDS = new Set(["com.apple.finder"]);
const SYSTEM_SETTINGS_BUNDLE_IDS = new Set(["com.apple.systempreferences"]);
export const SENTINEL_BUNDLE_IDS: ReadonlySet<string> = new Set([
...SHELL_ACCESS_BUNDLE_IDS,
...FILESYSTEM_ACCESS_BUNDLE_IDS,
...SYSTEM_SETTINGS_BUNDLE_IDS,
]);
export type SentinelCategory = "shell" | "filesystem" | "system_settings";
export function getSentinelCategory(bundleId: string): SentinelCategory | null {
if (SHELL_ACCESS_BUNDLE_IDS.has(bundleId)) return "shell";
if (FILESYSTEM_ACCESS_BUNDLE_IDS.has(bundleId)) return "filesystem";
if (SYSTEM_SETTINGS_BUNDLE_IDS.has(bundleId)) return "system_settings";
return null;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,706 @@
/**
* MCP tool schemas for the computer-use server. Mirrors
* claude-for-chrome-mcp/src/browserTools.ts in shape (plain `Tool`-shaped
* object literals, no zod).
*
* Coordinate descriptions are baked in at tool-list build time from the
* `chicago_coordinate_mode` gate. The model sees exactly ONE coordinate
* convention in the param descriptions and never learns the other exists.
* The host (`serverDef.ts`) reads the same frozen gate value for
* `scaleCoord` — both must agree or clicks land in the wrong space.
*/
import type { Tool } from "@modelcontextprotocol/sdk/types.js";
import type { CoordinateMode } from "./types.js";
// See packages/desktop/computer-use-mcp/COORDINATES.md before touching any
// model-facing coordinate text. Chrome's browserTools.ts:143 is the reference
// phrasing — "pixels from the left edge", no geometry, no number to do math with.
const COORD_DESC: Record<CoordinateMode, { x: string; y: string }> = {
pixels: {
x: "Horizontal pixel position read directly from the most recent screenshot image, measured from the left edge. The server handles all scaling.",
y: "Vertical pixel position read directly from the most recent screenshot image, measured from the top edge. The server handles all scaling.",
},
normalized_0_100: {
x: "Horizontal position as a percentage of screen width, 0.0100.0 (0 = left edge, 100 = right edge).",
y: "Vertical position as a percentage of screen height, 0.0100.0 (0 = top edge, 100 = bottom edge).",
},
};
const FRONTMOST_GATE_DESC =
"The frontmost application must be in the session allowlist at the time of this call, or this tool returns an error and does nothing.";
/**
* Item schema for the `actions` array in `computer_batch`, `teach_step`, and
* `teach_batch`. All three dispatch through the same `dispatchAction` path
* with the same validation — keep this enum in sync with `BATCHABLE_ACTIONS`
* in toolCalls.ts.
*/
const BATCH_ACTION_ITEM_SCHEMA = {
type: "object",
properties: {
action: {
type: "string",
enum: [
"key",
"type",
"mouse_move",
"left_click",
"left_click_drag",
"right_click",
"middle_click",
"double_click",
"triple_click",
"scroll",
"hold_key",
"screenshot",
"cursor_position",
"left_mouse_down",
"left_mouse_up",
"wait",
],
description: "The action to perform.",
},
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
"(x, y) for click/mouse_move/scroll/left_click_drag end point.",
},
start_coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
"(x, y) drag start — left_click_drag only. Omit to drag from current cursor.",
},
text: {
type: "string",
description:
"For type: the text. For key/hold_key: the chord string. For click/scroll: modifier keys to hold.",
},
scroll_direction: {
type: "string",
enum: ["up", "down", "left", "right"],
},
scroll_amount: { type: "integer", minimum: 0, maximum: 100 },
duration: {
type: "number",
description: "Seconds (0100). For hold_key/wait.",
},
repeat: {
type: "integer",
minimum: 1,
maximum: 100,
description: "For key: repeat count.",
},
},
required: ["action"],
};
/**
* Build the tool list. Parameterized by capabilities and coordinate mode so
* descriptions are honest and unambiguous (plan §1 — "Unfiltered + honest").
*
* `coordinateMode` MUST match what the host passes to `scaleCoord` at tool-
* -call time. Both should read the same frozen-at-load gate constant.
*
* `installedAppNames` — optional pre-sanitized list of app display names to
* enumerate in the `request_access` description. The caller is responsible
* for sanitization (length cap, character allowlist, sort, count cap) —
* this function just splices the list into the description verbatim. Omit
* to fall back to the generic "display names or bundle IDs" wording.
*/
export function buildComputerUseTools(
caps: {
screenshotFiltering: "native" | "none";
platform: "darwin" | "win32";
/** Include request_teach_access + teach_step. Read once at server construction. */
teachMode?: boolean;
},
coordinateMode: CoordinateMode,
installedAppNames?: string[],
): Tool[] {
const coord = COORD_DESC[coordinateMode];
// Shared hint suffix for BOTH request_access and request_teach_access —
// they use the same resolveRequestedApps path, so the model should get
// the same enumeration for both.
const installedAppsHint =
installedAppNames && installedAppNames.length > 0
? ` Available applications on this machine: ${installedAppNames.join(", ")}.`
: "";
// [x, y]` tuple — param shape for all
// click/move/scroll tools.
const coordinateTuple = {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: `(x, y): ${coord.x}`,
};
// Modifier hold during click. Shared across all 5 click variants.
const clickModifierText = {
type: "string",
description:
'Modifier keys to hold during the click (e.g. "shift", "ctrl+shift"). Supports the same syntax as the key tool.',
};
const screenshotDesc =
caps.screenshotFiltering === "native"
? "Take a screenshot of the primary display. Applications not in the session allowlist are excluded at the compositor level — only granted apps and the desktop are visible."
: "Take a screenshot of the primary display. On this platform, screenshots are NOT filtered — all open windows are visible. Input actions targeting apps not in the session allowlist are rejected.";
return [
{
name: "request_access",
description:
"Request user permission to control a set of applications for this session. Must be called before any other tool in this server. " +
"The user sees a single dialog listing all requested apps and either allows the whole set or denies it. " +
"Call this again mid-session to add more apps; previously granted apps remain granted. " +
"Returns the granted apps, denied apps, and screenshot filtering capability.",
inputSchema: {
type: "object" as const,
properties: {
apps: {
type: "array",
items: { type: "string" },
description:
"Application display names (e.g. \"Slack\", \"Calendar\") or bundle identifiers (e.g. \"com.tinyspeck.slackmacgap\"). Display names are resolved case-insensitively against installed apps." +
installedAppsHint,
},
reason: {
type: "string",
description:
"One-sentence explanation shown to the user in the approval dialog. Explain the task, not the mechanism.",
},
clipboardRead: {
type: "boolean",
description:
"Also request permission to read the user's clipboard (separate checkbox in the dialog).",
},
clipboardWrite: {
type: "boolean",
description:
"Also request permission to write the user's clipboard. When granted, multi-line `type` calls use the clipboard fast path.",
},
systemKeyCombos: {
type: "boolean",
description:
"Also request permission to send system-level key combos (quit app, switch app, lock screen). Without this, those specific combos are blocked.",
},
},
required: ["apps", "reason"],
},
},
{
name: "screenshot",
description:
screenshotDesc +
" Returns an error if the allowlist is empty. The returned image is what subsequent click coordinates are relative to.",
inputSchema: {
type: "object" as const,
properties: {
save_to_disk: {
type: "boolean",
description:
"Save the image to disk so it can be attached to a message for the user. Returns the saved path in the tool result. Only set this when you intend to share the image — screenshots you're just looking at don't need saving.",
},
},
required: [],
},
},
{
name: "zoom",
description:
"Take a higher-resolution screenshot of a specific region of the last full-screen screenshot. Use this liberally to inspect small text, button labels, or fine UI details that are hard to read in the downsampled full-screen image. " +
"IMPORTANT: Coordinates in subsequent click calls always refer to the full-screen screenshot, never the zoomed image. This tool is read-only for inspecting detail.",
inputSchema: {
type: "object" as const,
properties: {
region: {
type: "array",
items: { type: "integer" },
minItems: 4,
maxItems: 4,
description:
"(x0, y0, x1, y1): Rectangle to zoom into, in the coordinate space of the most recent full-screen screenshot. x0,y0 = top-left, x1,y1 = bottom-right.",
},
save_to_disk: {
type: "boolean",
description:
"Save the image to disk so it can be attached to a message for the user. Returns the saved path in the tool result. Only set this when you intend to share the image.",
},
},
required: ["region"],
},
},
{
name: "left_click",
description: `Left-click at the given coordinates. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "double_click",
description: `Double-click at the given coordinates. Selects a word in most text editors. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "triple_click",
description: `Triple-click at the given coordinates. Selects a line in most text editors. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "right_click",
description: `Right-click at the given coordinates. Opens a context menu in most applications. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "middle_click",
description: `Middle-click (scroll-wheel click) at the given coordinates. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "type",
description: `Type text into whatever currently has keyboard focus. ${FRONTMOST_GATE_DESC} Newlines are supported. For keyboard shortcuts use \`key\` instead.`,
inputSchema: {
type: "object" as const,
properties: {
text: { type: "string", description: "Text to type." },
},
required: ["text"],
},
},
{
name: "key",
description:
`Press a key or key combination (e.g. "return", "escape", "cmd+a", "ctrl+shift+tab"). ${FRONTMOST_GATE_DESC} ` +
"System-level combos (quit app, switch app, lock screen) require the `systemKeyCombos` grant — without it they return an error. All other combos work.",
inputSchema: {
type: "object" as const,
properties: {
text: {
type: "string",
description: 'Modifiers joined with "+", e.g. "cmd+shift+a".',
},
repeat: {
type: "integer",
minimum: 1,
maximum: 100,
description: "Number of times to repeat the key press. Default is 1.",
},
},
required: ["text"],
},
},
{
name: "scroll",
description: `Scroll at the given coordinates. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
scroll_direction: {
type: "string",
enum: ["up", "down", "left", "right"],
description: "Direction to scroll.",
},
scroll_amount: {
type: "integer",
minimum: 0,
maximum: 100,
description: "Number of scroll ticks.",
},
},
required: ["coordinate", "scroll_direction", "scroll_amount"],
},
},
{
name: "left_click_drag",
description: `Press, move to target, and release. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: {
...coordinateTuple,
description: `(x, y) end point: ${coord.x}`,
},
start_coordinate: {
...coordinateTuple,
description: `(x, y) start point. If omitted, drags from the current cursor position. ${coord.x}`,
},
},
required: ["coordinate"],
},
},
{
name: "mouse_move",
description: `Move the mouse cursor without clicking. Useful for triggering hover states. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
},
required: ["coordinate"],
},
},
{
name: "open_application",
description:
"Bring an application to the front, launching it if necessary. The target application must already be in the session allowlist — call request_access first.",
inputSchema: {
type: "object" as const,
properties: {
app: {
type: "string",
description:
"Display name (e.g. \"Slack\") or bundle identifier (e.g. \"com.tinyspeck.slackmacgap\").",
},
},
required: ["app"],
},
},
{
name: "switch_display",
description:
"Switch which monitor subsequent screenshots capture. Use this when the " +
"application you need is on a different monitor than the one shown. " +
"The screenshot tool tells you which monitor it captured and lists " +
"other attached monitors by name — pass one of those names here. " +
"After switching, call screenshot to see the new monitor. " +
'Pass "auto" to return to automatic monitor selection.',
inputSchema: {
type: "object" as const,
properties: {
display: {
type: "string",
description:
'Monitor name from the screenshot note (e.g. "Built-in Retina Display", ' +
'"LG UltraFine"), or "auto" to re-enable automatic selection.',
},
},
required: ["display"],
},
},
{
name: "list_granted_applications",
description:
"List the applications currently in the session allowlist, plus the active grant flags and coordinate mode. No side effects.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "read_clipboard",
description:
"Read the current clipboard contents as text. Requires the `clipboardRead` grant.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "write_clipboard",
description:
"Write text to the clipboard. Requires the `clipboardWrite` grant.",
inputSchema: {
type: "object" as const,
properties: {
text: { type: "string" },
},
required: ["text"],
},
},
{
name: "wait",
description: "Wait for a specified duration.",
inputSchema: {
type: "object" as const,
properties: {
duration: {
type: "number",
description: "Duration in seconds (0100).",
},
},
required: ["duration"],
},
},
{
name: "cursor_position",
description:
"Get the current mouse cursor position. Returns image-pixel coordinates relative to the most recent screenshot, or logical points if no screenshot has been taken.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "hold_key",
description:
`Press and hold a key or key combination for the specified duration, then release. ${FRONTMOST_GATE_DESC} ` +
"System-level combos require the `systemKeyCombos` grant.",
inputSchema: {
type: "object" as const,
properties: {
text: {
type: "string",
description: 'Key or chord to hold, e.g. "space", "shift+down".',
},
duration: {
type: "number",
description: "Duration in seconds (0100).",
},
},
required: ["text", "duration"],
},
},
{
name: "left_mouse_down",
description:
`Press the left mouse button at the current cursor position and leave it held. ${FRONTMOST_GATE_DESC} ` +
"Use mouse_move first to position the cursor. Call left_mouse_up to release. Errors if the button is already held.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "left_mouse_up",
description:
`Release the left mouse button at the current cursor position. ${FRONTMOST_GATE_DESC} ` +
"Pairs with left_mouse_down. Safe to call even if the button is not currently held.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "computer_batch",
description:
"Execute a sequence of actions in ONE tool call. Each individual tool call requires a model→API round trip (seconds); " +
"batching a predictable sequence eliminates all but one. Use this whenever you can predict the outcome of several actions ahead — " +
"e.g. click a field, type into it, press Return. Actions execute sequentially and stop on the first error. " +
`${FRONTMOST_GATE_DESC} The frontmost check runs before EACH action inside the batch — if an action opens a non-allowed app, the next action's gate fires and the batch stops there. ` +
"Mid-batch screenshot actions are allowed for inspection but coordinates in subsequent clicks always refer to the PRE-BATCH full-screen screenshot.",
inputSchema: {
type: "object" as const,
properties: {
actions: {
type: "array",
minItems: 1,
items: BATCH_ACTION_ITEM_SCHEMA,
description:
'List of actions. Example: [{"action":"left_click","coordinate":[100,200]},{"action":"type","text":"hello"},{"action":"key","text":"Return"}]',
},
},
required: ["actions"],
},
},
...(caps.teachMode ? buildTeachTools(coord, installedAppsHint) : []),
];
}
/**
* Teach-mode tools. Split out so the spread above stays a single expression;
* takes `coord` so `teach_step.anchor`'s description uses the same
* frozen coordinate-mode phrasing as click coords, and `installedAppsHint`
* so `request_teach_access.apps` gets the same enumeration as
* `request_access.apps` (same resolution path → same hint).
*/
function buildTeachTools(
coord: { x: string; y: string },
installedAppsHint: string,
): Tool[] {
// Shared between teach_step (top-level) and teach_batch (inside steps[]
// items). Depends on coord, so it lives inside this factory.
const teachStepProperties = {
explanation: {
type: "string",
description:
"Tooltip body text. Explain what the user is looking at and why it matters. " +
"This is the ONLY place the user sees your words — be complete but concise.",
},
next_preview: {
type: "string",
description:
"One line describing exactly what will happen when the user clicks Next. " +
'Example: "Next: I\'ll click Create Bucket and type the name." ' +
"Shown below the explanation in a smaller font.",
},
anchor: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
`(x, y) — where the tooltip arrow points. ${coord.x} ` +
"Omit to center the tooltip with no arrow (for general-context steps).",
},
actions: {
type: "array",
// Empty allowed — "read this, click Next" steps.
items: BATCH_ACTION_ITEM_SCHEMA,
description:
"Actions to execute when the user clicks Next. Same item schema as computer_batch.actions. " +
"Empty array is valid for purely explanatory steps. Actions run sequentially and stop on first error.",
},
} as const;
return [
{
name: "request_teach_access",
description:
"Request permission to guide the user through a task step-by-step with on-screen tooltips. " +
"Use this INSTEAD OF request_access when the user wants to LEARN how to do something " +
'(phrases like "teach me", "walk me through", "show me how", "help me learn"). ' +
"On approval the main Claude window hides and a fullscreen tooltip overlay appears. " +
"You then call teach_step repeatedly; each call shows one tooltip and waits for the user to click Next. " +
"Same app-allowlist semantics as request_access, but no clipboard/system-key flags. " +
"Teach mode ends automatically when your turn ends.",
inputSchema: {
type: "object" as const,
properties: {
apps: {
type: "array",
items: { type: "string" },
description:
'Application display names (e.g. "Slack", "Calendar") or bundle identifiers. Resolved case-insensitively against installed apps.' +
installedAppsHint,
},
reason: {
type: "string",
description:
'What you will be teaching. Shown in the approval dialog as "Claude wants to guide you through {reason}". Keep it short and task-focused.',
},
},
required: ["apps", "reason"],
},
},
{
name: "teach_step",
description:
"Show one guided-tour tooltip and wait for the user to click Next. On Next, execute the actions, " +
"take a fresh screenshot, and return both — you do NOT need a separate screenshot call between steps. " +
"The returned image shows the state after your actions ran; anchor the next teach_step against it. " +
"IMPORTANT — the user only sees the tooltip during teach mode. Put ALL narration in `explanation`. " +
"Text you emit outside teach_step calls is NOT visible until teach mode ends. " +
"Pack as many actions as possible into each step's `actions` array — the user waits through " +
"the whole round trip between clicks, so one step that fills a form beats five steps that fill one field each. " +
"Returns {exited:true} if the user clicks Exit — do not call teach_step again after that. " +
"Take an initial screenshot before your FIRST teach_step to anchor it.",
inputSchema: {
type: "object" as const,
properties: teachStepProperties,
required: ["explanation", "next_preview", "actions"],
},
},
{
name: "teach_batch",
description:
"Queue multiple teach steps in one tool call. Parallels computer_batch: " +
"N steps → one model↔API round trip instead of N. Each step still shows a tooltip " +
"and waits for the user's Next click, but YOU aren't waiting for a round trip between steps. " +
"You can call teach_batch multiple times in one tour — treat each batch as one predictable " +
"SEGMENT (typically: all the steps on one page). The returned screenshot shows the state " +
"after the batch's final actions; anchor the NEXT teach_batch against it. " +
"WITHIN a batch, all anchors and click coordinates refer to the PRE-BATCH screenshot " +
"(same invariant as computer_batch) — for steps 2+ in a batch, either omit anchor " +
"(centered tooltip) or target elements you know won't have moved. " +
"Good pattern: batch 5 tooltips on page A (last step navigates) → read returned screenshot → " +
"batch 3 tooltips on page B → done. " +
"Returns {exited:true, stepsCompleted:N} if the user clicks Exit — do NOT call again after that; " +
"{stepsCompleted, stepFailed, ...} if an action errors mid-batch; " +
"otherwise {stepsCompleted, results:[...]} plus a final screenshot. " +
"Fall back to individual teach_step calls when you need to react to each intermediate screenshot.",
inputSchema: {
type: "object" as const,
properties: {
steps: {
type: "array",
minItems: 1,
items: {
type: "object",
properties: teachStepProperties,
required: ["explanation", "next_preview", "actions"],
},
description:
"Ordered steps. Validated upfront — a typo in step 5 errors before any tooltip shows.",
},
},
required: ["steps"],
},
},
];
}

View File

@@ -0,0 +1,622 @@
import type {
ComputerExecutor,
InstalledApp,
ScreenshotResult,
} from "./executor.js";
/** `ScreenshotResult` without the base64 blob. The shape hosts persist for
* cross-respawn `scaleCoord` survival. */
export type ScreenshotDims = Omit<ScreenshotResult, "base64">;
/** Shape mirrors claude-for-chrome-mcp/src/types.ts:1-7 */
export interface Logger {
info: (message: string, ...args: unknown[]) => void;
error: (message: string, ...args: unknown[]) => void;
warn: (message: string, ...args: unknown[]) => void;
debug: (message: string, ...args: unknown[]) => void;
silly: (message: string, ...args: unknown[]) => void;
}
/**
* Per-app permission tier. Hardcoded by category at grant time — the
* approval dialog displays the tier but the user cannot change it (for now).
*
* - `"read"` — visible in screenshots, NO interaction (no clicks, no typing).
* Browsers land here: the model can read a page that's already open, but
* must use the Claude-in-Chrome MCP for any navigation/clicking. Trading
* platforms land here too (no CiC alternative — the model asks the user).
* - `"click"` — visible + plain left-click, scroll. NO typing/keys,
* NO right/middle-click, NO modifier-clicks, NO drag-drop (all text-
* injection vectors). Terminals/IDEs land here: the model can click a
* Run button or scroll test output, but `type("rm -rf /")` is blocked
* and so is right-click→Paste and dragging text onto the terminal.
* - `"full"` — visible + click + type/key/paste. Everything else.
*
* Enforced in `runInputActionGates` via the frontmost-app check: keyboard
* actions require `"full"`, mouse actions require `"click"` or higher.
*/
export type CuAppPermTier = "read" | "click" | "full";
/**
* A single app the user has approved for the current session. Session-scoped
* only — there is no "once" or "forever" scope (unlike Chrome's per-domain
* three-way). CU has no natural "once" unit; one task = hundreds of clicks.
* Mirrors how `chromeAllowedDomains` is a plain `string[]` with no per-item
* scope.
*/
export interface AppGrant {
bundleId: string;
displayName: string;
/** Epoch ms. For Settings-page display ("Granted 3m ago"). */
grantedAt: number;
/** Undefined → `"full"` (back-compat for pre-tier grants persisted in
* session state). */
tier?: CuAppPermTier;
}
/** Orthogonal to the app allowlist. */
export interface CuGrantFlags {
clipboardRead: boolean;
clipboardWrite: boolean;
/**
* When false, the `key` tool rejects combos in `keyBlocklist.ts`
* (cmd+q, cmd+tab, cmd+space, cmd+shift+q, ctrl+alt+delete). All other
* key sequences work regardless.
*/
systemKeyCombos: boolean;
}
export const DEFAULT_GRANT_FLAGS: CuGrantFlags = {
clipboardRead: false,
clipboardWrite: false,
systemKeyCombos: false,
};
/**
* Host picks via GrowthBook JSON feature `chicago_coordinate_mode`, baked
* into tool param descriptions at server-construction time. The model sees
* ONE convention and never learns the other exists. `normalized_0_100`
* sidesteps the Retina scaleFactor bug class entirely.
*/
export type CoordinateMode = "pixels" | "normalized_0_100";
/**
* Independent kill switches for subtle/risky ported behaviors. Read from
* GrowthBook by the host adapter, consulted in `toolCalls.ts`.
*/
export interface CuSubGates {
/** 9×9 exact-byte staleness guard before click. */
pixelValidation: boolean;
/** Route `type("foo\nbar")` through clipboard instead of keystroke-by-keystroke. */
clipboardPasteMultiline: boolean;
/**
* Ease-out-cubic mouse glide at 60fps, distance-proportional duration
* (2000 px/sec, capped at 0.5s). Adds up to ~0.5s latency
* per click. When off, cursor teleports instantly.
*/
mouseAnimation: boolean;
/**
* Pre-action sequence: hide non-allowlisted apps, then defocus us (from the
* Vercept acquisition). When off, the
* frontmost gate fires in the normal case and the model gets stuck — this
* is the A/B-test-the-old-broken-behavior switch.
*/
hideBeforeAction: boolean;
/**
* Auto-resolve the target display before each screenshot when the
* selected display has no allowed-app windows. When on, `handleScreenshot`
* uses the atomic Swift path; off → sticks with `selectedDisplayId`.
*/
autoTargetDisplay: boolean;
/**
* Stash+clear the clipboard while a tier-"click" app is frontmost.
* Closes the gap where a click-tier terminal/IDE has a UI Paste button
* that's plain-left-clickable — without this, the tier "click"
* keyboard block can be routed around by clicking Paste. Restored when
* a non-"click" app becomes frontmost, or at turn end.
*/
clipboardGuard: boolean;
}
// ----------------------------------------------------------------------------
// Permission request/response (mirror of BridgePermissionRequest, types.ts:77-94)
// ----------------------------------------------------------------------------
/** One entry per app the model asked for, after name → bundle ID resolution. */
export interface ResolvedAppRequest {
/** What the model asked for (e.g. "Slack", "com.tinyspeck.slackmacgap"). */
requestedName: string;
/** The resolved InstalledApp if found, else undefined (shown greyed in the UI). */
resolved?: InstalledApp;
/** Shell-access-equivalent bundle IDs get a UI warning. See sentinelApps.ts. */
isSentinel: boolean;
/** Already in the allowlist → skip the checkbox, return in `granted` immediately. */
alreadyGranted: boolean;
/** Hardcoded tier for this app (browser→"read", terminal→"click", else "full").
* The dialog displays this read-only; the renderer passes it through
* verbatim in the AppGrant. */
proposedTier: CuAppPermTier;
}
/**
* Payload for the renderer approval dialog. Rides through the existing
* `ToolPermissionRequest.input: unknown` field
* (packages/utils/desktop/bridge/common/claude.web.ts:1262) — no IPC schema
* change needed.
*/
export interface CuPermissionRequest {
requestId: string;
/** Model-provided reason string. Shown prominently in the approval UI. */
reason: string;
apps: ResolvedAppRequest[];
/** What the model asked for. User can toggle independently of apps. */
requestedFlags: Partial<CuGrantFlags>;
/**
* For the "On Windows, Claude can see all apps..." footnote. Taken from
* `executor.capabilities.screenshotFiltering` so the renderer doesn't
* need to know about platforms.
*/
screenshotFiltering: "native" | "none";
/**
* Present only when TCC permissions are NOT yet granted. When present,
* the renderer shows a TCC toggle panel (two rows: Accessibility, Screen
* Recording) INSTEAD OF the app list. Clicking a row's "Request" button
* triggers the OS prompt; the store polls on window-focus and flips the
* toggle when the grant is detected. macOS itself prompts the user to
* restart after granting Screen Recording — we don't.
*/
tccState?: {
accessibility: boolean;
screenRecording: boolean;
};
/**
* Apps with windows on the CU display that aren't in the requested
* allowlist. These will be hidden the first time Claude takes an action.
* Computed at request_access time — may be slightly stale by the time the
* user clicks Allow, but it's a preview, not a contract. Absent when
* empty so the renderer can skip the section cleanly.
*/
willHide?: Array<{ bundleId: string; displayName: string }>;
/**
* `chicagoAutoUnhide` app preference at request time. The renderer picks
* between "...then restored when Claude is done" and "...will be hidden"
* copy. Absent when `willHide` is absent (same condition).
*/
autoUnhideEnabled?: boolean;
}
/**
* What the renderer stuffs into `updatedInput._cuGrants` when the user clicks
* "Allow for this session" (mirror of the `_allowAllSites` sentinel at
* LocalAgentModeSessionManager.ts:2794).
*/
export interface CuPermissionResponse {
granted: AppGrant[];
/** Bundle IDs the user unchecked, or apps that weren't installed. */
denied: Array<{ bundleId: string; reason: "user_denied" | "not_installed" }>;
flags: CuGrantFlags;
/**
* Whether the user clicked Allow in THIS dialog. Only set by the
* teach-mode handler — regular request_access doesn't need it (the
* session manager's `result.behavior` gates the merge there). Needed
* because when all requested apps are already granted (skipDialogGrants
* non-empty, needDialog empty), Allow and Deny produce identical
* `{granted:[], denied:[]}` payloads and the tool handler can't tell
* them apart without this. Undefined → legacy/regular path, do not
* gate on it.
*/
userConsented?: boolean;
}
// ----------------------------------------------------------------------------
// Host adapter (mirror of ClaudeForChromeContext, types.ts:33-62)
// ----------------------------------------------------------------------------
/**
* Process-lifetime singleton dependencies. Everything that does NOT vary per
* tool call. Built once by `apps/desktop/src/main/nest-only/chicago/hostAdapter.ts`.
* No Electron imports in this package — the host injects everything.
*/
export interface ComputerUseHostAdapter {
serverName: string;
logger: Logger;
executor: ComputerExecutor;
/**
* TCC state check — Accessibility + Screen Recording on macOS. Pure check,
* no dialog, no relaunch. When either is missing, `request_access` threads
* the state through to the renderer which shows a toggle panel; all other
* tools return a tool error.
*/
ensureOsPermissions(): Promise<
| { granted: true }
| { granted: false; accessibility: boolean; screenRecording: boolean }
>;
/** The Settings-page kill switch (`chicagoEnabled` app preference). */
isDisabled(): boolean;
/**
* The `chicagoAutoUnhide` app preference. Consumed by `buildAccessRequest`
* to populate `CuPermissionRequest.autoUnhideEnabled` so the renderer's
* "will be hidden" copy can say "then restored" only when true.
*/
getAutoUnhideEnabled(): boolean;
/**
* Sub-gates re-read on every tool call so GrowthBook flips take effect
* mid-session without restart.
*/
getSubGates(): CuSubGates;
/**
* JPEG decode + crop + raw pixel bytes, for the PixelCompare staleness guard.
* Injected so this package stays Electron-free. The host implements it via
* `nativeImage.createFromBuffer(jpeg).crop(rect).toBitmap()` — Chromium's
* decoders, BSD-licensed, no `.node` binary.
*
* Returns null on decode/crop failure — caller treats null as `skipped`,
* click proceeds (validation failure must never block the action).
*/
cropRawPatch(
jpegBase64: string,
rect: { x: number; y: number; width: number; height: number },
): Buffer | null;
}
// ----------------------------------------------------------------------------
// Session context (getter/callback bag for bindSessionContext)
// ----------------------------------------------------------------------------
/**
* Per-session state binding for `bindSessionContext`. Hosts build this once
* per session with getters that read fresh from their session store and
* callbacks that write back. The returned dispatcher builds
* `ComputerUseOverrides` from these getters on every call.
*
* Callbacks must be set at construction time — `bindSessionContext` reads
* them once at bind, not per call.
*
* The lock hooks are **async** — `bindSessionContext` awaits them before
* `handleToolCall`, then passes `checkCuLock: undefined` in overrides so the
* sync Gate-3 in `handleToolCall` no-ops. Hosts with in-memory sync locks
* (Cowork) wrap them trivially; hosts with cross-process locks (the CLI's
* O_EXCL file) call the real async primitive directly.
*/
export interface ComputerUseSessionContext {
// ── Read state fresh per call ──────────────────────────────────────
getAllowedApps(): readonly AppGrant[];
getGrantFlags(): CuGrantFlags;
/** Per-user auto-deny list (Settings page). Empty array = none. */
getUserDeniedBundleIds(): readonly string[];
getSelectedDisplayId(): number | undefined;
getDisplayPinnedByModel?(): boolean;
getDisplayResolvedForApps?(): string | undefined;
getTeachModeActive?(): boolean;
/** Dims-only fallback when `lastScreenshot` is unset (cross-respawn).
* `bindSessionContext` reconstructs `{...dims, base64: ""}` so scaleCoord
* works and pixelCompare correctly skips. */
getLastScreenshotDims?(): ScreenshotDims | undefined;
// ── Write-back callbacks ───────────────────────────────────────────
/** Shows the approval dialog. Host routes to its UI, awaits user. The
* signal is aborted if the tool call finishes before the user answers
* (MCP timeout, etc.) — hosts dismiss the dialog on abort. */
onPermissionRequest?(
req: CuPermissionRequest,
signal: AbortSignal,
): Promise<CuPermissionResponse>;
/** Teach-mode sibling of `onPermissionRequest`. */
onTeachPermissionRequest?(
req: CuTeachPermissionRequest,
signal: AbortSignal,
): Promise<CuPermissionResponse>;
/** Called by `bindSessionContext` after merging a permission response into
* the allowlist (dedupe on bundleId, truthy-only flag spread). Host
* persists for resume survival. */
onAllowedAppsChanged?(apps: readonly AppGrant[], flags: CuGrantFlags): void;
onAppsHidden?(bundleIds: string[]): void;
/** Reads the session's clipboardGuard stash. undefined → no stash held. */
getClipboardStash?(): string | undefined;
/** Writes the clipboardGuard stash. undefined clears it. */
onClipboardStashChanged?(stash: string | undefined): void;
onResolvedDisplayUpdated?(displayId: number): void;
onDisplayPinned?(displayId: number | undefined): void;
onDisplayResolvedForApps?(sortedBundleIdsKey: string): void;
/** Called after each screenshot. Host persists for respawn survival. */
onScreenshotCaptured?(dims: ScreenshotDims): void;
onTeachModeActivated?(): void;
onTeachStep?(req: TeachStepRequest): Promise<TeachStepResult>;
onTeachWorking?(): void;
// ── Lock (async) ───────────────────────────────────────────────────
/** At most one session uses CU at a time. Awaited by `bindSessionContext`
* before dispatch. Undefined → no lock gating (proceed). */
checkCuLock?(): Promise<{ holder: string | undefined; isSelf: boolean }>;
/** Take the lock. Called when `checkCuLock` returned `holder: undefined`
* on a non-deferring tool. Host emits enter-CU signals here. */
acquireCuLock?(): Promise<void>;
/** Host-specific lock-held error text. Default is the package's generic
* message. The CLI host includes the holder session-ID prefix. */
formatLockHeldMessage?(holder: string): string;
/** User-abort signal. Passed through to `ComputerUseOverrides.isAborted`
* for the mid-loop checks in handleComputerBatch / handleType. See that
* field for semantics. */
isAborted?(): boolean;
}
// ----------------------------------------------------------------------------
// Per-call overrides (mirror of PermissionOverrides, types.ts:97-102)
// ----------------------------------------------------------------------------
/**
* Built FRESH on every tool call by `bindSessionContext` from
* `ComputerUseSessionContext` getters. This is what lets a singleton MCP
* server carry per-session state — the state lives on the host's session
* store, not the server.
*/
export interface ComputerUseOverrides {
allowedApps: AppGrant[];
grantFlags: CuGrantFlags;
coordinateMode: CoordinateMode;
/**
* User-configured auto-deny list (Settings → Desktop app → Computer Use).
* Bundle IDs
* here are stripped from request_access BEFORE the approval dialog — they
* never reach the user for approval regardless of tier. The response tells
* the agent to ask the user to remove the app from their deny list in
* Settings if access is genuinely needed.
*
* Per-USER, persists across restarts (read from appPreferences per call,
* not session state). Contrast with `allowedApps` which is per-session.
* Empty array = no user-configured denies (the default).
*/
userDeniedBundleIds: readonly string[];
/**
* Display CU operates on; read fresh per call. `scaleCoord` uses the
* `originX/Y` snapshotted in `lastScreenshot`, so mid-session switches
* only affect the NEXT screenshot/prepare call.
*/
selectedDisplayId?: number;
/**
* The `request_access` tool handler calls this and awaits. The wrapper
* closure in serverDef.ts (mirroring InternalMcpServerManager.ts:131-177)
* routes through `handleToolPermission` → IPC → renderer ChicagoApproval.
* When it resolves, the wrapper side-effectfully mutates
* `InternalServerContext.cuAllowedApps` BEFORE returning here.
*
* Undefined when the session wasn't wired with a permission handler (e.g.
* a future headless mode). `request_access` returns a tool error in that case.
*/
onPermissionRequest?: (req: CuPermissionRequest) => Promise<CuPermissionResponse>;
/**
* For the pixel-validation staleness guard. The model's-last-screenshot,
* stashed by serverDef.ts after each `screenshot` tool call. Undefined on
* cold start → pixel validation skipped (click proceeds).
*/
lastScreenshot?: ScreenshotResult;
/**
* Fired after every `prepareForAction` with the bundle IDs it just hid.
* The wrapper closure in serverDef.ts accumulates these into
* `Session.cuHiddenDuringTurn` via a write-through callback (same pattern
* as `onCuPermissionUpdated`). At turn end (`sdkMessage.type === "result"`),
* if the `chicagoAutoUnhide` setting is on, everything in the set is
* unhidden. Set is cleared regardless of the setting so it doesn't leak
* across turns.
*
* Undefined when the session wasn't wired with a tracker — unhide just
* doesn't happen.
*/
onAppsHidden?: (bundleIds: string[]) => void;
/**
* Reads the clipboardGuard stash from session state. `undefined` means no
* stash is held — `syncClipboardStash` stashes on first entry to click-tier
* and clears on restore. Sibling of the `cuHiddenDuringTurn` getter pattern
* — state lives on the host's session, not module-level here.
*/
getClipboardStash?: () => string | undefined;
/**
* Writes the clipboardGuard stash to session state. `undefined` clears.
* Sibling of `onAppsHidden` — the wrapper closure writes through to
* `Session.cuClipboardStash`. At turn end the host reads + clears it
* directly and restores via Electron's `clipboard.writeText` (no nest-only
* import surface).
*/
onClipboardStashChanged?: (stash: string | undefined) => void;
/**
* Write the resolver's picked display back to session so teach overlay
* positioning and subsequent non-resolver calls use the same display.
* Fired by `handleScreenshot` in the atomic `autoTargetDisplay` path when
* `resolvePrepareCapture`'s pick differs from `selectedDisplayId`.
* Fire-and-forget.
*/
onResolvedDisplayUpdated?: (displayId: number) => void;
/**
* Set when the model explicitly picked a display via `switch_display`.
* When true, `handleScreenshot` passes `autoResolve: false` so the Swift
* resolver honors `selectedDisplayId` directly (straight cuDisplayInfo
* passthrough) instead of running the co-location/chase chain. The
* resolver's Step 2 ("host + allowed co-located → host") otherwise
* overrides any `selectedDisplayId` whenever an allowed app shares the
* host's monitor.
*/
displayPinnedByModel?: boolean;
/**
* Write the model's explicit display pick to session. `displayId:
* undefined` clears both `selectedDisplayId` and the pin (back to auto).
* Sibling of `onResolvedDisplayUpdated` but also sets the pin flag —
* the two are semantically distinct (resolver-picked vs model-picked).
*/
onDisplayPinned?: (displayId: number | undefined) => void;
/**
* Sorted comma-joined bundle-ID set the display was last auto-resolved
* for. `handleScreenshot` compares this to the current allowed set and
* only passes `autoResolve: true` when they differ — so the resolver
* doesn't yank the display on every screenshot, only when the app set
* has changed since the last resolve (or manual switch).
*/
displayResolvedForApps?: string;
/**
* Records which app set the current display selection was made for. Fired
* alongside `onResolvedDisplayUpdated` when the resolver picks, so the next
* screenshot sees a matching set and skips auto-resolve.
*/
onDisplayResolvedForApps?: (sortedBundleIdsKey: string) => void;
/**
* Global CU lock — at most one session actively uses CU at a time. Checked
* in `handleToolCall` after kill-switch/TCC, before dispatch. Every CU tool
* including `request_access` goes through it.
*
* - `holder === undefined` → lock is free, safe to acquire
* - `isSelf === true` → this session already holds it (no-op, proceed)
* - `holder !== undefined && !isSelf` → blocked, return tool error
*
* `undefined` callback → lock system not wired (e.g. CCD). Proceed without
* gating — absence of the mechanism ≠ locked out.
*
* The host manages release (on session idle/stop/archive) — this package
* never releases.
*/
checkCuLock?: () => { holder: string | undefined; isSelf: boolean };
/**
* Take the lock for this session. `handleToolCall` calls this exactly once
* per turn, on the FIRST CU tool call when `checkCuLock().holder` is
* undefined. No-op if already held (defensive — the check should have
* short-circuited). Host emits an event the overlay listens to.
*/
acquireCuLock?: () => void;
/**
* User-abort signal. Checked mid-iteration inside `handleComputerBatch`
* and `handleType`'s grapheme loop so an in-flight batch/type stops
* promptly on overlay Stop instead of running to completion after the
* host has already abandoned the tool result.
*
* Undefined → never aborts (e.g. unwired host). Live per-check read —
* same lazy-getter pattern as `checkCuLock`.
*/
isAborted?: () => boolean;
// ── Teach mode ───────────────────────────────────────────────────────
// Wired only when the host's teachModeEnabled gate is on. All five
// undefined → `request_teach_access` / `teach_step` return tool errors
// and teach mode is effectively off.
/**
* Sibling of `onPermissionRequest`. Same blocking-await-on-renderer-dialog
* semantics, but routes to ComputerUseTeachApproval.tsx (which explains
* the window-hides-during-guide behavior) instead of ComputerUseApproval.
* The wrapper closure in serverDef.ts writes grants through to session state
* via `onCuPermissionUpdated` exactly as `onPermissionRequest` does.
*/
onTeachPermissionRequest?: (
req: CuTeachPermissionRequest,
) => Promise<CuPermissionResponse>;
/**
* Called by `handleRequestTeachAccess` after the user approves and at least
* one app was granted. Host sets `session.teachModeActive = true`, emits
* `teachModeChanged` → teach controller hides the main window and shows the
* fullscreen overlay. Cleared by the host on turn end (`transitionTo("idle")`)
* alongside the CU lock release.
*/
onTeachModeActivated?: () => void;
/**
* Read by `handleRequestAccess` and `handleRequestTeachAccess` to
* short-circuit with a clear tool error when teach mode is active. The
* main window is hidden during teach mode, so permission dialogs render
* invisibly and handleToolPermission blocks forever on an invisible
* prompt. Better to tell the model to exit teach mode first. Getter
* (not a boolean field) because teach mode state lives on the session,
* not on this per-call overrides object.
*/
getTeachModeActive?: () => boolean;
/**
* Called by `handleTeachStep` with the scaled anchor + text. Host stores
* the resolver, emits `teachStepRequested` → teach controller pushes the
* payload to the overlay → user reads, clicks Next → IPC → host calls the
* stored resolver → this promise resolves. `{action: "exit"}` when the user
* clicks Exit (or the turn is interrupted) — `handleTeachStep` short-circuits
* without executing actions.
*
* Same blocking-promise pattern as `onPermissionRequest`, but resolved by
* the teach overlay's own preload (not the main renderer's tool-approval UI).
*/
onTeachStep?: (req: TeachStepRequest) => Promise<TeachStepResult>;
/**
* Called immediately after `onTeachStep` resolves with "next", before
* action dispatch begins. Host emits `teachStepWorking` → overlay flips to
* the spinner state (Next button gone, Exit stays, "Working…" + rotating
* notch). The next `onTeachStep` call replaces the spinner with the new
* tooltip content.
*/
onTeachWorking?: () => void;
}
// ----------------------------------------------------------------------------
// Teach mode (guided-tour tooltips with Next-button action execution)
// ----------------------------------------------------------------------------
/**
* Payload the host pushes to the teach overlay BrowserWindow. Built by
* `handleTeachStep` in toolCalls.ts from the model's `teach_step` args.
*
* `anchorLogical` here is POST-`scaleCoord` — **full-display** logical
* macOS points (origin = monitor top-left, menu bar included, since
* cuDisplayInfo returns CGDisplayBounds). The overlay window is positioned
* at `workArea.{x,y}` (excludes menu bar/Dock), so `updateTeachStep` in
* teach/window.ts subtracts the workArea offset before IPC so the HTML's
* CSS coords match.
*/
export interface TeachStepRequest {
explanation: string;
nextPreview: string;
/** Full-display logical points. Undefined → overlay centers the tooltip, hides the arrow. */
anchorLogical?: { x: number; y: number };
}
export type TeachStepResult = { action: "next" } | { action: "exit" };
/**
* Payload for the renderer's ComputerUseTeachApproval dialog. Rides through
* `ToolPermissionRequest.input: unknown` same as `CuPermissionRequest`.
* Separate type (not a flag on `CuPermissionRequest`) so the two approval
* components can narrow independently and the teach dialog is free to drop
* fields it doesn't render (no grant-flag checkboxes in teach mode).
*/
export interface CuTeachPermissionRequest {
requestId: string;
/** Model-provided reason. Shown in the dialog headline ("guide you through {reason}"). */
reason: string;
apps: ResolvedAppRequest[];
screenshotFiltering: "native" | "none";
/** Present only when TCC is ungranted — same semantics as `CuPermissionRequest.tccState`. */
tccState?: {
accessibility: boolean;
screenRecording: boolean;
};
willHide?: Array<{ bundleId: string; displayName: string }>;
/** Same semantics as `CuPermissionRequest.autoUnhideEnabled`. */
autoUnhideEnabled?: boolean;
}

View File

@@ -0,0 +1,23 @@
const path = require("path");
if (process.platform !== "darwin") {
throw new Error("@ant/computer-use-swift is only available on macOS");
}
// COMPUTER_USE_SWIFT_NODE_PATH: escape hatch for bundlers. Bun's --compile
// embeds the .node as an asset, not in a node_modules tree — __dirname is the
// exe dir and ../prebuilds/ doesn't exist. The consuming build bakes this var
// to the embedded asset's path. Unset → normal node_modules layout.
//
// Four methods use `Task { @MainActor in ... }` (captureExcluding,
// captureRegion, apps.listInstalled, resolvePrepareCapture) which enqueue
// onto DispatchQueue.main. Electron drains that queue via CFRunLoop; libuv
// (Node/bun) does not — the promises hang. Consumers running under libuv
// must pump `_drainMainRunLoop` via setInterval while those promises are
// pending. Consumers under Electron don't need to (CFRunLoop drains
// automatically).
const native = require(
process.env.COMPUTER_USE_SWIFT_NODE_PATH ??
path.resolve(__dirname, "../prebuilds/computer_use.node"),
);
module.exports = native.computerUse;