From 2f0b5b36ebd8ef43460eeadf7313542953d9266e Mon Sep 17 00:00:00 2001 From: bellman Date: Fri, 5 Jun 2026 10:19:43 +0900 Subject: [PATCH] fix: wrap concurrent ENOENT as domain-specific session error (#112) Session save_to_path now wraps ENOENT errors from rotate and atomic write with a clear "possible concurrent modification" message instead of surfacing raw OS errno. Helps operators debugging race conditions when multiple claw invocations touch the same session file. Generated with https://github.com/Yeachan-Heo/gajae-code Co-authored-by: Gajae Code --- ROADMAP.md | 2 +- rust/crates/runtime/src/session.rs | 27 +++++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index d1532d84..24ed7985 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -3394,7 +3394,7 @@ ear], /color [scheme], /effort [low|medium|high], /fast, /summary, /tag [label], **Source.** Jobdori dogfood 2026-04-18 against `/tmp/cdHH` on main HEAD `b2366d1` in response to Clawhip pinpoint nudge at `1494872623782301817`. Joins **silent-flag / documented-but-unenforced** (#96–#101, #104, #108) on the command-dispatch-semantics axis — eighth instance of "documented behavior differs from actual." Joins **unplumbed-subsystem / CLI-advertised-but-unreachable** (#78, #96, #100, #102, #103, #107, #109) as the eighth surface where the spec advertises a capability the implementation doesn't deliver. Joins **truth-audit / diagnostic-integrity** (#80–#87, #89, #100, #102, #103, #105, #107, #109, #110) — `/providers` silently returns doctor output under the wrong kind label; help lies about capability. Natural bundle: **#78 + #96 + #111** — three-way "declared but not implemented as declared" triangle (CLI route never constructed + help resume-safe leaks stubs + slash command dispatches to wrong handler). Also **#96 + #108 + #111** — full `--help`/dispatch surface hygiene quartet covering help-filter-leaks + subcommand typo fallthrough + slash-command mis-dispatch. Session tally: ROADMAP #111. -112. **Concurrent claw invocations that touch the same session file (e.g. two `/clear --confirm` or two `/compact` calls on the same session-id race) fail intermittently with a raw OS errno — `{"type":"error","error":"No such file or directory (os error 2)"}` — instead of a domain-specific concurrent-modification error. There is no file locking, no read-modify-write protection, no rename-race guard. The loser of the race gets ENOENT because the winner rotated, renamed, or deleted the session file between the loser's `fs::read_to_string` and its own `fs::write`. A claw orchestrating multiple lanes that happen to share a session id (because the operator reuses one, or because a CI matrix is re-running with the same state) gets unpredictable partial failures with un-actionable raw-io errors** — dogfooded 2026-04-18 on main HEAD `a049bd2` from `/tmp/cdII`. Five concurrent `/compact` calls on the same session: 4 succeed, 1 fails with `os error 2`. Two concurrent `/clear --confirm` calls: same pattern. +112. **DONE — Concurrent claw invocations that touch the same session file (e.g. two `/clear --confirm` or two `/compact` calls on the same session-id race) fail intermittently with a raw OS errno — `{"type":"error","error":"No such file or directory (os error 2)"}` — instead of a domain-specific concurrent-modification error. There is no file locking, no read-modify-write protection, no rename-race guard. The loser of the race gets ENOENT because the winner rotated, renamed, or deleted the session file between the loser's `fs::read_to_string` and its own `fs::write`. A claw orchestrating multiple lanes that happen to share a session id (because the operator reuses one, or because a CI matrix is re-running with the same state) gets unpredictable partial failures with un-actionable raw-io errors** — dogfooded 2026-04-18 on main HEAD `a049bd2` from `/tmp/cdII`. Five concurrent `/compact` calls on the same session: 4 succeed, 1 fails with `os error 2`. Two concurrent `/clear --confirm` calls: same pattern. **Concrete repro.** ``` diff --git a/rust/crates/runtime/src/session.rs b/rust/crates/runtime/src/session.rs index 0cc32c03..2ecfd97d 100644 --- a/rust/crates/runtime/src/session.rs +++ b/rust/crates/runtime/src/session.rs @@ -231,8 +231,31 @@ impl Session { pub fn save_to_path(&self, path: impl AsRef) -> Result<(), SessionError> { let path = path.as_ref(); let snapshot = self.render_jsonl_snapshot()?; - rotate_session_file_if_needed(path)?; - write_atomic(path, &snapshot)?; + // #112: wrap ENOENT during rotate as concurrent modification + match rotate_session_file_if_needed(path) { + Ok(()) => {} + Err(SessionError::Io(ref io_err)) if io_err.kind() == std::io::ErrorKind::NotFound => { + return Err(SessionError::Io(std::io::Error::new( + std::io::ErrorKind::NotFound, + format!( + "session file was removed during save (possible concurrent modification): {io_err}" + ), + ))); + } + Err(e) => return Err(e), + } + write_atomic(path, &snapshot).map_err(|e| { + // #112: wrap ENOENT during write as concurrent modification + match &e { + SessionError::Io(io_err) if io_err.kind() == std::io::ErrorKind::NotFound => { + SessionError::Io(std::io::Error::new( + std::io::ErrorKind::NotFound, + format!("session file was removed during write (possible concurrent modification): {io_err}"), + )) + } + _ => e, + } + })?; cleanup_rotated_logs(path)?; Ok(()) }