From cb55fd7b61c4a7355d399e3da9eec654d6942ddf Mon Sep 17 00:00:00 2001 From: Guillaume ARM Date: Wed, 10 Jun 2026 01:10:37 +0200 Subject: [PATCH] fix(ai): add async diagnostics and boot shutdown setting --- NEXT_PLAN.md | 129 --------------------------------- apis/libai.lua | 22 +++++- manifest.json | 2 +- packages/index.json | 6 +- packages/trapos-ai/ccpm.json | 2 +- packages/trapos-boot/ccpm.json | 2 +- packages/trapos/ccpm.json | 2 +- programs/ai.lua | 2 +- startup/servers.lua | 9 ++- tests/ai.lua | 47 ++++++++++++ tests/startup-servers.lua | 20 ++++- 11 files changed, 99 insertions(+), 144 deletions(-) delete mode 100644 NEXT_PLAN.md diff --git a/NEXT_PLAN.md b/NEXT_PLAN.md deleted file mode 100644 index dec3429..0000000 --- a/NEXT_PLAN.md +++ /dev/null @@ -1,129 +0,0 @@ -# Diagnose & fix opencode async polling (round 2) - -## Context - -The first round added the `model` field to `/prompt_async` (commit `1c07435`). On the dev server (`http://127.0.0.1:4242`, password `jenesuispasunhacker`) the user now sees two symptoms: - -1. **First `ai ping` on a fresh session "crashes" the computer.** The session is created, the user message is posted, opencode visibly generates a pong reply — but `ai` returns an error path and `startup/servers.lua` runs `parallel.waitForAny(shell, eventloop)` followed by `os.shutdown()`. Any path where either coroutine returns shuts the machine down. So the visible "crash" is really: *some* exit in our coroutine + harness-style shutdown logic in production. -2. **Subsequent `ai` calls (including `ai sessions`) block.** The user reports the second `ai ping` is "blocked indefinitely" and even `ai sessions` doesn't return — which is odd because `ai.listSessions` does not poll, so this might either be a misreport or evidence of a wider hang. - -The most load-bearing hypothesis is that `findAssistantMessage` in `apis/libai.lua` (introduced by `c61254d`) never matches: it assumes our submitted `messageID` is the **user** message id and looks for the *next* assistant message after it. The opencode docs we wrote in the same commit say the submitted id is the **assistant** message id (`docs/opencode_api.md` line 105: "Get a message by ID. Opencode validates caller-provided message IDs; use IDs starting with `msg`."). If the docs are right, our user message has a server-generated id, our submitted id appears on the assistant message itself, `seenSubmitted` stays `false`, and we poll until `opencc.poll_timeout_seconds` expires. - -This explains both observations: -- "First ping crashes" → poll timeout returns `false, 'delai depasse en attendant la reponse AI'`. `programs/ai.lua` prints the error and returns. Then the shell prompt gets shown again — *unless* something else in the program causes a Lua error in the call chain that makes either coroutine exit. We need a probe to confirm. -- "Second ping blocks" → opencode keeps the previous assistant generation in-flight (we never acknowledged it) and queues new prompts behind it, OR the next poll loop is the same one, just on a session where opencode now refuses to generate a new turn. - -The user also asked for two ergonomics: -- A `--verbose` flag on `programs/ai.lua` so headless probes can see polling progress. -- A way to disable `os.shutdown()` outside the harness (out of scope for the bug fix, captured as a follow-up). - -## Plan - -### Phase A — confirm the hypothesis with probes (no code changes) - -Run these against `http://127.0.0.1:4242` with Basic auth `opencode:jenesuispasunhacker`. - -1. Create a session and capture its id: - ``` - curl -s -u opencode:jenesuispasunhacker -X POST \ - -H 'Content-Type: application/json' \ - -d '{"title":"probe"}' \ - http://127.0.0.1:4242/session - ``` -2. POST a known messageID via prompt_async: - ``` - curl -s -u opencode:jenesuispasunhacker -X POST \ - -H 'Content-Type: application/json' \ - -d '{"messageID":"msg_probe_1","parts":[{"type":"text","text":"reply with exactly: pong"}],"model":{"providerID":"anthropic","modelID":"claude-opus-4-7"}}' \ - http://127.0.0.1:4242/session//prompt_async - ``` -3. Poll the list and inspect ids/roles: - ``` - curl -s -u opencode:jenesuispasunhacker \ - http://127.0.0.1:4242/session//message | jq '.[] | {id: .info.id, role: .info.role, finish: .info.finish, completed: .info.time.completed}' - ``` - -**Expected outcomes**: -- If `msg_probe_1` appears with `role == "assistant"` → the doc-stated semantics are correct, `findAssistantMessage` is wrong, and we should poll for the message *whose own id matches* our submitted id. -- If `msg_probe_1` appears with `role == "user"` → c61254d's reading is correct, the bug is elsewhere (model dispatch, message decoding, etc.). -- If `msg_probe_1` doesn't appear at all → opencode is silently dropping it; investigate model or auth. - -Record which case is real. The fix branches on this. - -### Phase B — code changes (drive both cases) - -Regardless of which probe outcome wins, `findAssistantMessage` is brittle (only handles one of two id-placement conventions and fails silently). Replace it with a more defensive lookup that handles both, in `apis/libai.lua` around lines 268–280: - -```lua -local function findAssistantMessage(messages, submittedMessageId) - -- Case 1 (docs): our id is the assistant message id. - for _, m in ipairs(messages) do - if type(m) == 'table' and type(m.info) == 'table' - and m.info.id == submittedMessageId and m.info.role == 'assistant' then - return m; - end - end - -- Case 2 (c61254d empirical): our id is the user message id; assistant follows. - local seen = false; - for _, m in ipairs(messages) do - if type(m) == 'table' and type(m.info) == 'table' then - if m.info.id == submittedMessageId then - seen = true; - elseif seen and m.info.role == 'assistant' then - return m; - end - end - end - return nil; -end -``` - -This keeps the existing `pollMessage` flow and `isMessageComplete` check unchanged; it just stops missing the message when opencode's id-placement matches the docs. - -### Phase C — `--verbose` for `programs/ai.lua` - -Add a `--verbose` (and `-v`) flag, parsed before `command` is taken. When set: -- Pass a `log` callback into `ai.ask` (libai already accepts `log` in `luaExec`; extend `api.ask` to call `options.log(message)` from `pollMessage` at each poll iteration with: `'poll attempt #N: msgs=K, found=' .. (decoded and decoded.info.id or 'nil') .. ', complete=' .. tostring(isComplete)`). -- `programs/ai.lua`'s `askAndPrint` / `printSessions` / ping handler all print these via a `[ai]` prefix when `--verbose` is on. - -This makes the next round of headless probing useful without re-instrumenting the code. - -### Phase D — harness verification - -With the dev server already running, drive the probes via the harness so the diagnostic loop is reproducible: - -``` -just trapos-exec ' - settings.set("opencc.server_url","http://127.0.0.1:4242"); - settings.set("opencc.password","jenesuispasunhacker"); - settings.set("opencc.provider_id","anthropic"); - settings.set("opencc.model_id","claude-opus-4-7"); - settings.unset("opencc.session_id"); - shell.run("/programs/ai.lua","--verbose","ping"); -' -``` - -Expected: `pong` printed; no timeout. Then re-run without `--verbose` to confirm the regression no longer reproduces. - -Also run `just check` and `just test`. Update the existing `tests/ai.lua` `'ask polls async message until completion'` test if the message-list shape changes to include the assistant-id-match case. Add a new test: `'ask finds reply when submitted id matches assistant message itself'` — message list contains a user message with a server id and an assistant message with `info.id == submitted` and `time.completed`. - -### Files - -- `apis/libai.lua` — rewrite `findAssistantMessage`; thread an optional `log` callback through `api.ask` → `pollMessage`. -- `programs/ai.lua` — add `--verbose`/`-v` flag, prefixing log lines with `[ai]`. -- `tests/ai.lua` — new test for assistant-id-match; adjust any test that asserts the exact poll output if needed. -- Version bumps per [ADR-0011](docs/adrs/adr-0011-repo-conventions.md): `packages/trapos-ai/ccpm.json` patch bump, `packages/trapos/ccpm.json` patch bump, mirror in `packages/index.json`, `manifest.json`. - -## Verification - -1. Probe outputs from Phase A captured (and pasted into the PR or commit message). -2. `just trapos-exec` invocation from Phase D returns `pong` on a fresh session id. -3. Repeat the same invocation — second ping also returns `pong` (i.e., second-call hang is gone). -4. `ai sessions` (added to the same probe script) returns the session list within a couple of seconds. -5. `just check` clean; `just test` passes; new test green. - -## Out of scope (captured as follow-ups) - -- **Harness vs production shutdown.** `startup/servers.lua` ends with `os.shutdown()` so the headless harness exits cleanly. In production this is undesirable because *any* shell exit shuts the computer down. Follow-up: gate the shutdown on a setting (`trapos.harness_mode`) or an environment signal so production keeps the shell up. -- **Auto-detect opencode default model** instead of requiring `opencc.provider_id` / `opencc.model_id` (kept from previous plan). -- **Session auto-recovery on poll timeout.** Optional `opencc.reset_session_on_timeout` setting that calls `clearSession()` after a timeout so the next run starts on a fresh session. Likely unnecessary once `findAssistantMessage` is fixed, but worth revisiting if the second-ping hang persists. diff --git a/apis/libai.lua b/apis/libai.lua index b5aa398..d7464c5 100644 --- a/apis/libai.lua +++ b/apis/libai.lua @@ -309,6 +309,7 @@ local function createAi(opts) local loop = eventloopFactory(); local deadline = nowFunc() + cfg.pollTimeoutSeconds; local resultOk, resultValue; + local attemptCount = 0; log = log or function() end; local function finish(ok, value) @@ -317,7 +318,7 @@ local function createAi(opts) end local function attempt() - log('polling async message ' .. messageId); + attemptCount = attemptCount + 1; local body, code = doGet(cfg, '/session/' .. sessionId .. '/message'); if not body then return finish(false, code); end if code == 404 then @@ -334,7 +335,14 @@ local function createAi(opts) end local decoded = findAssistantMessage(messages, messageId); local reply = decoded and extractTextParts(decoded.parts) or ''; - if decoded and reply ~= '' and isMessageComplete(decoded) then + local complete = decoded and isMessageComplete(decoded) or false; + local matchedId = decoded and type(decoded.info) == 'table' and decoded.info.id or 'nil'; + log('poll #' .. tostring(attemptCount) + .. ': messages=' .. tostring(#messages) + .. ', found=' .. tostring(matchedId) + .. ', complete=' .. tostring(complete) + .. ', text=' .. tostring(reply ~= '')); + if decoded and reply ~= '' and complete then log('async reply completed'); return finish(true, { reply = reply, sessionId = sessionId, messageId = messageId }); end @@ -421,22 +429,30 @@ local function createAi(opts) function api.listSessions(options) options = options or {}; + local log = options.log or function() end; local cfg, err = resolveConfig(options); if not cfg then return false, err; end + log('listing sessions from ' .. cfg.url); local body, code = doGet(cfg, '/session'); - if not body then return false, code; end + if not body then + log('list sessions failed: ' .. tostring(code)); + return false, code; + end if code and code ~= 200 then + log('list sessions failed: HTTP ' .. tostring(code)); return false, 'erreur serveur: HTTP ' .. tostring(code); end local decoded = textutils.unserializeJSON(body); if type(decoded) ~= 'table' then + log('list sessions failed: invalid response'); return false, 'reponse invalide'; end table.sort(decoded, function(a, b) return sessionTime(a) > sessionTime(b); end); + log('sessions returned: ' .. tostring(#decoded)); return true, decoded; end diff --git a/manifest.json b/manifest.json index 4d77dad..145745c 100644 --- a/manifest.json +++ b/manifest.json @@ -1,6 +1,6 @@ { "name": "TrapOS", - "version": "0.8.2", + "version": "0.8.3", "branch": "next", "packages": [ "trapos" diff --git a/packages/index.json b/packages/index.json index eb69e4a..4da82db 100644 --- a/packages/index.json +++ b/packages/index.json @@ -2,11 +2,11 @@ "packages": { "trapos-core": "0.4.0", "trapos-test": "0.2.1", - "trapos-boot": "0.3.0", + "trapos-boot": "0.3.1", "trapos-net": "0.3.0", "trapos-ui": "0.2.2", - "trapos-ai": "0.6.2", + "trapos-ai": "0.6.3", "trapos-sandbox": "0.1.0", - "trapos": "0.8.2" + "trapos": "0.8.3" } } diff --git a/packages/trapos-ai/ccpm.json b/packages/trapos-ai/ccpm.json index 0d0dae9..4601aae 100644 --- a/packages/trapos-ai/ccpm.json +++ b/packages/trapos-ai/ccpm.json @@ -1,6 +1,6 @@ { "name": "trapos-ai", - "version": "0.6.2", + "version": "0.6.3", "description": "TrapOS AI client for opencode serve", "dependencies": ["trapos-core"], "files": [ diff --git a/packages/trapos-boot/ccpm.json b/packages/trapos-boot/ccpm.json index 4d7035b..2914132 100644 --- a/packages/trapos-boot/ccpm.json +++ b/packages/trapos-boot/ccpm.json @@ -1,6 +1,6 @@ { "name": "trapos-boot", - "version": "0.3.0", + "version": "0.3.1", "description": "TrapOS boot: startup MOTD and autostart server launcher", "dependencies": ["trapos-core"], "files": [ diff --git a/packages/trapos/ccpm.json b/packages/trapos/ccpm.json index d8e9732..7dd4582 100644 --- a/packages/trapos/ccpm.json +++ b/packages/trapos/ccpm.json @@ -1,6 +1,6 @@ { "name": "trapos", - "version": "0.8.2", + "version": "0.8.3", "description": "TrapOS full install meta-package", "dependencies": ["trapos-boot", "trapos-net", "trapos-ui", "trapos-test", "trapos-ai"], "files": [], diff --git a/programs/ai.lua b/programs/ai.lua index 294e8cd..e64329c 100644 --- a/programs/ai.lua +++ b/programs/ai.lua @@ -63,7 +63,7 @@ local function joinArgs(start) end local function printSessions(ai) - local ok, result = ai.listSessions(); + local ok, result = ai.listSessions(askOptions()); if not ok then print(result); return; diff --git a/startup/servers.lua b/startup/servers.lua index 7f53938..9beb683 100644 --- a/startup/servers.lua +++ b/startup/servers.lua @@ -1,6 +1,7 @@ local createEventLoop = require("/apis/eventloop") local LOCAL_MANIFEST_PATH = "/trapos/manifest.json" +local SHUTDOWN_ON_SHELL_EXIT_SETTING = "trapos.shutdown_on_shell_exit" -- OK this file should not be named servers.lua we could rename it boot.lua and this could be the only file in this startup directory so the whole startup process will be orchestrated from there. @@ -24,6 +25,10 @@ local function init() shell.setPath(shell.path() .. ":/programs") end +local function shouldShutdownOnShellExit() + return settings.get(SHUTDOWN_ON_SHELL_EXIT_SETTING) == true +end + init() if periphemu then @@ -57,4 +62,6 @@ end parallel.waitForAny(shellFn, eventLoopFn) -os.shutdown() +if shouldShutdownOnShellExit() then + os.shutdown() +end diff --git a/tests/ai.lua b/tests/ai.lua index 13f346a..47fd494 100644 --- a/tests/ai.lua +++ b/tests/ai.lua @@ -204,6 +204,25 @@ testlib.test('listSessions returns newest sessions first', function() testlib.assertEquals(result[4].id, 'ses_missing'); end); +testlib.test('listSessions logs session count when verbose', function() + local sessions = { + { id = 'ses_1', title = 'one', time = { updated = 10 } }, + { id = 'ses_2', title = 'two', time = { updated = 20 } }, + }; + local httpStub = fakeHttp({}, { response(200, textutils.serializeJSON(sessions)) }); + local settingsStub = fakeSettings({ ['opencc.server_url'] = 'http://host' }); + local logs = {}; + local ai = createAi({ http = httpStub, settings = settingsStub }); + + local ok = ai.listSessions({ + log = function(message) logs[#logs + 1] = message; end, + }); + + testlib.assertTrue(ok); + testlib.assertTrue(string.find(logs[1], 'http://host', 1, true) ~= nil); + testlib.assertEquals(logs[2], 'sessions returned: 2'); +end); + testlib.test('listSessions fails when server_url missing', function() local httpStub = fakeHttp({}, {}); local ai = createAi({ http = httpStub, settings = fakeSettings() }); @@ -540,6 +559,34 @@ testlib.test('ask polling accepts assistant message with submitted id', function testlib.assertEquals(#httpStub.getCalls, 1); end); +testlib.test('ask polling logs diagnostic details', function() + local httpStub = fakeHttp( + { sessionResp('ses_1'), asyncResp() }, + { + messageListResp({ assistantMessage('msg_1', 'reply', true) }), + } + ); + local settingsStub = fakeAsyncSettings(); + local elFactory = fakeEventloopFactory(); + local logs = {}; + local ai = createAi({ + http = httpStub, + settings = settingsStub, + now = function() return 10; end, + eventloop = elFactory, + }); + + local ok = ai.ask('hello', { + messageId = 'msg_1', + log = function(message) logs[#logs + 1] = message; end, + }); + + testlib.assertTrue(ok); + testlib.assertTrue(string.find(table.concat(logs, '\n'), 'sending async prompt msg_1', 1, true) ~= nil); + testlib.assertTrue(string.find(table.concat(logs, '\n'), 'poll #1: messages=1, found=msg_1', 1, true) ~= nil); + testlib.assertTrue(string.find(table.concat(logs, '\n'), 'complete=true, text=true', 1, true) ~= nil); +end); + testlib.test('ask polling tolerates assistant message without parts', function() local httpStub = fakeHttp( { sessionResp('ses_1'), asyncResp() }, diff --git a/tests/startup-servers.lua b/tests/startup-servers.lua index 950dfe6..0fe7086 100644 --- a/tests/startup-servers.lua +++ b/tests/startup-servers.lua @@ -2,7 +2,8 @@ local createLibTest = require('/apis/libtest'); local testlib = createLibTest({ ... }); -local function runStartupWithStubs() +local function runStartupWithStubs(settingsValues) + settingsValues = settingsValues or {}; local calls = { rebooted = false, shutdown = false, @@ -26,6 +27,11 @@ local function runStartupWithStubs() end, sleep = function() end, }, + settings = { + get = function(key) + return settingsValues[key]; + end, + }, parallel = { waitForAny = function(firstFn) firstFn(); @@ -60,11 +66,11 @@ local function runStartupWithStubs() return calls; end -testlib.test('shell exit shuts down without rebooting', function() +testlib.test('shell exit does not shut down by default', function() local calls = runStartupWithStubs(); testlib.assertEquals(calls.shellRuns[1], 'shell'); - testlib.assertEquals(calls.shutdown, true); + testlib.assertEquals(calls.shutdown, false); testlib.assertEquals(calls.rebooted, false); for _, message in ipairs(calls.prints) do testlib.assertTrue( @@ -74,4 +80,12 @@ testlib.test('shell exit shuts down without rebooting', function() end end); +testlib.test('shell exit shuts down when setting is enabled', function() + local calls = runStartupWithStubs({ ['trapos.shutdown_on_shell_exit'] = true }); + + testlib.assertEquals(calls.shellRuns[1], 'shell'); + testlib.assertEquals(calls.shutdown, true); + testlib.assertEquals(calls.rebooted, false); +end); + testlib.run();