From 3b647090fa1762b7e73cb7d6df2bac1e36ce0839 Mon Sep 17 00:00:00 2001 From: Guillaume ARM Date: Thu, 11 Jun 2026 22:06:34 +0200 Subject: [PATCH] fix(ai): retry transient network timeouts during poll A single poll GET that hit a network-level failure was treated as fatal, aborting the whole poll after the ~60s per-call timeout instead of retrying until poll_timeout. Network failures during polling are now transient: they log and reschedule until the deadline. HTTP 404 and other non-200 codes stay fatal. --- apis/libai.lua | 8 ++++- manifest.json | 2 +- packages/index.json | 4 +-- packages/trapos-ai/ccpm.json | 2 +- packages/trapos/ccpm.json | 2 +- tests/ai.lua | 67 ++++++++++++++++++++++++++++++++++++ 6 files changed, 79 insertions(+), 6 deletions(-) diff --git a/apis/libai.lua b/apis/libai.lua index 3d6fd9b..e6bdd6a 100644 --- a/apis/libai.lua +++ b/apis/libai.lua @@ -386,7 +386,13 @@ local function createAi(opts) local function attempt() attemptCount = attemptCount + 1; local body, code = doGet(cfg, '/session/' .. sessionId .. '/message'); - if not body then return finish(false, code); end + if not body then + log('poll #' .. tostring(attemptCount) .. ': transient error: ' .. tostring(code)); + if nowFunc() >= deadline then + return finish(false, code); + end + return loop.setTimeout(attempt, cfg.pollIntervalSeconds); + end if code == 404 then local ok, value = handleMissingSession(persist, sessionSettingKey); return finish(ok, value); diff --git a/manifest.json b/manifest.json index 219c6e9..3684c63 100644 --- a/manifest.json +++ b/manifest.json @@ -1,6 +1,6 @@ { "name": "TrapOS", - "version": "0.8.13", + "version": "0.8.14", "branch": "next", "packages": [ "trapos" diff --git a/packages/index.json b/packages/index.json index 2e0ada1..b3ecb9f 100644 --- a/packages/index.json +++ b/packages/index.json @@ -5,8 +5,8 @@ "trapos-boot": "0.3.2", "trapos-net": "0.3.0", "trapos-ui": "0.2.2", - "trapos-ai": "0.6.11", + "trapos-ai": "0.6.12", "trapos-sandbox": "0.2.2", - "trapos": "0.8.13" + "trapos": "0.8.14" } } diff --git a/packages/trapos-ai/ccpm.json b/packages/trapos-ai/ccpm.json index d5cead5..5723bee 100644 --- a/packages/trapos-ai/ccpm.json +++ b/packages/trapos-ai/ccpm.json @@ -1,6 +1,6 @@ { "name": "trapos-ai", - "version": "0.6.11", + "version": "0.6.12", "description": "TrapOS AI client for opencode serve", "dependencies": ["trapos-core"], "files": [ diff --git a/packages/trapos/ccpm.json b/packages/trapos/ccpm.json index 8ad1c25..b796b1d 100644 --- a/packages/trapos/ccpm.json +++ b/packages/trapos/ccpm.json @@ -1,6 +1,6 @@ { "name": "trapos", - "version": "0.8.13", + "version": "0.8.14", "description": "TrapOS full install meta-package", "dependencies": [ "trapos-boot", diff --git a/tests/ai.lua b/tests/ai.lua index 8ae77ca..7ab1f1d 100644 --- a/tests/ai.lua +++ b/tests/ai.lua @@ -85,6 +85,14 @@ local function httpError(code, body) end; end +-- True network-level failure: no response handle at all (timeout / unreachable). +-- Drives callHttp's `not response` path -> 'serveur injoignable: '. +local function httpTimeout(message) + return function() + return nil, message or 'Timed out'; + end; +end + -- Synchronous deterministic eventloop double for tests. -- setTimeout drains FIFO; runLoop runs until pending is empty or stopLoop fires. -- Returns (factory, state). state.sleeps accumulates every delay passed across @@ -1023,6 +1031,65 @@ testlib.test('pollMessage stops the private loop on success', function() testlib.assertEquals(#elState.lastLoop.inspect().pending, 0); end); +testlib.test('pollMessage retries transient network timeout then succeeds', function() + local httpStub = fakeHttp( + { sessionResp('ses_1'), asyncResp() }, + { + messageListResp({ userMessage('msg_1', 'hi'), assistantMessage('msg_2', 'partial', false) }), + httpTimeout('Timed out'), + messageListResp({ userMessage('msg_1', 'hi'), assistantMessage('msg_2', 'reply', true) }), + } + ); + local settingsStub = fakeAsyncSettings(); + local elFactory, elState = fakeEventloopFactory(); + local ai = createAi({ + http = httpStub, + settings = settingsStub, + now = function() return 0; end, + eventloop = elFactory, + }); + + local ok, result = ai.ask('hi', { messageId = 'msg_1', pollIntervalSeconds = 1, pollTimeoutSeconds = 60 }); + + testlib.assertTrue(ok); + testlib.assertEquals(result.reply, 'reply'); + testlib.assertEquals(#httpStub.getCalls, 3); + testlib.assertTrue(elState.lastLoop.inspect().stopped); + testlib.assertEquals(#elState.lastLoop.inspect().pending, 0); +end); + +testlib.test('pollMessage fails on persistent timeout only after deadline', function() + local httpStub = fakeHttp( + { sessionResp('ses_1'), asyncResp() }, + { + httpTimeout('Timed out'), + httpTimeout('Timed out'), + httpTimeout('Timed out'), + } + ); + local settingsStub = fakeAsyncSettings(); + local elFactory, elState = fakeEventloopFactory(); + local clock = 0; + local ai = createAi({ + http = httpStub, + settings = settingsStub, + now = function() + local t = clock; + clock = clock + 30; + return t; + end, + eventloop = elFactory, + }); + + local ok, err = ai.ask('hi', { messageId = 'msg_1', pollIntervalSeconds = 1, pollTimeoutSeconds = 60 }); + + testlib.assertTrue(not ok); + testlib.assertTrue(string.find(err, 'injoignable', 1, true) ~= nil); + testlib.assertTrue(#httpStub.getCalls > 1); + testlib.assertTrue(elState.lastLoop.inspect().stopped); + testlib.assertEquals(#elState.lastLoop.inspect().pending, 0); +end); + testlib.test('pollMessage stops cleanly on HTTP error mid-poll', function() local httpStub = fakeHttp( { sessionResp('ses_1'), asyncResp() },