fix(ai): retry transient network timeouts during poll

A single poll GET that hit a network-level failure was treated as fatal,
aborting the whole poll after the ~60s per-call timeout instead of retrying
until poll_timeout. Network failures during polling are now transient: they
log and reschedule until the deadline. HTTP 404 and other non-200 codes stay
fatal.
This commit is contained in:
Guillaume ARM 2026-06-11 22:06:34 +02:00
parent 79677e2742
commit 3b647090fa
6 changed files with 79 additions and 6 deletions

View File

@ -386,7 +386,13 @@ local function createAi(opts)
local function attempt()
attemptCount = attemptCount + 1;
local body, code = doGet(cfg, '/session/' .. sessionId .. '/message');
if not body then return finish(false, code); end
if not body then
log('poll #' .. tostring(attemptCount) .. ': transient error: ' .. tostring(code));
if nowFunc() >= deadline then
return finish(false, code);
end
return loop.setTimeout(attempt, cfg.pollIntervalSeconds);
end
if code == 404 then
local ok, value = handleMissingSession(persist, sessionSettingKey);
return finish(ok, value);

View File

@ -1,6 +1,6 @@
{
"name": "TrapOS",
"version": "0.8.13",
"version": "0.8.14",
"branch": "next",
"packages": [
"trapos"

View File

@ -5,8 +5,8 @@
"trapos-boot": "0.3.2",
"trapos-net": "0.3.0",
"trapos-ui": "0.2.2",
"trapos-ai": "0.6.11",
"trapos-ai": "0.6.12",
"trapos-sandbox": "0.2.2",
"trapos": "0.8.13"
"trapos": "0.8.14"
}
}

View File

@ -1,6 +1,6 @@
{
"name": "trapos-ai",
"version": "0.6.11",
"version": "0.6.12",
"description": "TrapOS AI client for opencode serve",
"dependencies": ["trapos-core"],
"files": [

View File

@ -1,6 +1,6 @@
{
"name": "trapos",
"version": "0.8.13",
"version": "0.8.14",
"description": "TrapOS full install meta-package",
"dependencies": [
"trapos-boot",

View File

@ -85,6 +85,14 @@ local function httpError(code, body)
end;
end
-- True network-level failure: no response handle at all (timeout / unreachable).
-- Drives callHttp's `not response` path -> 'serveur injoignable: <message>'.
local function httpTimeout(message)
return function()
return nil, message or 'Timed out';
end;
end
-- Synchronous deterministic eventloop double for tests.
-- setTimeout drains FIFO; runLoop runs until pending is empty or stopLoop fires.
-- Returns (factory, state). state.sleeps accumulates every delay passed across
@ -1023,6 +1031,65 @@ testlib.test('pollMessage stops the private loop on success', function()
testlib.assertEquals(#elState.lastLoop.inspect().pending, 0);
end);
testlib.test('pollMessage retries transient network timeout then succeeds', function()
local httpStub = fakeHttp(
{ sessionResp('ses_1'), asyncResp() },
{
messageListResp({ userMessage('msg_1', 'hi'), assistantMessage('msg_2', 'partial', false) }),
httpTimeout('Timed out'),
messageListResp({ userMessage('msg_1', 'hi'), assistantMessage('msg_2', 'reply', true) }),
}
);
local settingsStub = fakeAsyncSettings();
local elFactory, elState = fakeEventloopFactory();
local ai = createAi({
http = httpStub,
settings = settingsStub,
now = function() return 0; end,
eventloop = elFactory,
});
local ok, result = ai.ask('hi', { messageId = 'msg_1', pollIntervalSeconds = 1, pollTimeoutSeconds = 60 });
testlib.assertTrue(ok);
testlib.assertEquals(result.reply, 'reply');
testlib.assertEquals(#httpStub.getCalls, 3);
testlib.assertTrue(elState.lastLoop.inspect().stopped);
testlib.assertEquals(#elState.lastLoop.inspect().pending, 0);
end);
testlib.test('pollMessage fails on persistent timeout only after deadline', function()
local httpStub = fakeHttp(
{ sessionResp('ses_1'), asyncResp() },
{
httpTimeout('Timed out'),
httpTimeout('Timed out'),
httpTimeout('Timed out'),
}
);
local settingsStub = fakeAsyncSettings();
local elFactory, elState = fakeEventloopFactory();
local clock = 0;
local ai = createAi({
http = httpStub,
settings = settingsStub,
now = function()
local t = clock;
clock = clock + 30;
return t;
end,
eventloop = elFactory,
});
local ok, err = ai.ask('hi', { messageId = 'msg_1', pollIntervalSeconds = 1, pollTimeoutSeconds = 60 });
testlib.assertTrue(not ok);
testlib.assertTrue(string.find(err, 'injoignable', 1, true) ~= nil);
testlib.assertTrue(#httpStub.getCalls > 1);
testlib.assertTrue(elState.lastLoop.inspect().stopped);
testlib.assertEquals(#elState.lastLoop.inspect().pending, 0);
end);
testlib.test('pollMessage stops cleanly on HTTP error mid-poll', function()
local httpStub = fakeHttp(
{ sessionResp('ses_1'), asyncResp() },