Describe the bug
MCP sessions served via McpSession.serve('/mcp') drop every ~5 minutes on the Cloudflare edge due to three defects in the keepalive implementation across two code paths in dist/mcp/index.js.
To Reproduce
- Serve an MCP server via
McpSession.serve('/mcp') on a Cloudflare Durable Object
- Open an SSE connection and leave it idle (no tool calls)
- After ~5 minutes the stream closes
- Workers logs show
waitUntil() tasks did not complete warnings at each drop boundary
Expected behavior
SSE streams remain open indefinitely. The CF edge sees periodic keepalive bytes and does not invoke its idle-stream watchdog.
Screenshots
N/A
Version
agents@0.12.4
Additional context
Three defects combine to cause this:
-
createStreamingHttpHandler has no keepalive at all. McpSession.serve() routes through this handler. Its GET path opens the SSE channel and never writes another byte, so the CF edge 5-minute idle watchdog closes it.
-
WorkerTransport's setInterval is not wrapped in ctx.waitUntil. The interval body runs bare. The Workers runtime cancels background work ~30s after the fetch handler returns, so the keepalive fires a few times then silently dies.
-
Wrong SSE frame format. The interval writes event: ping\ndata: \n\n (a named event) instead of the SSE comment form : ping\n\n. Named events trigger onmessage / event listeners on the client — unintended for a keepalive frame.
The fix for all three sites is the same pattern — wrap the tick in a ctx.waitUntil-guarded IIFE, use : ping\n\n, and clear the interval on stream close/error. A 25s interval gives comfortable headroom below both the 30s runtime cancellation window and the 5-minute edge watchdog.
We are currently carrying this as a pnpm patch against the compiled output:
diff --git a/dist/mcp/index.js b/dist/mcp/index.js
index 06a45a44f72216d24e12ca8d6cafa44456e30d18..81ef9096f58069a5fe7ce52e1501b1628d044c2c 100644
--- a/dist/mcp/index.js
+++ b/dist/mcp/index.js
@@ -271,6 +271,15 @@ const createStreamingHttpHandler = (basePath, namespace, options = {}) => {
return new Response("Failed to establish WS to DO", { status: 500 });
}
ws.accept();
+ const keepAlive = setInterval(() => {
+ ctx.waitUntil((async () => {
+ try {
+ await writer.write(encoder.encode(": ping\n\n"));
+ } catch {
+ clearInterval(keepAlive);
+ }
+ })());
+ }, 25_000);
ws.addEventListener("message", (event) => {
try {
async function onMessage(ev) {
@@ -285,9 +294,11 @@ const createStreamingHttpHandler = (basePath, namespace, options = {}) => {
}
});
ws.addEventListener("error", () => {
+ clearInterval(keepAlive);
writer.close().catch(() => {});
});
ws.addEventListener("close", () => {
+ clearInterval(keepAlive);
writer.close().catch(() => {});
});
return new Response(readable, {
@@ -911,12 +922,8 @@ var WorkerTransport = class {
});
if (this.sessionId !== void 0) headers.set("mcp-session-id", this.sessionId);
const keepAlive = setInterval(() => {
- try {
- writer.write(encoder.encode("event: ping\ndata: \n\n"));
- } catch {
- clearInterval(keepAlive);
- }
- }, 3e4);
+ writer.write(encoder.encode(": ping\n\n")).catch(() => clearInterval(keepAlive));
+ }, 25_000);
this.streamMapping.set(streamId, {
writer,
encoder,
@@ -1097,12 +1104,8 @@ var WorkerTransport = class {
});
if (this.sessionId !== void 0) headers.set("mcp-session-id", this.sessionId);
const keepAlive = setInterval(() => {
- try {
- writer.write(encoder.encode("event: ping\ndata: \n\n"));
- } catch {
- clearInterval(keepAlive);
- }
- }, 3e4);
+ writer.write(encoder.encode(": ping\n\n")).catch(() => clearInterval(keepAlive));
+ }, 25_000);
this.streamMapping.set(streamId, {
writer,
encoder,
Addendum — POST branch of createStreamingHttpHandler was also missing keepalive
The patch above covered three sites:
createStreamingHttpHandler GET branch
WorkerTransport (two setInterval sites)
In production we then observed the issue recurring for clients that use the Streamable HTTP request-response mode (Claude Code, web chat agents — anything calling POST /mcp for tool execution). The POST branch of createStreamingHttpHandler (around line 175 in the unpatched 0.12.4 source) is structurally identical to the GET branch but had no keepalive at all.
Symptom: every POST /mcp for a tool call > ~5 minutes (e.g. a slow tool call against a reasoning model) hits Cloudflare's edge idle timeout and drops the SSE stream. The client sees the connection die and reinitialises the MCP session, which surfaces to users as "auth keeps dying" since reinit re-runs the OAuth flow on some clients.
The fix is structurally identical: a setInterval writing : ping\n\n every 25s, cleared in 5 sites — ws.addEventListener('message') when message.close === true, ws.addEventListener('error'), ws.addEventListener('close'), the notifications-only early-return, and the setInterval itself.
Note: the POST branch's setInterval omits the ctx.waitUntil wrapper used in the GET branch. The POST handler returns a streaming Response whose body lifecycle keeps the Worker alive through the entire stream, so the 30s background-task cancellation window doesn't apply (the runtime considers the response body to be the foreground work).
Additional hunks for the same pnpm patch:
@@ -175,6 +175,9 @@ const createStreamingHttpHandler = (basePath, namespace, options = {}) => {
return new Response(body, { status: 500 });
}
ws.accept();
+ const keepAlive = setInterval(() => {
+ writer.write(encoder.encode(": ping\n\n")).catch(() => clearInterval(keepAlive));
+ }, 25_000);
ws.addEventListener("message", (event) => {
async function onMessage(event) {
try {
@@ -183,6 +186,7 @@ const createStreamingHttpHandler = (basePath, namespace, options = {}) => {
if (message.type !== "cf_mcp_agent_event") return;
await writer.write(encoder.encode(message.event));
if (message.close) {
+ clearInterval(keepAlive);
ws?.close();
await writer.close().catch(() => {});
}
@@ -194,17 +198,20 @@ const createStreamingHttpHandler = (basePath, namespace, options = {}) => {
});
ws.addEventListener("error", (error) => {
async function onError(_error) {
+ clearInterval(keepAlive);
await writer.close().catch(() => {});
}
onError(error).catch(console.error);
});
ws.addEventListener("close", () => {
async function onClose() {
+ clearInterval(keepAlive);
await writer.close().catch(() => {});
}
onClose().catch(console.error);
});
if (messages.every((msg) => isJSONRPCNotification(msg) || isJSONRPCResultResponse(msg))) {
+ clearInterval(keepAlive);
ws.close();
return new Response(null, {
headers: corsHeaders(request, options.corsOptions),
Describe the bug
MCP sessions served via
McpSession.serve('/mcp')drop every ~5 minutes on the Cloudflare edge due to three defects in the keepalive implementation across two code paths indist/mcp/index.js.To Reproduce
McpSession.serve('/mcp')on a Cloudflare Durable ObjectwaitUntil() tasks did not completewarnings at each drop boundaryExpected behavior
SSE streams remain open indefinitely. The CF edge sees periodic keepalive bytes and does not invoke its idle-stream watchdog.
Screenshots
N/A
Version
agents@0.12.4
Additional context
Three defects combine to cause this:
createStreamingHttpHandlerhas no keepalive at all.McpSession.serve()routes through this handler. Its GET path opens the SSE channel and never writes another byte, so the CF edge 5-minute idle watchdog closes it.WorkerTransport'ssetIntervalis not wrapped inctx.waitUntil. The interval body runs bare. The Workers runtime cancels background work ~30s after the fetch handler returns, so the keepalive fires a few times then silently dies.Wrong SSE frame format. The interval writes
event: ping\ndata: \n\n(a named event) instead of the SSE comment form: ping\n\n. Named events triggeronmessage/ event listeners on the client — unintended for a keepalive frame.The fix for all three sites is the same pattern — wrap the tick in a
ctx.waitUntil-guarded IIFE, use: ping\n\n, and clear the interval on stream close/error. A 25s interval gives comfortable headroom below both the 30s runtime cancellation window and the 5-minute edge watchdog.We are currently carrying this as a
pnpm patchagainst the compiled output:Addendum — POST branch of
createStreamingHttpHandlerwas also missing keepaliveThe patch above covered three sites:
createStreamingHttpHandlerGET branchWorkerTransport(twosetIntervalsites)In production we then observed the issue recurring for clients that use the Streamable HTTP request-response mode (Claude Code, web chat agents — anything calling
POST /mcpfor tool execution). The POST branch ofcreateStreamingHttpHandler(around line 175 in the unpatched 0.12.4 source) is structurally identical to the GET branch but had no keepalive at all.Symptom: every
POST /mcpfor a tool call > ~5 minutes (e.g. a slow tool call against a reasoning model) hits Cloudflare's edge idle timeout and drops the SSE stream. The client sees the connection die and reinitialises the MCP session, which surfaces to users as "auth keeps dying" since reinit re-runs the OAuth flow on some clients.The fix is structurally identical: a
setIntervalwriting: ping\n\nevery 25s, cleared in 5 sites —ws.addEventListener('message')whenmessage.close === true,ws.addEventListener('error'),ws.addEventListener('close'), the notifications-only early-return, and thesetIntervalitself.Note: the POST branch's
setIntervalomits thectx.waitUntilwrapper used in the GET branch. The POST handler returns a streamingResponsewhose body lifecycle keeps the Worker alive through the entire stream, so the 30s background-task cancellation window doesn't apply (the runtime considers the response body to be the foreground work).Additional hunks for the same
pnpm patch: