From 73c4cedc8aded3d92c9e0ac8f3f83375ceafe7d9 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Mon, 15 Jun 2026 13:53:08 +0200 Subject: [PATCH 01/24] Added logging of REST response containing host and negotiated HTTP protocol version. Fixed PubNubServerException handling to guard against a null response and fall back to the full exception message. --- src/PubNub/Endpoints/Endpoint.php | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/PubNub/Endpoints/Endpoint.php b/src/PubNub/Endpoints/Endpoint.php index efc38ba5..80dba331 100755 --- a/src/PubNub/Endpoints/Endpoint.php +++ b/src/PubNub/Endpoints/Endpoint.php @@ -356,6 +356,12 @@ protected function sendRequest(RequestInterface $request): PNEnvelope } else { $response = $client->sendRequest($request); } + $this->pubnub->getLogger()->debug(sprintf( + "%s response from %s negotiated HTTP/%s", + $this->getName(), + $request->getUri()->getHost(), + $response->getProtocolVersion() + )); $envelope = $this->parseResponse($response); } catch (NetworkExceptionInterface $exception) { return new PNEnvelope(null, $this->createStatus( @@ -368,10 +374,10 @@ protected function sendRequest(RequestInterface $request): PNEnvelope $statusCode = $exception->getCode(); $response = substr($exception->getMessage(), strpos($exception->getMessage(), "\n") + 1); $pnServerException = new PubNubServerException(); - if (is_callable([$exception, 'getResponse'])) { + if (is_callable([$exception, 'getResponse']) && $exception->getResponse() !== null) { $response = $exception->getResponse()->getBody()->getContents(); } else { - $response = substr($exception->getMessage(), strpos($exception->getMessage(), "\n") + 1); + $response = $exception->getMessage(); } $pnServerException->setRawBody($response); $pnServerException->setStatusCode($exception->getCode()); From c06517a8ad831bf34681d7dda10d8d091767aa40 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Mon, 15 Jun 2026 18:48:35 +0200 Subject: [PATCH 02/24] Disable HereNow tests because implicit heartbeat has been disabled on test keys. --- tests/e2e/HereNowE2eTest.php | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/e2e/HereNowE2eTest.php b/tests/e2e/HereNowE2eTest.php index c5118f24..3c8d1d2e 100644 --- a/tests/e2e/HereNowE2eTest.php +++ b/tests/e2e/HereNowE2eTest.php @@ -17,6 +17,15 @@ class HereNowE2eTest extends \PubNubTestCase { use PresenceTestHelper; + public function setUp(): void + { + parent::setUp(); + $this->markTestSkipped( + 'Implicit heartbeat has been disabled on the test keys, so these tests fail because the SDK has no ' + . 'Heartbeat support and no explicit Heartbeat is executed.' + ); + } + /** * Test pagination with multiple channels - verifies limit applies per-channel * This test requires real background clients to properly test the server-side pagination From 4ccc6ce32783a3d36b57bb7c71cd6bc56ab5c88f Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 13:34:36 +0200 Subject: [PATCH 03/24] Seems like server behaviour change on server side --- examples/Snippets.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Snippets.php b/examples/Snippets.php index 45e231ed..a4cb22bc 100644 --- a/examples/Snippets.php +++ b/examples/Snippets.php @@ -27,7 +27,7 @@ $pubnub->history() ->channel("my_channel") ->count(100) - ->start(-1) + ->start(1) ->end(13847168819178600) ->reverse(true) ->sync(); From 2a5988f775805680d89b512c19090fae05c0ee47 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 13:55:10 +0200 Subject: [PATCH 04/24] Add diagnostic --- .github/workflows/run-tests.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index bd760bb3..62f369e3 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -60,6 +60,28 @@ jobs: run: | composer self-update && composer --version composer install --prefer-dist + # TEMPORARY: diagnose intermittent `cURL error 28` timeouts to ps.pndsn.com + # from the self-hosted runner. Remove once the runner egress issue is resolved. + - name: Diagnose ps.pndsn.com connectivity + continue-on-error: true + run: | + echo "=== Runner public egress IP ===" + curl -s --max-time 15 https://api.ipify.org || echo "(could not determine egress IP)" + echo + echo "=== DNS resolution for ps.pndsn.com ===" + getent hosts ps.pndsn.com || nslookup ps.pndsn.com || true + echo + echo "=== Proxy-related environment ===" + env | grep -iE 'proxy' || echo "(no proxy env vars set)" + echo + echo "=== 10x probe of https://ps.pndsn.com/time/0 (catch intermittent hang) ===" + for i in $(seq 1 10); do + echo "--- attempt $i ---" + curl -sS -o /dev/null --max-time 15 \ + -w 'http_code=%{http_code} dns=%{time_namelookup}s connect=%{time_connect}s tls=%{time_appconnect}s total=%{time_total}s\n' \ + "https://ps.pndsn.com/time/0" \ + || echo "attempt $i FAILED (curl exit $?)" + done - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error From b6a3525b0fa1725c8ab63bb14eea94676fc54f01 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 14:19:11 +0200 Subject: [PATCH 05/24] Add diagnostic --- .github/workflows/run-tests.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 62f369e3..168c1fa3 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -82,6 +82,23 @@ jobs: "https://ps.pndsn.com/time/0" \ || echo "attempt $i FAILED (curl exit $?)" done + echo + echo "=== Burst probe: 200 requests, 20 concurrent (trip rate-limit/throttle under load) ===" + # If the idle probe above is fast but requests here stall ~10s / fail with + # exit 28, that points to server-side throttling under volume rather than a + # broken network path. http_code 429 would confirm explicit rate-limiting. + burst_log="$(mktemp)" + seq 1 200 | xargs -P 20 -I {} \ + curl -sS -o /dev/null --max-time 15 \ + -w 'http_code=%{http_code} total=%{time_total}s\n' \ + "https://ps.pndsn.com/time/0" \ + >> "$burst_log" 2>&1 || true + echo "--- HTTP status code distribution ---" + grep -oE 'http_code=[0-9]+' "$burst_log" | sort | uniq -c || true + echo "--- failures / non-200 / slow (>1s) responses ---" + grep -vE 'http_code=200 total=0' "$burst_log" | grep -E 'http_code=(000|4|5)|total=[1-9]' || echo "(none — all fast 200s)" + echo "--- slowest 5 responses ---" + sort -t= -k3 -rn "$burst_log" | head -5 || true - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error From f42b13b430a6f46e23e66c522794f19be5f5a6b7 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 15:35:01 +0200 Subject: [PATCH 06/24] Add diagnostic --- .github/workflows/run-tests.yml | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 168c1fa3..8a64f3b8 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -83,10 +83,9 @@ jobs: || echo "attempt $i FAILED (curl exit $?)" done echo - echo "=== Burst probe: 200 requests, 20 concurrent (trip rate-limit/throttle under load) ===" - # If the idle probe above is fast but requests here stall ~10s / fail with - # exit 28, that points to server-side throttling under volume rather than a - # broken network path. http_code 429 would confirm explicit rate-limiting. + echo "=== Burst probe A: 200x time/0, 20 concurrent (keyless baseline) ===" + # Cheap, unauthenticated endpoint. Establishes that the network path is fine + # under concurrent load. time/0 is NOT subject to per-key publish throttling. burst_log="$(mktemp)" seq 1 200 | xargs -P 20 -I {} \ curl -sS -o /dev/null --max-time 15 \ @@ -95,10 +94,25 @@ jobs: >> "$burst_log" 2>&1 || true echo "--- HTTP status code distribution ---" grep -oE 'http_code=[0-9]+' "$burst_log" | sort | uniq -c || true - echo "--- failures / non-200 / slow (>1s) responses ---" - grep -vE 'http_code=200 total=0' "$burst_log" | grep -E 'http_code=(000|4|5)|total=[1-9]' || echo "(none — all fast 200s)" - echo "--- slowest 5 responses ---" + echo "--- slowest 5 ---" sort -t= -k3 -rn "$burst_log" | head -5 || true + echo + echo "=== Burst probe B: 200x PUBLISH, 20 concurrent (the actual failing op) ===" + # Publishes to the real keyset — the same operation that times out in CI. + # If THIS bursts shows 429 / 000 / ~10s stalls while probe A stayed fast, + # the cause is per-key publish throttling, not the network or the runner. + pub_log="$(mktemp)" + seq 1 200 | xargs -P 20 -I {} \ + curl -sS -o /dev/null --max-time 15 \ + -w 'http_code=%{http_code} total=%{time_total}s\n' \ + "https://ps.pndsn.com/publish/$PUBLISH_KEY/$SUBSCRIBE_KEY/0/diag-burst/0/%22x%22?uuid=diag-burst" \ + >> "$pub_log" 2>&1 || true + echo "--- HTTP status code distribution ---" + grep -oE 'http_code=[0-9]+' "$pub_log" | sort | uniq -c || true + echo "--- non-200 / slow (>1s) responses ---" + grep -E 'http_code=(000|4|5)|total=[1-9]' "$pub_log" || echo "(none — all fast 200s)" + echo "--- slowest 5 ---" + sort -t= -k3 -rn "$pub_log" | head -5 || true - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error From 65f4a748a51716683318b46e71f5297f457319d2 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 16:35:14 +0200 Subject: [PATCH 07/24] Add diagnostic --- .github/workflows/run-tests.yml | 15 +++ scripts/diag-guzzle.php | 159 ++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 scripts/diag-guzzle.php diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 8a64f3b8..24c8327b 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -113,6 +113,21 @@ jobs: grep -E 'http_code=(000|4|5)|total=[1-9]' "$pub_log" || echo "(none — all fast 200s)" echo "--- slowest 5 ---" sort -t= -k3 -rn "$pub_log" | head -5 || true + # TEMPORARY: PHP-level probe that REUSES one Guzzle client across many + # requests, exactly like the SDK does for a whole PHPUnit run. The curl + # probes above open a fresh connection each time and so cannot reproduce a + # stale/half-closed keep-alive socket — the prime suspect for + # "connected fine, then 0 bytes received after 10s" (cURL error 28). + # DIAG_DELAY_MS inserts an idle gap to provoke LB idle-timeout of pooled + # sockets. Remove once the timeout cause is confirmed/fixed. + - name: Diagnose Guzzle connection reuse (PHP) + continue-on-error: true + run: | + echo "### back-to-back (no idle gap) ###" + DIAG_ITERATIONS=200 DIAG_DELAY_MS=0 php scripts/diag-guzzle.php + echo + echo "### with 5s idle gaps (provokes stale-socket timeout) ###" + DIAG_ITERATIONS=30 DIAG_DELAY_MS=5000 php scripts/diag-guzzle.php - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php new file mode 100644 index 00000000..f3c0b644 --- /dev/null +++ b/scripts/diag-guzzle.php @@ -0,0 +1,159 @@ +get($url, [ + RequestOptions::TIMEOUT => REQUEST_TIMEOUT, + RequestOptions::CONNECT_TIMEOUT => CONNECT_TIMEOUT, + 'version' => $version, + // num_connects is unavailable in some libcurl builds, so detect + // reuse via local_port: same client port == same pooled socket. + 'on_stats' => function ($stats) use (&$curlInfo) { + $h = $stats->getHandlerStats(); + $curlInfo = [ + 'port' => $h['local_port'] ?? null, + 'http' => $h['http_version'] ?? '?', // curl wire view (may differ from PSR-7) + 'ip' => $h['primary_ip'] ?? '?', + ]; + }, + ]); + $elapsed = (microtime(true) - $start) * 1000; + $times[] = $elapsed; + + $port = $curlInfo['port'] ?? null; + if ($port !== null) { + if (isset($seenPorts[$port])) { + $reuseCount++; // rode a previously-opened (pooled) socket + } + $seenPorts[$port] = true; + } + $hv = (string) ($curlInfo['http'] ?? '?'); + $httpVersions[$hv] = ($httpVersions[$hv] ?? 0) + 1; + } catch (\Throwable $e) { + $elapsed = (microtime(true) - $start) * 1000; + $failures[] = sprintf( + " #%d after %dms: %s", + $i, + (int) $elapsed, + $e->getMessage() + ); + } + + if ($delayMs > 0) { + usleep($delayMs * 1000); + } + } + + $ok = count($times); + $fail = count($failures); + sort($times); + $p50 = $times ? $times[(int) (count($times) * 0.50)] ?? end($times) : 0; + $p99 = $times ? $times[(int) (count($times) * 0.99)] ?? end($times) : 0; + $max = $times ? max($times) : 0; + + printf( + "\n=== %s (version=%s, %d iterations, delay=%dms) ===\n", + $label, + $version, + $iterations, + $delayMs + ); + $httpDist = []; + foreach ($httpVersions as $v => $n) { + $httpDist[] = 'http/' . $v . ' x' . $n; + } + printf(" ok=%d fail=%d reused-socket=%d distinct-sockets=%d\n", $ok, $fail, $reuseCount, count($seenPorts)); + printf(" curl-wire-protocol: %s\n", implode(' ', $httpDist) ?: '(n/a)'); + printf(" latency p50=%dms p99=%dms max=%dms\n", (int) $p50, (int) $p99, (int) $max); + if ($failures) { + echo " FAILURES:\n" . implode("\n", $failures) . "\n"; + } else { + echo " (no failures)\n"; + } +} + +$timeUrl = "https://ps.pndsn.com/time/0"; + +echo "######## PHP Guzzle reuse diagnostic ########\n"; +echo "Guzzle reuses one client across the whole run (like the SDK).\n"; + +// Baseline: keyless, both protocols. +probe("time/0 baseline", $timeUrl, '2', $iterations, $delayMs); +probe("time/0 baseline", $timeUrl, '1.1', $iterations, $delayMs); + +// The actual failing op: publish, both protocols. +if ($publishKey !== '' && $subscribeKey !== '') { + $publishUrl = sprintf( + "https://ps.pndsn.com/publish/%s/%s/0/diag-guzzle/0/%%22x%%22?uuid=diag-guzzle", + $publishKey, + $subscribeKey + ); + probe("publish (SDK-forced h2)", $publishUrl, '2', $iterations, $delayMs); + probe("publish (h1.1)", $publishUrl, '1.1', $iterations, $delayMs); +} + +echo "\n######## done ########\n"; From 6a4819e9bf7b89bf2086caade31ee1de2fc5c6b7 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 16:42:25 +0200 Subject: [PATCH 08/24] Add diagnostic --- scripts/diag-guzzle.php | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index f3c0b644..0bb3ddbf 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -66,17 +66,19 @@ function probe(string $label, string $url, string $version, int $iterations, int $start = microtime(true); $curlInfo = null; try { - $client->get($url, [ + $response = $client->get($url, [ RequestOptions::TIMEOUT => REQUEST_TIMEOUT, RequestOptions::CONNECT_TIMEOUT => CONNECT_TIMEOUT, 'version' => $version, - // num_connects is unavailable in some libcurl builds, so detect - // reuse via local_port: same client port == same pooled socket. + // Detect socket reuse via local_port (num_connects is absent in + // some libcurl builds). NOTE: do NOT trust the stat's + // 'http_version' field — it echoes the REQUESTED version, not the + // negotiated one. The PSR-7 response below reports the real wire + // protocol (ps.pndsn.com is HTTP/1.1 only — no h2 over ALPN). 'on_stats' => function ($stats) use (&$curlInfo) { $h = $stats->getHandlerStats(); $curlInfo = [ 'port' => $h['local_port'] ?? null, - 'http' => $h['http_version'] ?? '?', // curl wire view (may differ from PSR-7) 'ip' => $h['primary_ip'] ?? '?', ]; }, @@ -91,7 +93,8 @@ function probe(string $label, string $url, string $version, int $iterations, int } $seenPorts[$port] = true; } - $hv = (string) ($curlInfo['http'] ?? '?'); + // Real negotiated protocol, straight from the PSR-7 response. + $hv = $response->getProtocolVersion(); $httpVersions[$hv] = ($httpVersions[$hv] ?? 0) + 1; } catch (\Throwable $e) { $elapsed = (microtime(true) - $start) * 1000; @@ -127,7 +130,7 @@ function probe(string $label, string $url, string $version, int $iterations, int $httpDist[] = 'http/' . $v . ' x' . $n; } printf(" ok=%d fail=%d reused-socket=%d distinct-sockets=%d\n", $ok, $fail, $reuseCount, count($seenPorts)); - printf(" curl-wire-protocol: %s\n", implode(' ', $httpDist) ?: '(n/a)'); + printf(" negotiated-protocol (PSR-7): %s\n", implode(' ', $httpDist) ?: '(n/a)'); printf(" latency p50=%dms p99=%dms max=%dms\n", (int) $p50, (int) $p99, (int) $max); if ($failures) { echo " FAILURES:\n" . implode("\n", $failures) . "\n"; From 70f1f33c8ad2c8d90f4ddc7c7f12e55ca77cf924 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 16:48:58 +0200 Subject: [PATCH 09/24] Add diagnostic --- .github/workflows/run-tests.yml | 8 ++++---- scripts/diag-guzzle.php | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 24c8327b..f2ea5495 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -123,11 +123,11 @@ jobs: - name: Diagnose Guzzle connection reuse (PHP) continue-on-error: true run: | - echo "### back-to-back (no idle gap) ###" + # Reuses one Guzzle client like the SDK. The keyless time/0 baseline + # should be 0 failures; the publish run reproduces the cURL-28 timeout. + # The built-in spacing sweep then tells us whether spacing publishes + # further apart makes the failures disappear (== server rate limit). DIAG_ITERATIONS=200 DIAG_DELAY_MS=0 php scripts/diag-guzzle.php - echo - echo "### with 5s idle gaps (provokes stale-socket timeout) ###" - DIAG_ITERATIONS=30 DIAG_DELAY_MS=5000 php scripts/diag-guzzle.php - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index 0bb3ddbf..40b67d36 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -148,15 +148,25 @@ function probe(string $label, string $url, string $version, int $iterations, int probe("time/0 baseline", $timeUrl, '2', $iterations, $delayMs); probe("time/0 baseline", $timeUrl, '1.1', $iterations, $delayMs); -// The actual failing op: publish, both protocols. +// The actual failing op: publish. if ($publishKey !== '' && $subscribeKey !== '') { $publishUrl = sprintf( "https://ps.pndsn.com/publish/%s/%s/0/diag-guzzle/0/%%22x%%22?uuid=diag-guzzle", $publishKey, $subscribeKey ); - probe("publish (SDK-forced h2)", $publishUrl, '2', $iterations, $delayMs); - probe("publish (h1.1)", $publishUrl, '1.1', $iterations, $delayMs); + + // Primary run honoring the env-provided delay. + probe("publish", $publishUrl, '1.1', $iterations, $delayMs); + + // Spacing sweep: distinguish a per-second RATE limit from a per-request / + // idle issue. If failures vanish as we space publishes further apart, the + // cap is rate-based (quota refills over time). If they persist regardless of + // spacing, it is NOT a simple rate limit. + echo "\n######## spacing sweep (publish, 30 req each) ########\n"; + foreach ([0, 200, 500, 1000, 2000] as $gapMs) { + probe("publish gap=" . $gapMs . "ms", $publishUrl, '1.1', 30, $gapMs); + } } echo "\n######## done ########\n"; From 785eb754abc1a0b71ed9364f6fc4e5dadcb4d099 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 16:54:27 +0200 Subject: [PATCH 10/24] Add diagnostic --- scripts/diag-guzzle.php | 58 +++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index 40b67d36..fbef9e79 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -57,10 +57,13 @@ function probe(string $label, string $url, string $version, int $iterations, int $client = new GuzzleHttpClient(); $times = []; - $failures = []; + $failures = []; + $failedIndices = []; $reuseCount = 0; $seenPorts = []; // local TCP ports seen — a repeat == a reused socket - $httpVersions = []; // curl-level wire protocol distribution + $httpVersions = []; // negotiated protocol distribution (from PSR-7) + $ipOk = []; // backend IP => count of successful requests + $ipFail = []; // backend IP => count of timed-out requests for ($i = 0; $i < $iterations; $i++) { $start = microtime(true); @@ -96,14 +99,20 @@ function probe(string $label, string $url, string $version, int $iterations, int // Real negotiated protocol, straight from the PSR-7 response. $hv = $response->getProtocolVersion(); $httpVersions[$hv] = ($httpVersions[$hv] ?? 0) + 1; + + $ip = $curlInfo['ip'] ?? '?'; + $ipOk[$ip] = ($ipOk[$ip] ?? 0) + 1; } catch (\Throwable $e) { $elapsed = (microtime(true) - $start) * 1000; - $failures[] = sprintf( - " #%d after %dms: %s", - $i, - (int) $elapsed, - $e->getMessage() - ); + $failedIndices[] = $i; + // curl populates primary_ip once connected, so even a post-connect + // timeout tells us WHICH backend black-holed the request. + $ip = $curlInfo['ip'] ?? '?'; + $ipFail[$ip] = ($ipFail[$ip] ?? 0) + 1; + // Keep one representative full message; the rest are summarized by index. + if (count($failures) < 1) { + $failures[] = sprintf(" e.g. #%d after %dms (ip=%s): %s", $i, (int) $elapsed, $ip, $e->getMessage()); + } } if ($delayMs > 0) { @@ -132,8 +141,20 @@ function probe(string $label, string $url, string $version, int $iterations, int printf(" ok=%d fail=%d reused-socket=%d distinct-sockets=%d\n", $ok, $fail, $reuseCount, count($seenPorts)); printf(" negotiated-protocol (PSR-7): %s\n", implode(' ', $httpDist) ?: '(n/a)'); printf(" latency p50=%dms p99=%dms max=%dms\n", (int) $p50, (int) $p99, (int) $max); - if ($failures) { - echo " FAILURES:\n" . implode("\n", $failures) . "\n"; + // Per-backend breakdown: if the failures concentrate on one (or few) IPs + // while other IPs only ever succeed, the cause is a wedged backend node + // behind the LB — not a rate limit (which would return fast 429s, not 10s + // 0-byte hangs). + $allIps = array_unique(array_merge(array_keys($ipOk), array_keys($ipFail))); + sort($allIps); + echo " per-backend (ok/fail):"; + foreach ($allIps as $ip) { + printf(" %s=%d/%d", $ip, $ipOk[$ip] ?? 0, $ipFail[$ip] ?? 0); + } + echo "\n"; + if (!empty($failedIndices)) { + echo " failed-indices: " . implode(',', $failedIndices) . "\n"; + echo implode("\n", $failures) . "\n"; } else { echo " (no failures)\n"; } @@ -156,15 +177,18 @@ function probe(string $label, string $url, string $version, int $iterations, int $subscribeKey ); - // Primary run honoring the env-provided delay. + // The failures are 10s 0-byte HANGS, not 429s, so this is NOT a rate limit. + // The leading hypothesis is a wedged backend behind the publish LB: every + // ~10th request is routed to a node that accepts the connection but never + // responds. The per-backend (ok/fail) line in each run is the key evidence — + // watch whether fails concentrate on one IP. probe("publish", $publishUrl, '1.1', $iterations, $delayMs); - // Spacing sweep: distinguish a per-second RATE limit from a per-request / - // idle issue. If failures vanish as we space publishes further apart, the - // cap is rate-based (quota refills over time). If they persist regardless of - // spacing, it is NOT a simple rate limit. - echo "\n######## spacing sweep (publish, 30 req each) ########\n"; - foreach ([0, 200, 500, 1000, 2000] as $gapMs) { + // Spacing check: if it were a time/rate effect, gaps would change the rate + // of failure. For a per-request routing fault, the ~1-in-10 cadence persists + // regardless of spacing. + echo "\n######## spacing check (publish, 30 req each) ########\n"; + foreach ([0, 1000] as $gapMs) { probe("publish gap=" . $gapMs . "ms", $publishUrl, '1.1', 30, $gapMs); } } From bb7621d09bd18a67ad20b35d73c0e4b62b035cda Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Wed, 17 Jun 2026 13:02:22 +0200 Subject: [PATCH 11/24] Revert change to Endpoint --- src/PubNub/Endpoints/Endpoint.php | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/PubNub/Endpoints/Endpoint.php b/src/PubNub/Endpoints/Endpoint.php index 80dba331..f6dcbb53 100755 --- a/src/PubNub/Endpoints/Endpoint.php +++ b/src/PubNub/Endpoints/Endpoint.php @@ -356,12 +356,12 @@ protected function sendRequest(RequestInterface $request): PNEnvelope } else { $response = $client->sendRequest($request); } - $this->pubnub->getLogger()->debug(sprintf( - "%s response from %s negotiated HTTP/%s", - $this->getName(), - $request->getUri()->getHost(), - $response->getProtocolVersion() - )); +// $this->pubnub->getLogger()->debug(sprintf( +// "%s response from %s negotiated HTTP/%s", +// $this->getName(), +// $request->getUri()->getHost(), +// $response->getProtocolVersion() +// )); $envelope = $this->parseResponse($response); } catch (NetworkExceptionInterface $exception) { return new PNEnvelope(null, $this->createStatus( @@ -374,10 +374,10 @@ protected function sendRequest(RequestInterface $request): PNEnvelope $statusCode = $exception->getCode(); $response = substr($exception->getMessage(), strpos($exception->getMessage(), "\n") + 1); $pnServerException = new PubNubServerException(); - if (is_callable([$exception, 'getResponse']) && $exception->getResponse() !== null) { + if (is_callable([$exception, 'getResponse'])) { $response = $exception->getResponse()->getBody()->getContents(); } else { - $response = $exception->getMessage(); + $response = substr($exception->getMessage(), strpos($exception->getMessage(), "\n") + 1); } $pnServerException->setRawBody($response); $pnServerException->setStatusCode($exception->getCode()); From 092b25bb988d2578945e7adef99d199775c57c24 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Wed, 17 Jun 2026 13:10:57 +0200 Subject: [PATCH 12/24] Diagnostic --- scripts/diag-guzzle.php | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index fbef9e79..84ae296b 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -178,11 +178,13 @@ function probe(string $label, string $url, string $version, int $iterations, int ); // The failures are 10s 0-byte HANGS, not 429s, so this is NOT a rate limit. - // The leading hypothesis is a wedged backend behind the publish LB: every - // ~10th request is routed to a node that accepts the connection but never - // responds. The per-backend (ok/fail) line in each run is the key evidence — - // watch whether fails concentrate on one IP. - probe("publish", $publishUrl, '1.1', $iterations, $delayMs); + // A raw `curl --http1.1` CLI reusing one connection runs 200/200 clean, so + // the hang is specific to the PHP/Guzzle path. The SDK forces 'version'=>'2' + // (Endpoint::requestOptions line ~501) while curl ran 1.1 — so probe BOTH + // here to isolate whether the forced HTTP/2 option is the trigger. + // Watch: does version=1.1 go clean while version=2 hangs? + probe("publish version=2", $publishUrl, '2', $iterations, $delayMs); + probe("publish version=1.1", $publishUrl, '1.1', $iterations, $delayMs); // Spacing check: if it were a time/rate effect, gaps would change the rate // of failure. For a per-request routing fault, the ~1-in-10 cadence persists From eeebc5a273ecce6e83b21b7d6d8e5e536e932ce4 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Wed, 17 Jun 2026 18:10:56 +0200 Subject: [PATCH 13/24] StreamHandler --- scripts/diag-guzzle.php | 55 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index 84ae296b..e3f761ad 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -17,6 +17,13 @@ * - version=2 : what the SDK forces today (Endpoint::requestOptions) * - version=1.1 : what ps.pndsn.com actually speaks * + * It also runs a handler swap (publish, version=1.1) through Guzzle's curl + * handler vs its StreamHandler (PHP stream wrapper, no libcurl). The raw curl + * CLI control uses a different libcurl+TLS stack than PHP, so it can't exonerate + * PHP's curl handler; this swap stays inside PHP and changes only the handler. + * If the hang vanishes on the stream handler, the fault is PHP's curl handler / + * its libcurl build; if it persists on both, the cause is above libcurl. + * * Usage (from project root, keys exported as in the CI workflow): * php scripts/diag-guzzle.php * @@ -31,6 +38,9 @@ require __DIR__ . '/../vendor/autoload.php'; use GuzzleHttp\Client as GuzzleHttpClient; +use GuzzleHttp\HandlerStack; +use GuzzleHttp\Handler\CurlHandler; +use GuzzleHttp\Handler\StreamHandler; use GuzzleHttp\RequestOptions; $publishKey = getenv('PUBLISH_KEY') ?: ''; @@ -50,11 +60,29 @@ * Fire $iterations requests at $url through ONE reused client and report. * * @param string $version '2' or '1.1' — passed to Guzzle's 'version' option. + * @param string $handler 'curl' (default, the SDK transport) or 'stream' + * (PHP stream wrapper — no libcurl). Swapping the + * handler isolates whether the hang lives in PHP's + * curl handler / its libcurl+OpenSSL build: if the + * stream handler runs clean where curl hangs, the + * fault is in the curl path, not the server. */ -function probe(string $label, string $url, string $version, int $iterations, int $delayMs): void -{ - // One client for the whole loop == SDK behavior == cURL keep-alive pool reuse. - $client = new GuzzleHttpClient(); +function probe( + string $label, + string $url, + string $version, + int $iterations, + int $delayMs, + string $handler = 'curl' +): void { + // One client for the whole loop == SDK behavior == keep-alive pool reuse. + if ($handler === 'stream') { + // StreamHandler uses PHP's HTTP stream wrapper (no libcurl at all). + $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new StreamHandler())]); + } else { + // Explicit CurlHandler == what Guzzle picks by default == the SDK path. + $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new CurlHandler())]); + } $times = []; $failures = []; @@ -128,12 +156,18 @@ function probe(string $label, string $url, string $version, int $iterations, int $max = $times ? max($times) : 0; printf( - "\n=== %s (version=%s, %d iterations, delay=%dms) ===\n", + "\n=== %s (handler=%s, version=%s, %d iterations, delay=%dms) ===\n", $label, + $handler, $version, $iterations, $delayMs ); + if ($handler === 'stream') { + // StreamHandler exposes no libcurl stats, so socket-reuse and per-backend + // IP columns below will read 0 / '?' — that's expected, not a finding. + echo " (note: stream handler reports no socket/IP stats — ignore those columns)\n"; + } $httpDist = []; foreach ($httpVersions as $v => $n) { $httpDist[] = 'http/' . $v . ' x' . $n; @@ -186,6 +220,17 @@ function probe(string $label, string $url, string $version, int $iterations, int probe("publish version=2", $publishUrl, '2', $iterations, $delayMs); probe("publish version=1.1", $publishUrl, '1.1', $iterations, $delayMs); + // Handler swap: run the SAME publish loop through the curl handler and then + // through the StreamHandler (PHP stream wrapper, no libcurl). The curl CLI + // control runs clean but uses a DIFFERENT libcurl+TLS stack than PHP, so it + // doesn't exonerate PHP's curl handler. This swap stays inside PHP and + // changes only the handler: if the every-10th hang VANISHES on the stream + // handler, it pins the fault to PHP's curl handler / its libcurl+OpenSSL + // build; if it persists on both, the cause is above libcurl (server/network). + echo "\n######## handler swap (publish, version=1.1) ########\n"; + probe("publish handler=curl", $publishUrl, '1.1', $iterations, $delayMs, 'curl'); + probe("publish handler=stream", $publishUrl, '1.1', $iterations, $delayMs, 'stream'); + // Spacing check: if it were a time/rate effect, gaps would change the rate // of failure. For a per-request routing fault, the ~1-in-10 cadence persists // regardless of spacing. From 24efecd55196ab3fe5d9d8a491e404fa1243336c Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Wed, 17 Jun 2026 18:52:00 +0200 Subject: [PATCH 14/24] LB check --- scripts/diag-guzzle.php | 103 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index e3f761ad..ac506362 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -27,12 +27,28 @@ * Usage (from project root, keys exported as in the CI workflow): * php scripts/diag-guzzle.php * + * Finally it runs a VERBOSE capture (publish, curl handler, ~12 reused requests) + * that dumps libcurl's wire-level trace per request — the `< Connection:`, + * `< Keep-Alive:`, and `* Connection #N ... left intact / will close` lines — + * so we can see EXACTLY what the load balancer sends on the connection it later + * black-holes. The suite ran clean for months until the LB moved nginx -> + * Pingora; the prime suspect is now a server-side keep-alive change (e.g. a low + * `keepalive_requests` cap, or retiring the connection without a clean + * `Connection: close`) that PHP's curl reuse trips over on the ~10th request. + * Watch request #8/#9 (the retirement boundary) for the header the SDK ignores. + * + * Usage (from project root, keys exported as in the CI workflow): + * php scripts/diag-guzzle.php + * * Env: * PUBLISH_KEY, SUBSCRIBE_KEY (required for the publish probe) * DIAG_ITERATIONS (optional, default 200) * DIAG_DELAY_MS (optional, idle gap between requests in ms, * default 0; try 5000 to provoke idle-timeout * of pooled sockets) + * DIAG_VERBOSE_ITERS (optional, default 12 — how many requests the + * verbose capture fires to span the ~10th + * retirement boundary) */ require __DIR__ . '/../vendor/autoload.php'; @@ -47,6 +63,7 @@ $subscribeKey = getenv('SUBSCRIBE_KEY') ?: ''; $iterations = (int) (getenv('DIAG_ITERATIONS') ?: 200); $delayMs = (int) (getenv('DIAG_DELAY_MS') ?: 0); +$verboseIters = (int) (getenv('DIAG_VERBOSE_ITERS') ?: 12); if ($publishKey === '' || $subscribeKey === '') { fwrite(STDERR, "WARNING: PUBLISH_KEY/SUBSCRIBE_KEY not set — publish probe will be skipped.\n"); @@ -194,6 +211,87 @@ function probe( } } +/** + * Fire $iterations publishes over ONE reused curl connection with libcurl + * VERBOSE tracing on, and print, per request, only the lines that reveal how the + * load balancer manages the keep-alive connection: + * - `< Connection:` / `< Keep-Alive:` response headers (does Pingora announce a + * request cap or signal close?) + * - `* Connection #N ... left intact` vs `* Closing connection` (when libcurl + * decides to drop / retire the pooled socket) + * - `* Re-using existing connection` (proof the request rode a pooled socket) + * - any `* ... timed out` / error line on the hung request + * + * This is the smoking-gun capture for the nginx->Pingora hypothesis: watch the + * boundary request (~#8/#9) for a server header the SDK's pooled handle ignores, + * or a silent close that leaves the next reused request writing into a dead + * socket (-> 0 bytes -> 10s timeout). + * + * Uses Guzzle's raw 'curl' option passthrough (CurlHandler only) to set + * CURLOPT_VERBOSE + CURLOPT_STDERR at a temp stream we then read back. + */ +function captureVerbose(string $url, string $version, int $iterations): void +{ + printf("\n######## verbose wire capture (publish, curl, version=%s, %d reused requests) ########\n", $version, $iterations); + echo "Watch the ~#8/#9 boundary: what does the LB send right before the hang?\n"; + + // One client == one keep-alive pool, so requests reuse the same socket. + $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new CurlHandler())]); + + // Only the lines worth reading from libcurl's (very chatty) verbose trace. + $keep = ['Connection', 'Keep-Alive', 'Re-using', 'Closing connection', 'left intact', + 'Connected to', 'timed out', 'Operation timed out', 'Empty reply', 'Connection died']; + + for ($i = 0; $i < $iterations; $i++) { + $trace = fopen('php://temp', 'w+'); + $start = microtime(true); + $status = '?'; + $err = null; + try { + $resp = $client->get($url, [ + RequestOptions::TIMEOUT => REQUEST_TIMEOUT, + RequestOptions::CONNECT_TIMEOUT => CONNECT_TIMEOUT, + 'version' => $version, + // CurlHandler passes these straight to the easy handle. + 'curl' => [ + CURLOPT_VERBOSE => true, + CURLOPT_STDERR => $trace, + ], + ]); + $status = (string) $resp->getStatusCode(); + } catch (\Throwable $e) { + $err = $e->getMessage(); + } + $elapsed = (int) ((microtime(true) - $start) * 1000); + + // Pull the interesting verbose lines out of the trace stream. + rewind($trace); + $raw = stream_get_contents($trace) ?: ''; + fclose($trace); + $lines = []; + foreach (explode("\n", $raw) as $line) { + $line = rtrim($line); + if ($line === '') { + continue; + } + // The TLS handshake line matches "Connection" but is per-request + // noise, not keep-alive signal — drop it. + if (stripos($line, 'SSL connection using') !== false) { + continue; + } + foreach ($keep as $needle) { + if (stripos($line, $needle) !== false) { + $lines[] = ' ' . $line; + break; + } + } + } + + printf(" #%d status=%s %dms%s\n", $i, $status, $elapsed, $err !== null ? " ERROR: $err" : ''); + echo $lines ? implode("\n", $lines) . "\n" : " (no keep-alive/connection lines in trace)\n"; + } +} + $timeUrl = "https://ps.pndsn.com/time/0"; echo "######## PHP Guzzle reuse diagnostic ########\n"; @@ -231,6 +329,11 @@ function probe( probe("publish handler=curl", $publishUrl, '1.1', $iterations, $delayMs, 'curl'); probe("publish handler=stream", $publishUrl, '1.1', $iterations, $delayMs, 'stream'); + // Verbose wire capture across the every-10th retirement boundary. This is the + // hand-to-ops evidence for the nginx->Pingora keep-alive hypothesis: it shows + // exactly what the LB sends on the connection right before it's black-holed. + captureVerbose($publishUrl, '1.1', $verboseIters); + // Spacing check: if it were a time/rate effect, gaps would change the rate // of failure. For a per-request routing fault, the ~1-in-10 cadence persists // regardless of spacing. From 35994573427ef85cb4546b2c62b341b4c53031f6 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 10:27:25 +0200 Subject: [PATCH 15/24] Reuse vs new-connection --- scripts/diag-guzzle.php | 56 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index ac506362..ca04728b 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -230,10 +230,24 @@ function probe( * Uses Guzzle's raw 'curl' option passthrough (CurlHandler only) to set * CURLOPT_VERBOSE + CURLOPT_STDERR at a temp stream we then read back. */ -function captureVerbose(string $url, string $version, int $iterations): void +function captureVerbose(string $url, string $version, int $iterations, int $gapMs = 0): void { - printf("\n######## verbose wire capture (publish, curl, version=%s, %d reused requests) ########\n", $version, $iterations); - echo "Watch the ~#8/#9 boundary: what does the LB send right before the hang?\n"; + printf( + "\n######## verbose wire capture (publish, curl, version=%s, %d requests, gap=%dms) ########\n", + $version, + $iterations, + $gapMs + ); + if ($gapMs > 0) { + // The point of the gap run: at gap=0 the every-10th request hangs; at + // gap=1000ms the loop is clean. The open question this answers is WHY — + // does the boundary request (~#9/#10) RE-USE the same socket (so the gap + // just keeps it healthy), or does the client CONNECT a NEW socket (so the + // gap lets a silent drop become visible and the client reconnects cleanly)? + echo "Question: at this gap, does the ~#9/#10 boundary REUSE the socket or open a NEW connection?\n"; + } else { + echo "Watch the ~#8/#9 boundary: what does the LB send right before the hang?\n"; + } // One client == one keep-alive pool, so requests reuse the same socket. $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new CurlHandler())]); @@ -242,6 +256,9 @@ function captureVerbose(string $url, string $version, int $iterations): void $keep = ['Connection', 'Keep-Alive', 'Re-using', 'Closing connection', 'left intact', 'Connected to', 'timed out', 'Operation timed out', 'Empty reply', 'Connection died']; + $reused = 0; // requests that rode a pooled socket ("Re-using existing") + $newConns = 0; // requests that opened a fresh socket ("Connected to") + for ($i = 0; $i < $iterations; $i++) { $trace = fopen('php://temp', 'w+'); $start = microtime(true); @@ -268,6 +285,21 @@ function captureVerbose(string $url, string $version, int $iterations): void rewind($trace); $raw = stream_get_contents($trace) ?: ''; fclose($trace); + + // Classify reuse-vs-reconnect from the raw trace (independent of $keep + // filtering): "Re-using existing" == pooled socket; "Connected to" == a + // fresh TCP connect this request. This is the direct evidence for the gap + // question above. + $isReuse = stripos($raw, 'Re-using existing') !== false; + $isNew = stripos($raw, 'Connected to') !== false; + if ($isReuse) { + $reused++; + } + if ($isNew) { + $newConns++; + } + $conn = $isReuse ? 'REUSE' : ($isNew ? 'NEW-CONN' : '?'); + $lines = []; foreach (explode("\n", $raw) as $line) { $line = rtrim($line); @@ -287,9 +319,15 @@ function captureVerbose(string $url, string $version, int $iterations): void } } - printf(" #%d status=%s %dms%s\n", $i, $status, $elapsed, $err !== null ? " ERROR: $err" : ''); + printf(" #%d [%s] status=%s %dms%s\n", $i, $conn, $status, $elapsed, $err !== null ? " ERROR: $err" : ''); echo $lines ? implode("\n", $lines) . "\n" : " (no keep-alive/connection lines in trace)\n"; + + if ($gapMs > 0) { + usleep($gapMs * 1000); + } } + + printf(" -- summary: reused=%d new-connections=%d (of %d) --\n", $reused, $newConns, $iterations); } $timeUrl = "https://ps.pndsn.com/time/0"; @@ -332,7 +370,15 @@ function captureVerbose(string $url, string $version, int $iterations): void // Verbose wire capture across the every-10th retirement boundary. This is the // hand-to-ops evidence for the nginx->Pingora keep-alive hypothesis: it shows // exactly what the LB sends on the connection right before it's black-holed. - captureVerbose($publishUrl, '1.1', $verboseIters); + captureVerbose($publishUrl, '1.1', $verboseIters, 0); + + // Same capture WITH a 1000ms gap (the spacing that makes the loop clean). The + // per-request REUSE/NEW-CONN tag + summary answers: does the gap keep ONE + // socket alive across the boundary (all REUSE), or does it let a silent drop + // surface so the client reconnects (a NEW-CONN at ~#9/#10)? We currently have + // NO direct evidence either way — distinct-sockets=1 can't tell reuse from a + // same-port reconnect. This settles it. + captureVerbose($publishUrl, '1.1', $verboseIters, 1000); // Spacing check: if it were a time/rate effect, gaps would change the rate // of failure. For a per-request routing fault, the ~1-in-10 cadence persists From d658b9a3b9e9803de0f8655513790043316b47d6 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 11:55:28 +0200 Subject: [PATCH 16/24] run diag-curl.sh --- .github/workflows/run-tests.yml | 12 ++++ scripts/diag-curl.sh | 98 +++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100755 scripts/diag-curl.sh diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index f2ea5495..c3a18dc5 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -113,6 +113,18 @@ jobs: grep -E 'http_code=(000|4|5)|total=[1-9]' "$pub_log" || echo "(none — all fast 200s)" echo "--- slowest 5 ---" sort -t= -k3 -rn "$pub_log" | head -5 || true + # TEMPORARY: raw `curl` CLI control that reuses ONE keep-alive connection + # across many publishes (one curl process, N urls via -K). This is the + # PHP-free reproducer for the server team: if it hangs every ~10th here on + # the dense CI runner, the every-10th black-hole is reproducible with no + # PHP/Guzzle/SDK involvement at all. Locally (high RTT) it ran 200/200 + # clean — this step tests it on the same runner/pool that fails for Guzzle. + # Remove once the timeout cause is confirmed/fixed. + - name: Diagnose connection reuse (raw curl CLI) + continue-on-error: true + run: | + curl --version | head -1 + DIAG_ITERATIONS=200 ./scripts/diag-curl.sh # TEMPORARY: PHP-level probe that REUSES one Guzzle client across many # requests, exactly like the SDK does for a whole PHPUnit run. The curl # probes above open a fresh connection each time and so cannot reproduce a diff --git a/scripts/diag-curl.sh b/scripts/diag-curl.sh new file mode 100755 index 00000000..40f212d6 --- /dev/null +++ b/scripts/diag-curl.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# +# TEMPORARY diagnostic: reproduce the "every ~10th publish hangs 10s / 0 bytes" +# issue using the raw `curl` CLI instead of PHP/Guzzle. +# +# This is a CONTROL for scripts/diag-guzzle.php. The only way to reproduce the +# hang is to REUSE ONE keep-alive connection across many requests. A fresh curl +# per request opens a new socket every time and never reproduces it -- that is +# why the earlier per-request curl probes looked healthy. +# +# curl reuses the connection when you give it MANY URLs in ONE invocation, so +# this fires all N publishes down a single connection (like the SDK does). +# +# Usage (from project root): +# set -a; source .env.dev; set +a +# ./scripts/diag-curl.sh +# +# Env: +# PUBLISH_KEY, SUBSCRIBE_KEY (required) +# DIAG_ITERATIONS (optional, default 200) +# +# Output per request line: s +# - http_code 000 = no response (the hang / cURL error 28) +# - repeated local_port = same reused socket (keep-alive working) +# - a new local_port = curl had to reopen after a failure + +set -u + +PUBLISH_KEY="${PUBLISH_KEY:-}" +SUBSCRIBE_KEY="${SUBSCRIBE_KEY:-}" +ITER="${DIAG_ITERATIONS:-200}" + +if [[ -z "$PUBLISH_KEY" || -z "$SUBSCRIBE_KEY" ]]; then + echo "ERROR: set PUBLISH_KEY and SUBSCRIBE_KEY (e.g. 'set -a; source .env.dev; set +a')" >&2 + exit 1 +fi + +URL="https://ps.pndsn.com/publish/${PUBLISH_KEY}/${SUBSCRIBE_KEY}/0/diag-curl/0/%22x%22?uuid=diag-curl" + +# Build a curl-config file with N url entries -> ONE curl process, ONE connection. +CFG="$(mktemp)" +trap 'rm -f "$CFG"' EXIT +for ((i = 0; i < ITER; i++)); do + printf 'url = "%s"\n' "$URL" >> "$CFG" +done + +run_probe() { + local label="$1"; shift + echo "" + echo "=== ${label} (${ITER} iterations) ===" + + # -K $CFG : all N URLs, reusing the connection + # --max-time 12 : per-request ceiling (>10s server timeout so the hang shows as 000) + # -s -o /dev/null : discard bodies + # -w ...\n : one machine-readable line per request + # Fields: http_code | time_total | local_port | remote_ip + curl "$@" -K "$CFG" \ + --max-time 12 \ + -s -o /dev/null \ + -w '%{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ + | awk ' + { + idx = NR - 1 + code = $1; t = $2; port = $3; ip = $4; nconn = $5 + printf " #%-3d code=%s time=%ss port=%s ip=%s conns=%s\n", idx, code, t, port, ip, nconn + total++ + last_nconn = nconn + 0 # cumulative; final value = total TCP connections opened + ipseen[ip]++ + if (code == "000" || code == "" ) { # hang / no response + fail++; failidx = failidx (failidx=="" ? "" : ",") idx + ipfail[ip]++ + } else { + ok++; ipok[ip]++ + } + if (port != lastport && NR > 1) sockets++ + portseen[port]=1 + lastport = port + } + END { + ndist = 0; for (p in portseen) ndist++ + printf "\n ok=%d fail=%d distinct-sockets=%d tcp-connects-opened=%d (total=%d)\n", ok+0, fail+0, ndist, last_nconn, total + printf " (reuse proof: tcp-connects-opened << total => keep-alive working; ideally 1 + one reconnect per hang)\n" + printf " per-backend (ok/fail):" + for (ip in ipseen) printf " %s=%d/%d", ip, ipok[ip]+0, ipfail[ip]+0 + printf "\n" + if (fail+0 > 0) printf " failed-indices: %s\n", failidx + else printf " (no failures)\n" + }' +} + +echo "######## raw curl reuse diagnostic ########" +echo "One curl process reuses one keep-alive connection across ${ITER} publishes." +echo "Compare against scripts/diag-guzzle.php (the PHP/Guzzle path)." + +run_probe "publish (HTTP/1.1)" --http1.1 + +echo "" +echo "######## done ########" \ No newline at end of file From 9cd2f489dadbf73d6c59048520ec8b389d1b7706 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 12:37:33 +0200 Subject: [PATCH 17/24] run diag-curl.sh --- scripts/diag-curl.sh | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/scripts/diag-curl.sh b/scripts/diag-curl.sh index 40f212d6..8109ca8e 100755 --- a/scripts/diag-curl.sh +++ b/scripts/diag-curl.sh @@ -21,6 +21,9 @@ # # Output per request line: s # - http_code 000 = no response (the hang / cURL error 28) +# - http_code 429 = Balancer rate limit (fast) — curl reports these every +# ~10th publish; the PHP/Guzzle path black-holes (000/10s) +# at the same cadence instead. See the investigation doc. # - repeated local_port = same reused socket (keep-alive working) # - a new local_port = curl had to reopen after a failure @@ -53,22 +56,32 @@ run_probe() { # --max-time 12 : per-request ceiling (>10s server timeout so the hang shows as 000) # -s -o /dev/null : discard bodies # -w ...\n : one machine-readable line per request - # Fields: http_code | time_total | local_port | remote_ip + # + # CRITICAL: prefix the -w line with a unique sentinel ("@@STAT@@") and parse ONLY + # sentinel lines. With -K and many URLs, response bodies can still reach stdout and + # merge with the metadata line (an earlier version miscounted every-10th HTTP 429s + # as ok=200, e.g. `code={"status":429...`). Parsing on the sentinel makes the field + # positions reliable regardless of body content. + # Fields: http_code | time_total | local_port | remote_ip | num_connects curl "$@" -K "$CFG" \ --max-time 12 \ -s -o /dev/null \ - -w '%{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ + -w '@@STAT@@ %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ | awk ' + $1 != "@@STAT@@" { next } # ignore any body text that leaked onto stdout { - idx = NR - 1 - code = $1; t = $2; port = $3; ip = $4; nconn = $5 + idx = total # 0-based request index among sentinel lines + code = $2; t = $3; port = $4; ip = $5; nconn = $6 printf " #%-3d code=%s time=%ss port=%s ip=%s conns=%s\n", idx, code, t, port, ip, nconn total++ - last_nconn = nconn + 0 # cumulative; final value = total TCP connections opened + last_nconn = nconn + 0 # cumulative; final value = total TCP connections opened ipseen[ip]++ - if (code == "000" || code == "" ) { # hang / no response - fail++; failidx = failidx (failidx=="" ? "" : ",") idx + if (code == "000" || code == "") { # hang / no response (cURL 28) + hang++; hangidx = hangidx (hangidx=="" ? "" : ",") idx ipfail[ip]++ + } else if (code == "429") { # Balancer rate limit (fast) + rl++; rlidx = rlidx (rlidx=="" ? "" : ",") idx + ipok[ip]++ # server answered fast; not a hang } else { ok++; ipok[ip]++ } @@ -78,13 +91,15 @@ run_probe() { } END { ndist = 0; for (p in portseen) ndist++ - printf "\n ok=%d fail=%d distinct-sockets=%d tcp-connects-opened=%d (total=%d)\n", ok+0, fail+0, ndist, last_nconn, total + printf "\n ok=%d rate-limited(429)=%d hung(000)=%d distinct-sockets=%d tcp-connects-opened=%d (total=%d)\n", \ + ok+0, rl+0, hang+0, ndist, last_nconn, total printf " (reuse proof: tcp-connects-opened << total => keep-alive working; ideally 1 + one reconnect per hang)\n" - printf " per-backend (ok/fail):" + printf " per-backend (non-hang/hung):" for (ip in ipseen) printf " %s=%d/%d", ip, ipok[ip]+0, ipfail[ip]+0 printf "\n" - if (fail+0 > 0) printf " failed-indices: %s\n", failidx - else printf " (no failures)\n" + if (rl+0 > 0) printf " rate-limited(429) indices: %s\n", rlidx + if (hang+0 > 0) printf " hung(000) indices: %s\n", hangidx + if (rl+0 == 0 && hang+0 == 0) printf " (no 429s, no hangs)\n" }' } From 06f0bc79994eb4e1a391132aa8431b5fa3418189 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 13:01:18 +0200 Subject: [PATCH 18/24] run diag-curl.sh --- scripts/diag-curl.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/diag-curl.sh b/scripts/diag-curl.sh index 8109ca8e..a5ce242d 100755 --- a/scripts/diag-curl.sh +++ b/scripts/diag-curl.sh @@ -57,18 +57,21 @@ run_probe() { # -s -o /dev/null : discard bodies # -w ...\n : one machine-readable line per request # - # CRITICAL: prefix the -w line with a unique sentinel ("@@STAT@@") and parse ONLY + # CRITICAL: prefix the -w line with a unique sentinel ("STAT::") and parse ONLY # sentinel lines. With -K and many URLs, response bodies can still reach stdout and # merge with the metadata line (an earlier version miscounted every-10th HTTP 429s # as ok=200, e.g. `code={"status":429...`). Parsing on the sentinel makes the field # positions reliable regardless of body content. + # NOTE: the sentinel must NOT start with '@' — curl reads a -w value beginning with + # '@' as a FILENAME (`-w @file`), which fails with "error encountered when reading a + # file". So use "STAT::", not "@@STAT@@". # Fields: http_code | time_total | local_port | remote_ip | num_connects curl "$@" -K "$CFG" \ --max-time 12 \ -s -o /dev/null \ - -w '@@STAT@@ %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ + -w 'STAT:: %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ | awk ' - $1 != "@@STAT@@" { next } # ignore any body text that leaked onto stdout + $1 != "STAT::" { next } # ignore any body text that leaked onto stdout { idx = total # 0-based request index among sentinel lines code = $2; t = $3; port = $4; ip = $5; nconn = $6 From 025931e3faf68758e49b3d2c451728c81a8de0db Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 13:18:58 +0200 Subject: [PATCH 19/24] run diag-curl.sh --- scripts/diag-curl.sh | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/diag-curl.sh b/scripts/diag-curl.sh index a5ce242d..cfa3155f 100755 --- a/scripts/diag-curl.sh +++ b/scripts/diag-curl.sh @@ -65,16 +65,28 @@ run_probe() { # NOTE: the sentinel must NOT start with '@' — curl reads a -w value beginning with # '@' as a FILENAME (`-w @file`), which fails with "error encountered when reading a # file". So use "STAT::", not "@@STAT@@". - # Fields: http_code | time_total | local_port | remote_ip | num_connects + # + # ROBUST PARSING: -o /dev/null does NOT reliably suppress bodies on this -K + # multi-URL path (the response body still reaches stdout), AND the body often has + # no trailing newline — so the NEXT request's "STAT::" line gets glued onto the end + # of the body line. A naive `$1=="STAT::"` then matches only the very first line + # (total=1 bug). Fix: lead the format with \n (so each STAT starts fresh where + # possible) and use awk match() to find "STAT::" ANYWHERE on the line, parsing the + # whitespace-separated fields that follow it. Body text before the sentinel is + # discarded. + # Fields after STAT:: -> http_code | time_total | local_port | remote_ip | num_connects curl "$@" -K "$CFG" \ --max-time 12 \ -s -o /dev/null \ - -w 'STAT:: %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ + -w '\nSTAT:: %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ | awk ' - $1 != "STAT::" { next } # ignore any body text that leaked onto stdout { + p = index($0, "STAT:: ") + if (p == 0) next # body text with no sentinel — ignore + n = split(substr($0, p + 7), f, /[ \t]+/) + if (n < 5) next # malformed / truncated sentinel — skip + code = f[1]; t = f[2]; port = f[3]; ip = f[4]; nconn = f[5] idx = total # 0-based request index among sentinel lines - code = $2; t = $3; port = $4; ip = $5; nconn = $6 printf " #%-3d code=%s time=%ss port=%s ip=%s conns=%s\n", idx, code, t, port, ip, nconn total++ last_nconn = nconn + 0 # cumulative; final value = total TCP connections opened From aed3291e465abdcc22d19e2e6a312361fdf5b253 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 14:02:04 +0200 Subject: [PATCH 20/24] Increased requestTimeout to 15sec to get 429 --- scripts/diag-guzzle.php | 41 +++++++++++++++++++++++++++++++++++----- tests/PubNubTestCase.php | 12 ++++++++++++ 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index ca04728b..178d5e4b 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -69,8 +69,13 @@ fwrite(STDERR, "WARNING: PUBLISH_KEY/SUBSCRIBE_KEY not set — publish probe will be skipped.\n"); } -// Mirror the SDK's timeout defaults (PNConfiguration: 10s request, 10s connect). -const REQUEST_TIMEOUT = 10; +// Request timeout RAISED to 15s (was 10s, the SDK default). The Balancer holds a +// rate-limited publish ~10s before returning a 429; at the old 10s timeout the SDK +// aborted (cURL 28 / 0 bytes) ~8ms before the 429 arrived. 15s lets the response +// land — so this run now CONFIRMS the mechanism: the every-10th request should show +// up as an HTTP 429 (or a slow ~10s success) instead of a timeout. Mirrors raw +// curl's --max-time 12 and the tests' setNonSubscribeRequestTimeout(15). +const REQUEST_TIMEOUT = 15; const CONNECT_TIMEOUT = 10; /** @@ -109,6 +114,7 @@ function probe( $httpVersions = []; // negotiated protocol distribution (from PSR-7) $ipOk = []; // backend IP => count of successful requests $ipFail = []; // backend IP => count of timed-out requests + $statusCodes = []; // HTTP status => count (429s now visible, not thrown) for ($i = 0; $i < $iterations; $i++) { $start = microtime(true); @@ -118,6 +124,11 @@ function probe( RequestOptions::TIMEOUT => REQUEST_TIMEOUT, RequestOptions::CONNECT_TIMEOUT => CONNECT_TIMEOUT, 'version' => $version, + // Do NOT throw on 4xx/5xx — we WANT to observe the Balancer's 429 + // as a normal response. With http_errors on (Guzzle default) a 429 + // throws ClientException and gets lumped into the opaque failure + // bucket; off, we can tally it by status code below. + RequestOptions::HTTP_ERRORS => false, // Detect socket reuse via local_port (num_connects is absent in // some libcurl builds). NOTE: do NOT trust the stat's // 'http_version' field — it echoes the REQUESTED version, not the @@ -145,6 +156,13 @@ function probe( $hv = $response->getProtocolVersion(); $httpVersions[$hv] = ($httpVersions[$hv] ?? 0) + 1; + // Tally status code — every 10th should now be 429 (~10s), the rest 200. + $code = $response->getStatusCode(); + $statusCodes[$code] = ($statusCodes[$code] ?? 0) + 1; + if ($code === 429) { + $failedIndices[] = $i; // record the cadence even though it didn't throw + } + $ip = $curlInfo['ip'] ?? '?'; $ipOk[$ip] = ($ipOk[$ip] ?? 0) + 1; } catch (\Throwable $e) { @@ -190,6 +208,15 @@ function probe( $httpDist[] = 'http/' . $v . ' x' . $n; } printf(" ok=%d fail=%d reused-socket=%d distinct-sockets=%d\n", $ok, $fail, $reuseCount, count($seenPorts)); + // Status-code distribution: with http_errors off and a 15s timeout, the + // every-10th rate-limited publish now shows as `429 x20` here instead of a + // timeout — the confirmation that the "hang" was a delayed 429 all along. + ksort($statusCodes); + $codeDist = []; + foreach ($statusCodes as $c => $n) { + $codeDist[] = $c . ' x' . $n; + } + printf(" status-codes: %s\n", implode(' ', $codeDist) ?: '(none — all threw)'); printf(" negotiated-protocol (PSR-7): %s\n", implode(' ', $httpDist) ?: '(n/a)'); printf(" latency p50=%dms p99=%dms max=%dms\n", (int) $p50, (int) $p99, (int) $max); // Per-backend breakdown: if the failures concentrate on one (or few) IPs @@ -203,11 +230,15 @@ function probe( printf(" %s=%d/%d", $ip, $ipOk[$ip] ?? 0, $ipFail[$ip] ?? 0); } echo "\n"; + // $failedIndices now holds both thrown failures (timeouts) AND 429s observed as + // normal responses. Label it as the every-10th cadence rather than "failures". if (!empty($failedIndices)) { - echo " failed-indices: " . implode(',', $failedIndices) . "\n"; - echo implode("\n", $failures) . "\n"; + echo " rate-limited/failed indices: " . implode(',', $failedIndices) . "\n"; + if (!empty($failures)) { + echo implode("\n", $failures) . "\n"; + } } else { - echo " (no failures)\n"; + echo " (no 429s, no failures)\n"; } } diff --git a/tests/PubNubTestCase.php b/tests/PubNubTestCase.php index 7f8f44cc..feb76ea1 100755 --- a/tests/PubNubTestCase.php +++ b/tests/PubNubTestCase.php @@ -82,27 +82,39 @@ public function setUp(): void parent::setUp(); + // The Balancer rate-limits dense publish bursts by holding the connection + // ~10s before returning a 429 (see .claude/features/cicd-publish-timeout- + // investigation.md). The SDK's default 10s non-subscribe timeout fires ~8ms + // before that 429 arrives, surfacing as `cURL error 28 / 0 bytes`. Raising + // the timeout to 15s lets the SDK actually receive the 429 instead of timing + // out — mirroring what raw curl (--max-time 12) does. + $nonSubscribeRequestTimeout = 15; + $this->config = new PNConfiguration(); $this->config->setSubscribeKey($subscribeKey); $this->config->setPublishKey($publishKey); $this->config->setUuid($uuidMock); + $this->config->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_enc = new PNConfiguration(); $this->config_enc->setSubscribeKey($subscribeKey); $this->config_enc->setPublishKey($publishKey); $this->config_enc->setCipherKey(static::CIPHER_KEY); $this->config_enc->setUuid($uuidMock); + $this->config_enc->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_pam = new PNConfiguration(); $this->config_pam->setSubscribeKey($subscribeKeyPam); $this->config_pam->setPublishKey($publishKeyPam); $this->config_pam->setSecretKey($secretKeyPam); $this->config_pam->setUuid($uuidMock); + $this->config_pam->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_demo = new PNConfiguration(); $this->config_demo->setSubscribeKey('demo'); $this->config_demo->setPublishKey('demo'); $this->config_demo->setUuid($uuidMock); + $this->config_demo->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_demo->disableImmutableCheck(); $this->pubnub = new PubNub($this->config); From 55a7733d72ca63b871e554688da9a6157e60bf13 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 18:23:25 +0200 Subject: [PATCH 21/24] Enable logging --- src/PubNub/Endpoints/Endpoint.php | 38 ++++++++++++++++++++++++++----- tests/PubNubTestCase.php | 5 +++- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/PubNub/Endpoints/Endpoint.php b/src/PubNub/Endpoints/Endpoint.php index f6dcbb53..0e55e706 100755 --- a/src/PubNub/Endpoints/Endpoint.php +++ b/src/PubNub/Endpoints/Endpoint.php @@ -20,6 +20,7 @@ use Psr\Http\Client\ClientExceptionInterface; use Psr\Http\Message\RequestInterface; use Psr\Http\Message\ResponseInterface; +use Psr\Http\Message\UriInterface; abstract class Endpoint { @@ -356,12 +357,13 @@ protected function sendRequest(RequestInterface $request): PNEnvelope } else { $response = $client->sendRequest($request); } -// $this->pubnub->getLogger()->debug(sprintf( -// "%s response from %s negotiated HTTP/%s", -// $this->getName(), -// $request->getUri()->getHost(), -// $response->getProtocolVersion() -// )); + $this->pubnub->getLogger()->debug(sprintf( + "%s %s response from %s negotiated HTTP/%s", + $this->getName(), + $this->redactUri($request->getUri()), + $request->getUri()->getHost(), + $response->getProtocolVersion() + )); $envelope = $this->parseResponse($response); } catch (NetworkExceptionInterface $exception) { return new PNEnvelope(null, $this->createStatus( @@ -409,6 +411,30 @@ protected function sendRequest(RequestInterface $request): PNEnvelope return $envelope; } + /** + * Build a log-safe URI string with sensitive query params masked. + * + * Masks the PAM `auth` token and request `signature` so they don't leak + * into debug logs. All other params (and the pub/sub keys in the path) + * are kept to preserve diagnostic value. + */ + protected function redactUri(UriInterface $uri): string + { + $query = $uri->getQuery(); + if ($query === '') { + return (string) $uri; + } + + parse_str($query, $params); + foreach (['auth', 'signature'] as $sensitive) { + if (isset($params[$sensitive]) && $params[$sensitive] !== '') { + $params[$sensitive] = 'REDACTED'; + } + } + + return (string) $uri->withQuery(http_build_query($params)); + } + /** * @param ResponseInterface $response * @return PNEnvelope diff --git a/tests/PubNubTestCase.php b/tests/PubNubTestCase.php index feb76ea1..fa1306ae 100755 --- a/tests/PubNubTestCase.php +++ b/tests/PubNubTestCase.php @@ -78,7 +78,10 @@ public function setUp(): void $uuidMock = getenv("UUID_MOCK") ?: "UUID_MOCK"; $logger = new Logger('PubNub'); - $logger->pushHandler(new ErrorLogHandler()); + // Log at DEBUG so the SDK's per-request debug lines (endpoint, redacted + // URL, negotiated HTTP version — see Endpoint::sendRequest) surface in + // the test output. Logger::DEBUG is valid in both Monolog 2 and 3. + $logger->pushHandler(new ErrorLogHandler(ErrorLogHandler::OPERATING_SYSTEM, Logger::DEBUG)); parent::setUp(); From d403c946600a3857a3a1d6c0143828fb5929f73c Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Sat, 20 Jun 2026 11:51:10 +0200 Subject: [PATCH 22/24] Cleaning --- .github/workflows/run-tests.yml | 80 ------ scripts/diag-curl.sh | 128 --------- scripts/diag-guzzle.php | 423 ------------------------------ src/PubNub/Endpoints/Endpoint.php | 32 +-- tests/PubNubTestCase.php | 18 +- tests/integrational/FilesTest.php | 12 +- 6 files changed, 15 insertions(+), 678 deletions(-) delete mode 100755 scripts/diag-curl.sh delete mode 100644 scripts/diag-guzzle.php diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index c3a18dc5..bd760bb3 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -60,86 +60,6 @@ jobs: run: | composer self-update && composer --version composer install --prefer-dist - # TEMPORARY: diagnose intermittent `cURL error 28` timeouts to ps.pndsn.com - # from the self-hosted runner. Remove once the runner egress issue is resolved. - - name: Diagnose ps.pndsn.com connectivity - continue-on-error: true - run: | - echo "=== Runner public egress IP ===" - curl -s --max-time 15 https://api.ipify.org || echo "(could not determine egress IP)" - echo - echo "=== DNS resolution for ps.pndsn.com ===" - getent hosts ps.pndsn.com || nslookup ps.pndsn.com || true - echo - echo "=== Proxy-related environment ===" - env | grep -iE 'proxy' || echo "(no proxy env vars set)" - echo - echo "=== 10x probe of https://ps.pndsn.com/time/0 (catch intermittent hang) ===" - for i in $(seq 1 10); do - echo "--- attempt $i ---" - curl -sS -o /dev/null --max-time 15 \ - -w 'http_code=%{http_code} dns=%{time_namelookup}s connect=%{time_connect}s tls=%{time_appconnect}s total=%{time_total}s\n' \ - "https://ps.pndsn.com/time/0" \ - || echo "attempt $i FAILED (curl exit $?)" - done - echo - echo "=== Burst probe A: 200x time/0, 20 concurrent (keyless baseline) ===" - # Cheap, unauthenticated endpoint. Establishes that the network path is fine - # under concurrent load. time/0 is NOT subject to per-key publish throttling. - burst_log="$(mktemp)" - seq 1 200 | xargs -P 20 -I {} \ - curl -sS -o /dev/null --max-time 15 \ - -w 'http_code=%{http_code} total=%{time_total}s\n' \ - "https://ps.pndsn.com/time/0" \ - >> "$burst_log" 2>&1 || true - echo "--- HTTP status code distribution ---" - grep -oE 'http_code=[0-9]+' "$burst_log" | sort | uniq -c || true - echo "--- slowest 5 ---" - sort -t= -k3 -rn "$burst_log" | head -5 || true - echo - echo "=== Burst probe B: 200x PUBLISH, 20 concurrent (the actual failing op) ===" - # Publishes to the real keyset — the same operation that times out in CI. - # If THIS bursts shows 429 / 000 / ~10s stalls while probe A stayed fast, - # the cause is per-key publish throttling, not the network or the runner. - pub_log="$(mktemp)" - seq 1 200 | xargs -P 20 -I {} \ - curl -sS -o /dev/null --max-time 15 \ - -w 'http_code=%{http_code} total=%{time_total}s\n' \ - "https://ps.pndsn.com/publish/$PUBLISH_KEY/$SUBSCRIBE_KEY/0/diag-burst/0/%22x%22?uuid=diag-burst" \ - >> "$pub_log" 2>&1 || true - echo "--- HTTP status code distribution ---" - grep -oE 'http_code=[0-9]+' "$pub_log" | sort | uniq -c || true - echo "--- non-200 / slow (>1s) responses ---" - grep -E 'http_code=(000|4|5)|total=[1-9]' "$pub_log" || echo "(none — all fast 200s)" - echo "--- slowest 5 ---" - sort -t= -k3 -rn "$pub_log" | head -5 || true - # TEMPORARY: raw `curl` CLI control that reuses ONE keep-alive connection - # across many publishes (one curl process, N urls via -K). This is the - # PHP-free reproducer for the server team: if it hangs every ~10th here on - # the dense CI runner, the every-10th black-hole is reproducible with no - # PHP/Guzzle/SDK involvement at all. Locally (high RTT) it ran 200/200 - # clean — this step tests it on the same runner/pool that fails for Guzzle. - # Remove once the timeout cause is confirmed/fixed. - - name: Diagnose connection reuse (raw curl CLI) - continue-on-error: true - run: | - curl --version | head -1 - DIAG_ITERATIONS=200 ./scripts/diag-curl.sh - # TEMPORARY: PHP-level probe that REUSES one Guzzle client across many - # requests, exactly like the SDK does for a whole PHPUnit run. The curl - # probes above open a fresh connection each time and so cannot reproduce a - # stale/half-closed keep-alive socket — the prime suspect for - # "connected fine, then 0 bytes received after 10s" (cURL error 28). - # DIAG_DELAY_MS inserts an idle gap to provoke LB idle-timeout of pooled - # sockets. Remove once the timeout cause is confirmed/fixed. - - name: Diagnose Guzzle connection reuse (PHP) - continue-on-error: true - run: | - # Reuses one Guzzle client like the SDK. The keyless time/0 baseline - # should be 0 failures; the publish run reproduces the cURL-28 timeout. - # The built-in spacing sweep then tells us whether spacing publishes - # further apart makes the failures disappear (== server rate limit). - DIAG_ITERATIONS=200 DIAG_DELAY_MS=0 php scripts/diag-guzzle.php - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error diff --git a/scripts/diag-curl.sh b/scripts/diag-curl.sh deleted file mode 100755 index cfa3155f..00000000 --- a/scripts/diag-curl.sh +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env bash -# -# TEMPORARY diagnostic: reproduce the "every ~10th publish hangs 10s / 0 bytes" -# issue using the raw `curl` CLI instead of PHP/Guzzle. -# -# This is a CONTROL for scripts/diag-guzzle.php. The only way to reproduce the -# hang is to REUSE ONE keep-alive connection across many requests. A fresh curl -# per request opens a new socket every time and never reproduces it -- that is -# why the earlier per-request curl probes looked healthy. -# -# curl reuses the connection when you give it MANY URLs in ONE invocation, so -# this fires all N publishes down a single connection (like the SDK does). -# -# Usage (from project root): -# set -a; source .env.dev; set +a -# ./scripts/diag-curl.sh -# -# Env: -# PUBLISH_KEY, SUBSCRIBE_KEY (required) -# DIAG_ITERATIONS (optional, default 200) -# -# Output per request line: s -# - http_code 000 = no response (the hang / cURL error 28) -# - http_code 429 = Balancer rate limit (fast) — curl reports these every -# ~10th publish; the PHP/Guzzle path black-holes (000/10s) -# at the same cadence instead. See the investigation doc. -# - repeated local_port = same reused socket (keep-alive working) -# - a new local_port = curl had to reopen after a failure - -set -u - -PUBLISH_KEY="${PUBLISH_KEY:-}" -SUBSCRIBE_KEY="${SUBSCRIBE_KEY:-}" -ITER="${DIAG_ITERATIONS:-200}" - -if [[ -z "$PUBLISH_KEY" || -z "$SUBSCRIBE_KEY" ]]; then - echo "ERROR: set PUBLISH_KEY and SUBSCRIBE_KEY (e.g. 'set -a; source .env.dev; set +a')" >&2 - exit 1 -fi - -URL="https://ps.pndsn.com/publish/${PUBLISH_KEY}/${SUBSCRIBE_KEY}/0/diag-curl/0/%22x%22?uuid=diag-curl" - -# Build a curl-config file with N url entries -> ONE curl process, ONE connection. -CFG="$(mktemp)" -trap 'rm -f "$CFG"' EXIT -for ((i = 0; i < ITER; i++)); do - printf 'url = "%s"\n' "$URL" >> "$CFG" -done - -run_probe() { - local label="$1"; shift - echo "" - echo "=== ${label} (${ITER} iterations) ===" - - # -K $CFG : all N URLs, reusing the connection - # --max-time 12 : per-request ceiling (>10s server timeout so the hang shows as 000) - # -s -o /dev/null : discard bodies - # -w ...\n : one machine-readable line per request - # - # CRITICAL: prefix the -w line with a unique sentinel ("STAT::") and parse ONLY - # sentinel lines. With -K and many URLs, response bodies can still reach stdout and - # merge with the metadata line (an earlier version miscounted every-10th HTTP 429s - # as ok=200, e.g. `code={"status":429...`). Parsing on the sentinel makes the field - # positions reliable regardless of body content. - # NOTE: the sentinel must NOT start with '@' — curl reads a -w value beginning with - # '@' as a FILENAME (`-w @file`), which fails with "error encountered when reading a - # file". So use "STAT::", not "@@STAT@@". - # - # ROBUST PARSING: -o /dev/null does NOT reliably suppress bodies on this -K - # multi-URL path (the response body still reaches stdout), AND the body often has - # no trailing newline — so the NEXT request's "STAT::" line gets glued onto the end - # of the body line. A naive `$1=="STAT::"` then matches only the very first line - # (total=1 bug). Fix: lead the format with \n (so each STAT starts fresh where - # possible) and use awk match() to find "STAT::" ANYWHERE on the line, parsing the - # whitespace-separated fields that follow it. Body text before the sentinel is - # discarded. - # Fields after STAT:: -> http_code | time_total | local_port | remote_ip | num_connects - curl "$@" -K "$CFG" \ - --max-time 12 \ - -s -o /dev/null \ - -w '\nSTAT:: %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ - | awk ' - { - p = index($0, "STAT:: ") - if (p == 0) next # body text with no sentinel — ignore - n = split(substr($0, p + 7), f, /[ \t]+/) - if (n < 5) next # malformed / truncated sentinel — skip - code = f[1]; t = f[2]; port = f[3]; ip = f[4]; nconn = f[5] - idx = total # 0-based request index among sentinel lines - printf " #%-3d code=%s time=%ss port=%s ip=%s conns=%s\n", idx, code, t, port, ip, nconn - total++ - last_nconn = nconn + 0 # cumulative; final value = total TCP connections opened - ipseen[ip]++ - if (code == "000" || code == "") { # hang / no response (cURL 28) - hang++; hangidx = hangidx (hangidx=="" ? "" : ",") idx - ipfail[ip]++ - } else if (code == "429") { # Balancer rate limit (fast) - rl++; rlidx = rlidx (rlidx=="" ? "" : ",") idx - ipok[ip]++ # server answered fast; not a hang - } else { - ok++; ipok[ip]++ - } - if (port != lastport && NR > 1) sockets++ - portseen[port]=1 - lastport = port - } - END { - ndist = 0; for (p in portseen) ndist++ - printf "\n ok=%d rate-limited(429)=%d hung(000)=%d distinct-sockets=%d tcp-connects-opened=%d (total=%d)\n", \ - ok+0, rl+0, hang+0, ndist, last_nconn, total - printf " (reuse proof: tcp-connects-opened << total => keep-alive working; ideally 1 + one reconnect per hang)\n" - printf " per-backend (non-hang/hung):" - for (ip in ipseen) printf " %s=%d/%d", ip, ipok[ip]+0, ipfail[ip]+0 - printf "\n" - if (rl+0 > 0) printf " rate-limited(429) indices: %s\n", rlidx - if (hang+0 > 0) printf " hung(000) indices: %s\n", hangidx - if (rl+0 == 0 && hang+0 == 0) printf " (no 429s, no hangs)\n" - }' -} - -echo "######## raw curl reuse diagnostic ########" -echo "One curl process reuses one keep-alive connection across ${ITER} publishes." -echo "Compare against scripts/diag-guzzle.php (the PHP/Guzzle path)." - -run_probe "publish (HTTP/1.1)" --http1.1 - -echo "" -echo "######## done ########" \ No newline at end of file diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php deleted file mode 100644 index 178d5e4b..00000000 --- a/scripts/diag-guzzle.php +++ /dev/null @@ -1,423 +0,0 @@ - - * Pingora; the prime suspect is now a server-side keep-alive change (e.g. a low - * `keepalive_requests` cap, or retiring the connection without a clean - * `Connection: close`) that PHP's curl reuse trips over on the ~10th request. - * Watch request #8/#9 (the retirement boundary) for the header the SDK ignores. - * - * Usage (from project root, keys exported as in the CI workflow): - * php scripts/diag-guzzle.php - * - * Env: - * PUBLISH_KEY, SUBSCRIBE_KEY (required for the publish probe) - * DIAG_ITERATIONS (optional, default 200) - * DIAG_DELAY_MS (optional, idle gap between requests in ms, - * default 0; try 5000 to provoke idle-timeout - * of pooled sockets) - * DIAG_VERBOSE_ITERS (optional, default 12 — how many requests the - * verbose capture fires to span the ~10th - * retirement boundary) - */ - -require __DIR__ . '/../vendor/autoload.php'; - -use GuzzleHttp\Client as GuzzleHttpClient; -use GuzzleHttp\HandlerStack; -use GuzzleHttp\Handler\CurlHandler; -use GuzzleHttp\Handler\StreamHandler; -use GuzzleHttp\RequestOptions; - -$publishKey = getenv('PUBLISH_KEY') ?: ''; -$subscribeKey = getenv('SUBSCRIBE_KEY') ?: ''; -$iterations = (int) (getenv('DIAG_ITERATIONS') ?: 200); -$delayMs = (int) (getenv('DIAG_DELAY_MS') ?: 0); -$verboseIters = (int) (getenv('DIAG_VERBOSE_ITERS') ?: 12); - -if ($publishKey === '' || $subscribeKey === '') { - fwrite(STDERR, "WARNING: PUBLISH_KEY/SUBSCRIBE_KEY not set — publish probe will be skipped.\n"); -} - -// Request timeout RAISED to 15s (was 10s, the SDK default). The Balancer holds a -// rate-limited publish ~10s before returning a 429; at the old 10s timeout the SDK -// aborted (cURL 28 / 0 bytes) ~8ms before the 429 arrived. 15s lets the response -// land — so this run now CONFIRMS the mechanism: the every-10th request should show -// up as an HTTP 429 (or a slow ~10s success) instead of a timeout. Mirrors raw -// curl's --max-time 12 and the tests' setNonSubscribeRequestTimeout(15). -const REQUEST_TIMEOUT = 15; -const CONNECT_TIMEOUT = 10; - -/** - * Fire $iterations requests at $url through ONE reused client and report. - * - * @param string $version '2' or '1.1' — passed to Guzzle's 'version' option. - * @param string $handler 'curl' (default, the SDK transport) or 'stream' - * (PHP stream wrapper — no libcurl). Swapping the - * handler isolates whether the hang lives in PHP's - * curl handler / its libcurl+OpenSSL build: if the - * stream handler runs clean where curl hangs, the - * fault is in the curl path, not the server. - */ -function probe( - string $label, - string $url, - string $version, - int $iterations, - int $delayMs, - string $handler = 'curl' -): void { - // One client for the whole loop == SDK behavior == keep-alive pool reuse. - if ($handler === 'stream') { - // StreamHandler uses PHP's HTTP stream wrapper (no libcurl at all). - $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new StreamHandler())]); - } else { - // Explicit CurlHandler == what Guzzle picks by default == the SDK path. - $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new CurlHandler())]); - } - - $times = []; - $failures = []; - $failedIndices = []; - $reuseCount = 0; - $seenPorts = []; // local TCP ports seen — a repeat == a reused socket - $httpVersions = []; // negotiated protocol distribution (from PSR-7) - $ipOk = []; // backend IP => count of successful requests - $ipFail = []; // backend IP => count of timed-out requests - $statusCodes = []; // HTTP status => count (429s now visible, not thrown) - - for ($i = 0; $i < $iterations; $i++) { - $start = microtime(true); - $curlInfo = null; - try { - $response = $client->get($url, [ - RequestOptions::TIMEOUT => REQUEST_TIMEOUT, - RequestOptions::CONNECT_TIMEOUT => CONNECT_TIMEOUT, - 'version' => $version, - // Do NOT throw on 4xx/5xx — we WANT to observe the Balancer's 429 - // as a normal response. With http_errors on (Guzzle default) a 429 - // throws ClientException and gets lumped into the opaque failure - // bucket; off, we can tally it by status code below. - RequestOptions::HTTP_ERRORS => false, - // Detect socket reuse via local_port (num_connects is absent in - // some libcurl builds). NOTE: do NOT trust the stat's - // 'http_version' field — it echoes the REQUESTED version, not the - // negotiated one. The PSR-7 response below reports the real wire - // protocol (ps.pndsn.com is HTTP/1.1 only — no h2 over ALPN). - 'on_stats' => function ($stats) use (&$curlInfo) { - $h = $stats->getHandlerStats(); - $curlInfo = [ - 'port' => $h['local_port'] ?? null, - 'ip' => $h['primary_ip'] ?? '?', - ]; - }, - ]); - $elapsed = (microtime(true) - $start) * 1000; - $times[] = $elapsed; - - $port = $curlInfo['port'] ?? null; - if ($port !== null) { - if (isset($seenPorts[$port])) { - $reuseCount++; // rode a previously-opened (pooled) socket - } - $seenPorts[$port] = true; - } - // Real negotiated protocol, straight from the PSR-7 response. - $hv = $response->getProtocolVersion(); - $httpVersions[$hv] = ($httpVersions[$hv] ?? 0) + 1; - - // Tally status code — every 10th should now be 429 (~10s), the rest 200. - $code = $response->getStatusCode(); - $statusCodes[$code] = ($statusCodes[$code] ?? 0) + 1; - if ($code === 429) { - $failedIndices[] = $i; // record the cadence even though it didn't throw - } - - $ip = $curlInfo['ip'] ?? '?'; - $ipOk[$ip] = ($ipOk[$ip] ?? 0) + 1; - } catch (\Throwable $e) { - $elapsed = (microtime(true) - $start) * 1000; - $failedIndices[] = $i; - // curl populates primary_ip once connected, so even a post-connect - // timeout tells us WHICH backend black-holed the request. - $ip = $curlInfo['ip'] ?? '?'; - $ipFail[$ip] = ($ipFail[$ip] ?? 0) + 1; - // Keep one representative full message; the rest are summarized by index. - if (count($failures) < 1) { - $failures[] = sprintf(" e.g. #%d after %dms (ip=%s): %s", $i, (int) $elapsed, $ip, $e->getMessage()); - } - } - - if ($delayMs > 0) { - usleep($delayMs * 1000); - } - } - - $ok = count($times); - $fail = count($failures); - sort($times); - $p50 = $times ? $times[(int) (count($times) * 0.50)] ?? end($times) : 0; - $p99 = $times ? $times[(int) (count($times) * 0.99)] ?? end($times) : 0; - $max = $times ? max($times) : 0; - - printf( - "\n=== %s (handler=%s, version=%s, %d iterations, delay=%dms) ===\n", - $label, - $handler, - $version, - $iterations, - $delayMs - ); - if ($handler === 'stream') { - // StreamHandler exposes no libcurl stats, so socket-reuse and per-backend - // IP columns below will read 0 / '?' — that's expected, not a finding. - echo " (note: stream handler reports no socket/IP stats — ignore those columns)\n"; - } - $httpDist = []; - foreach ($httpVersions as $v => $n) { - $httpDist[] = 'http/' . $v . ' x' . $n; - } - printf(" ok=%d fail=%d reused-socket=%d distinct-sockets=%d\n", $ok, $fail, $reuseCount, count($seenPorts)); - // Status-code distribution: with http_errors off and a 15s timeout, the - // every-10th rate-limited publish now shows as `429 x20` here instead of a - // timeout — the confirmation that the "hang" was a delayed 429 all along. - ksort($statusCodes); - $codeDist = []; - foreach ($statusCodes as $c => $n) { - $codeDist[] = $c . ' x' . $n; - } - printf(" status-codes: %s\n", implode(' ', $codeDist) ?: '(none — all threw)'); - printf(" negotiated-protocol (PSR-7): %s\n", implode(' ', $httpDist) ?: '(n/a)'); - printf(" latency p50=%dms p99=%dms max=%dms\n", (int) $p50, (int) $p99, (int) $max); - // Per-backend breakdown: if the failures concentrate on one (or few) IPs - // while other IPs only ever succeed, the cause is a wedged backend node - // behind the LB — not a rate limit (which would return fast 429s, not 10s - // 0-byte hangs). - $allIps = array_unique(array_merge(array_keys($ipOk), array_keys($ipFail))); - sort($allIps); - echo " per-backend (ok/fail):"; - foreach ($allIps as $ip) { - printf(" %s=%d/%d", $ip, $ipOk[$ip] ?? 0, $ipFail[$ip] ?? 0); - } - echo "\n"; - // $failedIndices now holds both thrown failures (timeouts) AND 429s observed as - // normal responses. Label it as the every-10th cadence rather than "failures". - if (!empty($failedIndices)) { - echo " rate-limited/failed indices: " . implode(',', $failedIndices) . "\n"; - if (!empty($failures)) { - echo implode("\n", $failures) . "\n"; - } - } else { - echo " (no 429s, no failures)\n"; - } -} - -/** - * Fire $iterations publishes over ONE reused curl connection with libcurl - * VERBOSE tracing on, and print, per request, only the lines that reveal how the - * load balancer manages the keep-alive connection: - * - `< Connection:` / `< Keep-Alive:` response headers (does Pingora announce a - * request cap or signal close?) - * - `* Connection #N ... left intact` vs `* Closing connection` (when libcurl - * decides to drop / retire the pooled socket) - * - `* Re-using existing connection` (proof the request rode a pooled socket) - * - any `* ... timed out` / error line on the hung request - * - * This is the smoking-gun capture for the nginx->Pingora hypothesis: watch the - * boundary request (~#8/#9) for a server header the SDK's pooled handle ignores, - * or a silent close that leaves the next reused request writing into a dead - * socket (-> 0 bytes -> 10s timeout). - * - * Uses Guzzle's raw 'curl' option passthrough (CurlHandler only) to set - * CURLOPT_VERBOSE + CURLOPT_STDERR at a temp stream we then read back. - */ -function captureVerbose(string $url, string $version, int $iterations, int $gapMs = 0): void -{ - printf( - "\n######## verbose wire capture (publish, curl, version=%s, %d requests, gap=%dms) ########\n", - $version, - $iterations, - $gapMs - ); - if ($gapMs > 0) { - // The point of the gap run: at gap=0 the every-10th request hangs; at - // gap=1000ms the loop is clean. The open question this answers is WHY — - // does the boundary request (~#9/#10) RE-USE the same socket (so the gap - // just keeps it healthy), or does the client CONNECT a NEW socket (so the - // gap lets a silent drop become visible and the client reconnects cleanly)? - echo "Question: at this gap, does the ~#9/#10 boundary REUSE the socket or open a NEW connection?\n"; - } else { - echo "Watch the ~#8/#9 boundary: what does the LB send right before the hang?\n"; - } - - // One client == one keep-alive pool, so requests reuse the same socket. - $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new CurlHandler())]); - - // Only the lines worth reading from libcurl's (very chatty) verbose trace. - $keep = ['Connection', 'Keep-Alive', 'Re-using', 'Closing connection', 'left intact', - 'Connected to', 'timed out', 'Operation timed out', 'Empty reply', 'Connection died']; - - $reused = 0; // requests that rode a pooled socket ("Re-using existing") - $newConns = 0; // requests that opened a fresh socket ("Connected to") - - for ($i = 0; $i < $iterations; $i++) { - $trace = fopen('php://temp', 'w+'); - $start = microtime(true); - $status = '?'; - $err = null; - try { - $resp = $client->get($url, [ - RequestOptions::TIMEOUT => REQUEST_TIMEOUT, - RequestOptions::CONNECT_TIMEOUT => CONNECT_TIMEOUT, - 'version' => $version, - // CurlHandler passes these straight to the easy handle. - 'curl' => [ - CURLOPT_VERBOSE => true, - CURLOPT_STDERR => $trace, - ], - ]); - $status = (string) $resp->getStatusCode(); - } catch (\Throwable $e) { - $err = $e->getMessage(); - } - $elapsed = (int) ((microtime(true) - $start) * 1000); - - // Pull the interesting verbose lines out of the trace stream. - rewind($trace); - $raw = stream_get_contents($trace) ?: ''; - fclose($trace); - - // Classify reuse-vs-reconnect from the raw trace (independent of $keep - // filtering): "Re-using existing" == pooled socket; "Connected to" == a - // fresh TCP connect this request. This is the direct evidence for the gap - // question above. - $isReuse = stripos($raw, 'Re-using existing') !== false; - $isNew = stripos($raw, 'Connected to') !== false; - if ($isReuse) { - $reused++; - } - if ($isNew) { - $newConns++; - } - $conn = $isReuse ? 'REUSE' : ($isNew ? 'NEW-CONN' : '?'); - - $lines = []; - foreach (explode("\n", $raw) as $line) { - $line = rtrim($line); - if ($line === '') { - continue; - } - // The TLS handshake line matches "Connection" but is per-request - // noise, not keep-alive signal — drop it. - if (stripos($line, 'SSL connection using') !== false) { - continue; - } - foreach ($keep as $needle) { - if (stripos($line, $needle) !== false) { - $lines[] = ' ' . $line; - break; - } - } - } - - printf(" #%d [%s] status=%s %dms%s\n", $i, $conn, $status, $elapsed, $err !== null ? " ERROR: $err" : ''); - echo $lines ? implode("\n", $lines) . "\n" : " (no keep-alive/connection lines in trace)\n"; - - if ($gapMs > 0) { - usleep($gapMs * 1000); - } - } - - printf(" -- summary: reused=%d new-connections=%d (of %d) --\n", $reused, $newConns, $iterations); -} - -$timeUrl = "https://ps.pndsn.com/time/0"; - -echo "######## PHP Guzzle reuse diagnostic ########\n"; -echo "Guzzle reuses one client across the whole run (like the SDK).\n"; - -// Baseline: keyless, both protocols. -probe("time/0 baseline", $timeUrl, '2', $iterations, $delayMs); -probe("time/0 baseline", $timeUrl, '1.1', $iterations, $delayMs); - -// The actual failing op: publish. -if ($publishKey !== '' && $subscribeKey !== '') { - $publishUrl = sprintf( - "https://ps.pndsn.com/publish/%s/%s/0/diag-guzzle/0/%%22x%%22?uuid=diag-guzzle", - $publishKey, - $subscribeKey - ); - - // The failures are 10s 0-byte HANGS, not 429s, so this is NOT a rate limit. - // A raw `curl --http1.1` CLI reusing one connection runs 200/200 clean, so - // the hang is specific to the PHP/Guzzle path. The SDK forces 'version'=>'2' - // (Endpoint::requestOptions line ~501) while curl ran 1.1 — so probe BOTH - // here to isolate whether the forced HTTP/2 option is the trigger. - // Watch: does version=1.1 go clean while version=2 hangs? - probe("publish version=2", $publishUrl, '2', $iterations, $delayMs); - probe("publish version=1.1", $publishUrl, '1.1', $iterations, $delayMs); - - // Handler swap: run the SAME publish loop through the curl handler and then - // through the StreamHandler (PHP stream wrapper, no libcurl). The curl CLI - // control runs clean but uses a DIFFERENT libcurl+TLS stack than PHP, so it - // doesn't exonerate PHP's curl handler. This swap stays inside PHP and - // changes only the handler: if the every-10th hang VANISHES on the stream - // handler, it pins the fault to PHP's curl handler / its libcurl+OpenSSL - // build; if it persists on both, the cause is above libcurl (server/network). - echo "\n######## handler swap (publish, version=1.1) ########\n"; - probe("publish handler=curl", $publishUrl, '1.1', $iterations, $delayMs, 'curl'); - probe("publish handler=stream", $publishUrl, '1.1', $iterations, $delayMs, 'stream'); - - // Verbose wire capture across the every-10th retirement boundary. This is the - // hand-to-ops evidence for the nginx->Pingora keep-alive hypothesis: it shows - // exactly what the LB sends on the connection right before it's black-holed. - captureVerbose($publishUrl, '1.1', $verboseIters, 0); - - // Same capture WITH a 1000ms gap (the spacing that makes the loop clean). The - // per-request REUSE/NEW-CONN tag + summary answers: does the gap keep ONE - // socket alive across the boundary (all REUSE), or does it let a silent drop - // surface so the client reconnects (a NEW-CONN at ~#9/#10)? We currently have - // NO direct evidence either way — distinct-sockets=1 can't tell reuse from a - // same-port reconnect. This settles it. - captureVerbose($publishUrl, '1.1', $verboseIters, 1000); - - // Spacing check: if it were a time/rate effect, gaps would change the rate - // of failure. For a per-request routing fault, the ~1-in-10 cadence persists - // regardless of spacing. - echo "\n######## spacing check (publish, 30 req each) ########\n"; - foreach ([0, 1000] as $gapMs) { - probe("publish gap=" . $gapMs . "ms", $publishUrl, '1.1', 30, $gapMs); - } -} - -echo "\n######## done ########\n"; diff --git a/src/PubNub/Endpoints/Endpoint.php b/src/PubNub/Endpoints/Endpoint.php index 0e55e706..80dba331 100755 --- a/src/PubNub/Endpoints/Endpoint.php +++ b/src/PubNub/Endpoints/Endpoint.php @@ -20,7 +20,6 @@ use Psr\Http\Client\ClientExceptionInterface; use Psr\Http\Message\RequestInterface; use Psr\Http\Message\ResponseInterface; -use Psr\Http\Message\UriInterface; abstract class Endpoint { @@ -358,9 +357,8 @@ protected function sendRequest(RequestInterface $request): PNEnvelope $response = $client->sendRequest($request); } $this->pubnub->getLogger()->debug(sprintf( - "%s %s response from %s negotiated HTTP/%s", + "%s response from %s negotiated HTTP/%s", $this->getName(), - $this->redactUri($request->getUri()), $request->getUri()->getHost(), $response->getProtocolVersion() )); @@ -376,10 +374,10 @@ protected function sendRequest(RequestInterface $request): PNEnvelope $statusCode = $exception->getCode(); $response = substr($exception->getMessage(), strpos($exception->getMessage(), "\n") + 1); $pnServerException = new PubNubServerException(); - if (is_callable([$exception, 'getResponse'])) { + if (is_callable([$exception, 'getResponse']) && $exception->getResponse() !== null) { $response = $exception->getResponse()->getBody()->getContents(); } else { - $response = substr($exception->getMessage(), strpos($exception->getMessage(), "\n") + 1); + $response = $exception->getMessage(); } $pnServerException->setRawBody($response); $pnServerException->setStatusCode($exception->getCode()); @@ -411,30 +409,6 @@ protected function sendRequest(RequestInterface $request): PNEnvelope return $envelope; } - /** - * Build a log-safe URI string with sensitive query params masked. - * - * Masks the PAM `auth` token and request `signature` so they don't leak - * into debug logs. All other params (and the pub/sub keys in the path) - * are kept to preserve diagnostic value. - */ - protected function redactUri(UriInterface $uri): string - { - $query = $uri->getQuery(); - if ($query === '') { - return (string) $uri; - } - - parse_str($query, $params); - foreach (['auth', 'signature'] as $sensitive) { - if (isset($params[$sensitive]) && $params[$sensitive] !== '') { - $params[$sensitive] = 'REDACTED'; - } - } - - return (string) $uri->withQuery(http_build_query($params)); - } - /** * @param ResponseInterface $response * @return PNEnvelope diff --git a/tests/PubNubTestCase.php b/tests/PubNubTestCase.php index fa1306ae..26371cf1 100755 --- a/tests/PubNubTestCase.php +++ b/tests/PubNubTestCase.php @@ -5,6 +5,7 @@ use PubNub\PubNub; use PubNub\PubNubUtil; use Monolog\Logger; +use Monolog\Level; use Monolog\Handler\ErrorLogHandler; // phpcs:ignore PSR1.Classes.ClassDeclaration @@ -78,46 +79,31 @@ public function setUp(): void $uuidMock = getenv("UUID_MOCK") ?: "UUID_MOCK"; $logger = new Logger('PubNub'); - // Log at DEBUG so the SDK's per-request debug lines (endpoint, redacted - // URL, negotiated HTTP version — see Endpoint::sendRequest) surface in - // the test output. Logger::DEBUG is valid in both Monolog 2 and 3. - $logger->pushHandler(new ErrorLogHandler(ErrorLogHandler::OPERATING_SYSTEM, Logger::DEBUG)); + $logger->pushHandler(new ErrorLogHandler(ErrorLogHandler::OPERATING_SYSTEM, Level::Warning)); parent::setUp(); - // The Balancer rate-limits dense publish bursts by holding the connection - // ~10s before returning a 429 (see .claude/features/cicd-publish-timeout- - // investigation.md). The SDK's default 10s non-subscribe timeout fires ~8ms - // before that 429 arrives, surfacing as `cURL error 28 / 0 bytes`. Raising - // the timeout to 15s lets the SDK actually receive the 429 instead of timing - // out — mirroring what raw curl (--max-time 12) does. - $nonSubscribeRequestTimeout = 15; - $this->config = new PNConfiguration(); $this->config->setSubscribeKey($subscribeKey); $this->config->setPublishKey($publishKey); $this->config->setUuid($uuidMock); - $this->config->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_enc = new PNConfiguration(); $this->config_enc->setSubscribeKey($subscribeKey); $this->config_enc->setPublishKey($publishKey); $this->config_enc->setCipherKey(static::CIPHER_KEY); $this->config_enc->setUuid($uuidMock); - $this->config_enc->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_pam = new PNConfiguration(); $this->config_pam->setSubscribeKey($subscribeKeyPam); $this->config_pam->setPublishKey($publishKeyPam); $this->config_pam->setSecretKey($secretKeyPam); $this->config_pam->setUuid($uuidMock); - $this->config_pam->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_demo = new PNConfiguration(); $this->config_demo->setSubscribeKey('demo'); $this->config_demo->setPublishKey('demo'); $this->config_demo->setUuid($uuidMock); - $this->config_demo->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_demo->disableImmutableCheck(); $this->pubnub = new PubNub($this->config); diff --git a/tests/integrational/FilesTest.php b/tests/integrational/FilesTest.php index bc3e671f..9db3fa34 100644 --- a/tests/integrational/FilesTest.php +++ b/tests/integrational/FilesTest.php @@ -329,10 +329,18 @@ public function testFileUploadWithLargeFile(): void $largeContent = str_repeat('x', 5 * 1024 * 1024); // 5MB of data file_put_contents($largeFilePath, $largeContent); + $config = new PNConfiguration(); + $config->setSubscribeKey(getenv("SUBSCRIBE_KEY") ?: ""); + $config->setPublishKey(getenv("PUBLISH_KEY") ?: ""); + $config->setUuid(getenv("UUID_MOCK") ?: "UUID_MOCK"); + // Use a longer timeout: uploading 5MB can exceed the default 10s + $config->setNonSubscribeRequestTimeout(30); + $pubnub = new PubNub($config); + try { $file = fopen($largeFilePath, "r"); - $response = $this->pubnub->sendFile() + $response = $pubnub->sendFile() ->channel($this->channel) ->fileHandle($file) ->fileName('large.txt') @@ -343,7 +351,7 @@ public function testFileUploadWithLargeFile(): void $this->assertNotEmpty($response->getFileId()); // Verify file can be downloaded - $downloadResponse = $this->pubnub->downloadFile() + $downloadResponse = $pubnub->downloadFile() ->channel($this->channel) ->fileId($response->getFileId()) ->fileName('large.txt') From e3339ee0d088a4d445704b53bfc99648704eb84a Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Mon, 22 Jun 2026 12:56:42 +0200 Subject: [PATCH 23/24] Fix tests on CICD --- tests/integrational/objects/member/MembersHappyPathTest.php | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integrational/objects/member/MembersHappyPathTest.php b/tests/integrational/objects/member/MembersHappyPathTest.php index a1d0587b..80e9aeb3 100644 --- a/tests/integrational/objects/member/MembersHappyPathTest.php +++ b/tests/integrational/objects/member/MembersHappyPathTest.php @@ -79,6 +79,7 @@ public function testHappyPath(): void ->include($includes) ->sync(); $this->checkResponse($addMembers); + sleep(1); $getMembers = $this->pubnub->getMembers() ->channel($this->channel) From 30df4b84793016eec8f09b53de16b002eb68d750 Mon Sep 17 00:00:00 2001 From: PubNub Release Bot <120067856+pubnub-release-bot@users.noreply.github.com> Date: Mon, 22 Jun 2026 11:16:52 +0000 Subject: [PATCH 24/24] PubNub SDK 9.0.1 release. --- .pubnub.yml | 13 ++++++++++--- CHANGELOG.md | 9 +++++++++ README.md | 2 +- composer.json | 2 +- src/PubNub/PubNub.php | 2 +- 5 files changed, 22 insertions(+), 6 deletions(-) diff --git a/.pubnub.yml b/.pubnub.yml index 3d7b7080..1f2a9466 100644 --- a/.pubnub.yml +++ b/.pubnub.yml @@ -1,8 +1,15 @@ name: php -version: 9.0.0 +version: 9.0.1 schema: 1 scm: github.com/pubnub/php changelog: + - date: 2026-06-22 + version: 9.0.1 + changes: + - type: feature + text: "Added logging of REST response containing host and negotiated HTTP protocol version. ." + - type: bug + text: "Fixed PubNubServerException handling to guard against a null response and fall back to the full exception message." - date: 2025-10-30 version: 9.0.0 changes: @@ -543,8 +550,8 @@ sdks: - x86-64 - distribution-type: library distribution-repository: GitHub release - package-name: php-9.0.0.zip - location: https://github.com/pubnub/php/releases/tag/9.0.0 + package-name: php-9.0.1.zip + location: https://github.com/pubnub/php/releases/tag/9.0.1 requires: - name: rmccue/requests min-version: 1.0.0 diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e31fb87..0843826d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 9.0.1 +June 22 2026 + +#### Added +- Added logging of REST response containing host and negotiated HTTP protocol version. . + +#### Fixed +- Fixed PubNubServerException handling to guard against a null response and fall back to the full exception message. + ## 9.0.0 October 30 2025 diff --git a/README.md b/README.md index 950decf4..5a284733 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ You will need the publish and subscribe keys to authenticate your app. Get your { "require": { - "pubnub/pubnub": "9.0.0" + "pubnub/pubnub": "9.0.1" } } ``` diff --git a/composer.json b/composer.json index ef8a668d..962bfd7c 100644 --- a/composer.json +++ b/composer.json @@ -5,7 +5,7 @@ "keywords": ["api", "real-time", "realtime", "real time", "ajax", "push"], "homepage": "http://www.pubnub.com/", "license": "proprietary", - "version": "9.0.0", + "version": "9.0.1", "authors": [ { "name": "PubNub", diff --git a/src/PubNub/PubNub.php b/src/PubNub/PubNub.php index 4619a779..37da4a4e 100644 --- a/src/PubNub/PubNub.php +++ b/src/PubNub/PubNub.php @@ -69,7 +69,7 @@ class PubNub implements LoggerAwareInterface { - protected const SDK_VERSION = "9.0.0"; + protected const SDK_VERSION = "9.0.1"; protected const SDK_NAME = "PubNub-PHP"; public static $MAX_SEQUENCE = 65535;