From c484c095a984729a9d3e594504cae25b8041eb83 Mon Sep 17 00:00:00 2001
From: David Wein <david.wein@databricks.com>
Date: Tue, 9 Dec 2025 09:18:06 -0800
Subject: [PATCH] pgbench: Add latency percentile reporting with
 histogram-based tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add --percentile[=LIST] option to report latency percentiles during pgbench runs.
By default reports p50, p90, p99, p99.9, and p99.99. Users can specify a custom
comma-separated list of percentiles (0-100) to report.

Implementation details:
- Multi-tier histogram with bounded memory (~100KB per histogram):
  * Fine buckets: 0-100ms at 100μs resolution (1000 buckets)
  * Coarse buckets: 100ms-10s at 1ms resolution (9900 buckets)
  * Medium buckets: 10s-120s at 100ms resolution (1100 buckets)
  * Large buckets: 120s-1200s at 1s resolution (1080 buckets)
- Dynamic overflow tracking for latencies exceeding 20 minutes, storing
  individual values in a growable array for accurate tail percentiles
- Per-command percentile reporting when combined with -r flag, with
  percentiles displayed on a single line per command
- Per-script percentile aggregation across all threads

New options:
- --percentile[=LIST]: Enable percentile reporting with optional custom list
- --debug-latency-multiplier=N: Multiply latencies for testing overflow handling

Memory considerations documented for high-concurrency scenarios with
per-command reporting enabled. Includes comprehensive TAP test coverage
for custom percentile lists, overflow handling, and per-command reporting.
---
 src/bin/pgbench/pgbench.c                    | 504 ++++++++++++++++++-
 src/bin/pgbench/t/001_pgbench_with_server.pl | 154 ++++++
 2 files changed, 656 insertions(+), 2 deletions(-)

diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index afe3ce65fb7..54d65716aca 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -262,6 +262,17 @@ static int	agg_interval;		/* log aggregates instead of individual
 static bool per_script_stats = false;	/* whether to collect stats per script */
 static int	progress = 0;		/* thread progress report every this seconds */
 static bool progress_timestamp = false; /* progress report with Unix time */
+static bool report_percentiles = false; /* report latency percentiles */
+
+/* Default percentiles to report */
+#define MAX_PERCENTILES 20
+static double default_percentiles[] = {50.0, 90.0, 99.0, 99.9, 99.99};
+static double percentile_list[MAX_PERCENTILES];
+static int	num_percentiles = 0;	/* 0 means use defaults */
+
+/* Debug option: multiply latencies for testing histogram overflow */
+static double debug_latency_multiplier = 1.0;
+
 static int	nclients = 1;		/* number of clients */
 static int	nthreads = 1;		/* number of threads */
 static bool is_connect;			/* establish connection for each transaction */
@@ -364,6 +375,50 @@ typedef struct SimpleStats
 	double		sum2;			/* sum of squared values */
 } SimpleStats;
 
+/*
+ * Histogram for computing latency percentiles with bounded memory.
+ * Uses finer buckets for low latencies (where precision matters) and
+ * progressively coarser buckets for high latencies.
+ *
+ * Bucket tiers:
+ *   Fine:   0-100ms    at 100μs resolution (1000 buckets)
+ *   Coarse: 100ms-10s  at 1ms resolution   (9900 buckets)
+ *   Medium: 10s-120s   at 100ms resolution (1100 buckets)
+ *   Large:  120s-1200s at 1s resolution    (1080 buckets)
+ *
+ * Latencies exceeding the maximum trackable (20 minutes) are stored
+ * individually in a dynamic array for accurate percentile calculation.
+ *
+ * This consumes roughly 105KB of memory per histogram (per thread per script).
+ *
+ * WARNING: When using --percentile with -r (per-command stats), each
+ * command in each script requires a separate histogram (~105KB).
+ * For example: 4 threads x 2 scripts x 50 commands = 400 histograms = ~42MB
+ * Consider this memory overhead when running with many threads/commands.
+ */
+#define LATENCY_HIST_FINE_BUCKETS	1000	/* 0-100ms at 100μs resolution */
+#define LATENCY_HIST_COARSE_BUCKETS	9900	/* 100ms-10s at 1ms resolution */
+#define LATENCY_HIST_MEDIUM_BUCKETS	1100	/* 10s-120s at 100ms resolution */
+#define LATENCY_HIST_LARGE_BUCKETS	1080	/* 120s-1200s at 1s resolution */
+#define LATENCY_HIST_BUCKETS		(LATENCY_HIST_FINE_BUCKETS + LATENCY_HIST_COARSE_BUCKETS + \
+									 LATENCY_HIST_MEDIUM_BUCKETS + LATENCY_HIST_LARGE_BUCKETS)
+#define LATENCY_HIST_FINE_LIMIT		100000		/* 100ms in microseconds */
+#define LATENCY_HIST_COARSE_LIMIT	10000000	/* 10s in microseconds */
+#define LATENCY_HIST_MEDIUM_LIMIT	120000000	/* 120s in microseconds */
+#define LATENCY_HIST_FINE_WIDTH		100			/* 100μs per fine bucket */
+#define LATENCY_HIST_COARSE_WIDTH	1000		/* 1ms per coarse bucket */
+#define LATENCY_HIST_MEDIUM_WIDTH	100000		/* 100ms per medium bucket */
+#define LATENCY_HIST_LARGE_WIDTH	1000000		/* 1s per large bucket */
+
+typedef struct LatencyHistogram
+{
+	int64		buckets[LATENCY_HIST_BUCKETS];
+	int64		overflow_count;		/* count of latencies exceeding max trackable */
+	double		overflow_sum;		/* sum of overflow latencies */
+	double	   *overflow_values;	/* dynamic array of individual overflow latencies */
+	int			overflow_capacity;	/* allocated size of overflow_values array */
+} LatencyHistogram;
+
 /*
  * The instr_time type is expensive when dealing with time arithmetic.  Define
  * a type to hold microseconds instead.  Type int64 is good enough for about
@@ -442,6 +497,7 @@ typedef struct StatsData
 									 * error */
 	SimpleStats latency;
 	SimpleStats lag;
+	LatencyHistogram *latency_hist;	/* histogram for percentile calculation (allocated only if report_percentiles) */
 } StatsData;
 
 /*
@@ -735,6 +791,7 @@ static const char *const QUERYMODE[] = {"simple", "extended", "prepared"};
  * aset			do gset on all possible queries of a combined query (\;).
  * expr			Parsed expression, if needed.
  * stats		Time spent in this command.
+ * latency_hist	Histogram for percentile calculation (allocated if needed).
  * retries		Number of retries after a serialization or deadlock error in the
  *				current command.
  * failures		Number of errors in the current command that were not retried.
@@ -751,6 +808,7 @@ typedef struct Command
 	char	   *varprefix;
 	PgBenchExpr *expr;
 	SimpleStats stats;
+	LatencyHistogram *latency_hist;	/* for percentile calculation (may be NULL) */
 	int64		retries;
 	int64		failures;
 } Command;
@@ -954,6 +1012,8 @@ usage(void)
 		   "  --log-prefix=PREFIX      prefix for transaction time log file\n"
 		   "                           (default: \"pgbench_log\")\n"
 		   "  --max-tries=NUM          max number of tries to run transaction (default: 1)\n"
+		   "  --percentile[=LIST]      report latency percentiles (default: 50,90,99,99.9,99.99)\n"
+		   "                           LIST is comma-separated values between 0 and 100\n"
 		   "  --progress-timestamp     use Unix epoch timestamps for progress\n"
 		   "  --random-seed=SEED       set random seed (\"time\", \"rand\", integer)\n"
 		   "  --sampling-rate=NUM      fraction of transactions to log (e.g., 0.01 for 1%%)\n"
@@ -999,6 +1059,96 @@ is_an_int(const char *str)
 	return *ptr == '\0';
 }
 
+/*
+ * parse_percentile_list -- parse a comma-separated list of percentile values
+ *
+ * Input: comma-separated list of floats (e.g., "50,90,99,99.9,99.99")
+ * Each value must be between 0 and 100 (exclusive of 0, inclusive of 100).
+ * Returns true on success, false on error.
+ */
+static bool
+parse_percentile_list(const char *str)
+{
+	char	   *copy;
+	char	   *token;
+	char	   *saveptr;
+	int			count = 0;
+	double		values[MAX_PERCENTILES];
+
+	if (str == NULL || *str == '\0')
+	{
+		pg_log_error("empty percentile list");
+		return false;
+	}
+
+	/* Make a copy since strtok_r modifies the string */
+	copy = pg_strdup(str);
+
+	for (token = strtok_r(copy, ",", &saveptr);
+		 token != NULL;
+		 token = strtok_r(NULL, ",", &saveptr))
+	{
+		char	   *endptr;
+		double		val;
+
+		/* Skip leading whitespace */
+		while (*token && isspace((unsigned char) *token))
+			token++;
+
+		if (*token == '\0')
+		{
+			pg_log_error("empty value in percentile list");
+			pg_free(copy);
+			return false;
+		}
+
+		errno = 0;
+		val = strtod(token, &endptr);
+
+		/* Skip trailing whitespace */
+		while (*endptr && isspace((unsigned char) *endptr))
+			endptr++;
+
+		if (*endptr != '\0' || errno != 0)
+		{
+			pg_log_error("invalid percentile value: \"%s\"", token);
+			pg_free(copy);
+			return false;
+		}
+
+		if (val <= 0.0 || val > 100.0)
+		{
+			pg_log_error("percentile must be between 0 and 100 (exclusive of 0): %g", val);
+			pg_free(copy);
+			return false;
+		}
+
+		if (count >= MAX_PERCENTILES)
+		{
+			pg_log_error("too many percentiles (maximum %d)", MAX_PERCENTILES);
+			pg_free(copy);
+			return false;
+		}
+
+		values[count++] = val;
+	}
+
+	pg_free(copy);
+
+	if (count == 0)
+	{
+		pg_log_error("no valid percentiles found in list");
+		return false;
+	}
+
+	/* Copy parsed values to global array */
+	for (int i = 0; i < count; i++)
+		percentile_list[i] = values[i];
+	num_percentiles = count;
+
+	return true;
+}
+
 
 /*
  * strtoint64 -- convert a string to 64-bit integer
@@ -1453,6 +1603,242 @@ mergeSimpleStats(SimpleStats *acc, SimpleStats *ss)
 	acc->sum2 += ss->sum2;
 }
 
+/*
+ * Initialize the given LatencyHistogram struct to all zeroes.
+ */
+static void
+initLatencyHistogram(LatencyHistogram *hist)
+{
+	memset(hist->buckets, 0, sizeof(hist->buckets));
+	hist->overflow_count = 0;
+	hist->overflow_sum = 0.0;
+	hist->overflow_values = NULL;
+	hist->overflow_capacity = 0;
+}
+
+/*
+ * Free resources associated with a LatencyHistogram.
+ */
+static void
+freeLatencyHistogram(LatencyHistogram *hist)
+{
+	if (hist->overflow_values != NULL)
+	{
+		pg_free(hist->overflow_values);
+		hist->overflow_values = NULL;
+		hist->overflow_capacity = 0;
+	}
+}
+
+/*
+ * Add an overflow latency value to the dynamic array.
+ * Grows the array capacity as needed.
+ */
+static void
+addOverflowValue(LatencyHistogram *hist, double latency_us)
+{
+	/* Grow array if needed */
+	if (hist->overflow_count >= hist->overflow_capacity)
+	{
+		int	new_capacity = hist->overflow_capacity == 0 ? 16 : hist->overflow_capacity * 2;
+
+		hist->overflow_values = (double *) pg_realloc(hist->overflow_values,
+													  new_capacity * sizeof(double));
+		hist->overflow_capacity = new_capacity;
+	}
+
+	hist->overflow_values[hist->overflow_count] = latency_us;
+	hist->overflow_count++;
+	hist->overflow_sum += latency_us;
+}
+
+/*
+ * Record a latency value (in microseconds) in the histogram.
+ */
+static void
+addToLatencyHistogram(LatencyHistogram *hist, double latency_us)
+{
+	int			bucket;
+
+	/* Apply debug multiplier for testing overflow behavior */
+	latency_us *= debug_latency_multiplier;
+
+	if (latency_us < 0)
+		latency_us = 0;
+
+	if (latency_us < LATENCY_HIST_FINE_LIMIT)
+	{
+		/* Fine-grained bucket: 0-100ms at 100μs resolution */
+		bucket = (int) (latency_us / LATENCY_HIST_FINE_WIDTH);
+		if (bucket >= LATENCY_HIST_FINE_BUCKETS)
+			bucket = LATENCY_HIST_FINE_BUCKETS - 1;
+	}
+	else if (latency_us < LATENCY_HIST_COARSE_LIMIT)
+	{
+		/* Coarse bucket: 100ms-10s at 1ms resolution */
+		bucket = LATENCY_HIST_FINE_BUCKETS +
+			(int) ((latency_us - LATENCY_HIST_FINE_LIMIT) / LATENCY_HIST_COARSE_WIDTH);
+	}
+	else if (latency_us < LATENCY_HIST_MEDIUM_LIMIT)
+	{
+		/* Medium bucket: 10s-120s at 100ms resolution */
+		bucket = LATENCY_HIST_FINE_BUCKETS + LATENCY_HIST_COARSE_BUCKETS +
+			(int) ((latency_us - LATENCY_HIST_COARSE_LIMIT) / LATENCY_HIST_MEDIUM_WIDTH);
+	}
+	else
+	{
+		/* Large bucket: 120s-1200s at 1s resolution */
+		bucket = LATENCY_HIST_FINE_BUCKETS + LATENCY_HIST_COARSE_BUCKETS +
+			LATENCY_HIST_MEDIUM_BUCKETS +
+			(int) ((latency_us - LATENCY_HIST_MEDIUM_LIMIT) / LATENCY_HIST_LARGE_WIDTH);
+		if (bucket >= LATENCY_HIST_BUCKETS)
+		{
+			/* Overflow: latency exceeds max trackable (20 minutes) */
+			addOverflowValue(hist, latency_us);
+			return;
+		}
+	}
+
+	hist->buckets[bucket]++;
+}
+
+/*
+ * Merge one LatencyHistogram into another.
+ * Concatenates overflow value arrays from both histograms.
+ */
+static void
+mergeLatencyHistogram(LatencyHistogram *acc, LatencyHistogram *hist)
+{
+	int			i;
+
+	for (i = 0; i < LATENCY_HIST_BUCKETS; i++)
+		acc->buckets[i] += hist->buckets[i];
+
+	/* Merge overflow values by concatenating arrays */
+	if (hist->overflow_count > 0)
+	{
+		int64		new_count = acc->overflow_count + hist->overflow_count;
+
+		/* Grow acc's array if needed */
+		if (new_count > acc->overflow_capacity)
+		{
+			int			new_capacity = acc->overflow_capacity == 0 ? 16 : acc->overflow_capacity;
+
+			while (new_capacity < new_count)
+				new_capacity *= 2;
+			acc->overflow_values = (double *) pg_realloc(acc->overflow_values,
+														 new_capacity * sizeof(double));
+			acc->overflow_capacity = new_capacity;
+		}
+
+		/* Copy hist's overflow values to acc */
+		for (i = 0; i < hist->overflow_count; i++)
+			acc->overflow_values[acc->overflow_count + i] = hist->overflow_values[i];
+
+		acc->overflow_count = new_count;
+		acc->overflow_sum += hist->overflow_sum;
+	}
+}
+
+/*
+ * Comparison function for sorting overflow values (for qsort).
+ */
+static int
+compareDoubles(const void *a, const void *b)
+{
+	double		da = *(const double *) a;
+	double		db = *(const double *) b;
+
+	if (da < db)
+		return -1;
+	if (da > db)
+		return 1;
+	return 0;
+}
+
+/*
+ * Get the latency value (in microseconds) at a given percentile (0-100).
+ * Returns the upper bound of the bucket containing the percentile.
+ */
+static double
+getPercentileLatency(LatencyHistogram *hist, double percentile)
+{
+	int64		total_count = 0;
+	int64		target_count;
+	int64		cumulative = 0;
+	int			i;
+
+	/* Count total entries */
+	for (i = 0; i < LATENCY_HIST_BUCKETS; i++)
+		total_count += hist->buckets[i];
+	total_count += hist->overflow_count;
+
+	if (total_count == 0)
+		return 0.0;
+
+	/* Find the bucket containing the target percentile */
+	target_count = (int64) ((percentile / 100.0) * total_count);
+	if (target_count >= total_count)
+		target_count = total_count - 1;
+
+	for (i = 0; i < LATENCY_HIST_BUCKETS; i++)
+	{
+		cumulative += hist->buckets[i];
+		if (cumulative > target_count)
+		{
+			/* Return upper bound of this bucket */
+			if (i < LATENCY_HIST_FINE_BUCKETS)
+			{
+				/* Fine bucket: 0-100ms */
+				return (double) ((i + 1) * LATENCY_HIST_FINE_WIDTH);
+			}
+			else if (i < LATENCY_HIST_FINE_BUCKETS + LATENCY_HIST_COARSE_BUCKETS)
+			{
+				/* Coarse bucket: 100ms-10s */
+				return (double) (LATENCY_HIST_FINE_LIMIT +
+								 (i - LATENCY_HIST_FINE_BUCKETS + 1) * LATENCY_HIST_COARSE_WIDTH);
+			}
+			else if (i < LATENCY_HIST_FINE_BUCKETS + LATENCY_HIST_COARSE_BUCKETS +
+					 LATENCY_HIST_MEDIUM_BUCKETS)
+			{
+				/* Medium bucket: 10s-110s */
+				return (double) (LATENCY_HIST_COARSE_LIMIT +
+								 (i - LATENCY_HIST_FINE_BUCKETS - LATENCY_HIST_COARSE_BUCKETS + 1) *
+								 LATENCY_HIST_MEDIUM_WIDTH);
+			}
+			else
+			{
+				/* Large bucket: 110s-1110s */
+				return (double) (LATENCY_HIST_MEDIUM_LIMIT +
+								 (i - LATENCY_HIST_FINE_BUCKETS - LATENCY_HIST_COARSE_BUCKETS -
+								  LATENCY_HIST_MEDIUM_BUCKETS + 1) * LATENCY_HIST_LARGE_WIDTH);
+			}
+		}
+	}
+
+	/*
+	 * Percentile is in overflow region.
+	 * Sort the overflow values and find the exact value at the target position.
+	 */
+	if (hist->overflow_count > 0)
+	{
+		int64		overflow_target = target_count - cumulative;
+
+		/* Sort overflow values for accurate percentile calculation */
+		qsort(hist->overflow_values, hist->overflow_count, sizeof(double), compareDoubles);
+
+		/* Clamp to valid range */
+		if (overflow_target < 0)
+			overflow_target = 0;
+		if (overflow_target >= hist->overflow_count)
+			overflow_target = hist->overflow_count - 1;
+
+		return hist->overflow_values[overflow_target];
+	}
+
+	return 0.0;
+}
+
 /*
  * Initialize a StatsData struct to mostly zeroes, with its start time set to
  * the given value.
@@ -1469,6 +1855,15 @@ initStats(StatsData *sd, pg_time_usec_t start)
 	sd->deadlock_failures = 0;
 	initSimpleStats(&sd->latency);
 	initSimpleStats(&sd->lag);
+	if (report_percentiles)
+	{
+		sd->latency_hist = (LatencyHistogram *) pg_malloc(sizeof(LatencyHistogram));
+		initLatencyHistogram(sd->latency_hist);
+	}
+	else
+	{
+		sd->latency_hist = NULL;
+	}
 }
 
 /*
@@ -1504,6 +1899,10 @@ accumStats(StatsData *stats, bool skipped, double lat, double lag,
 
 			addToSimpleStats(&stats->latency, lat);
 
+			/* record in histogram for percentile calculation */
+			if (report_percentiles)
+				addToLatencyHistogram(stats->latency_hist, lat);
+
 			/* and possibly the same for schedule lag */
 			if (throttle_delay)
 				addToSimpleStats(&stats->lag, lag);
@@ -4079,6 +4478,11 @@ advanceConnectionState(TState *thread, CState *st, StatsData *agg)
 					/* XXX could use a mutex here, but we choose not to */
 					addToSimpleStats(&command->stats,
 									 PG_TIME_GET_DOUBLE(now - st->stmt_begin));
+
+					/* Also record in histogram for percentile calculation */
+					if (report_percentiles && command->latency_hist != NULL)
+						addToLatencyHistogram(command->latency_hist,
+											  (double) (now - st->stmt_begin));
 				}
 
 				/* Go ahead with next command, to be executed or skipped */
@@ -4727,7 +5131,7 @@ processXactStats(TState *thread, CState *st, pg_time_usec_t *now,
 	double		latency = 0.0,
 				lag = 0.0;
 	bool		detailed = progress || throttle_delay || latency_limit ||
-		use_log || per_script_stats;
+		use_log || per_script_stats || report_percentiles;
 
 	if (detailed && !skipped && st->estatus == ESTATUS_NO_ERROR)
 	{
@@ -5642,6 +6046,7 @@ create_sql_command(PQExpBuffer buf, const char *source)
 	my_command->varprefix = NULL;	/* allocated later, if needed */
 	my_command->expr = NULL;
 	initSimpleStats(&my_command->stats);
+	my_command->latency_hist = NULL;	/* allocated later if --percentile -r */
 	my_command->prepname = NULL;	/* set later, if needed */
 
 	return my_command;
@@ -5657,6 +6062,13 @@ free_command(Command *command)
 		pg_free(command->argv[i]);
 	pg_free(command->varprefix);
 
+	/* Free latency histogram if allocated */
+	if (command->latency_hist != NULL)
+	{
+		freeLatencyHistogram(command->latency_hist);
+		pg_free(command->latency_hist);
+	}
+
 	/*
 	 * It should also free expr recursively, but this is currently not needed
 	 * as only gset commands (which do not have an expression) are freed.
@@ -6312,6 +6724,7 @@ printProgressReport(TState *threads, int64 test_start, pg_time_usec_t now,
 	{
 		mergeSimpleStats(&cur.latency, &threads[i].stats.latency);
 		mergeSimpleStats(&cur.lag, &threads[i].stats.lag);
+		/* Note: histogram merge skipped here - percentiles not shown in progress */
 		cur.cnt += threads[i].stats.cnt;
 		cur.skipped += threads[i].stats.skipped;
 		cur.retries += threads[i].stats.retries;
@@ -6496,7 +6909,7 @@ printResults(StatsData *total,
 			   latency_limit / 1000.0, latency_late, total->cnt,
 			   (total->cnt > 0) ? 100.0 * latency_late / total->cnt : 0.0);
 
-	if (throttle_delay || progress || latency_limit)
+	if (throttle_delay || progress || latency_limit || report_percentiles)
 		printSimpleStats("latency", &total->latency);
 	else
 	{
@@ -6506,6 +6919,20 @@ printResults(StatsData *total,
 			   failures > 0 ? " (including failures)" : "");
 	}
 
+	/* Report latency percentiles if requested */
+	if (report_percentiles)
+	{
+		double	   *plist = num_percentiles > 0 ? percentile_list : default_percentiles;
+		int			pcount = num_percentiles > 0 ? num_percentiles : (int) (sizeof(default_percentiles) / sizeof(default_percentiles[0]));
+
+		for (int i = 0; i < pcount; i++)
+		{
+			printf("latency percentile %g = %.3f ms\n",
+				   plist[i],
+				   0.001 * getPercentileLatency(total->latency_hist, plist[i]));
+		}
+	}
+
 	if (throttle_delay)
 	{
 		/*
@@ -6602,6 +7029,20 @@ printResults(StatsData *total,
 
 				}
 				printSimpleStats(" - latency", &sstats->latency);
+
+				/* Report per-script latency percentiles if requested */
+				if (report_percentiles)
+				{
+					double	   *plist = num_percentiles > 0 ? percentile_list : default_percentiles;
+					int			pcount = num_percentiles > 0 ? num_percentiles : (int) (sizeof(default_percentiles) / sizeof(default_percentiles[0]));
+
+					for (int j = 0; j < pcount; j++)
+					{
+						printf(" - latency percentile %g = %.3f ms\n",
+							   plist[j],
+							   0.001 * getPercentileLatency(sstats->latency_hist, plist[j]));
+					}
+				}
 			}
 
 			/*
@@ -6637,6 +7078,22 @@ printResults(StatsData *total,
 							   (*commands)->failures,
 							   (*commands)->retries,
 							   (*commands)->first_line);
+
+					/* Print per-command percentiles on a single line */
+					if (report_percentiles && (*commands)->latency_hist != NULL)
+					{
+						double	   *plist = num_percentiles > 0 ? percentile_list : default_percentiles;
+						int			pcount = num_percentiles > 0 ? num_percentiles : (int) (sizeof(default_percentiles) / sizeof(default_percentiles[0]));
+
+						printf("                          ");
+						for (int k = 0; k < pcount; k++)
+						{
+							printf("p%g: %.3f%s",
+								   plist[k],
+								   0.001 * getPercentileLatency((*commands)->latency_hist, plist[k]),
+								   (k < pcount - 1) ? ", " : "\n");
+						}
+					}
 				}
 			}
 		}
@@ -6739,6 +7196,8 @@ main(int argc, char **argv)
 		{"verbose-errors", no_argument, NULL, 15},
 		{"exit-on-abort", no_argument, NULL, 16},
 		{"debug", no_argument, NULL, 17},
+		{"percentile", optional_argument, NULL, 18},
+		{"debug-latency-multiplier", required_argument, NULL, 19},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -7092,6 +7551,24 @@ main(int argc, char **argv)
 			case 17:			/* debug */
 				pg_logging_increase_verbosity();
 				break;
+			case 18:			/* percentile */
+				benchmarking_option_set = true;
+				report_percentiles = true;
+				if (optarg != NULL)
+				{
+					if (!parse_percentile_list(optarg))
+						exit(1);
+				}
+				break;
+			case 19:			/* debug-latency-multiplier */
+				{
+					double		multiplier = atof(optarg);
+
+					if (multiplier <= 0.0)
+						pg_fatal("invalid latency multiplier: \"%s\"", optarg);
+					debug_latency_multiplier = multiplier;
+				}
+				break;
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -7120,6 +7597,27 @@ main(int argc, char **argv)
 		total_weight += sql_script[i].weight;
 	}
 
+	/*
+	 * Allocate per-command histograms if both --percentile and -r are enabled.
+	 * Note: this can consume significant memory with many commands/threads.
+	 */
+	if (report_percentiles && report_per_command)
+	{
+		int			total_commands = 0;
+
+		for (i = 0; i < num_scripts; i++)
+		{
+			Command   **commands = sql_script[i].commands;
+
+			for (int j = 0; commands[j] != NULL; j++)
+			{
+				commands[j]->latency_hist = (LatencyHistogram *) pg_malloc(sizeof(LatencyHistogram));
+				initLatencyHistogram(commands[j]->latency_hist);
+				total_commands++;
+			}
+		}
+	}
+
 	if (total_weight == 0 && !is_init_mode)
 		pg_fatal("total script weight must not be zero");
 
@@ -7441,6 +7939,8 @@ main(int argc, char **argv)
 		/* aggregate thread level stats */
 		mergeSimpleStats(&stats.latency, &thread->stats.latency);
 		mergeSimpleStats(&stats.lag, &thread->stats.lag);
+		if (report_percentiles)
+			mergeLatencyHistogram(stats.latency_hist, thread->stats.latency_hist);
 		stats.cnt += thread->stats.cnt;
 		stats.skipped += thread->stats.skipped;
 		stats.retries += thread->stats.retries;
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index f61dcf68234..9a49aa54cf5 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -1824,6 +1824,160 @@ BEGIN
 # Clean up
 $node->safe_psql('postgres', 'DROP TABLE counter;');
 
+# Test --percentile output
+$node->pgbench(
+	'-n -t 100 -c 2 --percentile -b select-only',
+	0,
+	[
+		qr{processed: 200/200},
+		qr{latency percentile 50 = \d+\.\d+ ms},
+		qr{latency percentile 90 = \d+\.\d+ ms},
+		qr{latency percentile 99 = \d+\.\d+ ms},
+		qr{latency percentile 99\.9 = \d+\.\d+ ms}
+	],
+	[qr{^$}],
+	'pgbench percentile output');
+
+# Test --percentile with multiple threads
+$node->pgbench(
+	'-n -t 50 -c 4 -j 2 --percentile -b select-only',
+	0,
+	[
+		qr{processed: 200/200},
+		qr{latency percentile 50 = \d+\.\d+ ms},
+		qr{latency percentile 90 = \d+\.\d+ ms}
+	],
+	[qr{^$}],
+	'pgbench percentile with threads');
+
+# Test --percentile with per-script stats (multiple scripts)
+$node->pgbench(
+	'-n -t 50 -c 2 --percentile -b select-only@2 -b simple-update@1',
+	0,
+	[
+		qr{processed: 100/100},
+		qr{latency percentile 50 = \d+\.\d+ ms},
+		qr{SQL script 1:.*\n.*- latency percentile 50 = \d+\.\d+ ms}s,
+		qr{SQL script 2:.*\n.*- latency percentile 50 = \d+\.\d+ ms}s
+	],
+	[qr{^$}],
+	'pgbench percentile with per-script stats');
+
+# Test --percentile with custom percentile list
+$node->pgbench(
+	'-n -t 100 -c 2 --percentile=25,75,95 -b select-only',
+	0,
+	[
+		qr{processed: 200/200},
+		qr{latency percentile 25 = \d+\.\d+ ms},
+		qr{latency percentile 75 = \d+\.\d+ ms},
+		qr{latency percentile 95 = \d+\.\d+ ms}
+	],
+	[qr{^$}],
+	'pgbench custom percentile list');
+
+# Test --percentile with fractional custom percentiles
+$node->pgbench(
+	'-n -t 100 -c 2 --percentile=50,99.5,99.95 -b select-only',
+	0,
+	[
+		qr{processed: 200/200},
+		qr{latency percentile 50 = \d+\.\d+ ms},
+		qr{latency percentile 99\.5 = \d+\.\d+ ms},
+		qr{latency percentile 99\.95 = \d+\.\d+ ms}
+	],
+	[qr{^$}],
+	'pgbench custom fractional percentiles');
+
+# Test --percentile with invalid percentile value (out of range)
+$node->pgbench(
+	'-n -t 10 -c 1 --percentile=50,101 -b select-only',
+	1,
+	[qr{^$}],
+	[qr{percentile must be between 0 and 100}],
+	'pgbench invalid percentile value out of range');
+
+# Test --percentile with invalid percentile value (zero)
+$node->pgbench(
+	'-n -t 10 -c 1 --percentile=0,50 -b select-only',
+	1,
+	[qr{^$}],
+	[qr{percentile must be between 0 and 100}],
+	'pgbench invalid percentile value zero');
+
+# Test --percentile with invalid percentile value (non-numeric)
+$node->pgbench(
+	'-n -t 10 -c 1 --percentile=50,abc -b select-only',
+	1,
+	[qr{^$}],
+	[qr{invalid percentile value}],
+	'pgbench invalid percentile value non-numeric');
+
+# Test --percentile with -r (per-command percentiles)
+$node->pgbench(
+	'-n -t 50 -c 2 --percentile -r -b select-only',
+	0,
+	[
+		qr{processed: 100/100},
+		qr{latency percentile 50 = \d+\.\d+ ms},
+		qr{statement latencies in milliseconds},
+		qr{p50: \d+\.\d+, p90: \d+\.\d+, p99: \d+\.\d+}
+	],
+	[qr{^$}],
+	'pgbench per-command percentiles');
+
+# Test --percentile with custom list and -r
+$node->pgbench(
+	'-n -t 50 -c 2 --percentile=25,50,75 -r -b select-only',
+	0,
+	[
+		qr{processed: 100/100},
+		qr{latency percentile 25 = \d+\.\d+ ms},
+		qr{p25: \d+\.\d+, p50: \d+\.\d+, p75: \d+\.\d+}
+	],
+	[qr{^$}],
+	'pgbench per-command custom percentiles');
+
+# Test --debug-latency-multiplier to simulate overflow (latencies > 20 min)
+# Using a multiplier of 1e9 will push even microsecond latencies into overflow
+$node->pgbench(
+	'-n -t 10 -c 1 --percentile=50,99 --debug-latency-multiplier=1e9 -b select-only',
+	0,
+	[
+		qr{processed: 10/10},
+		qr{latency percentile 50 = \d+\.\d+ ms},
+		qr{latency percentile 99 = \d+\.\d+ ms}
+	],
+	[qr{^$}],
+	'pgbench latency multiplier for overflow testing');
+
+# Test high percentiles (99.99, 99.999) with overflow to exercise tail retrieval
+# With 100 transactions all in overflow, p99.99 should retrieve from sorted overflow array
+$node->pgbench(
+	'-n -t 100 -c 1 --percentile=50,90,99,99.9,99.99,99.999 --debug-latency-multiplier=1e9 -b select-only',
+	0,
+	[
+		qr{processed: 100/100},
+		qr{latency percentile 50 = \d+\.\d+ ms},
+		qr{latency percentile 99\.9 = \d+\.\d+ ms},
+		qr{latency percentile 99\.99 = \d+\.\d+ ms},
+		qr{latency percentile 99\.999 = \d+\.\d+ ms}
+	],
+	[qr{^$}],
+	'pgbench overflow high percentile retrieval');
+
+# Test overflow with per-command stats to exercise command histogram overflow
+$node->pgbench(
+	'-n -t 50 -c 1 --percentile=50,99.9 -r --debug-latency-multiplier=1e9 -b select-only',
+	0,
+	[
+		qr{processed: 50/50},
+		qr{latency percentile 50 = \d+\.\d+ ms},
+		qr{p50: \d+\.\d+.*p99\.9: \d+\.\d+}
+	],
+	[qr{^$}],
+	'pgbench overflow with per-command percentiles');
+
 # done
 $node->safe_psql('postgres', 'DROP TABLESPACE regress_pgbench_tap_1_ts');
 $node->stop;