From 29ceac99ae06c5ade71e1f185faaa4de02e97677 Mon Sep 17 00:00:00 2001
From: Ryan Ji <tairanjiryan@gmail.com>
Date: Thu, 8 Jan 2026 18:58:20 -0500
Subject: [PATCH 1/7] Improved benchmarking suite

---
 Makefile                 |   4 +-
 benchmarks/baseline.json | 155 ++++++++++++++
 profile_ddsketch.py      | 450 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 582 insertions(+), 27 deletions(-)
 create mode 100644 benchmarks/baseline.json
diff --git a/Makefile b/Makefile
index 9cdc91f..d9a173e 100644
--- a/Makefile
+++ b/Makefile
@@ -30,7 +30,7 @@ install: install-env-run install-env-docs install-env-test
 install-env-run:
 	@echo "👷‍♂️ $(BLUE)creating virtual environment $(PROJECT)-run$(NC)"
 	pyenv local --unset
-	-pyenv virtualenv $(PROJECT)-run > /dev/null
+	-pyenv virtualenv $(word 1,$(PYTHON_VERSIONS)) $(PROJECT)-run > /dev/null
 	pyenv local $(PROJECT)-run
 	pip install --no-user -U pip > /dev/null
 	pip install --no-user -r requirements.txt > /dev/null
@@ -39,7 +39,7 @@ install-env-run:
 install-env-docs:
 	@echo "👷‍♂️ $(BLUE)creating virtual environment $(PROJECT)-docs$(NC)"
 	pyenv local --unset
-	-pyenv virtualenv $(PROJECT)-docs > /dev/null
+	-pyenv virtualenv $(word 1,$(PYTHON_VERSIONS)) $(PROJECT)-docs > /dev/null
 	pyenv local $(PROJECT)-docs
 	pip install --no-user -U pip > /dev/null
 	pip install --no-user -r requirements.docs.txt > /dev/null
diff --git a/benchmarks/baseline.json b/benchmarks/baseline.json
new file mode 100644
index 0000000..58c48d8
--- /dev/null
+++ b/benchmarks/baseline.json
@@ -0,0 +1,155 @@
+{
+  "timestamp": "2026-01-08T18:57:53.373734",
+  "name": "baseline",
+  "metadata": {
+    "num_values": 10000000,
+    "num_trials": 5,
+    "timestamp": "2026-01-08T18:57:53.373709"
+  },
+  "stats": [
+    {
+      "function": "profile_ddsketch.py:18(run_sketch_operations)",
+      "ncalls": 1,
+      "tottime": 1.565406003,
+      "cumtime": 16.1225542324,
+      "percall_tot": 1.565406003,
+      "percall_cum": 16.1225542324
+    },
+    {
+      "function": "core.py:77(insert)",
+      "ncalls": 10000000,
+      "tottime": 3.4180910396,
+      "cumtime": 14.5549722284,
+      "percall_tot": 3.418091039600001e-07,
+      "percall_cum": 1.45549722284e-06
+    },
+    {
+      "function": "contiguous.py:51(add)",
+      "ncalls": 10000000,
+      "tottime": 4.7022346094,
+      "cumtime": 7.0226845072,
+      "percall_tot": 4.7022346094000004e-07,
+      "percall_cum": 7.022684507200001e-07
+    },
+    {
+      "function": "logarithmic.py:12(compute_bucket_index)",
+      "ncalls": 10000000,
+      "tottime": 2.7030884928000005,
+      "cumtime": 4.114196681600001,
+      "percall_tot": 2.7030884928e-07,
+      "percall_cum": 4.1141966816000005e-07
+    },
+    {
+      "function": "contiguous.py:37(_get_position)",
+      "ncalls": 10003320,
+      "tottime": 1.7105854762,
+      "cumtime": 2.3211455057999997,
+      "percall_tot": 1.7100174287063457e-07,
+      "percall_cum": 2.3203747022605352e-07
+    },
+    {
+      "function": "~:0(<built-in method math.log>)",
+      "ncalls": 10000001,
+      "tottime": 0.7740575776,
+      "cumtime": 0.7740575776,
+      "percall_tot": 7.7405750019425e-08,
+      "percall_cum": 7.7405750019425e-08
+    },
+    {
+      "function": "~:0(<built-in method math.ceil>)",
+      "ncalls": 10000000,
+      "tottime": 0.6370525874000001,
+      "cumtime": 0.6370525874000001,
+      "percall_tot": 6.370525874000001e-08,
+      "percall_cum": 6.370525874000001e-08
+    },
+    {
+      "function": "~:0(<built-in method builtins.len>)",
+      "ncalls": 10003337,
+      "tottime": 0.6105629916,
+      "cumtime": 0.6105629916,
+      "percall_tot": 6.103591742666684e-08,
+      "percall_cum": 6.103591742666684e-08
+    },
+    {
+      "function": "core.py:128(quantile)",
+      "ncalls": 4,
+      "tottime": 0.000567737,
+      "cumtime": 0.0021306240000000002,
+      "percall_tot": 0.00014193425,
+      "percall_cum": 0.0005326560000000001
+    },
+    {
+      "function": "contiguous.py:181(get_count)",
+      "ncalls": 3321,
+      "tottime": 0.0008458188000000002,
+      "cumtime": 0.0015443888,
+      "percall_tot": 2.549099319598771e-07,
+      "percall_cum": 4.6540709090191636e-07
+    },
+    {
+      "function": "core.py:24(__init__)",
+      "ncalls": 1,
+      "tottime": 8.0284e-06,
+      "cumtime": 4.5377e-05,
+      "percall_tot": 8.0284e-06,
+      "percall_cum": 4.5377e-05
+    },
+    {
+      "function": "~:0(<method 'disable' of '_lsprof.Profiler' objects>)",
+      "ncalls": 1,
+      "tottime": 3.40906e-05,
+      "cumtime": 3.40906e-05,
+      "percall_tot": 3.40906e-05,
+      "percall_cum": 3.40906e-05
+    },
+    {
+      "function": "contiguous.py:19(__init__)",
+      "ncalls": 2,
+      "tottime": 9.3374e-06,
+      "cumtime": 3.0868200000000004e-05,
+      "percall_tot": 4.6687e-06,
+      "percall_cum": 1.5434100000000002e-05
+    },
+    {
+      "function": "~:0(<built-in method numpy.zeros>)",
+      "ncalls": 2,
+      "tottime": 1.95952e-05,
+      "cumtime": 1.95952e-05,
+      "percall_tot": 9.7976e-06,
+      "percall_cum": 9.7976e-06
+    },
+    {
+      "function": "logarithmic.py:18(compute_value_from_index)",
+      "ncalls": 4,
+      "tottime": 1.54358e-05,
+      "cumtime": 1.8498200000000002e-05,
+      "percall_tot": 3.85895e-06,
+      "percall_cum": 4.6245500000000005e-06
+    },
+    {
+      "function": "logarithmic.py:7(__init__)",
+      "ncalls": 1,
+      "tottime": 4.5042e-06,
+      "cumtime": 6.4804000000000015e-06,
+      "percall_tot": 4.5042e-06,
+      "percall_cum": 6.4804000000000015e-06
+    },
+    {
+      "function": "~:0(<built-in method math.pow>)",
+      "ncalls": 4,
+      "tottime": 3.0624000000000005e-06,
+      "cumtime": 3.0624000000000005e-06,
+      "percall_tot": 7.656000000000001e-07,
+      "percall_cum": 7.656000000000001e-07
+    },
+    {
+      "function": "base.py:17(__init__)",
+      "ncalls": 2,
+      "tottime": 1.9355999999999997e-06,
+      "cumtime": 1.9355999999999997e-06,
+      "percall_tot": 9.677999999999999e-07,
+      "percall_cum": 9.677999999999999e-07
+    }
+  ]
+}
\ No newline at end of file
diff --git a/profile_ddsketch.py b/profile_ddsketch.py
index a086501..f4a769a 100644
--- a/profile_ddsketch.py
+++ b/profile_ddsketch.py
@@ -1,50 +1,450 @@
 import cProfile
 import pstats
 import io
-import numpy as np  # Using numpy for faster random data generation
+import json
+import argparse
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Tuple
+import numpy as np
 
 from QuantileFlow.ddsketch.core import DDSketch
 
-def run_sketch_operations():
-    """Runs typical DDSketch operations for profiling."""
-    print("Initializing DDSketch...")
+
+BENCHMARK_DIR = Path("benchmarks")
+BENCHMARK_DIR.mkdir(exist_ok=True)
+
+
+def run_sketch_operations(num_values: int = 10_000_000) -> Dict:
+    """Runs typical DDSketch operations for profiling.
+    
+    Returns:
+        Dict containing operation results and metrics
+    """
     sketch = DDSketch(relative_accuracy=0.01)
     
-    num_values = 1_000_000
-    print(f"Inserting {num_values} random values...")
-    # Generate random data more efficiently with numpy
+    # Generate random data
     data = np.random.rand(num_values) * 1000
+    
+    # Track insertion time
     for value in data:
         sketch.insert(value)
         
+    # Compute quantiles
     quantiles_to_compute = [0.5, 0.9, 0.99, 0.999]
-    print(f"Computing quantiles: {quantiles_to_compute}...")
+    quantile_results = {}
     for q in quantiles_to_compute:
         try:
             quantile_value = sketch.quantile(q)
-            print(f"Quantile({q}): {quantile_value}")
+            quantile_results[q] = quantile_value
         except ValueError as e:
-            print(f"Error computing quantile {q}: {e}")
+            quantile_results[q] = f"Error: {e}"
+    
+    return {
+        'num_values': num_values,
+        'quantiles': quantile_results
+    }
+
+
+def extract_profile_stats(profiler: cProfile.Profile, top_n: int = 20) -> List[Dict]:
+    """Extract key statistics from profiler."""
+    stats = pstats.Stats(profiler)
+    stats_data = []
+    
+    for func, (cc, nc, tt, ct, callers) in stats.stats.items():
+        filename, line, func_name = func
+        stats_data.append({
+            'function': f"{Path(filename).name}:{line}({func_name})",
+            'ncalls': nc,
+            'tottime': tt,
+            'cumtime': ct,
+            'percall_tot': tt / nc if nc > 0 else 0,
+            'percall_cum': ct / nc if nc > 0 else 0,
+        })
+    
+    # Sort by cumulative time
+    stats_data.sort(key=lambda x: x['cumtime'], reverse=True)
+    return stats_data[:top_n]
+
+
+def print_comparison_table(current: List[Dict], baseline: List[Dict]):
+    """Print side-by-side comparison of current run vs baseline."""
+    print(f"\n{'=' * 130}")
+    print(f"{'Performance Comparison (Current vs Baseline)':^130}")
+    print('=' * 130)
+    
+    header = (f"{'Function':<45} "
+              f"{'Curr Time':>12} {'Base Time':>12} {'Diff':>10} {'Change %':>12} {'Calls Δ':>10}")
+    print(header)
+    print('-' * 130)
+    
+    # Create lookup dict for baseline
+    baseline_dict = {stat['function']: stat for stat in baseline}
+    
+    for curr_stat in current:
+        func_name = curr_stat['function']
+        if len(func_name) > 42:
+            func_name = "..." + func_name[-42:]
+        
+        baseline_stat = baseline_dict.get(curr_stat['function'])
+        
+        if baseline_stat:
+            time_diff = curr_stat['cumtime'] - baseline_stat['cumtime']
+            time_change = (time_diff / baseline_stat['cumtime'] * 100) if baseline_stat['cumtime'] > 0 else 0
+            calls_diff = curr_stat['ncalls'] - baseline_stat['ncalls']
+            
+            # Color coding for terminal (green for improvement, red for regression)
+            change_str = f"{time_change:+.1f}%"
+            if time_change < -5:  # Significant improvement
+                change_str = f"\033[92m{change_str}\033[0m"
+            elif time_change > 5:  # Significant regression
+                change_str = f"\033[91m{change_str}\033[0m"
+            
+            print(f"{func_name:<45} "
+                  f"{curr_stat['cumtime']:>12.4f} "
+                  f"{baseline_stat['cumtime']:>12.4f} "
+                  f"{time_diff:>+10.4f} "
+                  f"{change_str:>12} "
+                  f"{calls_diff:>+10}")
+        else:
+            print(f"{func_name:<45} "
+                  f"{curr_stat['cumtime']:>12.4f} "
+                  f"{'N/A':>12} "
+                  f"{'N/A':>10} "
+                  f"{'NEW':>12} "
+                  f"{'N/A':>10}")
+    
+    print('=' * 130)
 
-    print("Profiling complete.")
 
-def profile():
-    """Profiles the run_sketch_operations function."""
-    profiler = cProfile.Profile()
-    profiler.enable()
+def save_benchmark(stats_data: List[Dict], name: str, metadata: Dict):
+    """Save benchmark results to file."""
+    benchmark = {
+        'timestamp': datetime.now().isoformat(),
+        'name': name,
+        'metadata': metadata,
+        'stats': stats_data
+    }
     
-    run_sketch_operations()
+    filepath = BENCHMARK_DIR / f"{name}.json"
+    with open(filepath, 'w') as f:
+        json.dump(benchmark, f, indent=2)
     
-    profiler.disable()
+    print(f"\n✓ Benchmark saved to: {filepath}")
+
+
+def load_benchmark(name: str) -> Tuple[List[Dict], Dict]:
+    """Load benchmark from file."""
+    filepath = BENCHMARK_DIR / f"{name}.json"
+    
+    if not filepath.exists():
+        raise FileNotFoundError(f"Benchmark '{name}' not found at {filepath}")
+    
+    with open(filepath, 'r') as f:
+        data = json.load(f)
+    
+    return data['stats'], data['metadata']
+
+
+def list_benchmarks():
+    """List all available benchmarks."""
+    benchmarks = sorted(BENCHMARK_DIR.glob("*.json"))
+    
+    if not benchmarks:
+        print("\nNo benchmarks found.")
+        return
+    
+    print(f"\n{'Available Benchmarks':^90}")
+    print('=' * 90)
+    print(f"{'Name':<25} {'Date':<22} {'Values':<15} {'Trials':<10}")
+    print('-' * 90)
     
-    s = io.StringIO()
-    # Sort stats by cumulative time spent in the function and its callees
-    sortby = pstats.SortKey.CUMULATIVE 
-    ps = pstats.Stats(profiler, stream=s).sort_stats(sortby)
-    ps.print_stats()
+    for bm_file in benchmarks:
+        with open(bm_file, 'r') as f:
+            data = json.load(f)
+        name = bm_file.stem
+        timestamp = datetime.fromisoformat(data['timestamp']).strftime('%Y-%m-%d %H:%M:%S')
+        num_values = data.get('metadata', {}).get('num_values', 'N/A')
+        num_trials = data.get('metadata', {}).get('num_trials', 1)
+        
+        values_str = f"{num_values:,}" if isinstance(num_values, int) else str(num_values)
+        print(f"{name:<25} {timestamp:<22} {values_str:<15} {num_trials:<10}")
     
-    print("\n--- cProfile Results (Sorted by Cumulative Time) ---")
-    print(s.getvalue())
+    print('=' * 90)
+
+
+def merge_trial_stats(all_trial_stats: List[List[Dict]]) -> Tuple[List[Dict], List[Dict]]:
+    """Merge statistics from multiple trials, computing mean and std dev.
+    
+    Args:
+        all_trial_stats: List of stats from each trial
+        
+    Returns:
+        Tuple of (mean_stats, std_stats)
+    """
+    # Build a dict mapping function -> list of stats from each trial
+    func_stats = {}
+    
+    for trial_stats in all_trial_stats:
+        for stat in trial_stats:
+            func = stat['function']
+            if func not in func_stats:
+                func_stats[func] = []
+            func_stats[func].append(stat)
+    
+    # Compute mean and std dev for each function
+    mean_stats = []
+    std_stats = []
+    
+    for func, stats_list in func_stats.items():
+        if not stats_list:
+            continue
+            
+        n = len(stats_list)
+        
+        # Compute means
+        mean_stat = {
+            'function': func,
+            'ncalls': int(np.mean([s['ncalls'] for s in stats_list])),
+            'tottime': np.mean([s['tottime'] for s in stats_list]),
+            'cumtime': np.mean([s['cumtime'] for s in stats_list]),
+            'percall_tot': np.mean([s['percall_tot'] for s in stats_list]),
+            'percall_cum': np.mean([s['percall_cum'] for s in stats_list]),
+        }
+        
+        # Compute standard deviations
+        std_stat = {
+            'function': func,
+            'tottime_std': np.std([s['tottime'] for s in stats_list]),
+            'cumtime_std': np.std([s['cumtime'] for s in stats_list]),
+        }
+        
+        mean_stats.append(mean_stat)
+        std_stats.append(std_stat)
+    
+    # Sort by cumulative time
+    mean_stats.sort(key=lambda x: x['cumtime'], reverse=True)
+    
+    # Reorder std_stats to match mean_stats
+    func_to_std = {s['function']: s for s in std_stats}
+    std_stats = [func_to_std[s['function']] for s in mean_stats]
+    
+    return mean_stats, std_stats
+
+
+def print_profile_table_with_std(stats_data: List[Dict], std_data: List[Dict], 
+                                  title: str = "Profile Results", num_trials: int = 1):
+    """Print profile statistics with standard deviation."""
+    print(f"\n{'=' * 120}")
+    print(f"{title:^120}")
+    if num_trials > 1:
+        print(f"{'(Averaged over ' + str(num_trials) + ' trials)':^120}")
+    print('=' * 120)
+    
+    if num_trials > 1:
+        header = f"{'Function':<45} {'Calls':>10} {'TotTime':>12} {'CumTime':>12} {'±StdDev':>10} {'Per Call':>12}"
+    else:
+        header = f"{'Function':<45} {'Calls':>10} {'TotTime':>12} {'CumTime':>12} {'Per Call':>12}"
+    print(header)
+    print('-' * 120)
+    
+    std_dict = {s['function']: s for s in std_data} if std_data else {}
+    
+    for stat in stats_data:
+        func_name = stat['function']
+        if len(func_name) > 42:
+            func_name = "..." + func_name[-42:]
+        
+        if num_trials > 1 and stat['function'] in std_dict:
+            std = std_dict[stat['function']]
+            print(f"{func_name:<45} "
+                  f"{stat['ncalls']:>10} "
+                  f"{stat['tottime']:>12.4f} "
+                  f"{stat['cumtime']:>12.4f} "
+                  f"±{std['cumtime_std']:>9.4f} "
+                  f"{stat['percall_cum']:>12.6f}")
+        else:
+            print(f"{func_name:<45} "
+                  f"{stat['ncalls']:>10} "
+                  f"{stat['tottime']:>12.4f} "
+                  f"{stat['cumtime']:>12.4f} "
+                  f"{stat['percall_cum']:>12.6f}")
+    
+    print('=' * 120)
+
+
+def profile(num_values: int = 10_000_000, 
+            save_as: str = None, 
+            compare_to: str = None,
+            top_n: int = 20,
+            num_trials: int = 1):
+    """Profiles the run_sketch_operations function.
+    
+    Args:
+        num_values: Number of values to insert per trial
+        save_as: If provided, save results as a benchmark with this name
+        compare_to: If provided, compare results to this benchmark
+        top_n: Number of top functions to display
+        num_trials: Number of trials to run and average
+    """
+    print(f"\n{'Starting Profile Run':^60}")
+    print(f"{'=' * 60}")
+    print(f"Values per trial: {num_values:,}")
+    print(f"Number of trials: {num_trials}")
+    print(f"Total operations: {num_values * num_trials:,}")
+    print(f"Top functions to show: {top_n}")
+    print('=' * 60)
+    
+    all_trial_stats = []
+    
+    for trial in range(num_trials):
+        if num_trials > 1:
+            print(f"\n▶ Running trial {trial + 1}/{num_trials}...")
+        
+        profiler = cProfile.Profile()
+        profiler.enable()
+        
+        results = run_sketch_operations(num_values)
+        
+        profiler.disable()
+        
+        # Extract statistics for this trial
+        trial_stats = extract_profile_stats(profiler, top_n=top_n * 2)  # Get more to ensure we have enough after averaging
+        all_trial_stats.append(trial_stats)
+    
+    # Merge stats from all trials
+    if num_trials > 1:
+        print(f"\n📊 Computing averages across {num_trials} trials...")
+        stats_data, std_data = merge_trial_stats(all_trial_stats)
+        stats_data = stats_data[:top_n]  # Trim to requested top_n
+        std_data = std_data[:top_n]
+    else:
+        stats_data = all_trial_stats[0][:top_n]
+        std_data = []
+    
+    # Print results
+    print("\n📊 Operation Results:")
+    print(f"  • Values inserted per trial: {results['num_values']:,}")
+    if num_trials > 1:
+        print(f"  • Total values processed: {results['num_values'] * num_trials:,}")
+    print(f"  • Sample quantiles from last trial:")
+    for q, val in results['quantiles'].items():
+        print(f"    - Q({q}): {val}")
+    
+    # Show profile table
+    print_profile_table_with_std(stats_data, std_data, 
+                                  "Current Run - Top Functions by Cumulative Time",
+                                  num_trials=num_trials)
+    
+    # Total time summary
+    total_time = sum(stat['cumtime'] for stat in stats_data[:5])
+    if num_trials > 1 and std_data:
+        total_std = np.sqrt(sum(std['cumtime_std']**2 for std in std_data[:5]))
+        print(f"\n⏱️  Top 5 functions avg time: {total_time:.4f}s (±{total_std:.4f}s)")
+        cv = (total_std / total_time * 100) if total_time > 0 else 0
+        print(f"   Coefficient of variation: {cv:.2f}%", end="")
+        if cv < 5:
+            print(" ✓ (Very stable)")
+        elif cv < 10:
+            print(" (Stable)")
+        elif cv < 20:
+            print(" ⚠ (Moderate variance)")
+        else:
+            print(" ⚠ (High variance - consider more trials)")
+    else:
+        print(f"\n⏱️  Top 5 functions total time: {total_time:.4f}s")
+    
+    # Compare to baseline if requested
+    if compare_to:
+        try:
+            baseline_stats, baseline_metadata = load_benchmark(compare_to)
+            baseline_trials = baseline_metadata.get('num_trials', 1)
+            print(f"\n📈 Comparing to baseline: '{compare_to}'")
+            print(f"   Baseline date: {baseline_metadata.get('timestamp', 'N/A')}")
+            print(f"   Baseline values per trial: {baseline_metadata.get('num_values', 'N/A'):,}")
+            print(f"   Baseline trials: {baseline_trials}")
+            print_comparison_table(stats_data, baseline_stats)
+            
+            # Summary statistics
+            curr_total = sum(s['cumtime'] for s in stats_data[:5])
+            base_total = sum(s['cumtime'] for s in baseline_stats[:5])
+            change = ((curr_total - base_total) / base_total * 100) if base_total > 0 else 0
+            
+            print(f"\n📊 Overall Performance Change: {change:+.2f}%")
+            if change < -5:
+                print("   ✓ \033[92mSignificant improvement!\033[0m")
+            elif change > 5:
+                print("   ⚠ \033[91mPerformance regression detected\033[0m")
+            else:
+                print("   ≈ Similar performance")
+                
+        except FileNotFoundError as e:
+            print(f"\n⚠️  {e}")
+    
+    # Save as benchmark if requested
+    if save_as:
+        metadata = {
+            'num_values': num_values,
+            'num_trials': num_trials,
+            'timestamp': datetime.now().isoformat(),
+        }
+        save_benchmark(stats_data, save_as, metadata)
+    
+    print("\n✅ Profiling complete.\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Profile DDSketch operations with benchmarking capabilities",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Run basic profile with multiple trials
+  python profile_ddsketch.py --num-trials 5
+  
+  # Save averaged baseline (recommended: use 3-5 trials)
+  python profile_ddsketch.py --num-trials 5 --save-as baseline
+  
+  # Compare against baseline
+  python profile_ddsketch.py --num-trials 5 --compare-to baseline
+  
+  # Save and compare in one run
+  python profile_ddsketch.py --num-trials 5 --save-as optimized --compare-to baseline
+  
+  # List all saved benchmarks
+  python profile_ddsketch.py --list
+  
+  # Quick test with fewer values
+  python profile_ddsketch.py --num-values 1000000 --num-trials 3
+        """
+    )
+    
+    parser.add_argument('--num-values', type=int, default=10_000_000,
+                        help='Number of values to insert per trial (default: 10,000,000)')
+    parser.add_argument('--num-trials', type=int, default=1,
+                        help='Number of trials to run and average (default: 1, recommended: 3-5)')
+    parser.add_argument('--save-as', type=str,
+                        help='Save results as a benchmark with this name')
+    parser.add_argument('--compare-to', type=str,
+                        help='Compare results to this benchmark')
+    parser.add_argument('--list', action='store_true',
+                        help='List all available benchmarks')
+    parser.add_argument('--top-n', type=int, default=20,
+                        help='Number of top functions to display (default: 20)')
+    
+    args = parser.parse_args()
+    
+    if args.list:
+        list_benchmarks()
+    else:
+        profile(
+            num_values=args.num_values,
+            save_as=args.save_as,
+            compare_to=args.compare_to,
+            top_n=args.top_n,
+            num_trials=args.num_trials
+        )
+
 
 if __name__ == "__main__":
-    profile() 
\ No newline at end of file
+    main() 
\ No newline at end of file

From ec234988479b35802191347fcc84511df0930a27 Mon Sep 17 00:00:00 2001
From: Ryan Ji <tairanjiryan@gmail.com>
Date: Thu, 8 Jan 2026 19:17:38 -0500
Subject: [PATCH 2/7] Replace calls to _get_position with a one liner to reduce
 overhead; ~10% speedup

---
 QuantileFlow/ddsketch/storage/contiguous.py |  28 +---
 benchmarks/get_pos_oneliner.json            | 147 ++++++++++++++++++++
 profile_ddsketch.py                         |   5 +-
 3 files changed, 155 insertions(+), 25 deletions(-)
 create mode 100644 benchmarks/get_pos_oneliner.json

diff --git a/QuantileFlow/ddsketch/storage/contiguous.py b/QuantileFlow/ddsketch/storage/contiguous.py
index 366053a..55176d8 100644
--- a/QuantileFlow/ddsketch/storage/contiguous.py
+++ b/QuantileFlow/ddsketch/storage/contiguous.py
@@ -34,20 +34,6 @@ def __init__(self, max_buckets: int = 2048):
         self.arr_index_of_min_bucket = 0  # Array index where min bucket is stored
         self.collapse_count = 0  # Number of times buckets have been collapsed
     
-    def _get_position(self, bucket_index: int) -> int:
-        """
-        Get array position for bucket index using new mapping formula.
-        
-        Args:
-            bucket_index: The bucket index to map to array position.
-            
-        Returns:
-            The array position in the circular buffer.
-        """
-        if self.min_index is None:
-            return 0
-        return (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
-    
     def add(self, bucket_index: int, count: int = 1):
         """
         Add count to bucket_index using new collapsing strategy.
@@ -72,7 +58,7 @@ def add(self, bucket_index: int, count: int = 1):
                 # Handle insertion below current minimum
                 if new_range > len(self.counts):
                     # Range too large, collapse into min bucket
-                    pos = self._get_position(self.min_index)
+                    pos = (self.arr_index_of_min_bucket) % len(self.counts)
                     self.counts[pos] += count
                     self.collapse_count += 1
                 else:
@@ -80,7 +66,7 @@ def add(self, bucket_index: int, count: int = 1):
                     shift = self.min_index - bucket_index
                     self.min_index = bucket_index
                     self.arr_index_of_min_bucket = self.arr_index_of_min_bucket - shift
-                    pos = self._get_position(bucket_index)
+                    pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
                     self.counts[pos] = count
                     self.num_buckets += 1
                     
@@ -103,7 +89,7 @@ def add(self, bucket_index: int, count: int = 1):
                         
                     # Add collapsed values to new min bucket
                     new_min = self.min_index + buckets_to_collapse
-                    new_min_pos = self._get_position(new_min)
+                    new_min_pos = (buckets_to_collapse + self.arr_index_of_min_bucket) % len(self.counts)
                     self.counts[new_min_pos] += collapse_sum
                     
                     # Update tracking variables
@@ -113,14 +99,14 @@ def add(self, bucket_index: int, count: int = 1):
                 
                 # Place new value
                 self.max_index = bucket_index
-                pos = self._get_position(bucket_index)
+                pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
                 was_zero = self.counts[pos] == 0
                 self.counts[pos] += count
                 if was_zero:
                     self.num_buckets += 1
             else:
                 # Normal insertion within current range
-                pos = self._get_position(bucket_index)
+                pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
                 was_zero = self.counts[pos] == 0
                 self.counts[pos] += count
                 if was_zero:
@@ -143,7 +129,7 @@ def remove(self, bucket_index: int, count: int = 1) -> bool:
             return False
             
         if self.min_index <= bucket_index <= self.max_index:
-            pos = self._get_position(bucket_index)
+            pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
             old_count = self.counts[pos]
             
             if old_count == 0:
@@ -191,7 +177,7 @@ def get_count(self, bucket_index: int) -> int:
         if self.min_index is None or bucket_index < self.min_index or bucket_index > self.max_index:
             warnings.warn("Bucket index is out of range. Returning 0.", UserWarning)
             return 0
-        pos = self._get_position(bucket_index)
+        pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
         return int(self.counts[pos])
     
     def merge(self, other: 'ContiguousStorage'):
diff --git a/benchmarks/get_pos_oneliner.json b/benchmarks/get_pos_oneliner.json
new file mode 100644
index 0000000..83befe0
--- /dev/null
+++ b/benchmarks/get_pos_oneliner.json
@@ -0,0 +1,147 @@
+{
+  "timestamp": "2026-01-08T19:16:06.640239",
+  "name": "get_pos_oneliner",
+  "metadata": {
+    "num_values": 10000000,
+    "num_trials": 5,
+    "timestamp": "2026-01-08T19:16:06.640223"
+  },
+  "stats": [
+    {
+      "function": "profile_ddsketch.py:17(run_sketch_operations)",
+      "ncalls": 1,
+      "tottime": 1.6643150042,
+      "cumtime": 15.3238002182,
+      "percall_tot": 1.6643150042,
+      "percall_cum": 15.3238002182
+    },
+    {
+      "function": "core.py:77(insert)",
+      "ncalls": 10000000,
+      "tottime": 3.5052579114,
+      "cumtime": 13.657695052200001,
+      "percall_tot": 3.5052579114e-07,
+      "percall_cum": 1.36576950522e-06
+    },
+    {
+      "function": "contiguous.py:37(add)",
+      "ncalls": 10000000,
+      "tottime": 5.303862841599999,
+      "cumtime": 5.9952392386,
+      "percall_tot": 5.3038628416e-07,
+      "percall_cum": 5.9952392386e-07
+    },
+    {
+      "function": "logarithmic.py:12(compute_bucket_index)",
+      "ncalls": 10000000,
+      "tottime": 2.7194350750000007,
+      "cumtime": 4.1571979022,
+      "percall_tot": 2.7194350750000006e-07,
+      "percall_cum": 4.157197902200001e-07
+    },
+    {
+      "function": "~:0(<built-in method math.log>)",
+      "ncalls": 10000001,
+      "tottime": 0.7966471508,
+      "cumtime": 0.7966471508,
+      "percall_tot": 7.966470711352928e-08,
+      "percall_cum": 7.966470711352928e-08
+    },
+    {
+      "function": "~:0(<built-in method builtins.len>)",
+      "ncalls": 10003315,
+      "tottime": 0.6915629914,
+      "cumtime": 0.6915629914,
+      "percall_tot": 6.913336317554129e-08,
+      "percall_cum": 6.913336317554129e-08
+    },
+    {
+      "function": "~:0(<built-in method math.ceil>)",
+      "ncalls": 10000000,
+      "tottime": 0.6411181274000001,
+      "cumtime": 0.6411181274000001,
+      "percall_tot": 6.411181274e-08,
+      "percall_cum": 6.411181274e-08
+    },
+    {
+      "function": "core.py:128(quantile)",
+      "ncalls": 4,
+      "tottime": 0.0005669172000000001,
+      "cumtime": 0.0017470114000000003,
+      "percall_tot": 0.00014172930000000002,
+      "percall_cum": 0.00043675285000000007
+    },
+    {
+      "function": "contiguous.py:167(get_count)",
+      "ncalls": 3297,
+      "tottime": 0.0009659642000000001,
+      "cumtime": 0.0011525586,
+      "percall_tot": 2.927350312418232e-07,
+      "percall_cum": 3.491643699871712e-07
+    },
+    {
+      "function": "~:0(<method 'disable' of '_lsprof.Profiler' objects>)",
+      "ncalls": 1,
+      "tottime": 4.4151600000000006e-05,
+      "cumtime": 4.4151600000000006e-05,
+      "percall_tot": 4.4151600000000006e-05,
+      "percall_cum": 4.4151600000000006e-05
+    },
+    {
+      "function": "core.py:24(__init__)",
+      "ncalls": 1,
+      "tottime": 7.417200000000001e-06,
+      "cumtime": 4.3150400000000006e-05,
+      "percall_tot": 7.417200000000001e-06,
+      "percall_cum": 4.3150400000000006e-05
+    },
+    {
+      "function": "contiguous.py:19(__init__)",
+      "ncalls": 2,
+      "tottime": 9.9896e-06,
+      "cumtime": 2.8892999999999996e-05,
+      "percall_tot": 4.9948e-06,
+      "percall_cum": 1.4446499999999998e-05
+    },
+    {
+      "function": "logarithmic.py:18(compute_value_from_index)",
+      "ncalls": 4,
+      "tottime": 2.4520599999999997e-05,
+      "cumtime": 2.7535599999999998e-05,
+      "percall_tot": 6.130149999999999e-06,
+      "percall_cum": 6.8838999999999994e-06
+    },
+    {
+      "function": "~:0(<built-in method numpy.zeros>)",
+      "ncalls": 2,
+      "tottime": 1.6609e-05,
+      "cumtime": 1.6609e-05,
+      "percall_tot": 8.3045e-06,
+      "percall_cum": 8.3045e-06
+    },
+    {
+      "function": "logarithmic.py:7(__init__)",
+      "ncalls": 1,
+      "tottime": 4.3892e-06,
+      "cumtime": 6.8402e-06,
+      "percall_tot": 4.3892e-06,
+      "percall_cum": 6.8402e-06
+    },
+    {
+      "function": "~:0(<built-in method math.pow>)",
+      "ncalls": 4,
+      "tottime": 3.0150000000000004e-06,
+      "cumtime": 3.0150000000000004e-06,
+      "percall_tot": 7.537500000000001e-07,
+      "percall_cum": 7.537500000000001e-07
+    },
+    {
+      "function": "base.py:17(__init__)",
+      "ncalls": 2,
+      "tottime": 2.2944000000000004e-06,
+      "cumtime": 2.2944000000000004e-06,
+      "percall_tot": 1.1472000000000002e-06,
+      "percall_cum": 1.1472000000000002e-06
+    }
+  ]
+}
\ No newline at end of file
diff --git a/profile_ddsketch.py b/profile_ddsketch.py
index f4a769a..e30aa51 100644
--- a/profile_ddsketch.py
+++ b/profile_ddsketch.py
@@ -1,6 +1,5 @@
 import cProfile
 import pstats
-import io
 import json
 import argparse
 from pathlib import Path
@@ -199,8 +198,6 @@ def merge_trial_stats(all_trial_stats: List[List[Dict]]) -> Tuple[List[Dict], Li
     for func, stats_list in func_stats.items():
         if not stats_list:
             continue
-            
-        n = len(stats_list)
         
         # Compute means
         mean_stat = {
@@ -327,7 +324,7 @@ def profile(num_values: int = 10_000_000,
     print(f"  • Values inserted per trial: {results['num_values']:,}")
     if num_trials > 1:
         print(f"  • Total values processed: {results['num_values'] * num_trials:,}")
-    print(f"  • Sample quantiles from last trial:")
+    print("  • Sample quantiles from last trial:")
     for q, val in results['quantiles'].items():
         print(f"    - Q({q}): {val}")
     

From a015364d180dda4c30792d5ae35a4ebede4c8bc5 Mon Sep 17 00:00:00 2001
From: Ryan Ji <tairanjiryan@gmail.com>
Date: Sat, 10 Jan 2026 16:19:02 -0500
Subject: [PATCH 3/7] Remove redundant int casting in mapping; remove redundant
 positive value check

---
 .../ddsketch/mapping/cubic_interpolation.py   |   7 +-
 .../ddsketch/mapping/linear_interpolation.py  |   7 +-
 QuantileFlow/ddsketch/mapping/logarithmic.py  |   4 +-
 .../compute_bucket_index_redundancy.json      | 147 ++++++++++++++++++
 benchmarks/numba_mapping.json                 | 147 ++++++++++++++++++
 5 files changed, 299 insertions(+), 13 deletions(-)
 create mode 100644 benchmarks/compute_bucket_index_redundancy.json
 create mode 100644 benchmarks/numba_mapping.json

diff --git a/QuantileFlow/ddsketch/mapping/cubic_interpolation.py b/QuantileFlow/ddsketch/mapping/cubic_interpolation.py
index b1f37f6..c8db29a 100644
--- a/QuantileFlow/ddsketch/mapping/cubic_interpolation.py
+++ b/QuantileFlow/ddsketch/mapping/cubic_interpolation.py
@@ -59,9 +59,6 @@ def _cubic_interpolation(self, s: float) -> float:
         return s * (self.C + s * (self.B + s * self.A))
         
     def compute_bucket_index(self, value: float) -> int:
-        if value <= 0:
-            raise ValueError("Value must be positive")
-            
         # Get binary exponent and normalized significand
         exponent, significand = self._extract_exponent_and_significand(value)
         
@@ -73,7 +70,7 @@ def compute_bucket_index(self, value: float) -> int:
         # where m is the optimal multiplier, e is the exponent,
         # P(s) is the cubic interpolation, and γ is (1+α)/(1-α)
         index = self.m * (exponent + interpolated) / self.log2_gamma
-        return int(math.ceil(index))
+        return math.ceil(index)
         
     def compute_value_from_index(self, index: float) -> float:
         """
@@ -84,7 +81,7 @@ def compute_value_from_index(self, index: float) -> float:
         target = (index * self.log2_gamma) / self.m
         
         # Extract integer and fractional parts
-        e = int(math.floor(target))
+        e = math.floor(target)
         f = target - e
         
         # If f is close to 0 or 1, return power of 2 directly
diff --git a/QuantileFlow/ddsketch/mapping/linear_interpolation.py b/QuantileFlow/ddsketch/mapping/linear_interpolation.py
index 97506a4..8660453 100644
--- a/QuantileFlow/ddsketch/mapping/linear_interpolation.py
+++ b/QuantileFlow/ddsketch/mapping/linear_interpolation.py
@@ -30,9 +30,6 @@ def _extract_exponent(self, value: float) -> tuple[int, float]:
         return exponent, normalized_fraction
         
     def compute_bucket_index(self, value: float) -> int:
-        if value <= 0:
-            raise ValueError("Value must be positive")
-            
         # Get binary exponent and normalized fraction
         exponent, normalized_fraction = self._extract_exponent(value)
         
@@ -42,7 +39,7 @@ def compute_bucket_index(self, value: float) -> int:
         
         # Compute final index
         log2_value = exponent + log2_fraction
-        return int(math.ceil(log2_value / self.log_gamma))
+        return math.ceil(log2_value / self.log_gamma)
         
     def compute_value_from_index(self, index: int) -> float:
         """
@@ -58,7 +55,7 @@ def compute_value_from_index(self, index: int) -> float:
         log2_value = index * self.log_gamma
         
         # Extract the integer and fractional parts of log2_value
-        exponent = int(math.floor(log2_value) + 1)
+        exponent = math.floor(log2_value) + 1
         mantissa = (log2_value - exponent + 2) / 2.0
         
         # Use ldexp to efficiently compute 2^exponent * mantissa
diff --git a/QuantileFlow/ddsketch/mapping/logarithmic.py b/QuantileFlow/ddsketch/mapping/logarithmic.py
index 3fea7b1..fd25332 100644
--- a/QuantileFlow/ddsketch/mapping/logarithmic.py
+++ b/QuantileFlow/ddsketch/mapping/logarithmic.py
@@ -10,10 +10,8 @@ def __init__(self, relative_accuracy: float):
         self.multiplier = 1 / math.log(self.gamma)
         
     def compute_bucket_index(self, value: float) -> int:
-        if value <= 0:
-            raise ValueError(f"Value must be positive, got {value}")
         # ceil(log_gamma(value) = ceil(log(value) / log(gamma))
-        return int(math.ceil(math.log(value) * self.multiplier))
+        return math.ceil(math.log(value) * self.multiplier)
     
     def compute_value_from_index(self, index: int) -> float:
         # Return geometric mean of bucket boundaries
diff --git a/benchmarks/compute_bucket_index_redundancy.json b/benchmarks/compute_bucket_index_redundancy.json
new file mode 100644
index 0000000..cb79230
--- /dev/null
+++ b/benchmarks/compute_bucket_index_redundancy.json
@@ -0,0 +1,147 @@
+{
+  "timestamp": "2026-01-10T16:17:23.669749",
+  "name": "compute_bucket_index_redundancy",
+  "metadata": {
+    "num_values": 10000000,
+    "num_trials": 5,
+    "timestamp": "2026-01-10T16:17:23.669737"
+  },
+  "stats": [
+    {
+      "function": "profile_ddsketch.py:17(run_sketch_operations)",
+      "ncalls": 1,
+      "tottime": 1.3887002275999998,
+      "cumtime": 13.0266837678,
+      "percall_tot": 1.3887002275999998,
+      "percall_cum": 13.0266837678
+    },
+    {
+      "function": "core.py:77(insert)",
+      "ncalls": 10000000,
+      "tottime": 3.2283954910000006,
+      "cumtime": 11.6364473224,
+      "percall_tot": 3.228395491e-07,
+      "percall_cum": 1.1636447322399998e-06
+    },
+    {
+      "function": "contiguous.py:37(add)",
+      "ncalls": 10000000,
+      "tottime": 4.568145479,
+      "cumtime": 5.2062685638,
+      "percall_tot": 4.568145479000001e-07,
+      "percall_cum": 5.206268563800001e-07
+    },
+    {
+      "function": "logarithmic.py:12(compute_bucket_index)",
+      "ncalls": 10000000,
+      "tottime": 1.903147694,
+      "cumtime": 3.2017832675999998,
+      "percall_tot": 1.9031476940000004e-07,
+      "percall_cum": 3.2017832676e-07
+    },
+    {
+      "function": "~:0(<built-in method math.log>)",
+      "ncalls": 10000001,
+      "tottime": 0.7143321826000001,
+      "cumtime": 0.7143321826000001,
+      "percall_tot": 7.14332111166789e-08,
+      "percall_cum": 7.14332111166789e-08
+    },
+    {
+      "function": "~:0(<built-in method builtins.len>)",
+      "ncalls": 10003244,
+      "tottime": 0.6382800820000002,
+      "cumtime": 0.6382800820000002,
+      "percall_tot": 6.380730510039856e-08,
+      "percall_cum": 6.380730510039856e-08
+    },
+    {
+      "function": "~:0(<built-in method math.ceil>)",
+      "ncalls": 10000000,
+      "tottime": 0.584304876,
+      "cumtime": 0.584304876,
+      "percall_tot": 5.8430487600000004e-08,
+      "percall_cum": 5.8430487600000004e-08
+    },
+    {
+      "function": "core.py:128(quantile)",
+      "ncalls": 4,
+      "tottime": 0.0004936750000000001,
+      "cumtime": 0.0014924204000000003,
+      "percall_tot": 0.00012341875000000001,
+      "percall_cum": 0.0003731051000000001
+    },
+    {
+      "function": "contiguous.py:167(get_count)",
+      "ncalls": 3223,
+      "tottime": 0.0008286914000000001,
+      "cumtime": 0.0009856886,
+      "percall_tot": 2.571817063873498e-07,
+      "percall_cum": 3.0584562750259306e-07
+    },
+    {
+      "function": "core.py:24(__init__)",
+      "ncalls": 1,
+      "tottime": 1.12378e-05,
+      "cumtime": 4.37974e-05,
+      "percall_tot": 1.12378e-05,
+      "percall_cum": 4.37974e-05
+    },
+    {
+      "function": "~:0(<method 'disable' of '_lsprof.Profiler' objects>)",
+      "ncalls": 1,
+      "tottime": 3.56454e-05,
+      "cumtime": 3.56454e-05,
+      "percall_tot": 3.56454e-05,
+      "percall_cum": 3.56454e-05
+    },
+    {
+      "function": "contiguous.py:19(__init__)",
+      "ncalls": 2,
+      "tottime": 1.10118e-05,
+      "cumtime": 2.7115199999999997e-05,
+      "percall_tot": 5.5059e-06,
+      "percall_cum": 1.3557599999999999e-05
+    },
+    {
+      "function": "~:0(<built-in method numpy.zeros>)",
+      "ncalls": 2,
+      "tottime": 1.4002600000000002e-05,
+      "cumtime": 1.4002600000000002e-05,
+      "percall_tot": 7.001300000000001e-06,
+      "percall_cum": 7.001300000000001e-06
+    },
+    {
+      "function": "logarithmic.py:16(compute_value_from_index)",
+      "ncalls": 4,
+      "tottime": 1.12992e-05,
+      "cumtime": 1.3056800000000002e-05,
+      "percall_tot": 2.8248e-06,
+      "percall_cum": 3.2642000000000004e-06
+    },
+    {
+      "function": "logarithmic.py:7(__init__)",
+      "ncalls": 1,
+      "tottime": 3.959400000000001e-06,
+      "cumtime": 5.4444e-06,
+      "percall_tot": 3.959400000000001e-06,
+      "percall_cum": 5.4444e-06
+    },
+    {
+      "function": "base.py:17(__init__)",
+      "ncalls": 2,
+      "tottime": 2.1008e-06,
+      "cumtime": 2.1008e-06,
+      "percall_tot": 1.0504e-06,
+      "percall_cum": 1.0504e-06
+    },
+    {
+      "function": "~:0(<built-in method math.pow>)",
+      "ncalls": 4,
+      "tottime": 1.7576000000000003e-06,
+      "cumtime": 1.7576000000000003e-06,
+      "percall_tot": 4.394000000000001e-07,
+      "percall_cum": 4.394000000000001e-07
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/numba_mapping.json b/benchmarks/numba_mapping.json
new file mode 100644
index 0000000..7982e63
--- /dev/null
+++ b/benchmarks/numba_mapping.json
@@ -0,0 +1,147 @@
+{
+  "timestamp": "2026-01-10T16:02:37.248118",
+  "name": "numba_mapping",
+  "metadata": {
+    "num_values": 10000000,
+    "num_trials": 5,
+    "timestamp": "2026-01-10T16:02:37.248088"
+  },
+  "stats": [
+    {
+      "function": "profile_ddsketch.py:17(run_sketch_operations)",
+      "ncalls": 1,
+      "tottime": 1.5803080648,
+      "cumtime": 15.3215635052,
+      "percall_tot": 1.5803080648,
+      "percall_cum": 15.3215635052
+    },
+    {
+      "function": "core.py:80(insert)",
+      "ncalls": 10000000,
+      "tottime": 3.561579956,
+      "cumtime": 13.739444904199999,
+      "percall_tot": 3.561579956e-07,
+      "percall_cum": 1.3739444904200001e-06
+    },
+    {
+      "function": "contiguous.py:37(add)",
+      "ncalls": 10000000,
+      "tottime": 4.967856963600001,
+      "cumtime": 5.6552094486,
+      "percall_tot": 4.9678569636e-07,
+      "percall_cum": 5.6552094486e-07
+    },
+    {
+      "function": "logarithmic.py:35(compute_bucket_index)",
+      "ncalls": 10000000,
+      "tottime": 3.040597505,
+      "cumtime": 4.522655499600001,
+      "percall_tot": 3.0405975050000003e-07,
+      "percall_cum": 4.5226554996000005e-07
+    },
+    {
+      "function": "~:0(<built-in method math.log>)",
+      "ncalls": 10000001,
+      "tottime": 0.775530691,
+      "cumtime": 0.775530691,
+      "percall_tot": 7.755306134469388e-08,
+      "percall_cum": 7.755306134469388e-08
+    },
+    {
+      "function": "~:0(<built-in method math.ceil>)",
+      "ncalls": 10000000,
+      "tottime": 0.7065290746,
+      "cumtime": 0.7065290746,
+      "percall_tot": 7.065290746000001e-08,
+      "percall_cum": 7.065290746000001e-08
+    },
+    {
+      "function": "~:0(<built-in method builtins.len>)",
+      "ncalls": 10003501,
+      "tottime": 0.6875303102,
+      "cumtime": 0.6875303102,
+      "percall_tot": 6.87289631095811e-08,
+      "percall_cum": 6.87289631095811e-08
+    },
+    {
+      "function": "core.py:131(quantile)",
+      "ncalls": 4,
+      "tottime": 0.0005834166000000001,
+      "cumtime": 0.0017633896,
+      "percall_tot": 0.00014585415000000003,
+      "percall_cum": 0.0004408474
+    },
+    {
+      "function": "contiguous.py:167(get_count)",
+      "ncalls": 3482,
+      "tottime": 0.0009676952,
+      "cumtime": 0.0011455204,
+      "percall_tot": 2.784679111662184e-07,
+      "percall_cum": 3.2965915502525064e-07
+    },
+    {
+      "function": "~:0(<method 'disable' of '_lsprof.Profiler' objects>)",
+      "ncalls": 1,
+      "tottime": 4.857280000000001e-05,
+      "cumtime": 4.857280000000001e-05,
+      "percall_tot": 4.857280000000001e-05,
+      "percall_cum": 4.857280000000001e-05
+    },
+    {
+      "function": "core.py:24(__init__)",
+      "ncalls": 1,
+      "tottime": 1.14384e-05,
+      "cumtime": 4.71466e-05,
+      "percall_tot": 1.14384e-05,
+      "percall_cum": 4.71466e-05
+    },
+    {
+      "function": "logarithmic.py:47(compute_value_from_index)",
+      "ncalls": 4,
+      "tottime": 3.120040000000001e-05,
+      "cumtime": 3.44526e-05,
+      "percall_tot": 7.800100000000002e-06,
+      "percall_cum": 8.61315e-06
+    },
+    {
+      "function": "contiguous.py:19(__init__)",
+      "ncalls": 2,
+      "tottime": 1.0658400000000001e-05,
+      "cumtime": 2.9616400000000003e-05,
+      "percall_tot": 5.329200000000001e-06,
+      "percall_cum": 1.4808200000000002e-05
+    },
+    {
+      "function": "~:0(<built-in method numpy.zeros>)",
+      "ncalls": 2,
+      "tottime": 1.6537600000000002e-05,
+      "cumtime": 1.6537600000000002e-05,
+      "percall_tot": 8.268800000000001e-06,
+      "percall_cum": 8.268800000000001e-06
+    },
+    {
+      "function": "logarithmic.py:29(__init__)",
+      "ncalls": 1,
+      "tottime": 4.3208e-06,
+      "cumtime": 6.0918e-06,
+      "percall_tot": 4.3208e-06,
+      "percall_cum": 6.0918e-06
+    },
+    {
+      "function": "~:0(<built-in method math.pow>)",
+      "ncalls": 4,
+      "tottime": 3.2522000000000004e-06,
+      "cumtime": 3.2522000000000004e-06,
+      "percall_tot": 8.130500000000001e-07,
+      "percall_cum": 8.130500000000001e-07
+    },
+    {
+      "function": "base.py:17(__init__)",
+      "ncalls": 2,
+      "tottime": 2.4204e-06,
+      "cumtime": 2.4204e-06,
+      "percall_tot": 1.2102e-06,
+      "percall_cum": 1.2102e-06
+    }
+  ]
+}
\ No newline at end of file

From cc016c1f1107da79421cbf1ab44a23f89a53c9c0 Mon Sep 17 00:00:00 2001
From: Ryan Ji <tairanjiryan@gmail.com>
Date: Sat, 10 Jan 2026 16:28:46 -0500
Subject: [PATCH 4/7] Cached len(self.counts) in contiguous storage

---
 QuantileFlow/ddsketch/storage/contiguous.py |  25 ++--
 benchmarks/cache_len_self_counts.json       | 139 ++++++++++++++++++++
 2 files changed, 152 insertions(+), 12 deletions(-)
 create mode 100644 benchmarks/cache_len_self_counts.json

diff --git a/QuantileFlow/ddsketch/storage/contiguous.py b/QuantileFlow/ddsketch/storage/contiguous.py
index 55176d8..b2bd949 100644
--- a/QuantileFlow/ddsketch/storage/contiguous.py
+++ b/QuantileFlow/ddsketch/storage/contiguous.py
@@ -28,6 +28,7 @@ def __init__(self, max_buckets: int = 2048):
         super().__init__(max_buckets, BucketManagementStrategy.FIXED)
         self.total_count = 0
         self.counts = np.zeros(max_buckets, dtype=np.int64)
+        self.max_buckets = max_buckets
         self.min_index = None  # Minimum bucket index seen
         self.max_index = None  # Maximum bucket index seen
         self.num_buckets = 0   # Number of non-zero buckets
@@ -56,9 +57,9 @@ def add(self, bucket_index: int, count: int = 1):
             if bucket_index < self.min_index:
                 new_range = self.max_index - bucket_index + 1
                 # Handle insertion below current minimum
-                if new_range > len(self.counts):
+                if new_range > self.max_buckets:
                     # Range too large, collapse into min bucket
-                    pos = (self.arr_index_of_min_bucket) % len(self.counts)
+                    pos = (self.arr_index_of_min_bucket) % self.max_buckets
                     self.counts[pos] += count
                     self.collapse_count += 1
                 else:
@@ -66,13 +67,13 @@ def add(self, bucket_index: int, count: int = 1):
                     shift = self.min_index - bucket_index
                     self.min_index = bucket_index
                     self.arr_index_of_min_bucket = self.arr_index_of_min_bucket - shift
-                    pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
+                    pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % self.max_buckets
                     self.counts[pos] = count
                     self.num_buckets += 1
                     
             elif bucket_index > self.max_index:
                 new_range = bucket_index - self.min_index + 1
-                if new_range > len(self.counts):
+                if new_range > self.max_buckets:
                     # Handle insertion above current maximum
                     buckets_to_collapse = bucket_index - self.max_index
                     # Collapse lowest buckets
@@ -89,7 +90,7 @@ def add(self, bucket_index: int, count: int = 1):
                         
                     # Add collapsed values to new min bucket
                     new_min = self.min_index + buckets_to_collapse
-                    new_min_pos = (buckets_to_collapse + self.arr_index_of_min_bucket) % len(self.counts)
+                    new_min_pos = (buckets_to_collapse + self.arr_index_of_min_bucket) % self.max_buckets
                     self.counts[new_min_pos] += collapse_sum
                     
                     # Update tracking variables
@@ -99,14 +100,14 @@ def add(self, bucket_index: int, count: int = 1):
                 
                 # Place new value
                 self.max_index = bucket_index
-                pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
+                pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % self.max_buckets
                 was_zero = self.counts[pos] == 0
                 self.counts[pos] += count
                 if was_zero:
                     self.num_buckets += 1
             else:
                 # Normal insertion within current range
-                pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
+                pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % self.max_buckets
                 was_zero = self.counts[pos] == 0
                 self.counts[pos] += count
                 if was_zero:
@@ -129,7 +130,7 @@ def remove(self, bucket_index: int, count: int = 1) -> bool:
             return False
             
         if self.min_index <= bucket_index <= self.max_index:
-            pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
+            pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % self.max_buckets
             old_count = self.counts[pos]
             
             if old_count == 0:
@@ -146,7 +147,7 @@ def remove(self, bucket_index: int, count: int = 1) -> bool:
                 elif bucket_index == self.min_index:
                     # Find new minimum index
                     for i in range(self.max_index - self.min_index + 1):
-                        pos = (self.arr_index_of_min_bucket + i) % len(self.counts)
+                        pos = (self.arr_index_of_min_bucket + i) % self.max_buckets
                         if self.counts[pos] > 0:
                             self.min_index += i
                             self.arr_index_of_min_bucket = pos
@@ -154,7 +155,7 @@ def remove(self, bucket_index: int, count: int = 1) -> bool:
                 elif bucket_index == self.max_index:
                     # Find new maximum index
                     for i in range(self.max_index - self.min_index + 1):
-                        pos = (self.arr_index_of_min_bucket + (self.max_index - self.min_index - i)) % len(self.counts)
+                        pos = (self.arr_index_of_min_bucket + (self.max_index - self.min_index - i)) % self.max_buckets
                         if self.counts[pos] > 0:
                             self.max_index -= i
                             break
@@ -177,7 +178,7 @@ def get_count(self, bucket_index: int) -> int:
         if self.min_index is None or bucket_index < self.min_index or bucket_index > self.max_index:
             warnings.warn("Bucket index is out of range. Returning 0.", UserWarning)
             return 0
-        pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % len(self.counts)
+        pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % self.max_buckets
         return int(self.counts[pos])
     
     def merge(self, other: 'ContiguousStorage'):
@@ -192,7 +193,7 @@ def merge(self, other: 'ContiguousStorage'):
             
         # Add each non-zero bucket
         for i in range(other.max_index - other.min_index + 1):
-            pos = (other.arr_index_of_min_bucket + i) % len(other.counts)
+            pos = (other.arr_index_of_min_bucket + i) % other.max_buckets
             if other.counts[pos] > 0:
                 bucket_index = other.min_index + i
                 self.add(bucket_index, int(other.counts[pos]))
diff --git a/benchmarks/cache_len_self_counts.json b/benchmarks/cache_len_self_counts.json
new file mode 100644
index 0000000..dfcc096
--- /dev/null
+++ b/benchmarks/cache_len_self_counts.json
@@ -0,0 +1,139 @@
+{
+  "timestamp": "2026-01-10T16:26:33.986951",
+  "name": "cache_len_self_counts",
+  "metadata": {
+    "num_values": 10000000,
+    "num_trials": 5,
+    "timestamp": "2026-01-10T16:26:33.986892"
+  },
+  "stats": [
+    {
+      "function": "profile_ddsketch.py:17(run_sketch_operations)",
+      "ncalls": 1,
+      "tottime": 1.3366227698000002,
+      "cumtime": 11.3580648152,
+      "percall_tot": 1.3366227698000002,
+      "percall_cum": 11.3580648152
+    },
+    {
+      "function": "core.py:77(insert)",
+      "ncalls": 10000000,
+      "tottime": 3.1263041402000002,
+      "cumtime": 10.0199071374,
+      "percall_tot": 3.1263041402e-07,
+      "percall_cum": 1.0019907137400002e-06
+    },
+    {
+      "function": "contiguous.py:38(add)",
+      "ncalls": 10000000,
+      "tottime": 3.8569736876,
+      "cumtime": 3.8569736876,
+      "percall_tot": 3.8569736875999994e-07,
+      "percall_cum": 3.8569736875999994e-07
+    },
+    {
+      "function": "logarithmic.py:12(compute_bucket_index)",
+      "ncalls": 10000000,
+      "tottime": 1.8100902910000003,
+      "cumtime": 3.0366293096,
+      "percall_tot": 1.810090291e-07,
+      "percall_cum": 3.0366293096000003e-07
+    },
+    {
+      "function": "~:0(<built-in method math.log>)",
+      "ncalls": 10000001,
+      "tottime": 0.671522876,
+      "cumtime": 0.671522876,
+      "percall_tot": 6.71522808847719e-08,
+      "percall_cum": 6.71522808847719e-08
+    },
+    {
+      "function": "~:0(<built-in method math.ceil>)",
+      "ncalls": 10000000,
+      "tottime": 0.555017911,
+      "cumtime": 0.555017911,
+      "percall_tot": 5.550179110000001e-08,
+      "percall_cum": 5.550179110000001e-08
+    },
+    {
+      "function": "core.py:128(quantile)",
+      "ncalls": 4,
+      "tottime": 0.0006492008,
+      "cumtime": 0.0014758362000000002,
+      "percall_tot": 0.0001623002,
+      "percall_cum": 0.00036895905000000005
+    },
+    {
+      "function": "contiguous.py:168(get_count)",
+      "ncalls": 3175,
+      "tottime": 0.0007850656000000001,
+      "cumtime": 0.0007850656000000001,
+      "percall_tot": 2.450894805204026e-07,
+      "percall_cum": 2.450894805204026e-07
+    },
+    {
+      "function": "core.py:24(__init__)",
+      "ncalls": 1,
+      "tottime": 8.736200000000001e-06,
+      "cumtime": 5.907179999999999e-05,
+      "percall_tot": 8.736200000000001e-06,
+      "percall_cum": 5.907179999999999e-05
+    },
+    {
+      "function": "~:0(<method 'disable' of '_lsprof.Profiler' objects>)",
+      "ncalls": 1,
+      "tottime": 4.9611800000000005e-05,
+      "cumtime": 4.9611800000000005e-05,
+      "percall_tot": 4.9611800000000005e-05,
+      "percall_cum": 4.9611800000000005e-05
+    },
+    {
+      "function": "contiguous.py:19(__init__)",
+      "ncalls": 2,
+      "tottime": 1.16142e-05,
+      "cumtime": 4.36792e-05,
+      "percall_tot": 5.8071e-06,
+      "percall_cum": 2.18396e-05
+    },
+    {
+      "function": "logarithmic.py:16(compute_value_from_index)",
+      "ncalls": 4,
+      "tottime": 3.6364999999999995e-05,
+      "cumtime": 4.15698e-05,
+      "percall_tot": 9.091249999999999e-06,
+      "percall_cum": 1.039245e-05
+    },
+    {
+      "function": "~:0(<built-in method numpy.zeros>)",
+      "ncalls": 2,
+      "tottime": 2.748e-05,
+      "cumtime": 2.748e-05,
+      "percall_tot": 1.374e-05,
+      "percall_cum": 1.374e-05
+    },
+    {
+      "function": "logarithmic.py:7(__init__)",
+      "ncalls": 1,
+      "tottime": 4.8880000000000005e-06,
+      "cumtime": 6.656400000000001e-06,
+      "percall_tot": 4.8880000000000005e-06,
+      "percall_cum": 6.656400000000001e-06
+    },
+    {
+      "function": "~:0(<built-in method math.pow>)",
+      "ncalls": 4,
+      "tottime": 5.204800000000001e-06,
+      "cumtime": 5.204800000000001e-06,
+      "percall_tot": 1.3012000000000003e-06,
+      "percall_cum": 1.3012000000000003e-06
+    },
+    {
+      "function": "base.py:17(__init__)",
+      "ncalls": 2,
+      "tottime": 4.584999999999999e-06,
+      "cumtime": 4.584999999999999e-06,
+      "percall_tot": 2.2924999999999997e-06,
+      "percall_cum": 2.2924999999999997e-06
+    }
+  ]
+}
\ No newline at end of file

From 91c3c865d17d741d1ca823d2ab7fd11b4edec24d Mon Sep 17 00:00:00 2001
From: Ryan Ji <tairanjiryan@gmail.com>
Date: Sat, 10 Jan 2026 16:38:25 -0500
Subject: [PATCH 5/7] Cleaned up insert comparisons

---
 QuantileFlow/ddsketch/core.py             |   4 +-
 benchmarks/5streamline_insert_method.json | 139 ++++++++++++++++++++++
 2 files changed, 141 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/5streamline_insert_method.json

diff --git a/QuantileFlow/ddsketch/core.py b/QuantileFlow/ddsketch/core.py
index f3d9b3c..1a623a9 100644
--- a/QuantileFlow/ddsketch/core.py
+++ b/QuantileFlow/ddsketch/core.py
@@ -89,10 +89,10 @@ def insert(self, value: Union[int, float]) -> None:
         elif value > 0:
             bucket_idx = self.mapping.compute_bucket_index(value)
             self.positive_store.add(bucket_idx)
-        elif value < 0 and self.cont_neg:
+        elif self.cont_neg:
             bucket_idx = self.mapping.compute_bucket_index(-value)
             self.negative_store.add(bucket_idx)
-        elif value < 0:
+        else:
             raise ValueError("Negative values not supported when cont_neg is False")
         self.count += 1
     
diff --git a/benchmarks/5streamline_insert_method.json b/benchmarks/5streamline_insert_method.json
new file mode 100644
index 0000000..0b4645b
--- /dev/null
+++ b/benchmarks/5streamline_insert_method.json
@@ -0,0 +1,139 @@
+{
+  "timestamp": "2026-01-10T16:37:06.396876",
+  "name": "5streamline_insert_method",
+  "metadata": {
+    "num_values": 10000000,
+    "num_trials": 5,
+    "timestamp": "2026-01-10T16:37:06.396868"
+  },
+  "stats": [
+    {
+      "function": "profile_ddsketch.py:17(run_sketch_operations)",
+      "ncalls": 1,
+      "tottime": 1.3612975308000004,
+      "cumtime": 11.6981027108,
+      "percall_tot": 1.3612975308000004,
+      "percall_cum": 11.6981027108
+    },
+    {
+      "function": "core.py:77(insert)",
+      "ncalls": 10000000,
+      "tottime": 3.2169409506,
+      "cumtime": 10.3354853876,
+      "percall_tot": 3.2169409506e-07,
+      "percall_cum": 1.0335485387600001e-06
+    },
+    {
+      "function": "contiguous.py:38(add)",
+      "ncalls": 10000000,
+      "tottime": 3.9631574034000003,
+      "cumtime": 3.9631574034000003,
+      "percall_tot": 3.9631574034000003e-07,
+      "percall_cum": 3.9631574034000003e-07
+    },
+    {
+      "function": "logarithmic.py:12(compute_bucket_index)",
+      "ncalls": 10000000,
+      "tottime": 1.8843829416000002,
+      "cumtime": 3.1553870336000003,
+      "percall_tot": 1.8843829416000001e-07,
+      "percall_cum": 3.1553870336e-07
+    },
+    {
+      "function": "~:0(<built-in method math.log>)",
+      "ncalls": 10000001,
+      "tottime": 0.6960204540000001,
+      "cumtime": 0.6960204540000001,
+      "percall_tot": 6.960203843979616e-08,
+      "percall_cum": 6.960203843979616e-08
+    },
+    {
+      "function": "~:0(<built-in method math.ceil>)",
+      "ncalls": 10000000,
+      "tottime": 0.5749850052000001,
+      "cumtime": 0.5749850052000001,
+      "percall_tot": 5.749850052000001e-08,
+      "percall_cum": 5.749850052000001e-08
+    },
+    {
+      "function": "core.py:128(quantile)",
+      "ncalls": 4,
+      "tottime": 0.0005329822000000001,
+      "cumtime": 0.0012824054,
+      "percall_tot": 0.00013324555000000001,
+      "percall_cum": 0.00032060135
+    },
+    {
+      "function": "contiguous.py:168(get_count)",
+      "ncalls": 3494,
+      "tottime": 0.0007340835999999999,
+      "cumtime": 0.0007340835999999999,
+      "percall_tot": 2.0989430127963593e-07,
+      "percall_cum": 2.0989430127963593e-07
+    },
+    {
+      "function": "core.py:24(__init__)",
+      "ncalls": 1,
+      "tottime": 5.505400000000001e-06,
+      "cumtime": 3.7387e-05,
+      "percall_tot": 5.505400000000001e-06,
+      "percall_cum": 3.7387e-05
+    },
+    {
+      "function": "~:0(<method 'disable' of '_lsprof.Profiler' objects>)",
+      "ncalls": 1,
+      "tottime": 3.31294e-05,
+      "cumtime": 3.31294e-05,
+      "percall_tot": 3.31294e-05,
+      "percall_cum": 3.31294e-05
+    },
+    {
+      "function": "contiguous.py:19(__init__)",
+      "ncalls": 2,
+      "tottime": 8.434000000000001e-06,
+      "cumtime": 2.6972e-05,
+      "percall_tot": 4.2170000000000005e-06,
+      "percall_cum": 1.3486e-05
+    },
+    {
+      "function": "~:0(<built-in method numpy.zeros>)",
+      "ncalls": 2,
+      "tottime": 1.6823e-05,
+      "cumtime": 1.6823e-05,
+      "percall_tot": 8.4115e-06,
+      "percall_cum": 8.4115e-06
+    },
+    {
+      "function": "logarithmic.py:16(compute_value_from_index)",
+      "ncalls": 4,
+      "tottime": 1.3271000000000002e-05,
+      "cumtime": 1.53396e-05,
+      "percall_tot": 3.3177500000000005e-06,
+      "percall_cum": 3.8349e-06
+    },
+    {
+      "function": "logarithmic.py:7(__init__)",
+      "ncalls": 1,
+      "tottime": 3.5424000000000006e-06,
+      "cumtime": 4.9096e-06,
+      "percall_tot": 3.5424000000000006e-06,
+      "percall_cum": 4.9096e-06
+    },
+    {
+      "function": "~:0(<built-in method math.pow>)",
+      "ncalls": 4,
+      "tottime": 2.0686e-06,
+      "cumtime": 2.0686e-06,
+      "percall_tot": 5.1715e-07,
+      "percall_cum": 5.1715e-07
+    },
+    {
+      "function": "base.py:17(__init__)",
+      "ncalls": 2,
+      "tottime": 1.7150000000000003e-06,
+      "cumtime": 1.7150000000000003e-06,
+      "percall_tot": 8.575000000000002e-07,
+      "percall_cum": 8.575000000000002e-07
+    }
+  ]
+}
\ No newline at end of file

From b3e245f84dc6190739b841a6ca822b02db50dac0 Mon Sep 17 00:00:00 2001
From: Ryan Ji <tairanjiryan@gmail.com>
Date: Sat, 10 Jan 2026 17:32:03 -0500
Subject: [PATCH 6/7] Added slots

---
 QuantileFlow/ddsketch/mapping/logarithmic.py |   4 +-
 QuantileFlow/ddsketch/storage/contiguous.py  |   4 +
 benchmarks/6add_slots.json                   | 139 ++++++++++++
 benchmarks/7reword_value_comparisons.json    | 139 ++++++++++++
 profile_ddsketch.py                          | 215 ++++++++++++++++++-
 5 files changed, 491 insertions(+), 10 deletions(-)
 create mode 100644 benchmarks/6add_slots.json
 create mode 100644 benchmarks/7reword_value_comparisons.json

diff --git a/QuantileFlow/ddsketch/mapping/logarithmic.py b/QuantileFlow/ddsketch/mapping/logarithmic.py
index fd25332..a495921 100644
--- a/QuantileFlow/ddsketch/mapping/logarithmic.py
+++ b/QuantileFlow/ddsketch/mapping/logarithmic.py
@@ -4,7 +4,9 @@
 from .base import MappingScheme
 
 class LogarithmicMapping(MappingScheme):
-    def __init__(self, relative_accuracy: float):
+    __slots__ = ('relative_accuracy', 'gamma', 'multiplier')
+    
+    def __init__(self, relative_accuracy: float):        
         self.relative_accuracy = relative_accuracy
         self.gamma = (1 + relative_accuracy) / (1 - relative_accuracy)
         self.multiplier = 1 / math.log(self.gamma)
diff --git a/QuantileFlow/ddsketch/storage/contiguous.py b/QuantileFlow/ddsketch/storage/contiguous.py
index b2bd949..92ead6f 100644
--- a/QuantileFlow/ddsketch/storage/contiguous.py
+++ b/QuantileFlow/ddsketch/storage/contiguous.py
@@ -15,6 +15,10 @@ class ContiguousStorage(Storage):
     - If inserting below min: collapse if range too large, otherwise adjust min
     - If inserting above max: collapse lowest buckets to make room
     """
+
+    __slots__ = ('total_count', 'counts', 'min_index', 'max_index', 
+                 'num_buckets', 'arr_index_of_min_bucket', 'collapse_count',
+                 'max_buckets', 'bucket_mask', 'strategy')
     
     def __init__(self, max_buckets: int = 2048):
         """
diff --git a/benchmarks/6add_slots.json b/benchmarks/6add_slots.json
new file mode 100644
index 0000000..243e776
--- /dev/null
+++ b/benchmarks/6add_slots.json
@@ -0,0 +1,139 @@
+{
+  "timestamp": "2026-01-10T16:42:43.578971",
+  "name": "6add_slots",
+  "metadata": {
+    "num_values": 10000000,
+    "num_trials": 5,
+    "timestamp": "2026-01-10T16:42:43.578962"
+  },
+  "stats": [
+    {
+      "function": "profile_ddsketch.py:17(run_sketch_operations)",
+      "ncalls": 1,
+      "tottime": 1.33988225,
+      "cumtime": 11.5264426936,
+      "percall_tot": 1.33988225,
+      "percall_cum": 11.5264426936
+    },
+    {
+      "function": "core.py:77(insert)",
+      "ncalls": 10000000,
+      "tottime": 3.3348853683999997,
+      "cumtime": 10.185277087,
+      "percall_tot": 3.3348853684e-07,
+      "percall_cum": 1.0185277087e-06
+    },
+    {
+      "function": "contiguous.py:42(add)",
+      "ncalls": 10000000,
+      "tottime": 3.8266771040000003,
+      "cumtime": 3.8266771040000003,
+      "percall_tot": 3.826677104e-07,
+      "percall_cum": 3.826677104e-07
+    },
+    {
+      "function": "logarithmic.py:14(compute_bucket_index)",
+      "ncalls": 10000000,
+      "tottime": 1.7708338908,
+      "cumtime": 3.0237146146000002,
+      "percall_tot": 1.7708338908000002e-07,
+      "percall_cum": 3.0237146146e-07
+    },
+    {
+      "function": "~:0(<built-in method math.log>)",
+      "ncalls": 10000001,
+      "tottime": 0.6956715138,
+      "cumtime": 0.6956715138,
+      "percall_tot": 6.956714442328556e-08,
+      "percall_cum": 6.956714442328556e-08
+    },
+    {
+      "function": "~:0(<built-in method math.ceil>)",
+      "ncalls": 10000000,
+      "tottime": 0.5572106932,
+      "cumtime": 0.5572106932,
+      "percall_tot": 5.572106932e-08,
+      "percall_cum": 5.572106932e-08
+    },
+    {
+      "function": "core.py:128(quantile)",
+      "ncalls": 4,
+      "tottime": 0.000548316,
+      "cumtime": 0.0012531198,
+      "percall_tot": 0.000137079,
+      "percall_cum": 0.00031327995
+    },
+    {
+      "function": "contiguous.py:172(get_count)",
+      "ncalls": 3324,
+      "tottime": 0.0006952066000000001,
+      "cumtime": 0.0006952066000000001,
+      "percall_tot": 2.092879594892521e-07,
+      "percall_cum": 2.092879594892521e-07
+    },
+    {
+      "function": "core.py:24(__init__)",
+      "ncalls": 1,
+      "tottime": 5.0892e-06,
+      "cumtime": 3.0236799999999995e-05,
+      "percall_tot": 5.0892e-06,
+      "percall_cum": 3.0236799999999995e-05
+    },
+    {
+      "function": "~:0(<method 'disable' of '_lsprof.Profiler' objects>)",
+      "ncalls": 1,
+      "tottime": 2.64168e-05,
+      "cumtime": 2.64168e-05,
+      "percall_tot": 2.64168e-05,
+      "percall_cum": 2.64168e-05
+    },
+    {
+      "function": "contiguous.py:23(__init__)",
+      "ncalls": 2,
+      "tottime": 7.3314e-06,
+      "cumtime": 2.0227400000000002e-05,
+      "percall_tot": 3.6657e-06,
+      "percall_cum": 1.0113700000000001e-05
+    },
+    {
+      "function": "~:0(<built-in method numpy.zeros>)",
+      "ncalls": 2,
+      "tottime": 1.0474e-05,
+      "cumtime": 1.0474e-05,
+      "percall_tot": 5.237e-06,
+      "percall_cum": 5.237e-06
+    },
+    {
+      "function": "logarithmic.py:18(compute_value_from_index)",
+      "ncalls": 4,
+      "tottime": 7.9294e-06,
+      "cumtime": 9.5972e-06,
+      "percall_tot": 1.98235e-06,
+      "percall_cum": 2.3993e-06
+    },
+    {
+      "function": "logarithmic.py:9(__init__)",
+      "ncalls": 1,
+      "tottime": 3.4370000000000003e-06,
+      "cumtime": 4.9202e-06,
+      "percall_tot": 3.4370000000000003e-06,
+      "percall_cum": 4.9202e-06
+    },
+    {
+      "function": "base.py:17(__init__)",
+      "ncalls": 2,
+      "tottime": 2.422e-06,
+      "cumtime": 2.422e-06,
+      "percall_tot": 1.211e-06,
+      "percall_cum": 1.211e-06
+    },
+    {
+      "function": "~:0(<built-in method math.pow>)",
+      "ncalls": 4,
+      "tottime": 1.6678000000000002e-06,
+      "cumtime": 1.6678000000000002e-06,
+      "percall_tot": 4.1695000000000004e-07,
+      "percall_cum": 4.1695000000000004e-07
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/7reword_value_comparisons.json b/benchmarks/7reword_value_comparisons.json
new file mode 100644
index 0000000..2efe686
--- /dev/null
+++ b/benchmarks/7reword_value_comparisons.json
@@ -0,0 +1,139 @@
+{
+  "timestamp": "2026-01-10T17:13:13.833726",
+  "name": "7reword_value_comparisons",
+  "metadata": {
+    "num_values": 10000000,
+    "num_trials": 5,
+    "timestamp": "2026-01-10T17:13:13.833712"
+  },
+  "stats": [
+    {
+      "function": "profile_ddsketch.py:26(run_sketch_operations)",
+      "ncalls": 1,
+      "tottime": 1.8573772013999998,
+      "cumtime": 15.3256251814,
+      "percall_tot": 1.8573772013999998,
+      "percall_cum": 15.3256251814
+    },
+    {
+      "function": "core.py:77(insert)",
+      "ncalls": 10000000,
+      "tottime": 4.248560426,
+      "cumtime": 13.4664414144,
+      "percall_tot": 4.2485604260000007e-07,
+      "percall_cum": 1.34664414144e-06
+    },
+    {
+      "function": "contiguous.py:42(add)",
+      "ncalls": 10000000,
+      "tottime": 5.1584048128,
+      "cumtime": 5.1584048128,
+      "percall_tot": 5.158404812800001e-07,
+      "percall_cum": 5.158404812800001e-07
+    },
+    {
+      "function": "logarithmic.py:14(compute_bucket_index)",
+      "ncalls": 10000000,
+      "tottime": 2.3456515052,
+      "cumtime": 4.0594761756,
+      "percall_tot": 2.3456515052000006e-07,
+      "percall_cum": 4.0594761756000005e-07
+    },
+    {
+      "function": "~:0(<built-in method math.log>)",
+      "ncalls": 10000001,
+      "tottime": 0.9696837526,
+      "cumtime": 0.9696837526,
+      "percall_tot": 9.696836556316345e-08,
+      "percall_cum": 9.696836556316345e-08
+    },
+    {
+      "function": "~:0(<built-in method math.ceil>)",
+      "ncalls": 10000000,
+      "tottime": 0.7441432736,
+      "cumtime": 0.7441432736,
+      "percall_tot": 7.441432736e-08,
+      "percall_cum": 7.441432736e-08
+    },
+    {
+      "function": "core.py:129(quantile)",
+      "ncalls": 4,
+      "tottime": 0.0007785592000000001,
+      "cumtime": 0.0017527554,
+      "percall_tot": 0.00019463980000000003,
+      "percall_cum": 0.00043818885
+    },
+    {
+      "function": "contiguous.py:172(get_count)",
+      "ncalls": 3338,
+      "tottime": 0.0009408718,
+      "cumtime": 0.0009408718,
+      "percall_tot": 2.7929728771085877e-07,
+      "percall_cum": 2.7929728771085877e-07
+    },
+    {
+      "function": "core.py:24(__init__)",
+      "ncalls": 1,
+      "tottime": 1.0402600000000001e-05,
+      "cumtime": 5.38102e-05,
+      "percall_tot": 1.0402600000000001e-05,
+      "percall_cum": 5.38102e-05
+    },
+    {
+      "function": "~:0(<method 'disable' of '_lsprof.Profiler' objects>)",
+      "ncalls": 1,
+      "tottime": 4.279240000000001e-05,
+      "cumtime": 4.279240000000001e-05,
+      "percall_tot": 4.279240000000001e-05,
+      "percall_cum": 4.279240000000001e-05
+    },
+    {
+      "function": "contiguous.py:23(__init__)",
+      "ncalls": 2,
+      "tottime": 1.21986e-05,
+      "cumtime": 3.51634e-05,
+      "percall_tot": 6.0993e-06,
+      "percall_cum": 1.75817e-05
+    },
+    {
+      "function": "logarithmic.py:18(compute_value_from_index)",
+      "ncalls": 4,
+      "tottime": 2.9578800000000002e-05,
+      "cumtime": 3.332440000000001e-05,
+      "percall_tot": 7.3947000000000005e-06,
+      "percall_cum": 8.331100000000002e-06
+    },
+    {
+      "function": "~:0(<built-in method numpy.zeros>)",
+      "ncalls": 2,
+      "tottime": 1.8467600000000003e-05,
+      "cumtime": 1.8467600000000003e-05,
+      "percall_tot": 9.233800000000001e-06,
+      "percall_cum": 9.233800000000001e-06
+    },
+    {
+      "function": "logarithmic.py:9(__init__)",
+      "ncalls": 1,
+      "tottime": 5.8884e-06,
+      "cumtime": 8.244200000000001e-06,
+      "percall_tot": 5.8884e-06,
+      "percall_cum": 8.244200000000001e-06
+    },
+    {
+      "function": "base.py:17(__init__)",
+      "ncalls": 2,
+      "tottime": 4.4972e-06,
+      "cumtime": 4.4972e-06,
+      "percall_tot": 2.2486e-06,
+      "percall_cum": 2.2486e-06
+    },
+    {
+      "function": "~:0(<built-in method math.pow>)",
+      "ncalls": 4,
+      "tottime": 3.7456e-06,
+      "cumtime": 3.7456e-06,
+      "percall_tot": 9.364e-07,
+      "percall_cum": 9.364e-07
+    }
+  ]
+}
\ No newline at end of file
diff --git a/profile_ddsketch.py b/profile_ddsketch.py
index e30aa51..68c2e7f 100644
--- a/profile_ddsketch.py
+++ b/profile_ddsketch.py
@@ -6,26 +6,37 @@
 from datetime import datetime
 from typing import Dict, List, Tuple
 import numpy as np
+import sys
+from io import StringIO
 
 from QuantileFlow.ddsketch.core import DDSketch
 
+# Optional line profiler import
+try:
+    from line_profiler import LineProfiler
+    LINE_PROFILER_AVAILABLE = True
+except ImportError:
+    LINE_PROFILER_AVAILABLE = False
+
 
 BENCHMARK_DIR = Path("benchmarks")
 BENCHMARK_DIR.mkdir(exist_ok=True)
 
 
-def run_sketch_operations(num_values: int = 10_000_000) -> Dict:
+def run_sketch_operations(sketch: DDSketch, data: np.ndarray) -> Dict:
     """Runs typical DDSketch operations for profiling.
     
+    This function should be called ONLY when profiler is enabled,
+    so it doesn't include initialization or data generation overhead.
+    
+    Args:
+        sketch: Pre-initialized DDSketch instance
+        data: Pre-generated data array
+    
     Returns:
         Dict containing operation results and metrics
     """
-    sketch = DDSketch(relative_accuracy=0.01)
-    
-    # Generate random data
-    data = np.random.rand(num_values) * 1000
-    
-    # Track insertion time
+    # Core operations - ONLY these are profiled
     for value in data:
         sketch.insert(value)
         
@@ -40,7 +51,7 @@ def run_sketch_operations(num_values: int = 10_000_000) -> Dict:
             quantile_results[q] = f"Error: {e}"
     
     return {
-        'num_values': num_values,
+        'num_values': len(data),
         'quantiles': quantile_results
     }
 
@@ -298,10 +309,15 @@ def profile(num_values: int = 10_000_000,
         if num_trials > 1:
             print(f"\n▶ Running trial {trial + 1}/{num_trials}...")
         
+        # Initialize sketch and data BEFORE profiling (exclude overhead)
+        sketch = DDSketch(relative_accuracy=0.01)
+        data = np.random.rand(num_values) * 1000
+        
+        # Profile ONLY the core operations
         profiler = cProfile.Profile()
         profiler.enable()
         
-        results = run_sketch_operations(num_values)
+        results = run_sketch_operations(sketch, data)
         
         profiler.disable()
         
@@ -390,6 +406,155 @@ def profile(num_values: int = 10_000_000,
     print("\n✅ Profiling complete.\n")
 
 
+def line_profile(num_values: int = 1_000_000,
+                 functions_to_profile: List[str] = None,
+                 num_trials: int = 1):
+    """Perform line-by-line profiling of specified functions.
+    
+    Args:
+        num_values: Number of values to insert per trial
+        functions_to_profile: List of function paths to profile (e.g., 'insert', 'add', 'compute_bucket_index')
+        num_trials: Number of trials to run and average
+    """
+    if not LINE_PROFILER_AVAILABLE:
+        print("\n❌ Error: line_profiler is not installed!")
+        print("   Install with: pip install line_profiler")
+        return
+    
+    # Import the modules we want to profile
+    from QuantileFlow.ddsketch.core import DDSketch
+    from QuantileFlow.ddsketch.storage.contiguous import ContiguousStorage
+    from QuantileFlow.ddsketch.mapping.logarithmic import LogarithmicMapping
+    
+    # Default functions to profile
+    if functions_to_profile is None:
+        functions_to_profile = ['insert', 'add', 'compute_bucket_index']
+    
+    # Map function names to actual function objects
+    function_map = {
+        'insert': DDSketch.insert,
+        'add': ContiguousStorage.add,
+        'remove': ContiguousStorage.remove,
+        'get_count': ContiguousStorage.get_count,
+        'compute_bucket_index': LogarithmicMapping.compute_bucket_index,
+        'compute_value_from_index': LogarithmicMapping.compute_value_from_index,
+        'quantile': DDSketch.quantile,
+    }
+    
+    # Validate and get function objects
+    functions_to_profile_obj = []
+    for func_name in functions_to_profile:
+        if func_name in function_map:
+            functions_to_profile_obj.append(function_map[func_name])
+        else:
+            print(f"⚠️  Warning: Unknown function '{func_name}', skipping")
+    
+    if not functions_to_profile_obj:
+        print("\n❌ Error: No valid functions to profile!")
+        return
+    
+    print(f"\n{'Starting Line-Level Profile':^60}")
+    print('=' * 60)
+    print(f"Values per trial: {num_values:,}")
+    print(f"Number of trials: {num_trials}")
+    print("Functions to profile:")
+    for func in functions_to_profile:
+        if func in function_map:
+            print(f"  • {func}")
+    print('=' * 60)
+    
+    # Run profiling across multiple trials
+    all_results = []
+    
+    for trial in range(num_trials):
+        if num_trials > 1:
+            print(f"\n▶ Running trial {trial + 1}/{num_trials}...")
+        
+        # Initialize sketch and data BEFORE profiling (exclude overhead)
+        sketch = DDSketch(relative_accuracy=0.01)
+        data = np.random.rand(num_values) * 1000
+        
+        # Create line profiler
+        lp = LineProfiler()
+        
+        # Add functions to profile
+        for func in functions_to_profile_obj:
+            lp.add_function(func)
+        
+        # Profile ONLY the core operations
+        lp.enable()
+        
+        # Run insertions (no lambda wrapper overhead)
+        for value in data:
+            sketch.insert(value)
+        
+        # Also compute some quantiles to profile quantile function
+        if 'quantile' in functions_to_profile:
+            for q in [0.5, 0.9, 0.99, 0.999]:
+                sketch.quantile(q)
+        
+        lp.disable()
+        
+        # Capture output
+        string_buffer = StringIO()
+        lp.print_stats(stream=string_buffer)
+        all_results.append(string_buffer.getvalue())
+    
+    # Print results from last trial (most representative)
+    print("\n" + "=" * 120)
+    print(f"{'Line-by-Line Profiling Results':^120}")
+    if num_trials > 1:
+        print(f"{'(Showing results from trial ' + str(num_trials) + ')':^120}")
+    print("=" * 120)
+    print(all_results[-1])
+    
+    # Optionally save detailed results
+    if num_trials > 1:
+        print(f"\n💡 Tip: Results shown are from the last trial. All {num_trials} trials were run for consistency.")
+    
+    # Print optimization suggestions based on results
+    print("\n" + "=" * 120)
+    print("🔍 Analysis Tips:")
+    print("  • Look for lines with high '% Time' - these are the bottlenecks")
+    print("  • High '# Hits' with moderate time per hit suggests vectorization opportunities")
+    print("  • Compare 'Time' vs '% Time' to understand relative impact")
+    print("  • Lines with 0 hits but in hot functions may be branches you can optimize")
+    print("=" * 120)
+    
+    print("\n✅ Line profiling complete.\n")
+
+
+def line_profile_to_file(num_values: int = 1_000_000,
+                         functions_to_profile: List[str] = None,
+                         output_file: str = "line_profile_output.txt"):
+    """Perform line profiling and save results to file.
+    
+    Args:
+        num_values: Number of values to insert
+        functions_to_profile: List of function paths to profile
+        output_file: Output file path
+    """
+    if not LINE_PROFILER_AVAILABLE:
+        print("\n❌ Error: line_profiler is not installed!")
+        print("   Install with: pip install line_profiler")
+        return
+    
+    # Capture output
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    
+    try:
+        line_profile(num_values, functions_to_profile, num_trials=1)
+        output = sys.stdout.getvalue()
+    finally:
+        sys.stdout = old_stdout
+    
+    # Write to file
+    output_path = Path(output_file)
+    output_path.write_text(output)
+    print(f"\n✅ Line profile saved to: {output_path}")
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Profile DDSketch operations with benchmarking capabilities",
@@ -413,6 +578,16 @@ def main():
   
   # Quick test with fewer values
   python profile_ddsketch.py --num-values 1000000 --num-trials 3
+  
+  # LINE PROFILING (requires line_profiler):
+  # Profile specific functions line-by-line
+  python profile_ddsketch.py --line-profile --num-values 1000000
+  
+  # Profile specific functions
+  python profile_ddsketch.py --line-profile --functions insert add compute_bucket_index
+  
+  # Profile with multiple trials for stability
+  python profile_ddsketch.py --line-profile --num-values 500000 --num-trials 3
         """
     )
     
@@ -429,10 +604,32 @@ def main():
     parser.add_argument('--top-n', type=int, default=20,
                         help='Number of top functions to display (default: 20)')
     
+    # Line profiling arguments
+    parser.add_argument('--line-profile', action='store_true',
+                        help='Enable line-by-line profiling (requires line_profiler)')
+    parser.add_argument('--functions', nargs='+', 
+                        default=['insert', 'add', 'compute_bucket_index'],
+                        help='Functions to line-profile: insert, add, remove, get_count, '
+                             'compute_bucket_index, compute_value_from_index, quantile '
+                             '(default: insert add compute_bucket_index)')
+    parser.add_argument('--line-output', type=str,
+                        help='Save line profile output to file')
+    
     args = parser.parse_args()
     
     if args.list:
         list_benchmarks()
+    elif args.line_profile:
+        # Use fewer values by default for line profiling if not specified
+        num_vals = args.num_values
+        if args.num_values == 10_000_000:  # Default value
+            num_vals = 1_000_000
+            print(f"ℹ️  Using {num_vals:,} values for line profiling (override with --num-values)")
+        
+        if args.line_output:
+            line_profile_to_file(num_vals, args.functions, args.line_output)
+        else:
+            line_profile(num_vals, args.functions, args.num_trials)
     else:
         profile(
             num_values=args.num_values,

From fe9f8228b4461b1dc9dee1739962d6dfd22d9603 Mon Sep 17 00:00:00 2001
From: Ryan Ji <tairanjiryan@gmail.com>
Date: Sat, 10 Jan 2026 18:28:10 -0500
Subject: [PATCH 7/7] Refactor line profiling default value handling and
 improve DDSketch value insertion logic

---
 QuantileFlow/ddsketch/core.py               | 15 ++++++++-------
 QuantileFlow/ddsketch/storage/contiguous.py |  3 ---
 profile_ddsketch.py                         |  4 ----
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/QuantileFlow/ddsketch/core.py b/QuantileFlow/ddsketch/core.py
index 1a623a9..d512790 100644
--- a/QuantileFlow/ddsketch/core.py
+++ b/QuantileFlow/ddsketch/core.py
@@ -84,16 +84,17 @@ def insert(self, value: Union[int, float]) -> None:
         Raises:
             ValueError: If value is negative and cont_neg is False.
         """
-        if value == 0:
-            self.zero_count += 1
-        elif value > 0:
+        if value > 0:
             bucket_idx = self.mapping.compute_bucket_index(value)
             self.positive_store.add(bucket_idx)
-        elif self.cont_neg:
-            bucket_idx = self.mapping.compute_bucket_index(-value)
-            self.negative_store.add(bucket_idx)
+        elif value < 0:
+            if self.cont_neg:
+                bucket_idx = self.mapping.compute_bucket_index(-value)
+                self.negative_store.add(bucket_idx)
+            else:
+                raise ValueError("Negative values not supported when cont_neg is False")
         else:
-            raise ValueError("Negative values not supported when cont_neg is False")
+            self.zero_count += 1
         self.count += 1
     
     def delete(self, value: Union[int, float]) -> None:
diff --git a/QuantileFlow/ddsketch/storage/contiguous.py b/QuantileFlow/ddsketch/storage/contiguous.py
index 92ead6f..7ed33dd 100644
--- a/QuantileFlow/ddsketch/storage/contiguous.py
+++ b/QuantileFlow/ddsketch/storage/contiguous.py
@@ -47,8 +47,6 @@ def add(self, bucket_index: int, count: int = 1):
             bucket_index: The bucket index to add to.
             count: The count to add (default 1).
         """
-        if count <= 0:
-            return
             
         if self.min_index is None:
             # First insertion
@@ -180,7 +178,6 @@ def get_count(self, bucket_index: int) -> int:
             The count at the specified bucket index.
         """
         if self.min_index is None or bucket_index < self.min_index or bucket_index > self.max_index:
-            warnings.warn("Bucket index is out of range. Returning 0.", UserWarning)
             return 0
         pos = (bucket_index - self.min_index + self.arr_index_of_min_bucket) % self.max_buckets
         return int(self.counts[pos])
diff --git a/profile_ddsketch.py b/profile_ddsketch.py
index 68c2e7f..b696e4c 100644
--- a/profile_ddsketch.py
+++ b/profile_ddsketch.py
@@ -622,10 +622,6 @@ def main():
     elif args.line_profile:
         # Use fewer values by default for line profiling if not specified
         num_vals = args.num_values
-        if args.num_values == 10_000_000:  # Default value
-            num_vals = 1_000_000
-            print(f"ℹ️  Using {num_vals:,} values for line profiling (override with --num-values)")
-        
         if args.line_output:
             line_profile_to_file(num_vals, args.functions, args.line_output)
         else: