From 2374094c3f21782f7a10554a294cd8ac1d60f50c Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Mon, 23 Feb 2026 10:07:23 -0800
Subject: [PATCH 01/21] Use Flexbuffer
---
Cargo.lock | 99 +++++++++++++++++++++++++---
Cargo.toml | 2 +
core-relations/src/hash_index/mod.rs | 1 +
src/lib.rs | 29 +++++---
src/poach.rs | 2 +-
5 files changed, 114 insertions(+), 19 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index f4a3a0ff5..405b84829 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -94,6 +94,12 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
[[package]]
name = "bitflags"
version = "2.10.0"
@@ -124,6 +130,12 @@ version = "3.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
[[package]]
name = "cc"
version = "1.2.41"
@@ -235,7 +247,7 @@ checksum = "93e373516c58af1c344bfe013b6c9831ce6a08bb59709ab3fa6fe5c9b0e904ff"
dependencies = [
"divan-macros",
"itertools",
- "proc-macro-crate",
+ "proc-macro-crate 3.4.0",
"proc-macro2",
"quote",
"syn 2.0.107",
@@ -460,6 +472,7 @@ dependencies = [
"egglog-reports",
"egraph-serialize",
"env_logger",
+ "flexbuffers",
"glob",
"hashbrown 0.16.0",
"im-rc",
@@ -725,6 +738,19 @@ version = "0.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
+[[package]]
+name = "flexbuffers"
+version = "25.12.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bc752b3d049e0705749b9999d0b130d6cf62935bc7762fd3bdb7636047abe43"
+dependencies = [
+ "bitflags 1.3.2",
+ "byteorder",
+ "num_enum",
+ "serde",
+ "serde_derive",
+]
+
[[package]]
name = "foldhash"
version = "0.1.5"
@@ -1016,7 +1042,7 @@ version = "0.30.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
dependencies = [
- "bitflags",
+ "bitflags 2.10.0",
"cfg-if",
"cfg_aliases",
"libc",
@@ -1097,6 +1123,27 @@ dependencies = [
"autocfg",
]
+[[package]]
+name = "num_enum"
+version = "0.5.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f646caf906c20226733ed5b1374287eb97e3c2a5c227ce668c1f2ce20ae57c9"
+dependencies = [
+ "num_enum_derive",
+]
+
+[[package]]
+name = "num_enum_derive"
+version = "0.5.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dcbff9bc912032c62bf65ef1d5aea88983b420f4f839db1e9b0c281a25c9c799"
+dependencies = [
+ "proc-macro-crate 1.3.1",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
[[package]]
name = "once_cell"
version = "1.21.3"
@@ -1212,13 +1259,23 @@ dependencies = [
"zerocopy",
]
+[[package]]
+name = "proc-macro-crate"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919"
+dependencies = [
+ "once_cell",
+ "toml_edit 0.19.15",
+]
+
[[package]]
name = "proc-macro-crate"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983"
dependencies = [
- "toml_edit",
+ "toml_edit 0.23.7",
]
[[package]]
@@ -1328,7 +1385,7 @@ version = "0.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
dependencies = [
- "bitflags",
+ "bitflags 2.10.0",
]
[[package]]
@@ -1378,7 +1435,7 @@ version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
dependencies = [
- "bitflags",
+ "bitflags 2.10.0",
"errno",
"libc",
"linux-raw-sys",
@@ -1595,6 +1652,12 @@ dependencies = [
"syn 2.0.107",
]
+[[package]]
+name = "toml_datetime"
+version = "0.6.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
+
[[package]]
name = "toml_datetime"
version = "0.7.3"
@@ -1604,6 +1667,17 @@ dependencies = [
"serde_core",
]
+[[package]]
+name = "toml_edit"
+version = "0.19.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421"
+dependencies = [
+ "indexmap",
+ "toml_datetime 0.6.11",
+ "winnow 0.5.40",
+]
+
[[package]]
name = "toml_edit"
version = "0.23.7"
@@ -1611,9 +1685,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6485ef6d0d9b5d0ec17244ff7eb05310113c3f316f2d14200d4de56b3cb98f8d"
dependencies = [
"indexmap",
- "toml_datetime",
+ "toml_datetime 0.7.3",
"toml_parser",
- "winnow",
+ "winnow 0.7.13",
]
[[package]]
@@ -1622,7 +1696,7 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e"
dependencies = [
- "winnow",
+ "winnow 0.7.13",
]
[[package]]
@@ -1955,6 +2029,15 @@ version = "0.53.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
+[[package]]
+name = "winnow"
+version = "0.5.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876"
+dependencies = [
+ "memchr",
+]
+
[[package]]
name = "winnow"
version = "0.7.13"
diff --git a/Cargo.toml b/Cargo.toml
index 86206ef8d..9860f9912 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -56,6 +56,7 @@ csv = "1.3"
typetag = "0.2"
serde = { version = "1.0", features = ["derive", "rc"] }
serde_json = "1.0"
+flexbuffers = "25.12.19"
######################
# build dependencies
@@ -162,6 +163,7 @@ serde_json_diff = "0.2.0"
anyhow.workspace = true
walkdir = "2.5.0"
egglog-reports = { workspace = true }
+flexbuffers.workspace = true
[build-dependencies]
chrono = { workspace = true, features = ["now"], optional = true }
diff --git a/core-relations/src/hash_index/mod.rs b/core-relations/src/hash_index/mod.rs
index 3f19107fe..b377a3bae 100644
--- a/core-relations/src/hash_index/mod.rs
+++ b/core-relations/src/hash_index/mod.rs
@@ -915,6 +915,7 @@ static THREAD_POOL: Lazy = Lazy::new(|| {
/// to the beginning of an unused vector.
#[derive(Default, Clone, Serialize, Deserialize)]
pub(super) struct FreeList {
+ #[serde(skip)]
data: HashMap>,
}
impl FreeList {
diff --git a/src/lib.rs b/src/lib.rs
index ef2fcdd3a..48b345b7d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -33,6 +33,7 @@ pub use command_macro::{CommandMacro, CommandMacroRegistry};
// This is used to allow the `add_primitive` macro to work in
// both this crate and other crates by referring to `::egglog`.
extern crate self as egglog;
+extern crate flexbuffers;
use anyhow::{Context, Result};
use ast::*;
pub use ast::{ResolvedExpr, ResolvedFact, ResolvedVar};
@@ -2674,8 +2675,11 @@ impl TimedEgraph {
time_micros: self.timer.elapsed().as_micros(),
});
- let value = serde_json::to_value(egraph).context("Failed to encode egraph as json")?;
-
+ //let value = serde_json::to_value(egraph).context("Failed to encode egraph as json")?;
+ let mut buf = flexbuffers::FlexbufferSerializer::new();
+ // Have to use the fully qualified syntax because egraph has a method called serailize
+ Serialize::serialize(egraph, &mut buf).expect("Failed to serialize the egraph in Flexbuffer");
+
timeline.evts.push(EgraphEvent {
sexp_idx: 0,
evt: END,
@@ -2688,10 +2692,11 @@ impl TimedEgraph {
time_micros: self.timer.elapsed().as_micros(),
});
- let file = fs::File::create(path)
+ let mut file = fs::File::create(path)
.with_context(|| format!("failed to create file {}", path.display()))?;
- serde_json::to_writer(BufWriter::new(file), &value)
- .context("Failed to write value to file")?;
+ //serde_json::to_writer(BufWriter::new(file), &value)
+ // .context("Failed to write value to file")?;
+ file.write_all(buf.view()).context("Failed to write value to file")?;
timeline.evts.push(EgraphEvent {
sexp_idx: 1,
@@ -2713,11 +2718,13 @@ impl TimedEgraph {
time_micros: self.timer.elapsed().as_micros(),
});
- let file = fs::File::open(path)
+ let mut file = fs::File::open(path)
.with_context(|| format!("failed to open file {}", path.display()))?;
- let reader = BufReader::new(file);
- let value: serde_json::Value =
- serde_json::from_reader(reader).context("Failed to read json from file")?;
+ //let reader = BufReader::new(file);
+ //let value: serde_json::Value =
+ // serde_json::from_reader(reader).context("Failed to read json from file")?;
+ let mut buf = Vec::new();
+ file.read_to_end(&mut buf).context("Failed to read Flatbuffer from file")?;
timeline.evts.push(EgraphEvent {
sexp_idx: 0,
@@ -2731,7 +2738,9 @@ impl TimedEgraph {
time_micros: self.timer.elapsed().as_micros(),
});
- let egraph: EGraph = serde_json::from_value(value)?;
+ //let egraph: EGraph = serde_json::from_value(value)?;
+ let r = flexbuffers::Reader::get_root(buf.as_slice()).unwrap();
+ let egraph: EGraph = EGraph::deserialize(r).unwrap();
timeline.evts.push(EgraphEvent {
sexp_idx: 1,
diff --git a/src/poach.rs b/src/poach.rs
index 1bc0c361f..5a1a8465e 100644
--- a/src/poach.rs
+++ b/src/poach.rs
@@ -52,7 +52,7 @@ enum RunMode {
// For each egg file under the input path,
// Run the egglog program, recording timing information.
// Round trip to JSON Value, but do not read/write from file
- // Assert the deserialized egraph has hthe same size as the initial egraph.
+ // Assert the deserialized egraph has the same size as the initial egraph.
// Save the completed timeline, for consumption by the nightly frontend
NoIO,
From dae76d2e967ac8f4750a6b8889063cc3f962d5f3 Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Mon, 2 Mar 2026 17:17:30 -0800
Subject: [PATCH 02/21] Implement SizeReport
---
src/lib.rs | 8 +++
src/poach.rs | 14 +++++
src/serialize_size.rs | 138 ++++++++++++++++++++++++++++++++++++++++++
3 files changed, 160 insertions(+)
create mode 100644 src/serialize_size.rs
diff --git a/src/lib.rs b/src/lib.rs
index 48b345b7d..cf6e97c35 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -30,6 +30,8 @@ mod typechecking;
pub mod util;
pub use command_macro::{CommandMacro, CommandMacroRegistry};
+mod serialize_size;
+
// This is used to allow the `add_primitive` macro to work in
// both this crate and other crates by referring to `::egglog`.
extern crate self as egglog;
@@ -64,6 +66,7 @@ use serde::ser::SerializeStruct;
use serde::{Deserialize, Serialize};
use serde_json::json;
pub use serialize_vis::{SerializeConfig, SerializeOutput, SerializedNode};
+use serialize_size::GenerateSizeReport;
use size::GetSizePrimitive;
use sort::*;
use std::any::Any;
@@ -2666,6 +2669,11 @@ impl TimedEgraph {
Ok(())
}
+ pub fn print_size_report(&mut self) -> Result<()> {
+ self.egraphs.last().unwrap().get_sizerp().pretty_print(0);
+ Ok(())
+ }
+
pub fn to_file(&mut self, path: &Path) -> Result<()> {
let mut timeline = ProgramTimeline::new("(serialize)\n(write)");
let egraph = self.egraphs.last().unwrap();
diff --git a/src/poach.rs b/src/poach.rs
index 5a1a8465e..f8543cf11 100644
--- a/src/poach.rs
+++ b/src/poach.rs
@@ -71,6 +71,11 @@ enum RunMode {
// Run the egglog program, skipping declarations of Sorts and Rules
// Save the completed timeline, for consumption by the nightly frontend
Mine,
+
+ // For each egg file under the input path,
+ // run the egglog program and record timing information.
+ // Print size information on the serialized egraphs.
+ SizeReport,
}
impl Display for RunMode {
@@ -87,6 +92,7 @@ impl Display for RunMode {
RunMode::NoIO => "no-io",
RunMode::Extract => "extract",
RunMode::Mine => "mine",
+ RunMode::SizeReport => "size-report"
}
)
}
@@ -651,6 +657,14 @@ fn poach(
},
)
}
+ RunMode::SizeReport => process_files(
+ &files,
+ out_dir,
+ initial_egraph.as_deref(),
+ |egg_file, _, timed_egraph| {
+ timed_egraph.run_from_file(egg_file)?;
+ timed_egraph.print_size_report()
+ }),
}
}
diff --git a/src/serialize_size.rs b/src/serialize_size.rs
new file mode 100644
index 000000000..168e26899
--- /dev/null
+++ b/src/serialize_size.rs
@@ -0,0 +1,138 @@
+use crate::{CommandMacroRegistry, EGraph, RunReport, TypeInfo, term_encoding::EncodingState};
+
+/// Generate a json report for the size of a serialized structu
+/// By default, only uses serialize
+/// Allow specalization to look into subfields
+
+#[allow(dead_code)]
+#[derive (Debug, Clone)]
+pub struct SizeReport {
+ name: String,
+ size: usize,
+ fields: Vec<(String, Box)>,
+}
+
+fn up_to_two_decimals(a : usize, b : usize) -> String {
+ let a100 = a * 100 / b;
+ let high = a100 / 100;
+ let low = a100 % 100;
+ let low_str =
+ if low < 10 {
+ "0".to_string() + &low.to_string()
+ } else {
+ low.to_string()
+ };
+ return high.to_string() + "." + &low_str;
+}
+
+fn pretty_print_nbytes(size: usize) -> String {
+ if size < 200 {
+ size.to_string() + "B"
+ } else if size < 200 * 1024 {
+ up_to_two_decimals(size, 1024) + "KB"
+ } else if size < 200 * 1024 * 1024 {
+ up_to_two_decimals(size, 1024 * 1024) + "MB"
+ } else {
+ up_to_two_decimals(size, 1024 * 1024 * 1024) + "GB"
+ }
+}
+
+impl SizeReport {
+
+ pub fn pretty_print(&self, level: usize) {
+ if level == 0 {
+ println!("{} : {}", self.name, pretty_print_nbytes(self.size));
+ }
+ let mut sorted_fields = self.fields.clone();
+ sorted_fields.sort_by(|(_, a), (_, b)| b.size.cmp(&a.size));
+ for (name, sr) in sorted_fields {
+ let percentage = (sr.size as f64 / self.size as f64) * 100.0;
+ println!(". {:level$}{} : {} ({:.2}%)", "", name, pretty_print_nbytes(sr.size), percentage);
+ sr.pretty_print(level + 2);
+ }
+ }
+}
+
+pub trait GenerateSizeReport: serde::Serialize {
+ fn get_sizerp(&self) -> SizeReport {
+ let mut buf = flexbuffers::FlexbufferSerializer::new();
+ serde::Serialize::serialize(self, &mut buf).expect("Failed to serialize in Flexbuffer");
+ SizeReport {
+ name: std::any::type_name::().to_string(),
+ size: buf.view().len(),
+ fields: Vec::new(),
+ }
+ }
+}
+
+impl GenerateSizeReport for egglog_bridge::EGraph {}
+
+impl GenerateSizeReport for Option {}
+
+impl GenerateSizeReport for egglog::util::IndexMap {}
+
+impl GenerateSizeReport for TypeInfo {}
+
+impl GenerateSizeReport for RunReport {}
+
+impl GenerateSizeReport for egglog_numeric_id::DenseIdMap {}
+
+impl GenerateSizeReport for CommandMacroRegistry {}
+
+impl GenerateSizeReport for EncodingState {}
+
+
+impl GenerateSizeReport for EGraph {
+ fn get_sizerp(&self) -> SizeReport {
+ let mut buf = flexbuffers::FlexbufferSerializer::new();
+ serde::Serialize::serialize(self, &mut buf).expect("Failed to serialize in Flexbuffer");
+ let mut ret = SizeReport {
+ name: std::any::type_name::().to_string(),
+ size: buf.view().len(),
+ fields: Vec::new(),
+ };
+ ret.fields.push(("backend".to_string(), Box::new(self.backend.get_sizerp())));
+ ret.fields.push(("pushed_egraph".to_string(), Box::new(self.pushed_egraph.get_sizerp())));
+ ret.fields.push(("functions".to_string(), Box::new(self.functions.get_sizerp())));
+ ret.fields.push(("rulesets".to_string(), Box::new(self.rulesets.get_sizerp())));
+ ret.fields.push(("type_info".to_string(), Box::new(self.type_info.get_sizerp())));
+ ret.fields.push(("overall_run_report".to_string(), Box::new(self.overall_run_report.get_sizerp())));
+ ret.fields.push(("schedulers".to_string(), Box::new(self.schedulers.get_sizerp())));
+ ret.fields.push(("commands".to_string(), Box::new(self.commands.get_sizerp())));
+ ret.fields.push(("command_macros".to_string(), Box::new(self.command_macros.get_sizerp())));
+ ret.fields.push(("proof_state".to_string(), Box::new(self.proof_state.get_sizerp())));
+ ret
+ }
+}
+
+/*
+pub struct EGraph {
+ backend: egglog_bridge::EGraph,
+
+ pub parser: Parser,
+
+ names: check_shadowing::Names,
+ /// pushed_egraph forms a linked list of pushed egraphs.
+ /// Pop reverts the egraph to the last pushed egraph.
+ pushed_egraph: Option>,
+
+ functions: IndexMap,
+
+ rulesets: IndexMap,
+ pub fact_directory: Option,
+ pub seminaive: bool,
+
+ type_info: TypeInfo,
+ /// The run report unioned over all runs so far.
+ overall_run_report: RunReport,
+
+ schedulers: DenseIdMap,
+
+ commands: IndexMap>,
+ strict_mode: bool,
+ warned_about_missing_global_prefix: bool,
+ /// Registry for command-level macros
+ command_macros: CommandMacroRegistry,
+ proof_state: EncodingState,
+}
+ */
\ No newline at end of file
From 156f463e5f34b1cfdc3e0817f96a60e36db650ee Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Tue, 3 Mar 2026 14:06:37 -0800
Subject: [PATCH 03/21] Dig deeper into the size blowup
---
src/lib.rs | 15 +--
src/poach.rs | 17 ++--
src/serialize_size.rs | 228 ++++++++++++++++++++++++++++++------------
3 files changed, 182 insertions(+), 78 deletions(-)
diff --git a/src/lib.rs b/src/lib.rs
index cf6e97c35..db1bc52ca 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -34,8 +34,8 @@ mod serialize_size;
// This is used to allow the `add_primitive` macro to work in
// both this crate and other crates by referring to `::egglog`.
-extern crate self as egglog;
extern crate flexbuffers;
+extern crate self as egglog;
use anyhow::{Context, Result};
use ast::*;
pub use ast::{ResolvedExpr, ResolvedFact, ResolvedVar};
@@ -65,8 +65,8 @@ use scheduler::{SchedulerId, SchedulerRecord};
use serde::ser::SerializeStruct;
use serde::{Deserialize, Serialize};
use serde_json::json;
-pub use serialize_vis::{SerializeConfig, SerializeOutput, SerializedNode};
use serialize_size::GenerateSizeReport;
+pub use serialize_vis::{SerializeConfig, SerializeOutput, SerializedNode};
use size::GetSizePrimitive;
use sort::*;
use std::any::Any;
@@ -2686,8 +2686,9 @@ impl TimedEgraph {
//let value = serde_json::to_value(egraph).context("Failed to encode egraph as json")?;
let mut buf = flexbuffers::FlexbufferSerializer::new();
// Have to use the fully qualified syntax because egraph has a method called serailize
- Serialize::serialize(egraph, &mut buf).expect("Failed to serialize the egraph in Flexbuffer");
-
+ Serialize::serialize(egraph, &mut buf)
+ .expect("Failed to serialize the egraph in Flexbuffer");
+
timeline.evts.push(EgraphEvent {
sexp_idx: 0,
evt: END,
@@ -2704,7 +2705,8 @@ impl TimedEgraph {
.with_context(|| format!("failed to create file {}", path.display()))?;
//serde_json::to_writer(BufWriter::new(file), &value)
// .context("Failed to write value to file")?;
- file.write_all(buf.view()).context("Failed to write value to file")?;
+ file.write_all(buf.view())
+ .context("Failed to write value to file")?;
timeline.evts.push(EgraphEvent {
sexp_idx: 1,
@@ -2732,7 +2734,8 @@ impl TimedEgraph {
//let value: serde_json::Value =
// serde_json::from_reader(reader).context("Failed to read json from file")?;
let mut buf = Vec::new();
- file.read_to_end(&mut buf).context("Failed to read Flatbuffer from file")?;
+ file.read_to_end(&mut buf)
+ .context("Failed to read Flatbuffer from file")?;
timeline.evts.push(EgraphEvent {
sexp_idx: 0,
diff --git a/src/poach.rs b/src/poach.rs
index f8543cf11..14f972771 100644
--- a/src/poach.rs
+++ b/src/poach.rs
@@ -92,7 +92,7 @@ impl Display for RunMode {
RunMode::NoIO => "no-io",
RunMode::Extract => "extract",
RunMode::Mine => "mine",
- RunMode::SizeReport => "size-report"
+ RunMode::SizeReport => "size-report",
}
)
}
@@ -658,13 +658,14 @@ fn poach(
)
}
RunMode::SizeReport => process_files(
- &files,
- out_dir,
- initial_egraph.as_deref(),
- |egg_file, _, timed_egraph| {
- timed_egraph.run_from_file(egg_file)?;
- timed_egraph.print_size_report()
- }),
+ &files,
+ out_dir,
+ initial_egraph.as_deref(),
+ |egg_file, _, timed_egraph| {
+ timed_egraph.run_from_file(egg_file)?;
+ timed_egraph.print_size_report()
+ },
+ ),
}
}
diff --git a/src/serialize_size.rs b/src/serialize_size.rs
index 168e26899..63a22195c 100644
--- a/src/serialize_size.rs
+++ b/src/serialize_size.rs
@@ -1,27 +1,32 @@
-use crate::{CommandMacroRegistry, EGraph, RunReport, TypeInfo, term_encoding::EncodingState};
+use crate::{
+ ast::ResolvedVar,
+ core::{GenericCoreAction, GenericCoreActions, GenericAtom, Query, ResolvedCall, ResolvedCoreRule},
+ egglog::util::IndexMap,
+ term_encoding::EncodingState,
+ CommandMacroRegistry, EGraph, RunReport, TypeInfo,
+};
/// Generate a json report for the size of a serialized structu
/// By default, only uses serialize
/// Allow specalization to look into subfields
#[allow(dead_code)]
-#[derive (Debug, Clone)]
+#[derive(Debug, Clone)]
pub struct SizeReport {
name: String,
size: usize,
fields: Vec<(String, Box)>,
}
-fn up_to_two_decimals(a : usize, b : usize) -> String {
+fn up_to_two_decimals(a: usize, b: usize) -> String {
let a100 = a * 100 / b;
let high = a100 / 100;
let low = a100 % 100;
- let low_str =
- if low < 10 {
- "0".to_string() + &low.to_string()
- } else {
- low.to_string()
- };
+ let low_str = if low < 10 {
+ "0".to_string() + &low.to_string()
+ } else {
+ low.to_string()
+ };
return high.to_string() + "." + &low_str;
}
@@ -37,102 +42,197 @@ fn pretty_print_nbytes(size: usize) -> String {
}
}
-impl SizeReport {
+fn truncate_string_with_ellipsis(s: &str, max_len: usize) -> String {
+ if s.chars().count() > max_len {
+ let mut truncated = s.chars().take(max_len).collect::();
+ truncated.push_str(&format!("...{:} chars total", s.len()));
+ truncated
+ } else {
+ s.to_string()
+ }
+}
+impl SizeReport {
pub fn pretty_print(&self, level: usize) {
if level == 0 {
println!("{} : {}", self.name, pretty_print_nbytes(self.size));
}
let mut sorted_fields = self.fields.clone();
sorted_fields.sort_by(|(_, a), (_, b)| b.size.cmp(&a.size));
- for (name, sr) in sorted_fields {
+ for (name, sr) in sorted_fields.iter().take(10) {
let percentage = (sr.size as f64 / self.size as f64) * 100.0;
- println!(". {:level$}{} : {} ({:.2}%)", "", name, pretty_print_nbytes(sr.size), percentage);
+ println!(
+ " {:level$}{} : {} ({:.2}%)",
+ "",
+ name,
+ pretty_print_nbytes(sr.size),
+ percentage
+ );
sr.pretty_print(level + 2);
}
+ if sorted_fields.len() > 10 {
+ println!(" {:level$} ... {:} fields total", "", sorted_fields.len());
+ }
+ }
+}
+
+fn get_sizerp_default(obj: &T) -> SizeReport {
+ let mut buf = flexbuffers::FlexbufferSerializer::new();
+ serde::Serialize::serialize(obj, &mut buf).expect("Failed to serialize in Flexbuffer");
+ SizeReport {
+ name: std::any::type_name::().to_string(),
+ size: buf.view().len(),
+ fields: Vec::new(),
}
}
-pub trait GenerateSizeReport: serde::Serialize {
+pub trait GenerateSizeReport: serde::Serialize + Sized {
fn get_sizerp(&self) -> SizeReport {
- let mut buf = flexbuffers::FlexbufferSerializer::new();
- serde::Serialize::serialize(self, &mut buf).expect("Failed to serialize in Flexbuffer");
- SizeReport {
- name: std::any::type_name::().to_string(),
- size: buf.view().len(),
- fields: Vec::new(),
- }
+ get_sizerp_default(self)
}
}
impl GenerateSizeReport for egglog_bridge::EGraph {}
-impl GenerateSizeReport for Option {}
+impl GenerateSizeReport for Option {}
-impl GenerateSizeReport for egglog::util::IndexMap {}
+impl GenerateSizeReport
+ for IndexMap
+{
+ fn get_sizerp(&self) -> SizeReport {
+ let mut ret = get_sizerp_default(self);
+ for (k, v) in self {
+ ret.fields.push((
+ truncate_string_with_ellipsis(&k.to_string(), 20),
+ Box::new(v.get_sizerp()),
+ ));
+ }
+ ret
+ }
+}
impl GenerateSizeReport for TypeInfo {}
impl GenerateSizeReport for RunReport {}
-impl GenerateSizeReport for egglog_numeric_id::DenseIdMap {}
+impl GenerateSizeReport
+ for egglog_numeric_id::DenseIdMap
+{
+}
impl GenerateSizeReport for CommandMacroRegistry {}
impl GenerateSizeReport for EncodingState {}
+impl GenerateSizeReport for egglog::Function {}
-impl GenerateSizeReport for EGraph {
+use egglog::ast::Ruleset;
+use egglog_ast::span::Span;
+
+impl GenerateSizeReport for Span {}
+
+impl GenerateSizeReport for GenericAtom {}
+
+impl GenerateSizeReport for Query {
+ fn get_sizerp(&self) -> SizeReport {
+ self.atoms.get_sizerp()
+ }
+}
+
+impl GenerateSizeReport for Vec {
fn get_sizerp(&self) -> SizeReport {
- let mut buf = flexbuffers::FlexbufferSerializer::new();
- serde::Serialize::serialize(self, &mut buf).expect("Failed to serialize in Flexbuffer");
- let mut ret = SizeReport {
- name: std::any::type_name::().to_string(),
- size: buf.view().len(),
- fields: Vec::new(),
- };
- ret.fields.push(("backend".to_string(), Box::new(self.backend.get_sizerp())));
- ret.fields.push(("pushed_egraph".to_string(), Box::new(self.pushed_egraph.get_sizerp())));
- ret.fields.push(("functions".to_string(), Box::new(self.functions.get_sizerp())));
- ret.fields.push(("rulesets".to_string(), Box::new(self.rulesets.get_sizerp())));
- ret.fields.push(("type_info".to_string(), Box::new(self.type_info.get_sizerp())));
- ret.fields.push(("overall_run_report".to_string(), Box::new(self.overall_run_report.get_sizerp())));
- ret.fields.push(("schedulers".to_string(), Box::new(self.schedulers.get_sizerp())));
- ret.fields.push(("commands".to_string(), Box::new(self.commands.get_sizerp())));
- ret.fields.push(("command_macros".to_string(), Box::new(self.command_macros.get_sizerp())));
- ret.fields.push(("proof_state".to_string(), Box::new(self.proof_state.get_sizerp())));
+ let mut ret = get_sizerp_default(self);
+ for e in self {
+ let rep = e.get_sizerp();
+ ret.fields.push((rep.name.clone(), Box::new(rep)));
+ }
ret
}
}
-/*
-pub struct EGraph {
- backend: egglog_bridge::EGraph,
+impl GenerateSizeReport for GenericCoreAction {}
- pub parser: Parser,
+impl GenerateSizeReport for GenericCoreActions {
+ fn get_sizerp(&self) -> SizeReport {
+ self.0.get_sizerp()
+ }
+}
- names: check_shadowing::Names,
- /// pushed_egraph forms a linked list of pushed egraphs.
- /// Pop reverts the egraph to the last pushed egraph.
- pushed_egraph: Option>,
+impl GenerateSizeReport for ResolvedCall {}
- functions: IndexMap,
+impl GenerateSizeReport for ResolvedVar {}
- rulesets: IndexMap,
- pub fact_directory: Option,
- pub seminaive: bool,
+impl GenerateSizeReport for ResolvedCoreRule {
+ fn get_sizerp(&self) -> SizeReport {
+ let mut ret = get_sizerp_default(self);
+ ret.fields
+ .push(("span".to_string(), Box::new(self.span.get_sizerp())));
+ ret.fields
+ .push(("body".to_string(), Box::new(self.body.get_sizerp())));
+ ret.fields
+ .push(("head".to_string(), Box::new(self.head.get_sizerp())));
+ ret
+ }
+}
- type_info: TypeInfo,
- /// The run report unioned over all runs so far.
- overall_run_report: RunReport,
+impl GenerateSizeReport for (T, S) {
+ fn get_sizerp(&self) -> SizeReport {
+ let mut ret = get_sizerp_default(self);
+ ret.fields
+ .push(("0".to_string(), Box::new(self.0.get_sizerp())));
+ ret.fields
+ .push(("1".to_string(), Box::new(self.1.get_sizerp())));
+ ret
+ }
+}
- schedulers: DenseIdMap,
+impl GenerateSizeReport for egglog_bridge::RuleId {}
- commands: IndexMap>,
- strict_mode: bool,
- warned_about_missing_global_prefix: bool,
- /// Registry for command-level macros
- command_macros: CommandMacroRegistry,
- proof_state: EncodingState,
+impl GenerateSizeReport for egglog::ast::Ruleset {
+ fn get_sizerp(&self) -> SizeReport {
+ match &self {
+ Ruleset::Rules(mp) => mp.get_sizerp(),
+ Ruleset::Combined(_l) => {
+ //TODO if needed
+ get_sizerp_default(self)
+ }
+ }
+ }
+}
+
+impl GenerateSizeReport for EGraph {
+ fn get_sizerp(&self) -> SizeReport {
+ let mut ret = get_sizerp_default(&self);
+ ret.fields
+ .push(("backend".to_string(), Box::new(self.backend.get_sizerp())));
+ ret.fields.push((
+ "pushed_egraph".to_string(),
+ Box::new(self.pushed_egraph.get_sizerp()),
+ ));
+ ret.fields.push((
+ "functions".to_string(),
+ Box::new(self.functions.get_sizerp()),
+ ));
+ ret.fields
+ .push(("rulesets".to_string(), Box::new(self.rulesets.get_sizerp())));
+ ret.fields.push((
+ "type_info".to_string(),
+ Box::new(self.type_info.get_sizerp()),
+ ));
+ ret.fields.push((
+ "overall_run_report".to_string(),
+ Box::new(self.overall_run_report.get_sizerp()),
+ ));
+ ret.fields.push((
+ "schedulers".to_string(),
+ Box::new(self.schedulers.get_sizerp()),
+ ));
+ //ret.fields.push(("commands".to_string(), Box::new(self.commands.get_sizerp())));
+ //ret.fields.push(("command_macros".to_string(), Box::new(self.command_macros.get_sizerp())));
+ ret.fields.push((
+ "proof_state".to_string(),
+ Box::new(self.proof_state.get_sizerp()),
+ ));
+ ret
+ }
}
- */
\ No newline at end of file
From 92cc3334a1b5ac106386d80d63582dc89005412b Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Tue, 3 Mar 2026 14:06:58 -0800
Subject: [PATCH 04/21] Serialize span into unit
---
egglog-ast/src/span.rs | 25 +++++++++++++++++++++++--
1 file changed, 23 insertions(+), 2 deletions(-)
diff --git a/egglog-ast/src/span.rs b/egglog-ast/src/span.rs
index c2c8db320..d062426cf 100644
--- a/egglog-ast/src/span.rs
+++ b/egglog-ast/src/span.rs
@@ -3,13 +3,32 @@ use std::sync::Arc;
use serde::{Deserialize, Serialize};
-#[derive(Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Clone, PartialEq, Eq, Hash)]
pub enum Span {
Panic,
Egglog(Arc),
Rust(Arc),
+ POACH,
}
+impl serde::Serialize for Span {
+ fn serialize(&self, serializer: S) -> Result
+ where
+ S: serde::Serializer {
+ serializer.serialize_unit()
+ }
+}
+
+impl<'de> serde::Deserialize<'de> for Span {
+ fn deserialize(_: D) -> Result
+ where
+ D: serde::Deserializer<'de> {
+ Ok(Self::POACH)
+ }
+}
+
+
+
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct EgglogSpan {
pub file: Arc,
@@ -55,6 +74,7 @@ impl Span {
Span::Panic => panic!("Span::Panic in Span::string"),
Span::Rust(_) => panic!("Span::Rust cannot track end position"),
Span::Egglog(span) => &span.file.contents[span.i..span.j],
+ Span::POACH => "From POACH deserialization",
}
}
}
@@ -96,7 +116,8 @@ impl Display for Span {
write!(f, "In {}:{}-{}: {quote}", start_line, start_col, end_col)
}
}
- }
+ },
+ Span::POACH => write!(f, "From POACH deserialization"),
}
}
}
From c37fd3a877d537d345ba07aa90338190a0c30d4a Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Tue, 3 Mar 2026 14:29:00 -0800
Subject: [PATCH 05/21] Add control for how much size information to output
---
src/lib.rs | 4 ++--
src/poach.rs | 2 +-
src/serialize_size.rs | 10 +++++++---
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/src/lib.rs b/src/lib.rs
index db1bc52ca..3eafa05f5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2669,8 +2669,8 @@ impl TimedEgraph {
Ok(())
}
- pub fn print_size_report(&mut self) -> Result<()> {
- self.egraphs.last().unwrap().get_sizerp().pretty_print(0);
+ pub fn print_size_report(&mut self, max_level: usize) -> Result<()> {
+ self.egraphs.last().unwrap().get_sizerp().pretty_print(0, max_level);
Ok(())
}
diff --git a/src/poach.rs b/src/poach.rs
index 14f972771..a3da3ed87 100644
--- a/src/poach.rs
+++ b/src/poach.rs
@@ -663,7 +663,7 @@ fn poach(
initial_egraph.as_deref(),
|egg_file, _, timed_egraph| {
timed_egraph.run_from_file(egg_file)?;
- timed_egraph.print_size_report()
+ timed_egraph.print_size_report(0)
},
),
}
diff --git a/src/serialize_size.rs b/src/serialize_size.rs
index 63a22195c..c9b49ae03 100644
--- a/src/serialize_size.rs
+++ b/src/serialize_size.rs
@@ -53,7 +53,10 @@ fn truncate_string_with_ellipsis(s: &str, max_len: usize) -> String {
}
impl SizeReport {
- pub fn pretty_print(&self, level: usize) {
+ pub fn pretty_print(&self, level: usize, max_level: usize) {
+ if level > max_level {
+ return;
+ }
if level == 0 {
println!("{} : {}", self.name, pretty_print_nbytes(self.size));
}
@@ -61,14 +64,15 @@ impl SizeReport {
sorted_fields.sort_by(|(_, a), (_, b)| b.size.cmp(&a.size));
for (name, sr) in sorted_fields.iter().take(10) {
let percentage = (sr.size as f64 / self.size as f64) * 100.0;
+ let indent = level * 2;
println!(
- " {:level$}{} : {} ({:.2}%)",
+ " {:indent$}{} : {} ({:.2}%)",
"",
name,
pretty_print_nbytes(sr.size),
percentage
);
- sr.pretty_print(level + 2);
+ sr.pretty_print(level + 1, max_level);
}
if sorted_fields.len() > 10 {
println!(" {:level$} ... {:} fields total", "", sorted_fields.len());
From 4234f79f04aab3e56bf57a8f97e703be9e49d243 Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Thu, 5 Mar 2026 10:43:17 -0800
Subject: [PATCH 06/21] Extract experiment runs
---
infra/nightly.py | 36 ++++++++++++++--------
infra/nightly.sh | 7 +++--
src/lib.rs | 33 ++++++++++----------
src/poach.rs | 80 +++++++++++++++++++++++++++++++++++-------------
4 files changed, 102 insertions(+), 54 deletions(-)
mode change 100644 => 100755 infra/nightly.sh
diff --git a/infra/nightly.py b/infra/nightly.py
index 3e833356a..f968ec4f0 100644
--- a/infra/nightly.py
+++ b/infra/nightly.py
@@ -88,23 +88,32 @@ def run_test_experiments(top_dir, tmp_dir, aggregator):
run_poach(benchmark, tmp_dir, run_mode)
add_benchmark_data(aggregator, timeline_file, f"tests/{benchmark_name}/{benchmark.stem}/timeline.json")
extra_files = {
- "sequential-round-trip": [tmp_dir / f"{benchmark.stem}-serialize1.json"],
+ "sequential-round-trip": [tmp_dir / f"{benchmark.stem}-serialize1.fbs"],
"old-serialize": [
- tmp_dir / f"{benchmark.stem}-serialize-poach.json",
+ tmp_dir / f"{benchmark.stem}-serialize-poach.fbs",
tmp_dir / f"{benchmark.stem}-serialize-old.json",
],
}.get(run_mode, [])
cleanup_benchmark_files(timeline_file, tmp_dir / "summary.json", *extra_files)
+def run_extract_experiments(resource_dir, tmp_dir, aggregator):
+ timeline_suites = ["easteregg", "herbie-hamming", "herbie-math-rewrite", "herbie-math-taylor"]
+ for suite in timeline_suites:
+ for benchmark in benchmark_files(resource_dir / "test-files" / suite):
+ timeline_file = tmp_dir / f"{benchmark.stem}-timeline.json"
+ run_poach(benchmark, tmp_dir, "extract")
+ add_benchmark_data(aggregator, timeline_file, f"{suite}/timeline/{benchmark.stem}/timeline.json")
+ cleanup_benchmark_files(timeline_file, tmp_dir / "summary.json")
+
def run_mined_experiments(resource_dir, tmp_dir, aggregator):
- mega_serialize_file = tmp_dir / "mega-easteregg-serialize.json"
+ mega_serialize_file = tmp_dir / "mega-easteregg-serialize.fbs"
mega_timeline_file = tmp_dir / "mega-easteregg-timeline.json"
run_poach(resource_dir / "mega-easteregg.egg", tmp_dir, "serialize")
add_benchmark_data(aggregator, mega_timeline_file, "easteregg/serialize/mega-easteregg/timeline.json")
cleanup_benchmark_files(mega_timeline_file, tmp_dir / "summary.json")
for benchmark in benchmark_files(resource_dir / "test-files" / "easteregg"):
timeline_file = tmp_dir / f"{benchmark.stem}-timeline.json"
- serialize_file = tmp_dir / f"{benchmark.stem}-serialize.json"
+ serialize_file = tmp_dir / f"{benchmark.stem}-serialize.fbs"
run_poach(benchmark, tmp_dir, "serialize")
add_benchmark_data(aggregator, timeline_file, f"easteregg/serialize/{benchmark.stem}/timeline.json")
cleanup_benchmark_files(timeline_file, tmp_dir / "summary.json")
@@ -146,22 +155,25 @@ def run_mined_experiments(resource_dir, tmp_dir, aggregator):
##############################################################################
# Run the benchmarks and record timeline-only data.
- run_timeline_experiments(resource_dir, tmp_dir, aggregator)
+ # run_timeline_experiments(resource_dir, tmp_dir, aggregator)
# Re-run the benchmarks with JSON round-tripping kept entirely in memory.
- run_no_io_experiments(resource_dir, tmp_dir, aggregator)
+ # run_no_io_experiments(resource_dir, tmp_dir, aggregator)
# Run the egglog tests under each serialization experiment mode.
- run_test_experiments(top_dir, tmp_dir, aggregator)
+ # run_test_experiments(top_dir, tmp_dir, aggregator)
# Run the mined-egraph experiment using both per-benchmark and mega-egraph seeds.
- run_mined_experiments(resource_dir, tmp_dir, aggregator)
+ # run_mined_experiments(resource_dir, tmp_dir, aggregator)
+
+ # Run the extract experiment on our heavy benchmarks
+ run_extract_experiments(resource_dir, tmp_dir, aggregator)
##############################################################################
aggregator.save()
- if shutil.which("perf") is not None:
- # Generate flamegraphs
- for egg_file in glob.glob("tests/*.egg") + glob.glob("tests/web-demo/*.egg"):
- run_cmd([str(script_dir / "flamegraph.sh"), egg_file, str(nightly_dir / "output" / "flamegraphs")])
+ #if shutil.which("perf") is not None:
+ # # Generate flamegraphs
+ # for egg_file in glob.glob("tests/*.egg") + glob.glob("tests/web-demo/*.egg"):
+ # run_cmd([str(script_dir / "flamegraph.sh"), egg_file, str(nightly_dir / "output" / "flamegraphs")])
diff --git a/infra/nightly.sh b/infra/nightly.sh
old mode 100644
new mode 100755
index 766e417cb..777641a8e
--- a/infra/nightly.sh
+++ b/infra/nightly.sh
@@ -47,7 +47,8 @@ mkdir -p nightly/output
mkdir -p nightly/output/flamegraphs
mkdir -p nightly/tmp
-git clone https://github.com/brendangregg/FlameGraph.git
+# Skip FlameGraphs for mining MVP
+# git clone https://github.com/brendangregg/FlameGraph.git
# Build in release mode before running nightly.py
cargo build --release
@@ -61,9 +62,9 @@ if [ ! -f nightly/output/data/data.json ]; then
exit 1
fi
-ls nightly/output/flamegraphs > nightly/output/flamegraphs.txt
+# ls nightly/output/flamegraphs > nightly/output/flamegraphs.txt
cp infra/nightly-resources/web/* nightly/output
# Uncomment for local development
-# cd nightly/output && python3 -m http.server 8002
+cd nightly/output && python3 -m http.server 8002
diff --git a/src/lib.rs b/src/lib.rs
index 941282916..950778ab5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -73,7 +73,7 @@ use std::any::Any;
use std::fmt::{Debug, Display, Formatter};
use std::fs::{self, read_to_string, File};
use std::hash::Hash;
-use std::io::{BufReader, BufWriter, Read, Write as _};
+use std::io::{BufWriter, Read, Write as _};
use std::iter::once;
use std::ops::Deref;
use std::path::{Path, PathBuf};
@@ -2485,10 +2485,14 @@ impl TimedEgraph {
}
pub fn new_from_file(path: &Path) -> Self {
- let file = File::open(path).expect("failed to open egraph file");
- let reader = BufReader::new(file);
+ let mut file = fs::File::open(path)
+ .expect("failed to open file");
+ let mut buf = Vec::new();
+ file.read_to_end(&mut buf)
+ .expect("Failed to read Flatbuffer from file");
- let egraph: EGraph = serde_json::from_reader(reader).expect("failed to parse egraph JSON");
+ let r = flexbuffers::Reader::get_root(buf.as_slice()).unwrap();
+ let egraph: EGraph = EGraph::deserialize(r).unwrap();
Self {
egraphs: vec![egraph],
@@ -2624,7 +2628,7 @@ impl TimedEgraph {
Ok(())
}
- pub fn to_value(&mut self) -> Result {
+ pub fn to_value(&mut self) -> Result> {
let mut timeline = ProgramTimeline::new("(serialize)");
let egraph = self.egraphs.last().unwrap();
@@ -2634,7 +2638,10 @@ impl TimedEgraph {
time_micros: self.timer.elapsed().as_micros(),
});
- let value = serde_json::to_value(egraph).context("Failed to encode egraph as json")?;
+ let mut buf = flexbuffers::FlexbufferSerializer::new();
+ Serialize::serialize(egraph, &mut buf)
+ .expect("Failed to serialize the egraph in Flexbuffer");
+ let value = Vec::from(buf.view());
timeline.evts.push(EgraphEvent {
sexp_idx: 0,
@@ -2646,7 +2653,7 @@ impl TimedEgraph {
Ok(value)
}
- pub fn from_value(&mut self, value: serde_json::Value) -> Result<()> {
+ pub fn from_value(&mut self, value: Vec) -> Result<()> {
let mut timeline = ProgramTimeline::new("(deserialize)");
timeline.evts.push(EgraphEvent {
@@ -2655,8 +2662,8 @@ impl TimedEgraph {
time_micros: self.timer.elapsed().as_micros(),
});
- let egraph: EGraph =
- serde_json::from_value(value).context("Failed to decode egraph from json")?;
+ let r = flexbuffers::Reader::get_root(value.as_slice()).unwrap();
+ let egraph: EGraph = EGraph::deserialize(r).unwrap();
timeline.evts.push(EgraphEvent {
sexp_idx: 0,
@@ -2684,9 +2691,7 @@ impl TimedEgraph {
time_micros: self.timer.elapsed().as_micros(),
});
- //let value = serde_json::to_value(egraph).context("Failed to encode egraph as json")?;
let mut buf = flexbuffers::FlexbufferSerializer::new();
- // Have to use the fully qualified syntax because egraph has a method called serailize
Serialize::serialize(egraph, &mut buf)
.expect("Failed to serialize the egraph in Flexbuffer");
@@ -2704,8 +2709,6 @@ impl TimedEgraph {
let mut file = fs::File::create(path)
.with_context(|| format!("failed to create file {}", path.display()))?;
- //serde_json::to_writer(BufWriter::new(file), &value)
- // .context("Failed to write value to file")?;
file.write_all(buf.view())
.context("Failed to write value to file")?;
@@ -2731,9 +2734,6 @@ impl TimedEgraph {
let mut file = fs::File::open(path)
.with_context(|| format!("failed to open file {}", path.display()))?;
- //let reader = BufReader::new(file);
- //let value: serde_json::Value =
- // serde_json::from_reader(reader).context("Failed to read json from file")?;
let mut buf = Vec::new();
file.read_to_end(&mut buf)
.context("Failed to read Flatbuffer from file")?;
@@ -2750,7 +2750,6 @@ impl TimedEgraph {
time_micros: self.timer.elapsed().as_micros(),
});
- //let egraph: EGraph = serde_json::from_value(value)?;
let r = flexbuffers::Reader::get_root(buf.as_slice()).unwrap();
let egraph: EGraph = EGraph::deserialize(r).unwrap();
diff --git a/src/poach.rs b/src/poach.rs
index 3d4de64e6..71471f3c5 100644
--- a/src/poach.rs
+++ b/src/poach.rs
@@ -134,6 +134,8 @@ fn check_egraph_size(egraph: &TimedEgraph) -> Result<()> {
Ok(())
}
+// TODO: This is not working right now due to no longer using serde_json
+/*
fn check_idempotent(p1: &PathBuf, p2: &PathBuf, name: &str, out_dir: &PathBuf) {
let json1: serde_json::Value = serde_json::from_str(
&fs::read_to_string(p1).expect(&format!("failed to open {}", p1.display())),
@@ -153,6 +155,7 @@ fn check_idempotent(p1: &PathBuf, p2: &PathBuf, name: &str, out_dir: &PathBuf) {
panic!("Diff for {}", name)
}
}
+*/
fn benchmark_name(egg_file: &Path) -> &str {
egg_file
@@ -210,6 +213,7 @@ where
(successes, failures)
}
+#[allow(dead_code)]
fn compare_extracts(
initial_extracts: &[CommandOutput],
final_extracts: &[CommandOutput],
@@ -248,6 +252,38 @@ fn compare_extracts(
Ok(())
}
+fn compare_extracts_weak(
+ initial_extracts: &[CommandOutput],
+ final_extracts: &[CommandOutput],
+) -> Result<()> {
+ if initial_extracts.len() != final_extracts.len() {
+ anyhow::bail!("extract lengths mismatch")
+ }
+
+ for (x, y) in initial_extracts.iter().zip(final_extracts) {
+ match (x, y) {
+ (CommandOutput::ExtractBest(_, _, _), CommandOutput::ExtractBest(_, _, _)) => {
+
+ }
+ (
+ CommandOutput::ExtractVariants(_, _),
+ CommandOutput::ExtractVariants(_, _),
+ ) => {
+
+ }
+ (
+ CommandOutput::MultiExtractVariants(_, _),
+ CommandOutput::MultiExtractVariants(_, _),
+ ) => {
+
+ }
+ _ => anyhow::bail!("No match : {:?} {:?}", x, y),
+ }
+ }
+
+ Ok(())
+}
+
fn poach(
files: Vec,
out_dir: &PathBuf,
@@ -275,7 +311,7 @@ fn poach(
|egg_file, out_dir, timed_egraph| {
let name = benchmark_name(egg_file);
timed_egraph.run_from_file(egg_file)?;
- timed_egraph.to_file(&out_dir.join(format!("{name}-serialize.json")))?;
+ timed_egraph.to_file(&out_dir.join(format!("{name}-serialize.fbs")))?;
timed_egraph.write_timeline(&out_dir.join(format!("{name}-timeline.json")))?;
Ok(())
},
@@ -288,15 +324,15 @@ fn poach(
|egg_file, out_dir: &PathBuf, timed_egraph| {
let name = benchmark_name(egg_file);
timed_egraph.run_from_file(egg_file)?;
- let s1 = out_dir.join(format!("{name}-serialize1.json"));
+ let s1 = out_dir.join(format!("{name}-serialize1.fbs"));
timed_egraph
.to_file(&s1)
- .context("Failed to write s1.json")?;
+ .context("Failed to write s1.fbs")?;
timed_egraph
.from_file(&s1)
- .context("failed to read s1.json")?;
+ .context("failed to read s1.fbs")?;
check_egraph_number(&timed_egraph, 2)?;
@@ -314,37 +350,37 @@ fn poach(
|egg_file, out_dir, timed_egraph| {
let name = benchmark_name(egg_file);
timed_egraph.run_from_file(egg_file)?;
- let s1 = out_dir.join(format!("{name}-serialize1.json"));
- let s2 = out_dir.join(format!("{name}-serialize2.json"));
- let s3 = out_dir.join(format!("{name}-serialize3.json"));
+ let s1 = out_dir.join(format!("{name}-serialize1.fbs"));
+ let s2 = out_dir.join(format!("{name}-serialize2.fbs"));
+ let s3 = out_dir.join(format!("{name}-serialize3.fbs"));
timed_egraph
.to_file(&s1)
- .context("failed to serialize s1.json")?;
+ .context("failed to serialize s1.fbs")?;
timed_egraph
.from_file(&s1)
- .context("failed to read s1.json")?;
+ .context("failed to read s1.fbs")?;
timed_egraph
.to_file(&s2)
- .context("failed to serialize s2.json")?;
+ .context("failed to serialize s2.fbs")?;
timed_egraph
.from_file(&s2)
- .context("failed to read s2.json")?;
+ .context("failed to read s2.fbs")?;
timed_egraph
.to_file(&s3)
- .context("failed to serialize s3.json")?;
+ .context("failed to serialize s3.fbs")?;
timed_egraph
.from_file(&s3)
- .context("failed to read s3.json")?;
+ .context("failed to read s3.fbs")?;
check_egraph_number(&timed_egraph, 4)?;
check_egraph_size(&timed_egraph)?;
- check_idempotent(&s2, &s3, name, out_dir);
+ //check_idempotent(&s2, &s3, name, out_dir);
timed_egraph.write_timeline(&out_dir.join(format!("{name}-timeline.json")))?;
Ok(())
@@ -360,8 +396,8 @@ fn poach(
timed_egraph.run_from_file(egg_file)?;
timed_egraph
- .to_file(&out_dir.join(format!("{name}-serialize-poach.json")))
- .context("failed to write poach.json")?;
+ .to_file(&out_dir.join(format!("{name}-serialize-poach.fbs")))
+ .context("failed to write poach.fbs")?;
timed_egraph
.old_serialize_egraph(&out_dir.join(format!("{name}-serialize-old.json")))
@@ -382,11 +418,11 @@ fn poach(
let value = timed_egraph
.to_value()
- .context("Failed to encode egraph as json")?;
+ .context("Failed to encode egraph as flatbuffer")?;
timed_egraph
.from_value(value)
- .context("failed to decode egraph from json")?;
+ .context("failed to decode egraph from flatbuffer")?;
check_egraph_number(&timed_egraph, 2)?;
@@ -424,7 +460,7 @@ fn poach(
if let Sexp::List(xs, _) = sexp {
if !xs.is_empty() {
match &xs[0] {
- Sexp::Atom(s, _) => s == "extract",
+ Sexp::Atom(s, _) => s == "extract" || s == "multi-extract",
_ => false,
}
} else {
@@ -452,18 +488,18 @@ fn poach(
let value = timed_egraph
.to_value()
- .context("Failed to encode egraph as JSON")?;
+ .context("Failed to encode egraph as Flatbuffer")?;
timed_egraph
.from_value(value)
- .context("failed to decode egraph from json")?;
+ .context("Failed to decode egraph from Flatbuffer")?;
check_egraph_number(&timed_egraph, 2)?;
let final_extracts =
timed_egraph.run_program_with_timeline(extract_cmds, &extracts)?;
- compare_extracts(&initial_extracts, &final_extracts)?;
+ compare_extracts_weak(&initial_extracts, &final_extracts)?;
timed_egraph.write_timeline(&out_dir.join(format!("{name}-timeline.json")))?;
From 9c854695edf8716d220c30b575102aa0f711f79f Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Thu, 5 Mar 2026 12:06:14 -0800
Subject: [PATCH 07/21] Tweak nightly frontent to display extract experiment
results
---
infra/nightly-resources/web/chart.js | 4 +-
infra/nightly-resources/web/extract.html | 2 +
infra/nightly-resources/web/extract.js | 54 +++++++++++++++++++-----
infra/nightly.py | 7 ++-
4 files changed, 54 insertions(+), 13 deletions(-)
diff --git a/infra/nightly-resources/web/chart.js b/infra/nightly-resources/web/chart.js
index 466b69975..da73b509b 100644
--- a/infra/nightly-resources/web/chart.js
+++ b/infra/nightly-resources/web/chart.js
@@ -156,8 +156,8 @@ function initializeCharts() {
},
},
y: {
- min: -25,
- max: 25,
+ min: -1000,
+ max: 3000,
title: {
display: true,
text: "time (ms)",
diff --git a/infra/nightly-resources/web/extract.html b/infra/nightly-resources/web/extract.html
index 55de269b5..8af73a4dc 100644
--- a/infra/nightly-resources/web/extract.html
+++ b/infra/nightly-resources/web/extract.html
@@ -25,6 +25,8 @@ POACH vs Vanilla Egglog
Serialization time is not counted
+
+
diff --git a/infra/nightly-resources/web/extract.js b/infra/nightly-resources/web/extract.js
index e83b0c854..562dc5c90 100644
--- a/infra/nightly-resources/web/extract.js
+++ b/infra/nightly-resources/web/extract.js
@@ -1,9 +1,43 @@
function initializeExtract() {
- initializeGlobalData().then(initializeCharts).then(plotExtract);
+ initializeGlobalData()
+ .then(initializeExtractOptions)
+ .then(initializeCharts)
+ .then(plotExtract);
}
+function initializeExtractOptions() {
+ const suiteElt = document.getElementById("suite");
+ Object.keys(GLOBAL_DATA.data).forEach((suite, idx) => {
+ const label = document.createElement("label");
+ const input = document.createElement("input");
+
+ input.type = "radio";
+ input.name = "suiteToggle";
+ input.value = suite;
+
+ if (idx === 0) {
+ input.checked = true; // select first run mode
+ }
+
+ label.appendChild(input);
+ label.append(" " + suite);
+
+ suiteElt.appendChild(label);
+ });
+}
+
+
function plotExtract() {
- const all_data = GLOBAL_DATA.data.tests.extract;
+
+ const suite = document.querySelector(
+ 'input[name="suiteToggle"]:checked'
+ ).value;
+
+ if (!suite) {
+ return;
+ }
+
+ const all_data = GLOBAL_DATA.data[suite].extract;
if (GLOBAL_DATA.extractChart === null) {
return;
@@ -31,7 +65,7 @@ function plotExtract() {
data[b].poachDeser = aggregate(all_data[b].deserialize, "total");
data[b].poachTotal = data[b].poachDeser + data[b].poachExtract;
- data[b].difference = data[b].poachTotal - data[b].vanillaTotal;
+ data[b].difference = data[b].vanillaTotal - data[b].poachTotal;
});
GLOBAL_DATA.differenceChart.data = {
@@ -41,18 +75,16 @@ function plotExtract() {
label: "poach - vanilla",
data: Object.values(data).map((d) => d.difference),
backgroundColor: Object.values(data).map((d) => {
- if (Math.abs(d.difference) > 25) {
- return "gray";
- } else {
- return d.difference >= 0
- ? "rgba(255, 99, 132, 0.7)"
- : "rgba(54, 162, 235, 0.7)";
- }
+ return d.difference >= 0
+ ? "rgba(54, 162, 235, 0.7)"
+ : "rgba(255, 99, 132, 0.7)";
}),
},
],
};
+ GLOBAL_DATA.differenceChart.update();
+
GLOBAL_DATA.extractChart.data = {
labels: benchmarks,
datasets: [
@@ -85,4 +117,6 @@ function plotExtract() {
},
],
};
+
+ GLOBAL_DATA.extractChart.update();
}
diff --git a/infra/nightly.py b/infra/nightly.py
index f968ec4f0..745e62b1b 100644
--- a/infra/nightly.py
+++ b/infra/nightly.py
@@ -102,8 +102,13 @@ def run_extract_experiments(resource_dir, tmp_dir, aggregator):
for benchmark in benchmark_files(resource_dir / "test-files" / suite):
timeline_file = tmp_dir / f"{benchmark.stem}-timeline.json"
run_poach(benchmark, tmp_dir, "extract")
- add_benchmark_data(aggregator, timeline_file, f"{suite}/timeline/{benchmark.stem}/timeline.json")
+ add_benchmark_data(aggregator, timeline_file, f"{suite}/extract/{benchmark.stem}/timeline.json")
cleanup_benchmark_files(timeline_file, tmp_dir / "summary.json")
+ for benchmark in benchmark_files(top_dir / "tests", recursive = True):
+ timeline_file = tmp_dir / f"{benchmark.stem}-timeline.json"
+ run_poach(benchmark, tmp_dir, "extract")
+ add_benchmark_data(aggregator, timeline_file, f"tests/extract/{benchmark.stem}/timeline.json")
+ cleanup_benchmark_files(timeline_file, tmp_dir / "summary.json")
def run_mined_experiments(resource_dir, tmp_dir, aggregator):
mega_serialize_file = tmp_dir / "mega-easteregg-serialize.fbs"
From 54533db2ac629947c44783eae21ed2418f0d6c3a Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Thu, 5 Mar 2026 12:22:59 -0800
Subject: [PATCH 08/21] Show egraph size in size report
---
src/lib.rs | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/lib.rs b/src/lib.rs
index 950778ab5..3c7a0f180 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2678,7 +2678,9 @@ impl TimedEgraph {
}
pub fn print_size_report(&mut self, max_level: usize) -> Result<()> {
- self.egraphs.last().unwrap().get_sizerp().pretty_print(0, max_level);
+ let egraph = self.egraphs.last().unwrap();
+ println!("egraph size: {:}", egraph.num_tuples());
+ egraph.get_sizerp().pretty_print(0, max_level);
Ok(())
}
From dcf81e5c7cb8074aec02d1745a8a35471be86662 Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Thu, 5 Mar 2026 12:48:07 -0800
Subject: [PATCH 09/21] Add include ser time option, add a speedup graph
---
infra/nightly-resources/web/chart.js | 45 ++++++++++++++++++++++++
infra/nightly-resources/web/extract.html | 7 ++++
infra/nightly-resources/web/extract.js | 26 +++++++++++++-
3 files changed, 77 insertions(+), 1 deletion(-)
diff --git a/infra/nightly-resources/web/chart.js b/infra/nightly-resources/web/chart.js
index da73b509b..aed046dd9 100644
--- a/infra/nightly-resources/web/chart.js
+++ b/infra/nightly-resources/web/chart.js
@@ -124,6 +124,51 @@ function initializeCharts() {
);
}
+ if (!!document.getElementById("speedup-chart")) {
+ console.assert(GLOBAL_DATA.differenceChart === null);
+
+ GLOBAL_DATA.speedupChart = new Chart(
+ document.getElementById("speedup-chart"),
+ {
+ type: "bar",
+ data: {},
+ options: {
+ responsive: true,
+ plugins: {
+ legend: {
+ display: false,
+ },
+ title: {
+ display: true,
+ text: "Per-benchmark Runtime Speedup",
+ },
+ tooltip: {
+ callbacks: {
+ label: (ctx) => `${ctx.raw.toFixed(2)}x`,
+ },
+ },
+ },
+ scales: {
+ x: {
+ ticks: {
+ maxRotation: 90,
+ minRotation: 45,
+ },
+ },
+ y: {
+ min: 0,
+ max: 50,
+ title: {
+ display: true,
+ text: "Speedup (times)",
+ },
+ },
+ },
+ },
+ },
+ );
+ }
+
if (!!document.getElementById("difference-chart")) {
console.assert(GLOBAL_DATA.differenceChart === null);
diff --git a/infra/nightly-resources/web/extract.html b/infra/nightly-resources/web/extract.html
index 8af73a4dc..f8a2483d9 100644
--- a/infra/nightly-resources/web/extract.html
+++ b/infra/nightly-resources/web/extract.html
@@ -25,8 +25,15 @@ POACH vs Vanilla Egglog
Serialization time is not counted
+
+
+
+
diff --git a/infra/nightly-resources/web/extract.js b/infra/nightly-resources/web/extract.js
index 562dc5c90..e922b536a 100644
--- a/infra/nightly-resources/web/extract.js
+++ b/infra/nightly-resources/web/extract.js
@@ -37,6 +37,8 @@ function plotExtract() {
return;
}
+ const includeser = document.querySelector("input[name='icldser1']:checked");
+
const all_data = GLOBAL_DATA.data[suite].extract;
if (GLOBAL_DATA.extractChart === null) {
@@ -63,11 +65,33 @@ function plotExtract() {
data[b].poachExtract = aggregate(extracts.slice(midpoint), "total");
data[b].poachDeser = aggregate(all_data[b].deserialize, "total");
- data[b].poachTotal = data[b].poachDeser + data[b].poachExtract;
+ if (includeser) {
+ data[b].poachTotal = data[b].poachDeser + data[b].poachExtract;
+ } else {
+ data[b].poachTotal = data[b].poachExtract;
+ }
data[b].difference = data[b].vanillaTotal - data[b].poachTotal;
+ data[b].speedup = data[b].vanillaTotal / data[b].poachTotal;
});
+ GLOBAL_DATA.speedupChart.data = {
+ labels: benchmarks,
+ datasets: [
+ {
+ label: "poach - vanilla",
+ data: Object.values(data).map((d) => d.speedup),
+ backgroundColor: Object.values(data).map((d) => {
+ return d.speedup >= 1
+ ? "rgba(54, 162, 235, 0.7)"
+ : "rgba(255, 99, 132, 0.7)";
+ }),
+ },
+ ],
+ };
+
+ GLOBAL_DATA.speedupChart.update();
+
GLOBAL_DATA.differenceChart.data = {
labels: benchmarks,
datasets: [
From 63d2be20fa3610dae0e338ca4e86697ae0435143 Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Thu, 5 Mar 2026 12:57:53 -0800
Subject: [PATCH 10/21] fmt
---
egglog-ast/src/span.rs | 14 +++++++-------
src/lib.rs | 3 +--
src/poach.rs | 15 +++------------
src/serialize_size.rs | 8 ++++++--
4 files changed, 17 insertions(+), 23 deletions(-)
diff --git a/egglog-ast/src/span.rs b/egglog-ast/src/span.rs
index d062426cf..2651d9cdc 100644
--- a/egglog-ast/src/span.rs
+++ b/egglog-ast/src/span.rs
@@ -13,22 +13,22 @@ pub enum Span {
impl serde::Serialize for Span {
fn serialize(&self, serializer: S) -> Result
- where
- S: serde::Serializer {
+ where
+ S: serde::Serializer,
+ {
serializer.serialize_unit()
}
}
impl<'de> serde::Deserialize<'de> for Span {
fn deserialize(_: D) -> Result
- where
- D: serde::Deserializer<'de> {
+ where
+ D: serde::Deserializer<'de>,
+ {
Ok(Self::POACH)
}
}
-
-
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct EgglogSpan {
pub file: Arc,
@@ -116,7 +116,7 @@ impl Display for Span {
write!(f, "In {}:{}-{}: {quote}", start_line, start_col, end_col)
}
}
- },
+ }
Span::POACH => write!(f, "From POACH deserialization"),
}
}
diff --git a/src/lib.rs b/src/lib.rs
index 3c7a0f180..baf03eb21 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2485,8 +2485,7 @@ impl TimedEgraph {
}
pub fn new_from_file(path: &Path) -> Self {
- let mut file = fs::File::open(path)
- .expect("failed to open file");
+ let mut file = fs::File::open(path).expect("failed to open file");
let mut buf = Vec::new();
file.read_to_end(&mut buf)
.expect("Failed to read Flatbuffer from file");
diff --git a/src/poach.rs b/src/poach.rs
index 71471f3c5..d1b7d45d3 100644
--- a/src/poach.rs
+++ b/src/poach.rs
@@ -262,21 +262,12 @@ fn compare_extracts_weak(
for (x, y) in initial_extracts.iter().zip(final_extracts) {
match (x, y) {
- (CommandOutput::ExtractBest(_, _, _), CommandOutput::ExtractBest(_, _, _)) => {
-
- }
- (
- CommandOutput::ExtractVariants(_, _),
- CommandOutput::ExtractVariants(_, _),
- ) => {
-
- }
+ (CommandOutput::ExtractBest(_, _, _), CommandOutput::ExtractBest(_, _, _)) => {}
+ (CommandOutput::ExtractVariants(_, _), CommandOutput::ExtractVariants(_, _)) => {}
(
CommandOutput::MultiExtractVariants(_, _),
CommandOutput::MultiExtractVariants(_, _),
- ) => {
-
- }
+ ) => {}
_ => anyhow::bail!("No match : {:?} {:?}", x, y),
}
}
diff --git a/src/serialize_size.rs b/src/serialize_size.rs
index c9b49ae03..8683e27ca 100644
--- a/src/serialize_size.rs
+++ b/src/serialize_size.rs
@@ -1,6 +1,8 @@
use crate::{
ast::ResolvedVar,
- core::{GenericCoreAction, GenericCoreActions, GenericAtom, Query, ResolvedCall, ResolvedCoreRule},
+ core::{
+ GenericAtom, GenericCoreAction, GenericCoreActions, Query, ResolvedCall, ResolvedCoreRule,
+ },
egglog::util::IndexMap,
term_encoding::EncodingState,
CommandMacroRegistry, EGraph, RunReport, TypeInfo,
@@ -179,7 +181,9 @@ impl GenerateSizeReport for ResolvedCoreRule {
}
}
-impl GenerateSizeReport for (T, S) {
+impl
+ GenerateSizeReport for (T, S)
+{
fn get_sizerp(&self) -> SizeReport {
let mut ret = get_sizerp_default(self);
ret.fields
From c54b1a20f392da11a95cdd5b9538ee0a520ced85 Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Thu, 5 Mar 2026 13:44:46 -0800
Subject: [PATCH 11/21] Skip tests because containers are not yet supported
---
infra/nightly.py | 5 -----
1 file changed, 5 deletions(-)
diff --git a/infra/nightly.py b/infra/nightly.py
index 745e62b1b..50011ff04 100644
--- a/infra/nightly.py
+++ b/infra/nightly.py
@@ -104,11 +104,6 @@ def run_extract_experiments(resource_dir, tmp_dir, aggregator):
run_poach(benchmark, tmp_dir, "extract")
add_benchmark_data(aggregator, timeline_file, f"{suite}/extract/{benchmark.stem}/timeline.json")
cleanup_benchmark_files(timeline_file, tmp_dir / "summary.json")
- for benchmark in benchmark_files(top_dir / "tests", recursive = True):
- timeline_file = tmp_dir / f"{benchmark.stem}-timeline.json"
- run_poach(benchmark, tmp_dir, "extract")
- add_benchmark_data(aggregator, timeline_file, f"tests/extract/{benchmark.stem}/timeline.json")
- cleanup_benchmark_files(timeline_file, tmp_dir / "summary.json")
def run_mined_experiments(resource_dir, tmp_dir, aggregator):
mega_serialize_file = tmp_dir / "mega-easteregg-serialize.fbs"
From 85dcdcf32d4441d70a953b74b3bfc0ec31167bb6 Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Thu, 5 Mar 2026 14:13:20 -0800
Subject: [PATCH 12/21] Comment local dev setup
---
infra/nightly.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/infra/nightly.sh b/infra/nightly.sh
index 777641a8e..a0777ca6c 100755
--- a/infra/nightly.sh
+++ b/infra/nightly.sh
@@ -67,4 +67,4 @@ fi
cp infra/nightly-resources/web/* nightly/output
# Uncomment for local development
-cd nightly/output && python3 -m http.server 8002
+# cd nightly/output && python3 -m http.server 8002
From 1d46162dec79c11b115a9fb80ee0115be1252e36 Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Thu, 5 Mar 2026 16:43:44 -0800
Subject: [PATCH 13/21] Output a csv file with serialization size data
---
Cargo.toml | 2 +-
infra/nightly.py | 10 +++++++---
infra/transform.py | 16 ++++++++++++++++
src/lib.rs | 26 +++++++++++++++++++++++++-
src/poach.rs | 26 ++++++++++++++++++++++++++
5 files changed, 75 insertions(+), 5 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 9860f9912..2190a235e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,7 +52,7 @@ getrandom = "0.3"
once_cell = "1.21"
num-bigint = { version = "0.4", features = ["serde"] }
num-rational = {version = "0.4", features = ["serde"]}
-csv = "1.3"
+csv = "1.4"
typetag = "0.2"
serde = { version = "1.0", features = ["derive", "rc"] }
serde_json = "1.0"
diff --git a/infra/nightly.py b/infra/nightly.py
index 50011ff04..2ce3f6ae1 100644
--- a/infra/nightly.py
+++ b/infra/nightly.py
@@ -96,14 +96,16 @@ def run_test_experiments(top_dir, tmp_dir, aggregator):
}.get(run_mode, [])
cleanup_benchmark_files(timeline_file, tmp_dir / "summary.json", *extra_files)
-def run_extract_experiments(resource_dir, tmp_dir, aggregator):
+def run_extract_experiments(resource_dir, tmp_dir, aggregator, csv_aggregator):
timeline_suites = ["easteregg", "herbie-hamming", "herbie-math-rewrite", "herbie-math-taylor"]
for suite in timeline_suites:
for benchmark in benchmark_files(resource_dir / "test-files" / suite):
timeline_file = tmp_dir / f"{benchmark.stem}-timeline.json"
run_poach(benchmark, tmp_dir, "extract")
add_benchmark_data(aggregator, timeline_file, f"{suite}/extract/{benchmark.stem}/timeline.json")
- cleanup_benchmark_files(timeline_file, tmp_dir / "summary.json")
+ extra_files = [tmp_dir / f"{benchmark.stem}.csv"]
+ csv_aggregator.add_file(extra_files[0])
+ cleanup_benchmark_files(timeline_file, tmp_dir / "summary.json", *extra_files)
def run_mined_experiments(resource_dir, tmp_dir, aggregator):
mega_serialize_file = tmp_dir / "mega-easteregg-serialize.fbs"
@@ -146,6 +148,7 @@ def run_mined_experiments(resource_dir, tmp_dir, aggregator):
tmp_dir = nightly_dir / "tmp"
output_data_dir = nightly_dir / "output" / "data"
aggregator = transform.TimelineAggregator(output_data_dir)
+ csv_aggregator = transform.CSVAggregator(output_data_dir)
# Make sure we're in the right place
os.chdir(top_dir)
@@ -167,11 +170,12 @@ def run_mined_experiments(resource_dir, tmp_dir, aggregator):
# run_mined_experiments(resource_dir, tmp_dir, aggregator)
# Run the extract experiment on our heavy benchmarks
- run_extract_experiments(resource_dir, tmp_dir, aggregator)
+ run_extract_experiments(resource_dir, tmp_dir, aggregator, csv_aggregator)
##############################################################################
aggregator.save()
+ csv_aggregator.save()
#if shutil.which("perf") is not None:
# # Generate flamegraphs
diff --git a/infra/transform.py b/infra/transform.py
index 2fe95fbfd..f6a334003 100644
--- a/infra/transform.py
+++ b/infra/transform.py
@@ -1,4 +1,5 @@
import json
+import pandas
import os
from pathlib import Path
@@ -111,3 +112,18 @@ def add_file(self, input_file, benchmark_name):
def save(self):
os.makedirs(self.output_dir, exist_ok=True)
save_json(self.data_path, self.aggregated)
+
+class CSVAggregator:
+ def __init__(self, output_dir):
+ self.output_dir = Path(output_dir)
+ self.data_path = self.output_dir / "data.csv"
+ self.records = []
+
+ def add_file(self, input_file):
+ df = pandas.read_csv(input_file)
+ self.records.append(df)
+
+ def save(self):
+ os.makedirs(self.output_dir, exist_ok=True)
+ combined = pandas.concat(self.records)
+ combined.to_csv(self.data_path, index=False)
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index baf03eb21..64273b913 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2442,13 +2442,33 @@ mod tests {
static START: &'static str = "start";
static END: &'static str = "end";
-#[derive(Serialize, Clone)]
+#[derive(Serialize, Clone, Eq)]
pub struct EgraphEvent {
sexp_idx: i32,
evt: &'static str,
time_micros: u128,
}
+impl Ord for EgraphEvent {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ self.time_micros.cmp(&other.time_micros)
+ }
+}
+
+impl PartialOrd for EgraphEvent {
+ fn partial_cmp(&self, other: &Self) -> Option {
+ Some(self.cmp(other))
+ }
+}
+
+impl PartialEq for EgraphEvent {
+ fn eq(&self, other: &Self) -> bool {
+ self.sexp_idx == other.sexp_idx &&
+ self.evt == other.evt &&
+ self.time_micros == other.time_micros
+ }
+}
+
#[derive(Serialize, Clone)]
pub struct ProgramTimeline {
program_text: String,
@@ -2500,6 +2520,10 @@ impl TimedEgraph {
}
}
+ pub fn get_total_time(&self, id : usize) -> u128 {
+ self.timeline[id].evts.iter().max().unwrap().time_micros - self.timeline[id].evts.iter().min().unwrap().time_micros
+ }
+
pub fn egraphs(&self) -> Vec<&EGraph> {
self.egraphs.iter().map(|x| x).collect()
}
diff --git a/src/poach.rs b/src/poach.rs
index d1b7d45d3..33479e2b6 100644
--- a/src/poach.rs
+++ b/src/poach.rs
@@ -481,6 +481,8 @@ fn poach(
.to_value()
.context("Failed to encode egraph as Flatbuffer")?;
+ let serialized_size = value.len();
+
timed_egraph
.from_value(value)
.context("Failed to decode egraph from Flatbuffer")?;
@@ -494,6 +496,30 @@ fn poach(
timed_egraph.write_timeline(&out_dir.join(format!("{name}-timeline.json")))?;
+
+ #[derive(Serialize)]
+ struct CSVRecord{
+ benchname: String,
+ egraph_size: usize,
+ serialized_size: usize,
+ ser_time: u128,
+ der_time: u128,
+ ext_time: u128,
+ run_time: u128,
+ }
+
+ let r = CSVRecord {
+ benchname: name.to_string(),
+ egraph_size: timed_egraph.egraphs().last().unwrap().num_tuples(),
+ serialized_size: serialized_size,
+ ser_time: timed_egraph.get_total_time(1),
+ der_time: timed_egraph.get_total_time(2),
+ ext_time: timed_egraph.get_total_time(3),
+ run_time: timed_egraph.get_total_time(0)
+ };
+
+ csv::Writer::from_path(&out_dir.join(format!("{name}.csv")))?.serialize(r)?;
+
Ok(())
},
),
From a5758297f1720883219230ca09db6f87900f0690 Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Thu, 5 Mar 2026 16:45:16 -0800
Subject: [PATCH 14/21] fmt
---
src/lib.rs | 11 ++++++-----
src/poach.rs | 7 +++----
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/lib.rs b/src/lib.rs
index 64273b913..725506bfb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2463,9 +2463,9 @@ impl PartialOrd for EgraphEvent {
impl PartialEq for EgraphEvent {
fn eq(&self, other: &Self) -> bool {
- self.sexp_idx == other.sexp_idx &&
- self.evt == other.evt &&
- self.time_micros == other.time_micros
+ self.sexp_idx == other.sexp_idx
+ && self.evt == other.evt
+ && self.time_micros == other.time_micros
}
}
@@ -2520,8 +2520,9 @@ impl TimedEgraph {
}
}
- pub fn get_total_time(&self, id : usize) -> u128 {
- self.timeline[id].evts.iter().max().unwrap().time_micros - self.timeline[id].evts.iter().min().unwrap().time_micros
+ pub fn get_total_time(&self, id: usize) -> u128 {
+ self.timeline[id].evts.iter().max().unwrap().time_micros
+ - self.timeline[id].evts.iter().min().unwrap().time_micros
}
pub fn egraphs(&self) -> Vec<&EGraph> {
diff --git a/src/poach.rs b/src/poach.rs
index 33479e2b6..05a28b653 100644
--- a/src/poach.rs
+++ b/src/poach.rs
@@ -496,9 +496,8 @@ fn poach(
timed_egraph.write_timeline(&out_dir.join(format!("{name}-timeline.json")))?;
-
#[derive(Serialize)]
- struct CSVRecord{
+ struct CSVRecord {
benchname: String,
egraph_size: usize,
serialized_size: usize,
@@ -515,11 +514,11 @@ fn poach(
ser_time: timed_egraph.get_total_time(1),
der_time: timed_egraph.get_total_time(2),
ext_time: timed_egraph.get_total_time(3),
- run_time: timed_egraph.get_total_time(0)
+ run_time: timed_egraph.get_total_time(0),
};
csv::Writer::from_path(&out_dir.join(format!("{name}.csv")))?.serialize(r)?;
-
+
Ok(())
},
),
From 96ea2262c4589d77e0b4c8517c763309719d1fa9 Mon Sep 17 00:00:00 2001
From: Haobin Ni
Date: Fri, 6 Mar 2026 15:41:13 -0800
Subject: [PATCH 15/21] Hacks
---
core-relations/src/free_join/mod.rs | 6 ++-
core-relations/src/lib.rs | 6 ++-
core-relations/src/row_buffer/mod.rs | 71 ++++++++++++++++++++++++++++
core-relations/src/table/mod.rs | 42 ++++++++++++++--
core-relations/src/table_spec.rs | 3 +-
core-relations/src/uf/mod.rs | 2 +
egglog-bridge/src/lib.rs | 3 +-
numeric-id/src/lib.rs | 6 ++-
src/lib.rs | 2 +-
src/poach.rs | 2 +-
src/serialize_size.rs | 58 +++++++++++++++++++----
11 files changed, 178 insertions(+), 23 deletions(-)
diff --git a/core-relations/src/free_join/mod.rs b/core-relations/src/free_join/mod.rs
index c97378fa0..9bccd0d1f 100644
--- a/core-relations/src/free_join/mod.rs
+++ b/core-relations/src/free_join/mod.rs
@@ -118,7 +118,8 @@ pub(crate) type HashColumnIndex = Arc>>;
pub struct TableInfo {
pub(crate) name: Option>,
pub(crate) spec: TableSpec,
- pub(crate) table: WrappedTable,
+ // TODO: evil hack for looking at serialization size
+ pub table: WrappedTable,
#[serde(skip)]
pub(crate) indexes: IndexCatalog, HashIndex>,
#[serde(skip)]
@@ -276,7 +277,8 @@ impl Counters {
pub struct Database {
// NB: some fields are pub(crate) to allow some internal modules to avoid
// borrowing the whole table.
- pub(crate) tables: DenseIdMap,
+ // TODO: evil hack for looking at serialization size
+ pub tables: DenseIdMap,
// TODO: having a single AtomicUsize per counter can lead to contention. We
// should look into prefetching counters when creating a new ExecutionState
// and incrementing locally. Note that the batch size shouldn't be too big
diff --git a/core-relations/src/lib.rs b/core-relations/src/lib.rs
index 7d0e66140..66fe1248c 100644
--- a/core-relations/src/lib.rs
+++ b/core-relations/src/lib.rs
@@ -7,7 +7,8 @@ pub(crate) mod base_values;
pub(crate) mod common;
pub(crate) mod containers;
pub(crate) mod dependency_graph;
-pub(crate) mod free_join;
+// TODO: evil hack for looking at serialization size
+pub mod free_join;
pub(crate) mod hash_index;
pub(crate) mod offsets;
pub(crate) mod parallel_heuristics;
@@ -16,7 +17,8 @@ pub(crate) mod query;
pub(crate) mod row_buffer;
pub(crate) mod table;
-pub(crate) mod table_spec;
+// TODO: evil hack for looking at serialization size
+pub mod table_spec;
pub(crate) mod uf;
#[cfg(test)]
diff --git a/core-relations/src/row_buffer/mod.rs b/core-relations/src/row_buffer/mod.rs
index a4426940c..e24af95f8 100644
--- a/core-relations/src/row_buffer/mod.rs
+++ b/core-relations/src/row_buffer/mod.rs
@@ -35,6 +35,7 @@ impl<'de> Deserialize<'de> for RowBuffer {
where
D: Deserializer<'de>,
{
+ /*
#[derive(Deserialize)]
struct Partial {
n_columns: usize,
@@ -49,19 +50,89 @@ impl<'de> Deserialize<'de> for RowBuffer {
total_rows: helper.total_rows,
data: Pooled::new(helper.data),
})
+ */
+
+ let bytes = >::deserialize(deserializer).expect("Failed to parse RowBuffer");
+ let mut it = bytes.iter();
+ let n_columns = deserialize_compressed(&mut it);
+ let total_rows = deserialize_compressed(&mut it);
+ let mut data = >>::new();
+ for i in 0..n_columns * total_rows {
+ data.push(Cell::new(Value::new(deserialize_compressed(&mut it))));
+ }
+ Ok(RowBuffer {
+ n_columns: n_columns.try_into().unwrap(),
+ total_rows: total_rows.try_into().unwrap(),
+ data: Pooled::new(data),
+ })
}
}
+#[allow(dead_code)]
+fn get_n_compressed_bytes(x: u32) -> usize {
+ if x < (1u32 << 7) {
+ 1
+ } else if x < (1u32 << 14) {
+ 2
+ } else if x < (1u32 << 21) {
+ 3
+ } else if x < (1u32 << 28) {
+ 4
+ } else {
+ 5
+ }
+}
+
+fn compressed_serialize(buf: &mut Vec, x: u32) {
+ let mut rem = x;
+ while (rem >= (1u32 << 7)) {
+ buf.push((rem & ((1u32 << 7) - 1)).try_into().unwrap());
+ rem = rem >> 7;
+ }
+ buf.push((rem | (1u32 << 7)).try_into().unwrap());
+}
+
+fn deserialize_compressed<'a, T: Iterator- >(it: &mut T) -> u32 {
+ let mut ret = 0u32;
+ let mut delta = 0u32;
+ let mut val: u32 = ::into(*it.next().unwrap());
+ while (val < (1u32 << 7)) {
+ ret = ret | (val << delta);
+ delta += 7;
+ val = ::into(*it.next().unwrap());
+ }
+ let last = (val ^ (1u32 << 7)) << delta;
+ ret | last
+}
+
impl Serialize for RowBuffer {
fn serialize
(&self, serializer: S) -> Result
where
S: serde::Serializer,
{
+ /*
let mut state = serializer.serialize_struct("RowBuffer", 3)?;
state.serialize_field("n_columns", &self.n_columns)?;
state.serialize_field("total_rows", &self.total_rows)?;
state.serialize_field("data", &*self.data)?;
state.end()
+ */
+ //let len = mem::size_of::() * 2 + self.n_columns * self.total_rows * mem::size_of::();
+ /*
+ let mut len = get_n_compressed_bytes(self.n_columns.try_into().unwrap()) + get_n_compressed_bytes(self.total_rows.try_into().unwrap());
+ for r in self.data.iter() {
+ len = len + get_n_compressed_bytes(r.get().rep);
+ }
+ let mut buf = vec![0u8; len];
+ //TODO: put data in
+ */
+ let mut buf = Vec::new();
+ compressed_serialize(&mut buf, self.n_columns.try_into().unwrap());
+ compressed_serialize(&mut buf, self.total_rows.try_into().unwrap());
+ for r in self.data.iter() {
+ compressed_serialize(&mut buf, r.get().rep);
+ }
+ serializer.serialize_bytes(&buf)
}
}
diff --git a/core-relations/src/table/mod.rs b/core-relations/src/table/mod.rs
index 4628a25b1..01ef79d8e 100644
--- a/core-relations/src/table/mod.rs
+++ b/core-relations/src/table/mod.rs
@@ -20,7 +20,10 @@ use crossbeam_queue::SegQueue;
use hashbrown::HashTable;
use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
use rustc_hash::FxHasher;
-use serde::{ser::SerializeStruct, Deserialize, Deserializer, Serialize, Serializer};
+use serde::{
+ ser::{SerializeStruct, SerializeTuple},
+ Deserialize, Deserializer, Serialize, Serializer,
+};
use sharded_hash_table::ShardedHashTable;
use crate::{
@@ -51,12 +54,41 @@ mod tests;
type HashCode = u64;
/// A pointer to a row in the table.
-#[derive(Clone, Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug)]
pub(crate) struct TableEntry {
hashcode: HashCode,
row: RowId,
}
+impl Serialize for TableEntry {
+ fn serialize(&self, serializer: S) -> Result
+ where
+ S: Serializer,
+ {
+ let mut bytes = [0u8; 12];
+ let b1 = self.hashcode.to_be_bytes();
+ bytes[..b1.len()].copy_from_slice(&b1);
+ let b2 = self.row.rep.to_be_bytes();
+ bytes[b1.len()..].copy_from_slice(&b2);
+ serializer.serialize_bytes(&bytes)
+ }
+}
+
+impl<'de> Deserialize<'de> for TableEntry {
+ fn deserialize(deserializer: D) -> Result
+ where
+ D: Deserializer<'de>,
+ {
+ let bytes = <[u8; 16]>::deserialize(deserializer).expect("Failed to parse TabelEntry");
+ Ok(TableEntry {
+ hashcode: u64::from_be_bytes(bytes[0..8].try_into().unwrap()),
+ row: RowId {
+ rep: u32::from_be_bytes(bytes[8..12].try_into().unwrap()),
+ },
+ })
+ }
+}
+
impl TableEntry {
fn hashcode(&self) -> u64 {
// We keep the cast here to make it easy to switch to HashCode=u32.
@@ -242,8 +274,8 @@ impl Serialize for SortedWritesTable {
let mut state = serializer.serialize_struct("SortedWritesTable", 11)?;
state.serialize_field("generation", &self.generation)?;
- state.serialize_field("shard_data", &self.hash.shard_data())?;
- state.serialize_field("shards", &serialized_shards)?;
+ //state.serialize_field("shard_data", &self.hash.shard_data())?;
+ //state.serialize_field("shards", &serialized_shards)?;
state.serialize_field("data", &self.data)?;
state.serialize_field("n_keys", &self.n_keys)?;
state.serialize_field("n_columns", &self.n_columns)?;
@@ -251,7 +283,7 @@ impl Serialize for SortedWritesTable {
state.serialize_field("offsets", &self.offsets)?;
state.serialize_field("pending_state", &self.pending_state)?;
state.serialize_field("to_rebuild", &self.to_rebuild)?;
- state.serialize_field("rebuild_index", &self.rebuild_index)?;
+ //state.serialize_field("rebuild_index", &self.rebuild_index)?;
state.serialize_field("subset_tracker", &self.subset_tracker)?;
state.end()
diff --git a/core-relations/src/table_spec.rs b/core-relations/src/table_spec.rs
index dc50ce360..5ec0fb8cc 100644
--- a/core-relations/src/table_spec.rs
+++ b/core-relations/src/table_spec.rs
@@ -522,7 +522,8 @@ impl TableWrapper for WrapperImpl {
/// The implementations here downcast manually to the type used when
/// constructing the WrappedTable.
pub struct WrappedTable {
- inner: Box,
+ // TODO: evil hack
+ pub inner: Box,
wrapper: Box,
}
diff --git a/core-relations/src/uf/mod.rs b/core-relations/src/uf/mod.rs
index 5688ddb9e..531706fc6 100644
--- a/core-relations/src/uf/mod.rs
+++ b/core-relations/src/uf/mod.rs
@@ -63,8 +63,10 @@ pub struct DisplacedTable {
// k columns, k-1 are args, kth is the ID
// enode is the row index
// on deserialize: need to recompute this from `displaced`
+ #[serde(skip)]
displaced: Vec<(Value, Value)>, // this is "the table" everything else can be recomputed from this
// can even recanonicalize on serialization to get rid of dead things
+ #[serde(skip)]
changed: bool,
#[serde(skip)]
lookup_table: HashMap,
diff --git a/egglog-bridge/src/lib.rs b/egglog-bridge/src/lib.rs
index 7232d5def..702fe4cb4 100644
--- a/egglog-bridge/src/lib.rs
+++ b/egglog-bridge/src/lib.rs
@@ -68,7 +68,8 @@ impl Timestamp {
/// The state associated with an egglog program.
#[derive(Clone, Serialize, Deserialize)]
pub struct EGraph {
- db: Database,
+ // TODO: evil hack for looking at serialization size
+ pub db: Database,
uf_table: TableId,
id_counter: CounterId,
reason_counter: CounterId,
diff --git a/numeric-id/src/lib.rs b/numeric-id/src/lib.rs
index 9825268f2..df7e14cbc 100644
--- a/numeric-id/src/lib.rs
+++ b/numeric-id/src/lib.rs
@@ -47,7 +47,8 @@ impl NumericId for usize {
/// with no hashing. For sparse mappings, use a HashMap.
#[derive(Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct DenseIdMap {
- data: Vec