From 3c401d84cd83ad3318a9914a714d87afeca1d763 Mon Sep 17 00:00:00 2001 From: Max <34987259+mparisi20@users.noreply.github.com> Date: Sun, 12 Apr 2026 02:13:18 -0400 Subject: [PATCH 1/3] Extract object size information from the PDB - When possible, deduce the size of an object symbol from the type information associated with it - Deduce the sizes of floats, doubles, strings, and wide strings from the decorated names of their symbols --- src/util/xpdb.rs | 159 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 144 insertions(+), 15 deletions(-) diff --git a/src/util/xpdb.rs b/src/util/xpdb.rs index 16b8df80..fa31093e 100644 --- a/src/util/xpdb.rs +++ b/src/util/xpdb.rs @@ -6,7 +6,7 @@ use std::{ use anyhow::{ensure, Result}; use itertools::Itertools; -use pdb2::{self, FallibleIterator}; +use pdb2::{self, FallibleIterator, Indirection, PrimitiveKind, TypeFinder, TypeIndex}; use typed_path::Utf8NativePathBuf; use crate::{ @@ -52,6 +52,132 @@ fn to_section_addr( SectionAddress { section: jeff_sect, address: sect_base as u32 + sect_offs.offset } } +/// Attempt to deduce the size of the type referenced by index using the +/// TypeFinder +fn lookup_type_size(ty_finder: &TypeFinder, index: TypeIndex) -> Result { + match ty_finder.find(index)?.parse()? { + pdb2::TypeData::Array(data) => { + // Note: the last "dimension" is the total size in bytes of the + // array. See the documentation for the dimensions field + Ok(data.dimensions[data.dimensions.len() - 1].into()) + } + pdb2::TypeData::Class(data) => Ok(data.size), + pdb2::TypeData::Enumeration(data) => lookup_type_size(ty_finder, data.underlying_type), + pdb2::TypeData::Modifier(data) => lookup_type_size(ty_finder, data.underlying_type), + pdb2::TypeData::Pointer(_) => Ok(4), + pdb2::TypeData::Union(data) => Ok(data.size), + pdb2::TypeData::Primitive(data) => { + // Check if this is a pointer to a primitive type + match data.indirection { + Some(Indirection::Near32) => { + return Ok(4); + } + Some(Indirection::Near64) => { + return Ok(8); + } + None => {} + _ => { + return Err(anyhow::anyhow!(format!( + "Unsupported pointer kind {:?} for index #{}", + data.indirection, index.0 + ))); + } + } + match data.kind { + PrimitiveKind::Char + | PrimitiveKind::RChar + | PrimitiveKind::UChar + | PrimitiveKind::I8 + | PrimitiveKind::U8 + | PrimitiveKind::Bool8 => Ok(1), + PrimitiveKind::WChar + | PrimitiveKind::RChar16 + | PrimitiveKind::Short + | PrimitiveKind::UShort + | PrimitiveKind::I16 + | PrimitiveKind::U16 + | PrimitiveKind::F16 + | PrimitiveKind::Bool16 => Ok(2), + PrimitiveKind::RChar32 + | PrimitiveKind::Long + | PrimitiveKind::ULong + | PrimitiveKind::I32 + | PrimitiveKind::U32 + | PrimitiveKind::F32 + | PrimitiveKind::Complex32 + | PrimitiveKind::Bool32 => Ok(4), + PrimitiveKind::Quad + | PrimitiveKind::UQuad + | PrimitiveKind::I64 + | PrimitiveKind::U64 + | PrimitiveKind::F64 + | PrimitiveKind::Complex64 + | PrimitiveKind::Bool64 => Ok(8), + PrimitiveKind::Octa + | PrimitiveKind::UOcta + | PrimitiveKind::I128 + | PrimitiveKind::U128 + | PrimitiveKind::F128 + | PrimitiveKind::Complex128 => Ok(16), + _ => Err(anyhow::anyhow!(format!( + "Unsupported PrimitiveKind {:?} for index #{}", + data, index.0 + ))), + } + } + _ => Err(anyhow::anyhow!(format!("Unrecognized type record for index 0x{:X}", index.0))), + } +} + +/// Try to set the size of the object symbol according to its type +fn set_obj_size_by_type(obj_sym: &mut ObjSymbol, ty_finder: &TypeFinder, index: TypeIndex) { + match lookup_type_size(ty_finder, index) { + Ok(ty_size) => { + obj_sym.size = ty_size; + obj_sym.size_known = true; + } + Err(err) => { + log::warn!("Object size lookup failed for {}: {:?}", obj_sym.name, err); + } + } +} + +/// Try to set the size of the object symbol according to its name +fn set_obj_size_by_name(obj_sym: &mut ObjSymbol, name: &str) { + if name.starts_with("??_C@_0") || name.starts_with("??_C@_1") { + // This is a string or a wide string. In either case, the size + // is encoded into the symbol name itself. + obj_sym.data_kind = ObjDataKind::String; + let ptr = &mut name["??_C@_0".len()..].chars(); + let mut str_size = 0; + for ch in ptr.by_ref() { + if ch.is_ascii_digit() { + str_size = ch.to_digit(10).unwrap() as u64 + 1; + break; + } else if ('A'..='P').contains(&ch) { + str_size = str_size * 16 + (ch as u8 - b'A') as u64; + } else { + assert!( + ch == '@', + "Expected '@'-terminator while parsing length of string constant" + ); + break; + } + } + obj_sym.size = str_size; + obj_sym.size_known = true; + } else if name.starts_with("__real@") { + if name.len() == "__real@00000000".len() { + obj_sym.data_kind = ObjDataKind::Float; + obj_sym.size = 4; + } else if name.len() == "__real@0000000000000000".len() { + obj_sym.data_kind = ObjDataKind::Double; + obj_sym.size = 8; + } + obj_sym.size_known = true; + } +} + #[derive(Debug, PartialEq, PartialOrd, Eq, Ord)] struct CoffGroup { /// Starting address of the group @@ -116,6 +242,16 @@ pub fn try_parse_pdb( } } + // Index all of the type records from the TPI; type data will be parsed + // as needed to deduce the sizes of object symbols + let tpi = dbfile.type_information()?; + let mut ty_finder = tpi.finder(); + let mut ty_iter = tpi.iter(); + + while (ty_iter.next()?).is_some() { + ty_finder.update(&ty_iter); + } + let pdbmap = dbfile.address_map()?; let mut unsupported_sym_kinds = HashSet::new(); let mut syms: BTreeMap = BTreeMap::new(); @@ -165,15 +301,16 @@ pub fn try_parse_pdb( // TODO: handle code/data merging properly, instead of // overwriting the name - // TODO: Not all S_PUB32 records represent functions or objects; - // Some may just be labels, which can be skipped - obj_sym.name = data.name.to_string().into(); obj_sym.address = symaddr.address.into(); obj_sym.section = Some(symaddr.section); obj_sym.flags = ObjSymbolFlagSet(ObjSymbolFlags::Global.into()); obj_sym.kind = if data.function { ObjSymbolKind::Function } else { ObjSymbolKind::Object }; obj_sym.data_kind = ObjDataKind::Unknown; + + let name: String = data.name.to_string().into(); + set_obj_size_by_name(obj_sym, &name); + obj_sym.name = name; } Ok(pdb2::SymbolData::Data(data)) => { if data.offset.section == 0 { @@ -192,11 +329,7 @@ pub fn try_parse_pdb( obj_sym.address = symaddr.address.into(); obj_sym.section = Some(symaddr.section); } - // TODO: We can also deduce the size by using the type - // field to index into the TPI. - // Build a TypeFinder, then use it to compute object sizes - // while iterating through the data symbols. - // See https://docs.rs/pdb2/latest/pdb2/struct.ItemInformation.html + set_obj_size_by_type(obj_sym, &ty_finder, data.type_index); } Ok(pdb2::SymbolData::ThreadStorage(data)) => { if data.offset.section == 0 { @@ -215,8 +348,7 @@ pub fn try_parse_pdb( obj_sym.address = symaddr.address.into(); obj_sym.section = Some(symaddr.section); } - - // TODO: Above note for DATA records also applies here + set_obj_size_by_type(obj_sym, &ty_finder, data.type_index); } Ok(pdb2::SymbolData::Procedure(data)) => { if data.offset.section == 0 { @@ -249,7 +381,7 @@ pub fn try_parse_pdb( // This is an S_THUNK32 record obj_sym.size = data.len as u64; obj_sym.size_known = true; - obj_sym.align = Some(4); + obj_sym.align = Some(8); } Ok(pdb2::SymbolData::Label(data)) => { if data.offset.section == 0 { @@ -372,9 +504,6 @@ pub fn try_parse_pdb( let mut contribs = dbi.section_contributions()?; while let Some(contrib) = contribs.next()? { - // TODO: Extract file names from the Sources substream to replace the - // auto-generated names. Take only the base name, fix the extension, - // and disambiguate identical names with a prefix let s_addr = to_section_addr(&pdbmap, section_addrs, &contrib.offset); let sec_idx = s_addr.section as usize; let start: u64 = s_addr.address.into(); From b0013bcf6e16ccde6b8320617b1468194a72703f Mon Sep 17 00:00:00 2001 From: Max <34987259+mparisi20@users.noreply.github.com> Date: Sun, 12 Apr 2026 12:35:56 -0400 Subject: [PATCH 2/3] Add use_pdb_types config key - Projects can add use_pdb_types: false to their config.ymls to avoid TPI parsing, e.g. if the type data in their PDB does not parse correctly --- src/cmd/dol.rs | 3 +++ src/cmd/xex.rs | 4 ++-- src/util/xpdb.rs | 31 ++++++++++++++++++++----------- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/cmd/dol.rs b/src/cmd/dol.rs index 531cba79..2461d18b 100644 --- a/src/cmd/dol.rs +++ b/src/cmd/dol.rs @@ -79,6 +79,8 @@ pub struct ProjectConfig { #[serde(default = "bool_true", skip_serializing_if = "is_true")] pub detect_strings: bool, #[serde(default = "bool_true", skip_serializing_if = "is_true")] + pub use_pdb_types: bool, + #[serde(default = "bool_true", skip_serializing_if = "is_true")] pub write_asm: bool, /// Specifies the start of the common BSS section. #[serde(skip_serializing_if = "is_default")] @@ -113,6 +115,7 @@ impl Default for ProjectConfig { modules: vec![], detect_objects: true, detect_strings: true, + use_pdb_types: true, write_asm: true, common_start: None, symbols_known: false, diff --git a/src/cmd/xex.rs b/src/cmd/xex.rs index 1534651c..4f63676a 100644 --- a/src/cmd/xex.rs +++ b/src/cmd/xex.rs @@ -473,7 +473,7 @@ fn load_analyze_xex(config: &ProjectConfig) -> Result { if let Some(pdb_path) = &config.base.pdb { let pdb_path: Utf8NativePathBuf = pdb_path.with_encoding(); - let pdb = try_parse_pdb(&pdb_path, &obj.sections)?; + let pdb = try_parse_pdb(&pdb_path, &obj.sections, config.use_pdb_types)?; // Apply all the splits // FIXME: Don't add splits unconditionally here; it may conflict with @@ -742,7 +742,7 @@ fn map(args: MapArgs) -> Result<()> { fn pdb(args: PdbArgs) -> Result<()> { println!("pdb: {}", args.input); - let data = try_parse_pdb(&args.input, &ObjSections::new(ObjKind::Executable, vec![]))?; + let data = try_parse_pdb(&args.input, &ObjSections::new(ObjKind::Executable, vec![]), true)?; println!("{:#?}", data); Ok(()) } diff --git a/src/util/xpdb.rs b/src/util/xpdb.rs index fa31093e..39035ad4 100644 --- a/src/util/xpdb.rs +++ b/src/util/xpdb.rs @@ -144,10 +144,11 @@ fn set_obj_size_by_type(obj_sym: &mut ObjSymbol, ty_finder: &TypeFinder, index: /// Try to set the size of the object symbol according to its name fn set_obj_size_by_name(obj_sym: &mut ObjSymbol, name: &str) { - if name.starts_with("??_C@_0") || name.starts_with("??_C@_1") { - // This is a string or a wide string. In either case, the size - // is encoded into the symbol name itself. - obj_sym.data_kind = ObjDataKind::String; + let is_string = name.starts_with("??_C@_0"); + let is_wstring = name.starts_with("??_C@_1"); + if is_string || is_wstring { + // In either case, the size is encoded into the symbol name itself + obj_sym.data_kind = if is_string { ObjDataKind::String } else { ObjDataKind::String16 }; let ptr = &mut name["??_C@_0".len()..].chars(); let mut str_size = 0; for ch in ptr.by_ref() { @@ -203,10 +204,14 @@ pub struct PdbAnalyzeResult { pub labels: Vec, } -/// Extract translation units, splits, and symbols from a PDB +/// Extract translation units, splits, and symbols from a PDB. The use_types +/// flag enables the parsing of the type information stream to deduce object +/// sizes. This is optional, since some PDBs are known to have corrupt data +/// in this stream. pub fn try_parse_pdb( path: &Utf8NativePathBuf, section_addrs: &ObjSections, + use_types: bool, ) -> Result { let mut dbfile = pdb2::PDB::open(File::open(path)?)?; @@ -248,8 +253,10 @@ pub fn try_parse_pdb( let mut ty_finder = tpi.finder(); let mut ty_iter = tpi.iter(); - while (ty_iter.next()?).is_some() { - ty_finder.update(&ty_iter); + if use_types { + while (ty_iter.next()?).is_some() { + ty_finder.update(&ty_iter); + } } let pdbmap = dbfile.address_map()?; @@ -329,7 +336,9 @@ pub fn try_parse_pdb( obj_sym.address = symaddr.address.into(); obj_sym.section = Some(symaddr.section); } - set_obj_size_by_type(obj_sym, &ty_finder, data.type_index); + if use_types { + set_obj_size_by_type(obj_sym, &ty_finder, data.type_index); + } } Ok(pdb2::SymbolData::ThreadStorage(data)) => { if data.offset.section == 0 { @@ -348,7 +357,9 @@ pub fn try_parse_pdb( obj_sym.address = symaddr.address.into(); obj_sym.section = Some(symaddr.section); } - set_obj_size_by_type(obj_sym, &ty_finder, data.type_index); + if use_types { + set_obj_size_by_type(obj_sym, &ty_finder, data.type_index); + } } Ok(pdb2::SymbolData::Procedure(data)) => { if data.offset.section == 0 { @@ -544,8 +555,6 @@ pub fn try_parse_pdb( rename: rename.clone(), }); } - // FIXME: This currently requires detect_objects=false to work. - // Deducing exact object sizes from the PDB should fix this curr_split.end = end as u32; } From baed9a06be93f3ffd53d3033360563640dc8aa6c Mon Sep 17 00:00:00 2001 From: Max <34987259+mparisi20@users.noreply.github.com> Date: Sun, 12 Apr 2026 15:35:19 -0400 Subject: [PATCH 3/3] Remove update_splits from the quick_analysis path - This function interferes with PDB-aided analysis, from which we already get accurate .pdata splits --- src/cmd/map.rs | 2 +- src/cmd/xex.rs | 4 ++-- src/util/split.rs | 11 +++++++++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/cmd/map.rs b/src/cmd/map.rs index 43ee2c8b..d66aba0e 100644 --- a/src/cmd/map.rs +++ b/src/cmd/map.rs @@ -187,7 +187,7 @@ fn config(args: ConfigArgs) -> Result<()> { log::info!("Processing map..."); let entries = process_map(file.as_mut(), None, None)?; let mut obj = create_obj(&entries)?; - if let Err(e) = update_splits(&mut obj, None, false) { + if let Err(e) = update_splits(&mut obj, None, false, true) { error!("Failed to update splits: {}", e) } DirBuilder::new().recursive(true).create(&args.out_dir)?; diff --git a/src/cmd/xex.rs b/src/cmd/xex.rs index 4f63676a..f5da7c06 100644 --- a/src/cmd/xex.rs +++ b/src/cmd/xex.rs @@ -255,7 +255,7 @@ fn split_write_obj_exe( debug!("Adjusting splits"); let module_id = module.obj.module_id; - update_splits(&mut module.obj, None, false)?; + update_splits(&mut module.obj, None, false, !config.quick_analysis)?; debug!("Writing configuration"); if let Some(symbols_path) = &module.config.symbols { @@ -633,7 +633,7 @@ fn disasm(args: DisasmArgs) -> Result<()> { // Gamepad Release apply_splits_file(&args.out, &mut obj)?; - update_splits(&mut obj, None, false)?; + update_splits(&mut obj, None, false, true)?; let split_objs = split_obj(&obj, None)?; for coff_obj in &split_objs { diff --git a/src/util/split.rs b/src/util/split.rs index 18c9595d..439bb240 100644 --- a/src/util/split.rs +++ b/src/util/split.rs @@ -544,7 +544,12 @@ fn split_pdata(obj: &mut ObjInfo) -> Result<()> { /// - Creating splits for gaps between existing splits /// - Resolving a new object link order #[instrument(level = "debug", skip(obj))] -pub fn update_splits(obj: &mut ObjInfo, common_start: Option, fill_gaps: bool) -> Result<()> { +pub fn update_splits( + obj: &mut ObjInfo, + common_start: Option, + fill_gaps: bool, + redo_pdata_splits: bool, +) -> Result<()> { // // Create splits for extab and extabindex entries // if let Some((section_index, section)) = obj.sections.by_name("extabindex")? { // if !section.data.is_empty() { @@ -576,7 +581,9 @@ pub fn update_splits(obj: &mut ObjInfo, common_start: Option, fill_gaps: bo // } // Create splits for .pdata entries - split_pdata(obj)?; + if redo_pdata_splits { + split_pdata(obj)?; + } // Remove linker generated symbols from splits // trim_linker_generated_symbols(obj)?;