From 52db3e0558af00f5f6c84d60b5b363f54d525ebf Mon Sep 17 00:00:00 2001 From: Max <34987259+mparisi20@users.noreply.github.com> Date: Mon, 6 Apr 2026 23:46:20 -0400 Subject: [PATCH] Extract label information from the PDB - The S_LABEL32 records include named labels for blocks within functions, such as the __catch and __unwind markers in a function using SEH. - Remove some unnecessary code, such as the 'static_dup_' prefixing for identically-named local symbols. Since we already derive splits from the PDB, these duplicate names no longer lead to unwanted automatic splitting --- src/analysis/pass.rs | 9 +++- src/cmd/xex.rs | 71 ++++++++++++++------------- src/util/xpdb.rs | 113 ++++++++++++++++++++++++------------------- 3 files changed, 106 insertions(+), 87 deletions(-) diff --git a/src/analysis/pass.rs b/src/analysis/pass.rs index e9948913..286da1d4 100644 --- a/src/analysis/pass.rs +++ b/src/analysis/pass.rs @@ -37,8 +37,13 @@ impl AnalysisPass for FindSaveRestSledsXbox { // save/restore gpr/fpr/vmx should've been found in pdata if !func.contains("_upper") { - assert!(obj.known_functions.contains_key(&start), - "Could not find reg intrinsic from pdata. Is that even possible for an xex?"); + assert!( + obj.known_functions.contains_key(&start), + concat!( + "Could not find reg intrinsic from pdata. ", + "For PDB analysis, add 'symbols_known: true' to config.yml" + ) + ); } // add known symbols for them if obj.known_functions.contains_key(&start) { diff --git a/src/cmd/xex.rs b/src/cmd/xex.rs index ffc657ab..1534651c 100644 --- a/src/cmd/xex.rs +++ b/src/cmd/xex.rs @@ -1,4 +1,5 @@ use std::{ + cmp, collections::{BTreeMap, HashSet}, fs::{self, DirBuilder, File}, io::{BufWriter, Write}, @@ -39,7 +40,7 @@ use crate::{ config::{apply_splits_file, apply_symbols_file, write_splits_file, write_symbols_file}, dep::DepFile, file::{buf_writer, FileReadInfo}, - map_exe::{apply_map_file_exe, is_reg_intrinsic, process_map_exe}, + map_exe::{apply_map_file_exe, process_map_exe}, path::native_path, split::{split_obj, update_splits}, xex::{ @@ -472,13 +473,13 @@ fn load_analyze_xex(config: &ProjectConfig) -> Result { if let Some(pdb_path) = &config.base.pdb { let pdb_path: Utf8NativePathBuf = pdb_path.with_encoding(); - let (pdb_units, pdb_splits, pdb_syms) = try_parse_pdb(&pdb_path, &obj.sections)?; + let pdb = try_parse_pdb(&pdb_path, &obj.sections)?; // Apply all the splits // FIXME: Don't add splits unconditionally here; it may conflict with // user-provided splits. For now, users can comment out the pdb key // in config.yml after initial analysis - for (i, splits_for_section) in pdb_splits.into_iter().enumerate() { + for (i, splits_for_section) in pdb.splits.into_iter().enumerate() { for (start, split) in splits_for_section.iter() { obj.sections[i as u32].splits.push(start, split.clone()); } @@ -489,7 +490,7 @@ fn load_analyze_xex(config: &ProjectConfig) -> Result { for split in obj.sections.all_splits() { nonempty_mods.insert(&split.3.unit); } - for unit in pdb_units { + for unit in pdb.units { if nonempty_mods.contains(&unit) { obj.link_order.push(ObjUnit { name: unit, autogenerated: false, order: None }); } else { @@ -497,40 +498,38 @@ fn load_analyze_xex(config: &ProjectConfig) -> Result { } } + // Remove known labels from known_functions/pdata_funcs + obj.pdata_funcs.sort(); + for known_label in pdb.labels { + if obj.known_functions.remove(&known_label).is_some() { + log::debug!("Demoted {} from func to label", known_label); + } + if let Result::Ok(idx) = obj.pdata_funcs.binary_search(&known_label) { + obj.pdata_funcs.remove(idx); + }; + } + // Apply all the symbols - for mut sym in pdb_syms.into_iter() { - if !is_reg_intrinsic(&sym.name) && sym.name != "__NLG_Return" { - match obj.sections.at_address(sym.address as u32).ok() { - Some((sec_idx, _sec)) => { - let the_sec_addr = SectionAddress::new(sec_idx, sym.address as u32); - if obj.pdata_funcs.contains(&the_sec_addr) { - let pdata_sz = - obj.known_functions.get(&the_sec_addr).unwrap().unwrap() as u64; - let pdb_sz = sym.size; - if pdata_sz != pdb_sz { - log::debug!( - concat!( - "Size of {} according to .pdata is {}", - ", but according to pdb is {}. " - ), - &sym.name, - pdata_sz, - pdb_sz - ); - // Defer to .pdata size for now. Empirically, - // the sizes only differ for XDK functions. - // Note that this approach results in extra - // 'fn_XXXXXXXX' symbols for certain XDK labels. - // TODO: Fix this without breaking CFA - sym.size = pdata_sz; - } - } - obj.add_symbol(sym, true)?; - } - // if we couldn't find the section (like maybe it was stripped), just continue on - _ => continue, - }; + for mut sym in pdb.symbols.into_iter() { + let (sec_idx, _sec) = obj.sections.at_address(sym.address as u32)?; + let the_sec_addr = SectionAddress::new(sec_idx, sym.address as u32); + if obj.pdata_funcs.contains(&the_sec_addr) { + let pdata_sz = obj.known_functions.get(&the_sec_addr).unwrap().unwrap() as u64; + let pdb_sz = sym.size; + if pdata_sz != pdb_sz { + log::debug!( + concat!( + "Size of {} according to .pdata is {}", + ", but according to pdb is {}. " + ), + &sym.name, + pdata_sz, + pdb_sz + ); + sym.size = cmp::max(pdata_sz, pdb_sz); + } } + obj.add_symbol(sym, true)?; } dep.push(pdb_path); } diff --git a/src/util/xpdb.rs b/src/util/xpdb.rs index 283dde01..2d4f5d81 100644 --- a/src/util/xpdb.rs +++ b/src/util/xpdb.rs @@ -1,5 +1,5 @@ use std::{ - collections::{BTreeMap, HashMap, HashSet}, + collections::{BTreeMap, HashSet}, fs::File, vec::Vec, }; @@ -40,28 +40,16 @@ fn warn_unsupported_sym_kind(sym: &pdb2::Symbol, set: &mut HashSet SectionAddress { - let s_addr = pdb_offs.to_section_offset(pdbmap).unwrap_or_default(); - SectionAddress { - address: s_addr.offset, - // jeff sections are 0-indexed - section: s_addr.section as u32 - 1, - } -} - -fn section_addr_to_virtual_addr(section_addrs: &ObjSections, s_addr: &SectionAddress) -> u64 { - let sect_base = section_addrs.get(s_addr.section).unwrap_or(&ObjSection::default()).address; - s_addr.address as u64 + sect_base -} - -fn to_virtual_address( pdbmap: &pdb2::AddressMap, section_addrs: &ObjSections, pdb_offs: &pdb2::PdbInternalSectionOffset, -) -> u64 { - section_addr_to_virtual_addr(section_addrs, &to_section_addr(pdbmap, pdb_offs)) +) -> SectionAddress { + let sect_offs = pdb_offs.to_section_offset(pdbmap).unwrap_or_default(); + // Subtracting 1 because jeff sections are 0-indexed + // TODO: Do optimized binaries have a different mapping? + let jeff_sect = (sect_offs.section - 1) as u32; + let sect_base = section_addrs.get(jeff_sect).unwrap_or(&ObjSection::default()).address; + SectionAddress { section: jeff_sect, address: sect_base as u32 + sect_offs.offset } } #[derive(Debug, PartialEq, PartialOrd, Eq, Ord)] @@ -76,11 +64,24 @@ struct CoffGroup { pub name: String, } +/// All the useful information parsed from a PDB +#[derive(Debug)] +pub struct PdbAnalyzeResult { + /// Names of translation units + pub units: Vec, + /// Splits derived from the Section Contributions Stream + pub splits: Vec, + /// Symbols derived from the Global and Module Symbol Streams + pub symbols: Vec, + /// Locations of labels. Note that all labels are symbols + pub labels: Vec, +} + /// Extract translation units, splits, and symbols from a PDB pub fn try_parse_pdb( path: &Utf8NativePathBuf, section_addrs: &ObjSections, -) -> Result<(Vec, Vec, Vec)> { +) -> Result { let mut dbfile = pdb2::PDB::open(File::open(path)?)?; // Ensure pdb sections match the exe sections and that all the names match @@ -147,14 +148,14 @@ pub fn try_parse_pdb( let all_syms_iter = all_syms.into_iter(); let mut groups: Vec = vec![]; - let mut ldata_dupes: HashMap = HashMap::new(); + let mut known_labels: Vec = vec![]; for symbol in all_syms_iter { match symbol.parse() { Ok(pdb2::SymbolData::Public(data)) => { if data.offset.section == 0 { continue; } - let symaddr = to_section_addr(&pdbmap, &data.offset); + let symaddr = to_section_addr(&pdbmap, section_addrs, &data.offset); let obj_sym = syms.entry(symaddr).or_default(); // We get almost everything we need to know about a symbol @@ -167,7 +168,7 @@ pub fn try_parse_pdb( // TODO: Not all S_PUB32 records represent functions or objects; // Some may just be labels, which can be skipped obj_sym.name = data.name.to_string().into(); - obj_sym.address = to_virtual_address(&pdbmap, section_addrs, &data.offset); + obj_sym.address = symaddr.address.into(); obj_sym.section = Some(symaddr.section); obj_sym.flags = ObjSymbolFlagSet(ObjSymbolFlags::Global.into()); obj_sym.kind = @@ -178,7 +179,7 @@ pub fn try_parse_pdb( if data.offset.section == 0 { continue; } - let symaddr = to_section_addr(&pdbmap, &data.offset); + let symaddr = to_section_addr(&pdbmap, section_addrs, &data.offset); let obj_sym = syms.entry(symaddr).or_default(); // This is an S_GDATA32 or S_LDATA32 record @@ -187,19 +188,8 @@ pub fn try_parse_pdb( } else { obj_sym.flags.set_scope(ObjSymbolScope::Local); obj_sym.kind = ObjSymbolKind::Object; - // TODO: Now that we extract object files and splits, we can - // update this renaming so it is only done for repeat - // names of symbols in the same file - let name = data.name.to_string().clone(); - let c = - *ldata_dupes.entry(name.to_string()).and_modify(|c| *c += 1).or_insert(1); - let name: String = if c > 1 { - format!("static_dup_{}_{}", c - 1, name) - } else { - data.name.to_string().into() - }; - obj_sym.name = name; - obj_sym.address = to_virtual_address(&pdbmap, section_addrs, &data.offset); + obj_sym.name = data.name.to_string().into(); + obj_sym.address = symaddr.address.into(); obj_sym.section = Some(symaddr.section); } // TODO: We can also deduce the size by using the type @@ -212,7 +202,7 @@ pub fn try_parse_pdb( if data.offset.section == 0 { continue; } - let symaddr = to_section_addr(&pdbmap, &data.offset); + let symaddr = to_section_addr(&pdbmap, section_addrs, &data.offset); let obj_sym = syms.entry(symaddr).or_default(); // This is an S_GTHREAD32 or S_LTHREAD32 record @@ -222,7 +212,7 @@ pub fn try_parse_pdb( obj_sym.flags.set_scope(ObjSymbolScope::Local); obj_sym.kind = ObjSymbolKind::Object; obj_sym.name = data.name.to_string().into(); - obj_sym.address = to_virtual_address(&pdbmap, section_addrs, &data.offset); + obj_sym.address = symaddr.address.into(); obj_sym.section = Some(symaddr.section); } @@ -232,20 +222,20 @@ pub fn try_parse_pdb( if data.offset.section == 0 { continue; } - let symaddr = to_section_addr(&pdbmap, &data.offset); + let symaddr = to_section_addr(&pdbmap, section_addrs, &data.offset); let obj_sym = syms.entry(symaddr).or_default(); // This is an S_GPROC32 or S_LPROC32 record obj_sym.size = data.len as u64; obj_sym.size_known = true; - obj_sym.align = Some(4); + obj_sym.align = Some(8); if data.global { obj_sym.flags.set_scope(ObjSymbolScope::Global); } else { obj_sym.flags.set_scope(ObjSymbolScope::Local); obj_sym.kind = ObjSymbolKind::Function; obj_sym.name = data.name.to_string().into(); - obj_sym.address = to_virtual_address(&pdbmap, section_addrs, &data.offset); + obj_sym.address = symaddr.address.into(); obj_sym.section = Some(symaddr.section); } } @@ -253,7 +243,7 @@ pub fn try_parse_pdb( if data.offset.section == 0 { continue; } - let symaddr = to_section_addr(&pdbmap, &data.offset); + let symaddr = to_section_addr(&pdbmap, section_addrs, &data.offset); let obj_sym = syms.entry(symaddr).or_default(); // This is an S_THUNK32 record @@ -261,11 +251,31 @@ pub fn try_parse_pdb( obj_sym.size_known = true; obj_sym.align = Some(4); } + Ok(pdb2::SymbolData::Label(data)) => { + if data.offset.section == 0 { + continue; + } + let name: String = data.name.to_string().into(); + if name.starts_with("$M") || name.starts_with("$LN") { + // These are uninteresting auto-generated symbols + continue; + } + let symaddr = to_section_addr(&pdbmap, section_addrs, &data.offset); + let obj_sym = syms.entry(symaddr).or_default(); + obj_sym.address = symaddr.address.into(); + obj_sym.data_kind = ObjDataKind::Unknown; + obj_sym.flags.set_scope(ObjSymbolScope::Local); + obj_sym.kind = ObjSymbolKind::Unknown; + obj_sym.name = name; + obj_sym.section = Some(symaddr.section); + + known_labels.push(symaddr); + } Ok(pdb2::SymbolData::CoffGroup(data)) => groups.push(CoffGroup { - address: to_virtual_address(&pdbmap, section_addrs, &data.offset), + address: to_section_addr(&pdbmap, section_addrs, &data.offset).address.into(), size: data.cb, name: data.name.to_string().into(), - section: to_section_addr(&pdbmap, &data.offset).section, + section: to_section_addr(&pdbmap, section_addrs, &data.offset).section, }), Ok(pdb2::SymbolData::Section(_data)) => { // TODO: We already have most section info from the EXE, but @@ -326,9 +336,9 @@ pub fn try_parse_pdb( // TODO: Extract file names from the Sources substream to replace the // auto-generated names. Take only the base name, fix the extension, // and disambiguate identical names with a prefix - let s_addr = to_section_addr(&pdbmap, &contrib.offset); + let s_addr = to_section_addr(&pdbmap, section_addrs, &contrib.offset); let sec_idx = s_addr.section as usize; - let start = section_addr_to_virtual_addr(section_addrs, &s_addr); + let start: u64 = s_addr.address.into(); let end = start + contrib.size as u64; let mod_idx = contrib.module as i32; @@ -414,5 +424,10 @@ pub fn try_parse_pdb( }; } - Ok((module_names, splits_by_section, addr_vec)) + Ok(PdbAnalyzeResult { + units: module_names, + splits: splits_by_section, + symbols: addr_vec, + labels: known_labels, + }) }