From b42d1eca32570bc3988ff5cd1ff9ca48919ae068 Mon Sep 17 00:00:00 2001 From: Corey Ryan Dean Date: Tue, 9 Jun 2026 17:28:48 -0500 Subject: [PATCH] fix(codegen): stop emitting GC releases for globals through uninitialized frame offsets Root cause of the long-standing intermittent exit-time crash in downstream rcce2 CI (ItemsTest / OnlinePlayerChainTest, ~2-12% of runs, historically mislabeled "Stack overflow!"). node.cpp deleteVars() emitted the GC release __bbRelease( [ebp + Decl::offset], "type" ) for every struct/blitz-typed decl in the environ -- missing the d->kind gate every sibling branch has. main()'s environ includes the program GLOBALS, whose Decl::offset is never assigned by frame layout and was omitted from the Decl constructor's initializer list: uninitialized memory. When the stale value was 0 (most runs) the instruction read the saved-EBP slot and _bbRelease lookup-missed -- harmless. When it was heap garbage, main's epilogue performed a wild [ebp+garbage] read and access- violated. GC on/off is irrelevant: the wild read precedes the release, which is why the non-GC chain test crashed identically. Fix: gate the emission on frame-resident kinds (DECL_LOCAL || DECL_PARAM -- params must stay released: the call protocol references args on the way in and the callee releases them; a LOCAL-only gate fails GarbageCollectionTest), and zero-init Decl::offset as defense-in-depth. Also extends the crash diagnostics that produced the evidence: registers, a hex dump of the faulting anonymous executable allocation (dynamically emitted Blitz code has no symbols), and a dbghelp-symbolized EBP-chain stack walk in seTranslator. Verification: pre-fix baseline ItemsTest 2/100 + 12/300 failures, chain test 12/100 + 27/300; post-fix 0/300 + 0/300 (expected ~6-12 and ~27-36 at baseline rates; p < 1e-8). Full test.bat green. Co-Authored-By: Claude Fable 5 --- docs/notes/exit-teardown-crash.md | 39 ++++++++- src/blitzrc/bbruntime_dll/bbruntime_dll.cpp | 90 ++++++++++++++++++++- src/blitzrc/compiler/decl.h | 5 +- src/blitzrc/compiler/node.cpp | 17 +++- 4 files changed, 145 insertions(+), 6 deletions(-) diff --git a/docs/notes/exit-teardown-crash.md b/docs/notes/exit-teardown-crash.md index 12920013..890a3bbb 100644 --- a/docs/notes/exit-teardown-crash.md +++ b/docs/notes/exit-teardown-crash.md @@ -65,7 +65,44 @@ repro), fix it minimally in the runtime, and pin it with regression tests. 6. **Land**: BlitzForge PR → merge → rcce2 submodule-bump PR → update rcce2 CLAUDE.md flake note. -## Fallback floor (if root cause resists one iteration) +## RESOLUTION (round 2) — the real root cause was in CODEGEN, not the pool + +The two pool defects above were real (deterministically pinned by +`tests/DeleteEachChainTest.bb` / `tests/DeleteEachGCRefMapTest.bb`, fixed in +PR #85) but the 300-run statistical gate showed the flake persisted at the +baseline rate — they were not its mechanism. + +Round-2 diagnostics (registers + full hex dump of the generated-code +allocation + dbghelp-symbolized EBP walk) pinned the truth: + +- The fault was always at the same generated-image offset, in **main's + epilogue**, executing `mov ebx,[ebp+]` — and comparing + against the object image embedded in a `-o` exe showed the same + instruction as `mov ebx,[ebp+0x00]` (disp8). The displacement differed + *at assembly time* per process run. +- Source: `node.cpp deleteVars()` emitted the GC release + `__bbRelease( mem(local(d->offset)), type )` for **every** struct/blitz + typed decl in the environ — without the `d->kind` gate every sibling + branch has. For main(), the environ includes the program **GLOBALS**, + whose `Decl::offset` is never assigned by frame layout — + **uninitialized memory** (the `Decl` constructor omitted it from the + initializer list). Usually the stale heap value was 0 → `[ebp+0]` reads + the saved-EBP slot, `_bbRelease` lookup-misses, run passes. Sometimes it + was heap garbage (run-varying ASCII fragments in the dumps) → wild read + → access violation. GC on/off is irrelevant — the wild *read* precedes + the no-op release, which is why the non-GC chain test crashed the same + way. + +Fix: gate the emission on frame-resident kinds (`DECL_LOCAL || DECL_PARAM` +— params must stay: the call protocol references args on the way in and +the callee releases them; a LOCAL-only gate fails GarbageCollectionTest), +plus zero-init `Decl::offset` as defense-in-depth. + +Verification: pre-fix baseline ItemsTest 2/100 + 12/300, chain test 12/100 ++ 27/300; post-fix **0/300 + 0/300** (at baseline rates ~6-12 and ~27-36 +failures were expected; p < 1e-8). BlitzForge suite green. + +## Fallback floor (superseded — root cause found) The instrumentation, measured baseline, probe results, and this note merge anyway — converting an opaque flake into an addressable bug with evidence. diff --git a/src/blitzrc/bbruntime_dll/bbruntime_dll.cpp b/src/blitzrc/bbruntime_dll/bbruntime_dll.cpp index 65a4a874..5c11d4c2 100644 --- a/src/blitzrc/bbruntime_dll/bbruntime_dll.cpp +++ b/src/blitzrc/bbruntime_dll/bbruntime_dll.cpp @@ -13,6 +13,7 @@ using namespace std; #include #include #include +#include // SYMBOL_INFO for the lazy dbghelp symbolizer (no link dep) #include "../bbruntime/bbruntime.h" @@ -52,6 +53,36 @@ static void killer(){ } #endif +// Best-effort symbol lookup through dbghelp (system DLL, loaded lazily so we +// add no link dependency). With PDBs alongside the binaries this turns a +// module-relative crash offset into a function name + displacement. +typedef BOOL (WINAPI *SymInitialize_t)( HANDLE,PCSTR,BOOL ); +typedef BOOL (WINAPI *SymFromAddr_t)( HANDLE,DWORD64,PDWORD64,PSYMBOL_INFO ); +static string symbolName( void *addr ){ + static SymFromAddr_t pSymFromAddr=0; + static bool tried=false; + if( !tried ){ + tried=true; + if( HMODULE dh=LoadLibraryA( "dbghelp.dll" ) ){ + SymInitialize_t pInit=(SymInitialize_t)GetProcAddress( dh,"SymInitialize" ); + SymFromAddr_t pFrom=(SymFromAddr_t)GetProcAddress( dh,"SymFromAddr" ); + if( pInit && pFrom && pInit( GetCurrentProcess(),0,TRUE ) ) pSymFromAddr=pFrom; + } + } + if( !pSymFromAddr ) return ""; + char symbuf[sizeof(SYMBOL_INFO)+256]; + SYMBOL_INFO *sym=(SYMBOL_INFO*)symbuf; + sym->SizeOfStruct=sizeof(SYMBOL_INFO); + sym->MaxNameLen=255; + DWORD64 disp=0; + if( pSymFromAddr( GetCurrentProcess(),(DWORD64)(uintptr_t)addr,&disp,sym ) ){ + char out[320]; + sprintf( out," (%s+0x%X)",sym->Name,(unsigned int)disp ); + return out; + } + return ""; +} + // Describe a code/data address as "module+0xOFFSET" when it falls inside a // loaded module, else as a raw pointer. Module-relative offsets stay stable // across ASLR runs, so an intermittent fault becomes attributable to a @@ -66,7 +97,7 @@ static string describeAddress( void *addr ){ const char *base=path; for( const char *p=path;*p;++p ) if( *p=='\\'||*p=='/' ) base=p+1; sprintf( buf,"%s+0x%X",base,(unsigned int)((char*)addr-(char*)mbi.AllocationBase) ); - return buf; + return string( buf )+symbolName( addr ); } } sprintf( buf,"0x%p",addr ); @@ -99,7 +130,7 @@ static void _cdecl seTranslator( unsigned int u,EXCEPTION_POINTERS* pExp ){ // describeAddress allocates stack, and the guard page is already blown. if( pExp && pExp->ExceptionRecord && u!=EXCEPTION_STACK_OVERFLOW ){ EXCEPTION_RECORD *er=pExp->ExceptionRecord; - char info[64]; + char info[96]; sprintf( info," [code 0x%08X at ",(unsigned int)er->ExceptionCode ); panicStr+=info; panicStr+=describeAddress( er->ExceptionAddress ); @@ -110,6 +141,61 @@ static void _cdecl seTranslator( unsigned int u,EXCEPTION_POINTERS* pExp ){ panicStr+=info; } panicStr+="]"; + + // Code bytes at the faulting instruction: lets an intermittent fault + // in dynamically-emitted Blitz code (not part of any module, so no + // symbols) be disassembled after the fact. + if( pExp->ContextRecord ){ + CONTEXT *cx=pExp->ContextRecord; + unsigned char *ip=(unsigned char*)cx->Eip; + MEMORY_BASIC_INFORMATION mbi; + if( ip && VirtualQuery( ip,&mbi,sizeof(mbi) ) && mbi.State==MEM_COMMIT ){ + panicStr+="\ncode:"; + for( int k=0;k<16;++k ){ + sprintf( info," %02X",ip[k] ); + panicStr+=info; + } + } + sprintf( info,"\nregs: eax=%08X ebx=%08X ecx=%08X edx=%08X esi=%08X edi=%08X esp=%08X ebp=%08X", + (unsigned)cx->Eax,(unsigned)cx->Ebx,(unsigned)cx->Ecx,(unsigned)cx->Edx, + (unsigned)cx->Esi,(unsigned)cx->Edi,(unsigned)cx->Esp,(unsigned)cx->Ebp ); + panicStr+=info; + // When the fault is inside an anonymous (non-module) executable + // allocation -- dynamically emitted Blitz code -- dump the image + // from its allocation base so the instruction stream leading to + // the fault can be disassembled offline. + if( ip && VirtualQuery( ip,&mbi,sizeof(mbi) ) && mbi.State==MEM_COMMIT && mbi.AllocationBase ){ + char path[MAX_PATH]; + if( !GetModuleFileName( (HMODULE)mbi.AllocationBase,path,MAX_PATH ) ){ + unsigned char *base=(unsigned char*)mbi.AllocationBase; + unsigned int len=0x280; + sprintf( info,"\nimage @0x%p (fault offset 0x%X):",base,(unsigned int)(ip-base) ); + panicStr+=info; + for( unsigned int k=0;kEbp; + for( int f=0;f<12 && ebp;++f ){ + if( !VirtualQuery( (void*)ebp,&mbi,sizeof(mbi) ) || mbi.State!=MEM_COMMIT + || (mbi.Protect&(PAGE_GUARD|PAGE_NOACCESS)) ) break; + if( (char*)ebp+8 > (char*)mbi.BaseAddress+mbi.RegionSize ) break; + unsigned int ret=((unsigned int*)ebp)[1]; + unsigned int prev=((unsigned int*)ebp)[0]; + if( !ret ) break; + panicStr+=" <- "; + panicStr+=describeAddress( (void*)ret ); + if( prev<=ebp ) break; + ebp=prev; + } + } } bbruntime_panic( panicStr.c_str() ); diff --git a/src/blitzrc/compiler/decl.h b/src/blitzrc/compiler/decl.h index f4467eab..b637c9b6 100644 --- a/src/blitzrc/compiler/decl.h +++ b/src/blitzrc/compiler/decl.h @@ -15,7 +15,10 @@ struct Decl{ Type *type; //type int kind,offset; ConstType *defType; //default value - Decl( const string &s,Type *t,int k,ConstType *d=0 ):name(s),type(t),kind(k),defType(d){} + // offset is only assigned during frame layout (locals/params); default + // it to 0 so any future read of a non-frame Decl's offset is a + // deterministic 0 instead of run-varying uninitialized memory. + Decl( const string &s,Type *t,int k,ConstType *d=0 ):name(s),type(t),kind(k),offset(0),defType(d){} ~Decl(); virtual void getName( char *buff ); diff --git a/src/blitzrc/compiler/node.cpp b/src/blitzrc/compiler/node.cpp index 1ece4ee8..c02f5943 100644 --- a/src/blitzrc/compiler/node.cpp +++ b/src/blitzrc/compiler/node.cpp @@ -125,14 +125,27 @@ TNode *Node::deleteVars( Environ *e, Codegen *g ){ } } - if (type->structType() || type->blitzType()) { + // GC release of frame-resident object refs (locals AND params -- + // the call protocol references args on the way in, the callee + // releases them here). Must be gated on frame-resident kinds: for + // main() this environ also contains the program GLOBALS, whose + // Decl::offset is never assigned by frame layout. Ungated, main's + // epilogue emitted + // __bbRelease( [ebp + ], "type" ) + // -- a wild stack-relative read. When the stale offset happened to + // be 0 it read the saved-EBP slot and _bbRelease lookup-missed + // (harmless); when it was heap garbage it access-violated. That was + // the long-standing intermittent exit-time crash in downstream + // (rcce2) CI: ItemsTest / OnlinePlayerChainTest failing ~2-12% of + // runs with an AV at a stable program offset after the last test. + if ((type->structType() || type->blitzType()) && (d->kind==DECL_LOCAL || d->kind==DECL_PARAM)) { string typeName = ""; if (type->structType()) { typeName = "BBCustom"; } else if (type->blitzType()) { BlitzType* ty = type->blitzType(); - + typeName = ty->ident; }