Skip to content

Commit 81cc825

Browse files
committed
Implement String Interning and Polymorphic Inline Caching (PIC)
- Added a global string pool (`stringPool_`) to the `GarbageCollector` for string interning. - Optimized `PomeString` to cache its hash on creation. - Updated `PomeValue::operator==` to use O(1) pointer equality for interned strings. - Updated `PomeValue::hash()` to leverage cached string hashes for faster table lookups. - Implemented Polymorphic Inline Caching (PIC) for `GETFIELD` and `SETFIELD` opcodes. - Fast-pathed property/method lookups using cached shapes and indices in the bytecode. - Replaced all `allocate<PomeString>` calls with `allocateString` globally. - Verified performance gains: Word Count (+37%), JSON Parse (+29%), Binary Tree (1.23x Python).
1 parent 76e5a97 commit 81cc825

8 files changed

Lines changed: 193 additions & 65 deletions

File tree

include/pome_gc.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,14 @@ class GarbageCollector {
3232

3333
template<typename T, typename... Args>
3434
T* allocate(Args&&... args);
35+
36+
PomeString* allocateString(const std::string& value);
3537
PomeList* allocateList();
3638

39+
void removeStringFromPool(const std::string& str) {
40+
stringPool_.erase(str);
41+
}
42+
3743
void updateSize(PomeObject* obj, size_t oldSize, size_t newSize);
3844

3945
void collect(bool minor = false);
@@ -78,6 +84,7 @@ class GarbageCollector {
7884
size_t nextMinorGC_ = 16 * 1024 * 1024;
7985

8086
std::vector<PomeObject*> tempRoots_;
87+
std::unordered_map<std::string, PomeString*> stringPool_;
8188
std::vector<PomeObject*> grayStack_;
8289

8390
void mark(bool minor);

include/pome_value.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,16 @@ namespace Pome
1212
class PomeString : public PomeObject
1313
{
1414
public:
15-
explicit PomeString(std::string value) : value_(std::move(value)) {}
15+
explicit PomeString(std::string value);
1616
ObjectType type() const override { return ObjectType::STRING; }
1717
std::string toString() const override { return value_; }
1818
const std::string &getValue() const { return value_; }
1919
size_t extraSize() const { return value_.capacity(); }
20+
size_t getHash() const { return hash_; }
2021

2122
private:
2223
std::string value_;
24+
size_t hash_;
2325
};
2426

2527
/**

plans/string-interning.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Plan: String Interning Optimization
2+
3+
## Objective
4+
Implement String Interning to eliminate redundant string allocations and transform $O(N)$ string comparisons into $O(1)$ pointer comparisons.
5+
6+
## Key Files
7+
- `include/pome_gc.h`: Add `stringPool_` member.
8+
- `src/pome_gc.cpp`: Implement pool cleanup in `sweep`.
9+
- `include/pome_value.h`: Add `hash_` to `PomeString`.
10+
- `src/pome_value.cpp`: Update `operator==` and `hash()` for `PomeValue`.
11+
- `include/pome_gc_impl.h`: Specialize `allocate<PomeString>`.
12+
13+
## Implementation Steps
14+
15+
### 1. PomeString & PomeValue Updates
16+
- Add `size_t hash_` to `PomeString`.
17+
- Initialize `hash_` in `PomeString` constructor using `std::hash<std::string>`.
18+
- Update `PomeValue::operator==`: If both are strings, return `asObject() == other.asObject()`.
19+
- Update `PomeValue::hash()`: If string, return `asStringObject()->hash_`.
20+
21+
### 2. GarbageCollector Updates
22+
- Add `std::unordered_map<std::string, PomeString*> stringPool_` to `GarbageCollector`.
23+
- In `GarbageCollector::sweep`, iterate through objects being deleted. If type is `STRING`, remove from `stringPool_`.
24+
25+
### 3. Allocation Specialization
26+
- In `include/pome_gc_impl.h`, provide a specialization for `template<> PomeString* GarbageCollector::allocate<PomeString>(std::string&& value)`.
27+
- This specialization will check the pool before performing a new allocation.
28+
29+
## Verification
30+
- Run `test/unit_tests/test_features.pome`.
31+
- Run `tools/pome_bench.py` and compare Word Count and JSON Parse ratios.

src/pome_compiler.cpp

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ namespace Pome {
113113
emit(Chunk::makeABC(uv.isLocal ? OpCode::MOVE : OpCode::GETUPVAL, 0, uv.index, 0), stmt.getLine());
114114
}
115115

116-
PomeString* nameStr = gc.allocate<PomeString>(stmt.getName()); RootGuard nameStrGuard(gc, nameStr);
116+
PomeString* nameStr = gc.allocateString(stmt.getName()); RootGuard nameStrGuard(gc, nameStr);
117117
int nameIdx = addConstant(PomeValue(nameStr));
118118
emit(Chunk::makeABx(OpCode::SETGLOBAL, reg, nameIdx), stmt.getLine());
119119
lastResultReg = reg;
@@ -217,7 +217,7 @@ namespace Pome {
217217

218218
void Compiler::visit(StringExpr &expr) {
219219
int reg = allocReg();
220-
PomeString* s = gc.allocate<PomeString>(expr.getValue()); RootGuard sGuard(gc, s);
220+
PomeString* s = gc.allocateString(expr.getValue()); RootGuard sGuard(gc, s);
221221
int constIdx = addConstant(PomeValue(s));
222222
emit(Chunk::makeABx(OpCode::LOADK, reg, constIdx), expr.getLine());
223223
lastResultReg = reg;
@@ -256,7 +256,7 @@ namespace Pome {
256256
// Global Read
257257
int dest = allocReg();
258258

259-
PomeString* nameStr = gc.allocate<PomeString>(expr.getName()); RootGuard nameStrGuard(gc, nameStr);
259+
PomeString* nameStr = gc.allocateString(expr.getName()); RootGuard nameStrGuard(gc, nameStr);
260260
int nameIdx = addConstant(PomeValue(nameStr));
261261

262262
emit(Chunk::makeABx(OpCode::GETGLOBAL, dest, nameIdx), expr.getLine());
@@ -331,7 +331,7 @@ namespace Pome {
331331
exit(1);
332332
}
333333
// Global assignment
334-
PomeString* nameStr = gc.allocate<PomeString>(ident->getName()); RootGuard nameStrGuard(gc, nameStr);
334+
PomeString* nameStr = gc.allocateString(ident->getName()); RootGuard nameStrGuard(gc, nameStr);
335335
int nameIdx = addConstant(PomeValue(nameStr));
336336
emit(Chunk::makeABx(OpCode::SETGLOBAL, valReg, nameIdx), expr.getLine());
337337
lastResultReg = valReg;
@@ -341,7 +341,7 @@ namespace Pome {
341341
member->getObject()->accept(*this);
342342
int objReg = lastResultReg;
343343

344-
PomeString* keyStr = gc.allocate<PomeString>(member->getMember()); RootGuard keyStrGuard(gc, keyStr);
344+
PomeString* keyStr = gc.allocateString(member->getMember()); RootGuard keyStrGuard(gc, keyStr);
345345
int keyIdx = addConstant(PomeValue(keyStr));
346346

347347
expr.getRight()->accept(*this);
@@ -535,7 +535,7 @@ namespace Pome {
535535
exit(1);
536536
}
537537
// Global
538-
PomeString* nameStr = gc.allocate<PomeString>(ident->getName()); RootGuard nameStrGuard(gc, nameStr);
538+
PomeString* nameStr = gc.allocateString(ident->getName()); RootGuard nameStrGuard(gc, nameStr);
539539
int nameIdx = addConstant(PomeValue(nameStr));
540540
if (!op.empty()) {
541541
int currentValReg = allocReg();
@@ -561,7 +561,7 @@ namespace Pome {
561561
int objSafe = allocReg();
562562
emit(Chunk::makeABC(OpCode::MOVE, objSafe, objReg, 0), stmt.getLine());
563563

564-
PomeString* keyStr = gc.allocate<PomeString>(member->getMember()); RootGuard keyStrGuard(gc, keyStr);
564+
PomeString* keyStr = gc.allocateString(member->getMember()); RootGuard keyStrGuard(gc, keyStr);
565565
int keyIdx = addConstant(PomeValue(keyStr));
566566

567567
// Evaluate RHS
@@ -630,7 +630,7 @@ namespace Pome {
630630
expr.getObject()->accept(*this);
631631
int objReg = lastResultReg;
632632

633-
PomeString* memberStr = gc.allocate<PomeString>(expr.getMember()); RootGuard memberStrGuard(gc, memberStr);
633+
PomeString* memberStr = gc.allocateString(expr.getMember()); RootGuard memberStrGuard(gc, memberStr);
634634
int memberIdx = addConstant(PomeValue(memberStr));
635635

636636
int dest = allocReg();
@@ -672,7 +672,7 @@ namespace Pome {
672672
// Reserve: R(A+1) for 'this', R(A+2...) for args
673673
for (int i = 0; i < argCount + 1; ++i) allocReg();
674674

675-
PomeString* memberStr = gc.allocate<PomeString>(super->getMember()); RootGuard memberStrGuard(gc, memberStr);
675+
PomeString* memberStr = gc.allocateString(super->getMember()); RootGuard memberStrGuard(gc, memberStr);
676676
int memberIdx = addConstant(PomeValue(memberStr));
677677
emit(Chunk::makeABC(OpCode::GETSUPER, calleeReg, thisReg, memberIdx), expr.getLine());
678678

@@ -700,7 +700,7 @@ namespace Pome {
700700
// Reserve: R(A+1) for 'this', R(A+2...) for args
701701
for (int i = 0; i < argCount + 1; ++i) allocReg();
702702

703-
PomeString* keyStr = gc.allocate<PomeString>(member->getMember()); RootGuard keyStrGuard(gc, keyStr);
703+
PomeString* keyStr = gc.allocateString(member->getMember()); RootGuard keyStrGuard(gc, keyStr);
704704
int keyIdx = addConstant(PomeValue(keyStr));
705705

706706
emit(Chunk::makeABC(OpCode::GETFIELD, calleeReg, objReg, keyIdx), expr.getLine());
@@ -823,15 +823,15 @@ namespace Pome {
823823
if (superReg == -1) {
824824
// Try as global
825825
superReg = allocReg();
826-
PomeString* superNameStr = gc.allocate<PomeString>(stmt.getSuperclassName()); RootGuard superNameStrGuard(gc, superNameStr);
826+
PomeString* superNameStr = gc.allocateString(stmt.getSuperclassName()); RootGuard superNameStrGuard(gc, superNameStr);
827827
int superNameIdx = addConstant(PomeValue(superNameStr));
828828
emit(Chunk::makeABx(OpCode::GETGLOBAL, superReg, superNameIdx), stmt.getLine());
829829
}
830830
// Emit INHERIT R(class) R(super)
831831
emit(Chunk::makeABC(OpCode::INHERIT, classReg, superReg, 0), stmt.getLine());
832832
}
833833

834-
PomeString* nameStr = gc.allocate<PomeString>(stmt.getName()); RootGuard nameStrGuard(gc, nameStr);
834+
PomeString* nameStr = gc.allocateString(stmt.getName()); RootGuard nameStrGuard(gc, nameStr);
835835
int nameIdx = addConstant(PomeValue(nameStr));
836836
emit(Chunk::makeABx(OpCode::SETGLOBAL, classReg, nameIdx), stmt.getLine());
837837
lastResultReg = classReg;
@@ -1077,7 +1077,7 @@ namespace Pome {
10771077
}
10781078

10791079
void Compiler::visit(ImportStmt &stmt) {
1080-
PomeString* nameStr = gc.allocate<PomeString>(stmt.getModuleName()); RootGuard nameStrGuard(gc, nameStr);
1080+
PomeString* nameStr = gc.allocateString(stmt.getModuleName()); RootGuard nameStrGuard(gc, nameStr);
10811081
int nameIdx = addConstant(PomeValue(nameStr));
10821082
int reg = allocReg();
10831083
emit(Chunk::makeABx(OpCode::IMPORT, reg, nameIdx), stmt.getLine());
@@ -1101,13 +1101,13 @@ namespace Pome {
11011101
}
11021102

11031103
void Compiler::visit(FromImportStmt &stmt) {
1104-
PomeString* nameStr = gc.allocate<PomeString>(stmt.getModuleName()); RootGuard nameStrGuard(gc, nameStr);
1104+
PomeString* nameStr = gc.allocateString(stmt.getModuleName()); RootGuard nameStrGuard(gc, nameStr);
11051105
int nameIdx = addConstant(PomeValue(nameStr));
11061106
int modReg = allocReg();
11071107
emit(Chunk::makeABx(OpCode::IMPORT, modReg, nameIdx), stmt.getLine());
11081108

11091109
for (const auto& symbol : stmt.getSymbols()) {
1110-
PomeString* symStr = gc.allocate<PomeString>(symbol); RootGuard symStrGuard(gc, symStr);
1110+
PomeString* symStr = gc.allocateString(symbol); RootGuard symStrGuard(gc, symStr);
11111111
int symIdx = addConstant(PomeValue(symStr));
11121112
int symKeyReg = allocReg();
11131113
emit(Chunk::makeABx(OpCode::LOADK, symKeyReg, symIdx), stmt.getLine());
@@ -1134,7 +1134,7 @@ namespace Pome {
11341134
}
11351135

11361136
if (!name.empty()) {
1137-
PomeString* nameStr = gc.allocate<PomeString>(name); RootGuard nameStrGuard(gc, nameStr);
1137+
PomeString* nameStr = gc.allocateString(name); RootGuard nameStrGuard(gc, nameStr);
11381138
int nameIdx = addConstant(PomeValue(nameStr));
11391139
emit(Chunk::makeABx(OpCode::EXPORT, valReg, nameIdx), stmt.getLine());
11401140
}
@@ -1152,7 +1152,7 @@ namespace Pome {
11521152
}
11531153

11541154
if (!name.empty()) {
1155-
PomeString* nameStr = gc.allocate<PomeString>(name); RootGuard nameStrGuard(gc, nameStr);
1155+
PomeString* nameStr = gc.allocateString(name); RootGuard nameStrGuard(gc, nameStr);
11561156
int nameIdx = addConstant(PomeValue(nameStr));
11571157
emit(Chunk::makeABx(OpCode::EXPORT, valReg, nameIdx), stmt.getLine());
11581158
}
@@ -1314,7 +1314,7 @@ namespace Pome {
13141314
}
13151315

13161316
int dest = allocReg();
1317-
PomeString* memberStr = gc.allocate<PomeString>(expr.getMember()); RootGuard memberStrGuard(gc, memberStr);
1317+
PomeString* memberStr = gc.allocateString(expr.getMember()); RootGuard memberStrGuard(gc, memberStr);
13181318
int memberIdx = addConstant(PomeValue(memberStr));
13191319

13201320
// GETSUPER R(A) R(this) C(index)

src/pome_gc.cpp

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,42 @@ PomeShape* GarbageCollector::getRootShape() const {
1919
return nullptr;
2020
}
2121

22+
PomeString* GarbageCollector::allocateString(const std::string& value) {
23+
auto it = stringPool_.find(value);
24+
if (it != stringPool_.end()) {
25+
return it->second;
26+
}
27+
28+
if (youngBytesAllocated_ > nextMinorGC_) collect(true);
29+
if (bytesAllocated_ > nextGC_) collect(false);
30+
31+
PomeString* str = nullptr;
32+
try {
33+
str = new PomeString(value);
34+
} catch (const std::bad_alloc& e) {
35+
collect(false);
36+
str = new PomeString(value);
37+
}
38+
39+
str->gcSize = sizeof(PomeString) + str->extraSize();
40+
41+
str->generation = 0;
42+
str->age = 0;
43+
str->refCount = 0;
44+
str->inZCT = true;
45+
zct_.push_back(str);
46+
47+
str->isMarked = false;
48+
str->next = youngObjects_;
49+
youngObjects_ = str;
50+
51+
bytesAllocated_ += str->gcSize;
52+
youngBytesAllocated_ += str->gcSize;
53+
54+
stringPool_[str->getValue()] = str;
55+
return str;
56+
}
57+
2258
PomeList* GarbageCollector::allocateList() {
2359
if (youngBytesAllocated_ > nextMinorGC_) collect(true);
2460
if (bytesAllocated_ > nextGC_) collect(false);
@@ -238,8 +274,9 @@ void sweepList(GarbageCollector& gc, PomeObject** listHead, size_t& bytesAllocat
238274
if (lst->elements.capacity() > 256) lst->elements.shrink_to_fit();
239275
listPool.push_back(lst);
240276
} else {
241-
// For general objects, we'd need a way to decRef children.
242-
// Since this is a specialized VM, we can add decRefChildren to PomeObject.
277+
if (unreached->type() == ObjectType::STRING) {
278+
gc.removeStringFromPool(static_cast<PomeString*>(unreached)->getValue());
279+
}
243280
delete unreached;
244281
}
245282
}
@@ -298,6 +335,9 @@ void GarbageCollector::sweep(bool minor) {
298335
if (lst->elements.capacity() > 256) lst->elements.shrink_to_fit();
299336
listPool_.push_back(lst);
300337
} else {
338+
if (unreached->type() == ObjectType::STRING) {
339+
removeStringFromPool(static_cast<PomeString*>(unreached)->getValue());
340+
}
301341
delete unreached;
302342
}
303343
}

src/pome_stdlib.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ namespace Pome
2929
/**
3030
* Create key string
3131
*/
32-
PomeString *keyStr = gc.allocate<PomeString>(name);
32+
PomeString *keyStr = gc.allocateString(name);
3333

3434
gc.rcMapSet(module->exports, PomeValue(keyStr), PomeValue(funcObj));
3535
}
@@ -91,7 +91,7 @@ namespace Pome
9191
/**
9292
* Constants
9393
*/
94-
PomeString *piStr = gc.allocate<PomeString>("pi");
94+
PomeString *piStr = gc.allocateString("pi");
9595
gc.rcMapSet(module->exports, PomeValue(piStr), PomeValue(3.141592653589793));
9696

9797
return module;
@@ -118,7 +118,7 @@ namespace Pome
118118

119119
registerNative(gc, module, "gc_info", [&gc](const std::vector<PomeValue> &args)
120120
{
121-
return PomeValue(gc.allocate<PomeString>(gc.getInfo()));
121+
return PomeValue(gc.allocateString(gc.getInfo()));
122122
});
123123

124124
return module;
@@ -148,7 +148,7 @@ namespace Pome
148148
std::stringstream buffer;
149149
buffer << file.rdbuf();
150150

151-
PomeString* s = gc.allocate<PomeString>(buffer.str());
151+
PomeString* s = gc.allocateString(buffer.str());
152152
return PomeValue(s); });
153153

154154
registerNative(gc, module, "writeFile", [getStringArg](const std::vector<PomeValue> &args)
@@ -168,7 +168,7 @@ namespace Pome
168168
if (args.size() > idx) std::cout << args[idx].toString();
169169
std::string line;
170170
if (std::getline(std::cin, line)) {
171-
PomeString* s = gc.allocate<PomeString>(line);
171+
PomeString* s = gc.allocateString(line);
172172
return PomeValue(s);
173173
}
174174
return PomeValue(std::monostate{}); });
@@ -193,13 +193,13 @@ namespace Pome
193193
std::string s = args[idx].asString();
194194

195195
if (args.size() < idx + 2 || !args[idx + 1].isNumber()) {
196-
PomeString* newS = gc.allocate<PomeString>(s);
196+
PomeString* newS = gc.allocateString(s);
197197
return PomeValue(newS);
198198
}
199199

200200
size_t start = static_cast<size_t>(args[idx + 1].asNumber());
201201
if (start >= s.length()) {
202-
PomeString* empty = gc.allocate<PomeString>("");
202+
PomeString* empty = gc.allocateString("");
203203
return PomeValue(empty);
204204
}
205205

@@ -208,7 +208,7 @@ namespace Pome
208208
len = static_cast<size_t>(args[idx + 2].asNumber());
209209
}
210210

211-
PomeString* sub = gc.allocate<PomeString>(s.substr(start, len));
211+
PomeString* sub = gc.allocateString(s.substr(start, len));
212212
return PomeValue(sub); });
213213

214214
registerNative(gc, module, "lower", [&gc](const std::vector<PomeValue> &args)
@@ -218,7 +218,7 @@ namespace Pome
218218
if (args.size() <= idx || !args[idx].isString()) return PomeValue(std::monostate{});
219219
std::string s = args[idx].asString();
220220
for (auto &c : s) c = std::tolower(c);
221-
return PomeValue(gc.allocate<PomeString>(s));
221+
return PomeValue(gc.allocateString(s));
222222
});
223223

224224
registerNative(gc, module, "upper", [&gc](const std::vector<PomeValue> &args)
@@ -228,7 +228,7 @@ namespace Pome
228228
if (args.size() <= idx || !args[idx].isString()) return PomeValue(std::monostate{});
229229
std::string s = args[idx].asString();
230230
for (auto &c : s) c = std::toupper(c);
231-
return PomeValue(gc.allocate<PomeString>(s));
231+
return PomeValue(gc.allocateString(s));
232232
});
233233

234234
return module;
@@ -543,7 +543,7 @@ PomeModule *createThreadingModule(GarbageCollector &gc, ModuleLoader loader)
543543
} else if (retType == &ffi_type_pointer) {
544544
char* result;
545545
ffi_call(&cif, FFI_FN(addr), &result, values);
546-
if (result) return PomeValue(gc.allocate<PomeString>(result));
546+
if (result) return PomeValue(gc.allocateString(result));
547547
return PomeValue(std::monostate{});
548548
} else {
549549
ffi_call(&cif, FFI_FN(addr), nullptr, values);

0 commit comments

Comments
 (0)