diff --git a/.jules/bolt.md b/.jules/bolt.md new file mode 100644 index 0000000..3b480ab --- /dev/null +++ b/.jules/bolt.md @@ -0,0 +1,3 @@ +## 2024-05-23 - [O(N) to O(1) Data Generator Sampling] +**Learning:** In ML data generators, naive list comprehensions inside the `__getitem__` loop can be disastrous for performance. Specifically, filtering a large dataset (`N` items) to find samples of a specific class for every item in a batch results in `O(N * batch_size)` complexity. +**Action:** Always pre-compute a `label -> [indices]` map (index) during initialization or after shuffling. This allows O(1) sampling of positive/negative pairs, reducing per-batch complexity to `O(batch_size)`. diff --git a/deeptuner/datagenerators/triplet_data_generator.py b/deeptuner/datagenerators/triplet_data_generator.py index 18523e7..c138771 100644 --- a/deeptuner/datagenerators/triplet_data_generator.py +++ b/deeptuner/datagenerators/triplet_data_generator.py @@ -17,6 +17,13 @@ def __init__(self, image_paths, labels, batch_size, image_size, num_classes): self.on_epoch_end() print(f"Initialized TripletDataGenerator with {len(self.image_paths)} images") + def _build_index(self): + """Builds an index of label -> image paths for O(1) access.""" + self.unique_labels = np.unique(self.encoded_labels) + self.label_to_paths = {label: [] for label in self.unique_labels} + for path, label in zip(self.image_paths, self.encoded_labels): + self.label_to_paths[label].append(path) + def __len__(self): return max(1, len(self.image_paths) // self.batch_size) # Ensure at least one batch @@ -30,6 +37,7 @@ def on_epoch_end(self): combined = list(zip(self.image_paths, self.encoded_labels)) np.random.shuffle(combined) self.image_paths[:], self.encoded_labels[:] = zip(*combined) + self._build_index() def _generate_triplet_batch(self, batch_image_paths, batch_labels): anchor_images = [] @@ -40,12 +48,19 @@ def _generate_triplet_batch(self, batch_image_paths, batch_labels): anchor_path = batch_image_paths[i] anchor_label = batch_labels[i] - positive_path = np.random.choice( - [p for p, l in zip(self.image_paths, self.encoded_labels) if l == anchor_label] - ) - negative_path = np.random.choice( - [p for p, l in zip(self.image_paths, self.encoded_labels) if l != anchor_label] - ) + # Optimized selection using pre-computed index + positive_path = np.random.choice(self.label_to_paths[anchor_label]) + + # Select a random negative label different from anchor_label + negative_labels = self.unique_labels[self.unique_labels != anchor_label] + if len(negative_labels) > 0: + negative_label = np.random.choice(negative_labels) + negative_path = np.random.choice(self.label_to_paths[negative_label]) + else: + # Fallback or error if only 1 class exists (cannot form triplet) + # For robustness, we might just pick a random path if no negatives exist + # but mathematically triplet loss requires negatives. + raise ValueError("Cannot select negative sample: only 1 class found.") anchor_image = load_img(anchor_path, target_size=self.image_size) positive_image = load_img(positive_path, target_size=self.image_size)