From 46ff651f7da40d340180dccc8f912ead4deca918 Mon Sep 17 00:00:00 2001 From: krish Date: Mon, 23 Feb 2026 19:07:13 -0600 Subject: [PATCH] Modernize Python version, add type hints/docstrings, improve .gitignore and README --- .gitignore | 51 +++++++++- README.md | 34 +++++-- kbc/datasets.py | 78 ++++++++++++++-- kbc/learn.py | 20 ++-- kbc/models.py | 200 +++++++++++++++++++++++++++++++++++----- kbc/optimizers.py | 37 +++++++- kbc/process_datasets.py | 43 ++++++--- kbc/regularizers.py | 62 ++++++++++++- requirements.txt | 15 ++- setup.py | 15 ++- 10 files changed, 479 insertions(+), 76 deletions(-) diff --git a/.gitignore b/.gitignore index b97bc7b..2ecc605 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,49 @@ -build -*.egg-info -**/__pycache__ +# Build artifacts +build/ +dist/ +*.egg-info/ +*.egg + +# Python cache +__pycache__/ +**/__pycache__/ +*.py[cod] +*$py.class **/*cpython* + +# Virtual environments +.venv/ +venv/ +env/ +.env/ + +# Distribution / packaging +*.so +*.dylib + +# C extension build artifacts kbc/lib/bindings.cpp -dist -kbc/data + +# Dataset files (downloaded separately) +kbc/data/ + +# NFS lock files **/.nfs* + +# IDE / editor +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Jupyter +.ipynb_checkpoints/ + +# Testing +.pytest_cache/ +htmlcov/ +.coverage diff --git a/README.md b/README.md index 571f5c7..8504926 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,38 @@ # Knowledge Base Completion (kbc) + This code reproduces results in [Canonical Tensor Decomposition for Knowledge Base Completion](https://arxiv.org/abs/1806.07297) (ICML 2018). +## Prerequisites + +- **Python** 3.10 or later +- **Conda** (recommended) or pip +- **PyTorch** 2.0+ — install separately from [pytorch.org](https://pytorch.org/get-started/locally/) for your platform +- **GPU** (optional but recommended) — NVIDIA GPU with CUDA support speeds up training significantly + ## Installation -Create a conda environment with pytorch cython and scikit-learn : + +Create a conda environment with Python 3.10 and activate it: + +```bash +conda create --name kbc_env python=3.10 +conda activate kbc_env ``` -conda create --name kbc_env python=3.7 -source activate kbc_env -conda install --file requirements.txt -c pytorch + +Install PyTorch for your platform by following the instructions at [pytorch.org](https://pytorch.org/get-started/locally/). For example, for a CPU-only install: + +```bash +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu ``` -Then install the kbc package to this environment +Then install the remaining dependencies: + +```bash +pip install -r requirements.txt ``` + +Finally, install the kbc package into the environment: + +```bash python setup.py install ``` @@ -47,7 +69,7 @@ regularized with the weighted N3 on several datasets, for several dimensions. We For rank 2000 : learning rate 1e-2, batch-size 100, max epochs 200. | rank | 5|25|50|100|500|2000| -|------------|--|--|--|---|---|----| +|------------|--|--|--|---|---|-----| | MRR | 0.36|0.61|0.78|0.83|0.84|0.86 | | H@1 | 0.27|0.52|0.73|0.79|0.80|0.83 | | H@3 | 0.41|0.67|0.81|0.85|0.87|0.87 | diff --git a/kbc/datasets.py b/kbc/datasets.py index c4d32c2..001d64c 100644 --- a/kbc/datasets.py +++ b/kbc/datasets.py @@ -19,7 +19,29 @@ class Dataset(object): - def __init__(self, name: str): + """Knowledge base dataset loader and evaluator. + + Loads pre-processed train / valid / test splits from pickle files produced + by ``process_datasets.py`` and provides utilities for feeding data to a + model and computing filtered ranking metrics. + + Args: + name: Name of the dataset subfolder inside ``kbc/data/`` + (e.g. ``"FB15K"`` or ``"WN18RR"``). + + Attributes: + root: Path to the dataset directory. + data: Dictionary mapping split name to a NumPy array of + ``(lhs, rel, rhs)`` triples. + n_entities: Total number of unique entities. + n_predicates: Total number of unique predicates (doubled to include + reciprocal relations). + to_skip: Filtered-metric lookup table. ``to_skip[side][(s, r)]`` + gives the list of objects that are considered correct and + should be masked during ranking. + """ + + def __init__(self, name: str) -> None: self.root = DATA_PATH / name self.data = {} @@ -36,10 +58,27 @@ def __init__(self, name: str): self.to_skip: Dict[str, Dict[Tuple[int, int], List[int]]] = pickle.load(inp_f) inp_f.close() - def get_examples(self, split): + def get_examples(self, split: str) -> np.ndarray: + """Return the raw NumPy array for a given data split. + + Args: + split: One of ``"train"``, ``"valid"``, or ``"test"``. + + Returns: + NumPy array of shape ``(n_triples, 3)`` with uint64 values. + """ return self.data[split] - def get_train(self): + def get_train(self) -> np.ndarray: + """Return the augmented training set including reciprocal triples. + + Each training triple ``(lhs, rel, rhs)`` is accompanied by its + reciprocal ``(rhs, rel + n_predicates//2, lhs)``, doubling the + training set size. + + Returns: + NumPy array of shape ``(2 * n_train_triples, 3)``. + """ copy = np.copy(self.data['train']) tmp = np.copy(copy[:, 0]) copy[:, 0] = copy[:, 2] @@ -49,16 +88,34 @@ def get_train(self): def eval( self, model: KBCModel, split: str, n_queries: int = -1, missing_eval: str = 'both', - at: Tuple[int] = (1, 3, 10) - ): + at: Tuple[int, ...] = (1, 3, 10) + ) -> Tuple[Dict[str, float], Dict[str, torch.FloatTensor]]: + """Evaluate a model on a given data split using filtered ranking metrics. + + Args: + model: The KBC model to evaluate. + split: Data split to evaluate on (``"train"``, ``"valid"``, + or ``"test"``). + n_queries: Number of queries to sample. Evaluates all queries + when set to ``-1``. + missing_eval: Which side to evaluate: ``"lhs"``, ``"rhs"``, + or ``"both"``. + at: Hits@k thresholds to compute. + + Returns: + A tuple ``(mean_reciprocal_rank, hits_at)`` where both are + dictionaries keyed by ``"lhs"`` / ``"rhs"`` (or both). + ``mean_reciprocal_rank[side]`` is a float; ``hits_at[side]`` + is a FloatTensor of length ``len(at)``. + """ test = self.get_examples(split) examples = torch.from_numpy(test.astype('int64')).cuda() missing = [missing_eval] if missing_eval == 'both': missing = ['rhs', 'lhs'] - mean_reciprocal_rank = {} - hits_at = {} + mean_reciprocal_rank: Dict[str, float] = {} + hits_at: Dict[str, torch.FloatTensor] = {} for m in missing: q = examples.clone() @@ -79,5 +136,10 @@ def eval( return mean_reciprocal_rank, hits_at - def get_shape(self): + def get_shape(self) -> Tuple[int, int, int]: + """Return the shape of the entity-relation-entity space. + + Returns: + Tuple ``(n_entities, n_predicates, n_entities)``. + """ return self.n_entities, self.n_predicates, self.n_entities diff --git a/kbc/learn.py b/kbc/learn.py index 2355b82..8e8de62 100644 --- a/kbc/learn.py +++ b/kbc/learn.py @@ -111,12 +111,18 @@ optimizer = KBCOptimizer(model, regularizer, optim_method, args.batch_size) -def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]): - """ - aggregate metrics for missing lhs and rhs - :param mrrs: d - :param hits: - :return: +def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]) -> Dict[str, object]: + """Aggregate MRR and Hits@k metrics by averaging over lhs and rhs directions. + + Args: + mrrs: Dictionary mapping ``"lhs"`` and ``"rhs"`` to their respective + mean reciprocal rank values. + hits: Dictionary mapping ``"lhs"`` and ``"rhs"`` to FloatTensors of + Hits@k values. + + Returns: + Dictionary with keys ``"MRR"`` (float) and ``"hits@[1,3,10]"`` + (FloatTensor), each averaged over both evaluation directions. """ m = (mrrs['lhs'] + mrrs['rhs']) / 2. h = (hits['lhs'] + hits['rhs']) / 2. @@ -124,7 +130,7 @@ def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]): cur_loss = 0 -curve = {'train': [], 'valid': [], 'test': []} +curve: Dict[str, list] = {'train': [], 'valid': [], 'test': []} for e in range(args.max_epochs): cur_loss = optimizer.epoch(examples) diff --git a/kbc/models.py b/kbc/models.py index 9985a4a..6954a7a 100644 --- a/kbc/models.py +++ b/kbc/models.py @@ -6,36 +6,83 @@ # from abc import ABC, abstractmethod -from typing import Tuple, List, Dict +from typing import Tuple, List, Dict, Optional import torch from torch import nn class KBCModel(nn.Module, ABC): + """Abstract base class for Knowledge Base Completion models. + + Subclasses must implement ``get_rhs``, ``get_queries``, and ``score`` + so that they can be used with the shared ``get_ranking`` evaluation loop. + """ + @abstractmethod - def get_rhs(self, chunk_begin: int, chunk_size: int): + def get_rhs(self, chunk_begin: int, chunk_size: int) -> torch.Tensor: + """Return right-hand-side entity embeddings for a chunk of entities. + + Args: + chunk_begin: Index of the first entity in this chunk. + chunk_size: Number of entities to include in the chunk. + + Returns: + A tensor of shape ``(embedding_dim, chunk_size)`` ready for + dot-product scoring against query representations. + """ pass @abstractmethod - def get_queries(self, queries: torch.Tensor): + def get_queries(self, queries: torch.Tensor) -> torch.Tensor: + """Return query representations for a batch of (lhs, rel, rhs) triples. + + Args: + queries: LongTensor of shape ``(batch_size, 3)`` containing + (lhs, rel, rhs) indices. + + Returns: + A float tensor of shape ``(batch_size, embedding_dim)`` + representing each query in the scoring space. + """ pass @abstractmethod - def score(self, x: torch.Tensor): + def score(self, x: torch.Tensor) -> torch.Tensor: + """Compute the scalar score for each (lhs, rel, rhs) triple. + + Args: + x: LongTensor of shape ``(batch_size, 3)`` containing + (lhs, rel, rhs) indices. + + Returns: + A float tensor of shape ``(batch_size, 1)`` with one score per triple. + """ pass def get_ranking( self, queries: torch.Tensor, filters: Dict[Tuple[int, int], List[int]], batch_size: int = 1000, chunk_size: int = -1 - ): - """ - Returns filtered ranking for each queries. - :param queries: a torch.LongTensor of triples (lhs, rel, rhs) - :param filters: filters[(lhs, rel)] gives the rhs to filter from ranking - :param batch_size: maximum number of queries processed at once - :param chunk_size: maximum number of candidates processed at once - :return: + ) -> torch.Tensor: + """Return filtered ranking for each query triple. + + For each query ``(lhs, rel, rhs)`` the method computes, over all + candidate entities, how many score at least as high as the true answer + after masking out known true answers (filtered setting). + + Args: + queries: LongTensor of shape ``(n_queries, 3)`` with + (lhs, rel, rhs) indices. + filters: Mapping from ``(lhs, rel)`` to a list of rhs entities that + should be filtered out of the ranking (all known correct + answers in the dataset). + batch_size: Maximum number of queries processed simultaneously. + chunk_size: Maximum number of candidate entities scored at once. + Defaults to all entities when set to ``-1``. + + Returns: + A float tensor of shape ``(n_queries,)`` where each entry is the + filtered rank of the correct answer (1 = best). """ if chunk_size < 0: chunk_size = self.sizes[2] @@ -76,10 +123,24 @@ def get_ranking( class CP(KBCModel): + """Canonical Polyadic (CP) decomposition model for knowledge base completion. + + Represents each entity and relation as a separate real-valued embedding + and scores a triple ``(lhs, rel, rhs)`` with the three-way dot product + ``lhs · rel · rhs``. + + Args: + sizes: Triple ``(n_lhs_entities, n_relations, n_rhs_entities)``. + For most KBC datasets lhs and rhs share the same entity vocab, + so ``sizes[0] == sizes[2]``. + rank: Dimensionality of the entity and relation embeddings. + init_size: Scale factor applied to the initial random weights. + """ + def __init__( self, sizes: Tuple[int, int, int], rank: int, init_size: float = 1e-3 - ): + ) -> None: super(CP, self).__init__() self.sizes = sizes self.rank = rank @@ -92,33 +153,89 @@ def __init__( self.rel.weight.data *= init_size self.rhs.weight.data *= init_size - def score(self, x): + def score(self, x: torch.Tensor) -> torch.Tensor: + """Compute the CP score for each triple. + + Args: + x: LongTensor of shape ``(batch_size, 3)`` with (lhs, rel, rhs) ids. + + Returns: + Float tensor of shape ``(batch_size, 1)``. + """ lhs = self.lhs(x[:, 0]) rel = self.rel(x[:, 1]) rhs = self.rhs(x[:, 2]) return torch.sum(lhs * rel * rhs, 1, keepdim=True) - def forward(self, x): + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: + """Forward pass returning scores over all entities and embedding factors. + + Args: + x: LongTensor of shape ``(batch_size, 3)`` with (lhs, rel, rhs) ids. + + Returns: + A tuple ``(scores, factors)`` where: + - ``scores`` is a float tensor of shape ``(batch_size, n_entities)`` + giving the score of each candidate rhs entity for each query. + - ``factors`` is a tuple ``(lhs_emb, rel_emb, rhs_emb)`` used by + the regularizer. + """ lhs = self.lhs(x[:, 0]) rel = self.rel(x[:, 1]) rhs = self.rhs(x[:, 2]) return (lhs * rel) @ self.rhs.weight.t(), (lhs, rel, rhs) - def get_rhs(self, chunk_begin: int, chunk_size: int): + def get_rhs(self, chunk_begin: int, chunk_size: int) -> torch.Tensor: + """Return transposed rhs embeddings for a chunk of entities. + + Args: + chunk_begin: Index of the first entity in the chunk. + chunk_size: Number of entities in the chunk. + + Returns: + Float tensor of shape ``(rank, chunk_size)``. + """ return self.rhs.weight.data[ chunk_begin:chunk_begin + chunk_size ].transpose(0, 1) - def get_queries(self, queries: torch.Tensor): + def get_queries(self, queries: torch.Tensor) -> torch.Tensor: + """Return lhs * rel query vectors for a batch of triples. + + Args: + queries: LongTensor of shape ``(batch_size, 3)``. + + Returns: + Float tensor of shape ``(batch_size, rank)``. + """ return self.lhs(queries[:, 0]).data * self.rel(queries[:, 1]).data class ComplEx(KBCModel): + """ComplEx decomposition model for knowledge base completion. + + Uses complex-valued embeddings where each entity/relation embedding of + dimension ``rank`` is stored as a ``2*rank`` real vector (first half = real + part, second half = imaginary part). Scoring follows the Hermitian inner + product of the form ``Re()``. + + Reference: Trouillon et al., "Complex Embeddings for Simple Link + Prediction", ICML 2016. + + Args: + sizes: Triple ``(n_lhs_entities, n_relations, n_rhs_entities)``. + Entities share a single embedding table (``sizes[0] == sizes[2]`` + is assumed). + rank: Half the total embedding width (real and imaginary parts each + have ``rank`` dimensions). + init_size: Scale factor applied to the initial random weights. + """ + def __init__( self, sizes: Tuple[int, int, int], rank: int, init_size: float = 1e-3 - ): + ) -> None: super(ComplEx, self).__init__() self.sizes = sizes self.rank = rank @@ -130,7 +247,15 @@ def __init__( self.embeddings[0].weight.data *= init_size self.embeddings[1].weight.data *= init_size - def score(self, x): + def score(self, x: torch.Tensor) -> torch.Tensor: + """Compute the ComplEx score for each triple. + + Args: + x: LongTensor of shape ``(batch_size, 3)`` with (lhs, rel, rhs) ids. + + Returns: + Float tensor of shape ``(batch_size, 1)``. + """ lhs = self.embeddings[0](x[:, 0]) rel = self.embeddings[1](x[:, 1]) rhs = self.embeddings[0](x[:, 2]) @@ -145,7 +270,18 @@ def score(self, x): 1, keepdim=True ) - def forward(self, x): + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: + """Forward pass returning scores over all entities and embedding moduli. + + Args: + x: LongTensor of shape ``(batch_size, 3)`` with (lhs, rel, rhs) ids. + + Returns: + A tuple ``(scores, factors)`` where: + - ``scores`` is a float tensor of shape ``(batch_size, n_entities)``. + - ``factors`` is a tuple ``(|lhs|, |rel|, |rhs|)`` (element-wise + modulus) used by the N3 regularizer. + """ lhs = self.embeddings[0](x[:, 0]) rel = self.embeddings[1](x[:, 1]) rhs = self.embeddings[0](x[:, 2]) @@ -165,12 +301,32 @@ def forward(self, x): torch.sqrt(rhs[0] ** 2 + rhs[1] ** 2) ) - def get_rhs(self, chunk_begin: int, chunk_size: int): + def get_rhs(self, chunk_begin: int, chunk_size: int) -> torch.Tensor: + """Return transposed entity embeddings for a chunk. + + Args: + chunk_begin: Index of the first entity in the chunk. + chunk_size: Number of entities in the chunk. + + Returns: + Float tensor of shape ``(2 * rank, chunk_size)``. + """ return self.embeddings[0].weight.data[ chunk_begin:chunk_begin + chunk_size ].transpose(0, 1) - def get_queries(self, queries: torch.Tensor): + def get_queries(self, queries: torch.Tensor) -> torch.Tensor: + """Return complex query vectors for a batch of triples. + + Computes ``lhs * conj(rel)`` in the complex sense and concatenates + real and imaginary parts into a single ``2*rank`` vector per query. + + Args: + queries: LongTensor of shape ``(batch_size, 3)``. + + Returns: + Float tensor of shape ``(batch_size, 2 * rank)``. + """ lhs = self.embeddings[0](queries[:, 0]) rel = self.embeddings[1](queries[:, 1]) lhs = lhs[:, :self.rank], lhs[:, self.rank:] diff --git a/kbc/optimizers.py b/kbc/optimizers.py index c1ad05a..9f40d84 100644 --- a/kbc/optimizers.py +++ b/kbc/optimizers.py @@ -15,17 +15,49 @@ class KBCOptimizer(object): + """Training optimizer wrapper for knowledge base completion models. + + Wraps a KBC model, a regularizer, and a PyTorch optimizer to provide a + single ``epoch`` call that iterates over shuffled mini-batches, computes + the combined cross-entropy + regularization loss, and performs a gradient + update. + + Args: + model: The KBC model to train (must expose a ``forward`` method that + returns ``(scores, factors)``). + regularizer: Regularizer applied to the embedding factors returned by + ``model.forward``. + optimizer: A PyTorch optimizer (e.g. ``torch.optim.Adagrad``) already + constructed for ``model.parameters()``. + batch_size: Number of training triples per gradient update step. + verbose: If ``True``, display a tqdm progress bar during training. + """ + def __init__( self, model: KBCModel, regularizer: Regularizer, optimizer: optim.Optimizer, batch_size: int = 256, verbose: bool = True - ): + ) -> None: self.model = model self.regularizer = regularizer self.optimizer = optimizer self.batch_size = batch_size self.verbose = verbose - def epoch(self, examples: torch.LongTensor): + def epoch(self, examples: torch.LongTensor) -> float: + """Run one full training epoch over the provided triples. + + Shuffles ``examples``, then iterates in mini-batches. For each batch + the method computes cross-entropy loss on right-hand-side prediction + plus the regularization term, back-propagates, and updates parameters. + + Args: + examples: LongTensor of shape ``(n_triples, 3)`` containing + (lhs, rel, rhs) training triples. + + Returns: + The total (fit + regularization) loss for the final mini-batch of + the epoch (for logging purposes). + """ actual_examples = examples[torch.randperm(examples.shape[0]), :] loss = nn.CrossEntropyLoss(reduction='mean') with tqdm.tqdm(total=examples.shape[0], unit='ex', disable=not self.verbose) as bar: @@ -49,3 +81,4 @@ def epoch(self, examples: torch.LongTensor): b_begin += self.batch_size bar.update(input_batch.shape[0]) bar.set_postfix(loss=f'{l.item():.0f}') + return l.item() diff --git a/kbc/process_datasets.py b/kbc/process_datasets.py index 7d8631a..d16301e 100644 --- a/kbc/process_datasets.py +++ b/kbc/process_datasets.py @@ -9,6 +9,7 @@ import errno from pathlib import Path import pickle +from typing import Dict, List import numpy as np @@ -17,16 +18,31 @@ DATA_PATH = pkg_resources.resource_filename('kbc', 'data/') -def prepare_dataset(path, name): - """ - Given a path to a folder containing tab separated files : - train, test, valid - In the format : - (lhs)\t(rel)\t(rhs)\n - Maps each entity and relation to a unique id, create corresponding folder - name in pkg/data, with mapped train/test/valid files. - Also create to_skip_lhs / to_skip_rhs for filtered metrics and - rel_id / ent_id for analysis. +def prepare_dataset(path: str, name: str) -> None: + """Process raw tab-separated KBC data files and save them as pickles. + + Given a directory containing ``train``, ``valid``, and ``test`` files in + TSV format (one ``lhs\\trel\\trhs`` triple per line), this function: + + 1. Builds entity and relation vocabularies and writes ``ent_id`` and + ``rel_id`` mapping files. + 2. Re-encodes each split as a NumPy ``uint64`` array and saves it as a + pickle inside ``kbc/data//``. + 3. Builds filtered-metric lookup tables (``to_skip.pickle``) mapping + ``(subject, relation)`` pairs to the full list of correct objects in + the dataset. + 4. Computes entity frequency distributions and saves them as + ``probas.pickle`` for optional importance-weighted sampling. + + Args: + path: Path to the folder containing the raw ``train``, ``valid``, + and ``test`` TSV files. + name: Name of the dataset (e.g. ``"FB15K"``). Used as the output + subdirectory name under ``kbc/data/``. + + Raises: + OSError: If the output directory already exists (errno.EEXIST) or + another filesystem error occurs. """ files = ['train', 'valid', 'test'] entities, relations = set(), set() @@ -71,14 +87,14 @@ def prepare_dataset(path, name): print("creating filtering lists") # create filtering files - to_skip = {'lhs': defaultdict(set), 'rhs': defaultdict(set)} + to_skip: Dict[str, Dict] = {'lhs': defaultdict(set), 'rhs': defaultdict(set)} for f in files: examples = pickle.load(open(Path(DATA_PATH) / name / (f + '.pickle'), 'rb')) for lhs, rel, rhs in examples: to_skip['lhs'][(rhs, rel + n_relations)].add(lhs) # reciprocals to_skip['rhs'][(lhs, rel)].add(rhs) - to_skip_final = {'lhs': {}, 'rhs': {}} + to_skip_final: Dict[str, Dict] = {'lhs': {}, 'rhs': {}} for kk, skip in to_skip.items(): for k, v in skip.items(): to_skip_final[kk][k] = sorted(list(v)) @@ -107,7 +123,7 @@ def prepare_dataset(path, name): if __name__ == "__main__": - datasets = ['FB15K', 'WN', 'WN18RR', 'FB237', 'YAGO3-10'] + datasets: List[str] = ['FB15K', 'WN', 'WN18RR', 'FB237', 'YAGO3-10'] for d in datasets: print("Preparing dataset {}".format(d)) try: @@ -123,4 +139,3 @@ def prepare_dataset(path, name): print("File exists. skipping...") else: raise - diff --git a/kbc/regularizers.py b/kbc/regularizers.py index c617f36..ea3e6b1 100644 --- a/kbc/regularizers.py +++ b/kbc/regularizers.py @@ -12,17 +12,50 @@ class Regularizer(nn.Module, ABC): + """Abstract base class for knowledge base embedding regularizers. + + Subclasses implement ``forward`` to compute a scalar regularization loss + from a tuple of embedding tensors returned by a model's ``forward`` pass. + """ + @abstractmethod - def forward(self, factors: Tuple[torch.Tensor]): + def forward(self, factors: Tuple[torch.Tensor, ...]) -> torch.Tensor: + """Compute the regularization term for the given embedding factors. + + Args: + factors: A tuple of embedding tensors, typically + ``(lhs_emb, rel_emb, rhs_emb)`` or their moduli. + + Returns: + A scalar tensor representing the regularization loss. + """ pass class F2(Regularizer): - def __init__(self, weight: float): + """Squared Frobenius (L2) regularizer. + + Penalizes the sum of squared embedding values across all factors, + normalized by batch size. + + Args: + weight: Regularization strength (lambda). Higher values impose + stronger regularization. + """ + + def __init__(self, weight: float) -> None: super(F2, self).__init__() self.weight = weight - def forward(self, factors): + def forward(self, factors: Tuple[torch.Tensor, ...]) -> torch.Tensor: + """Compute mean squared L2 penalty across all embedding factors. + + Args: + factors: Tuple of embedding tensors of shape ``(batch_size, dim)``. + + Returns: + Scalar regularization loss. + """ norm = 0 for f in factors: norm += self.weight * torch.sum(f ** 2) @@ -30,11 +63,30 @@ def forward(self, factors): class N3(Regularizer): - def __init__(self, weight: float): + """Nuclear 3-norm (N3) regularizer. + + Penalizes the sum of cubed absolute embedding values across all factors, + normalized by batch size. This regularizer, introduced in the ICML 2018 + paper, encourages sparse, low-magnitude embeddings while remaining + compatible with complex (ComplEx) models. + + Args: + weight: Regularization strength (lambda). + """ + + def __init__(self, weight: float) -> None: super(N3, self).__init__() self.weight = weight - def forward(self, factors): + def forward(self, factors: Tuple[torch.Tensor, ...]) -> torch.Tensor: + """Compute mean N3 penalty across all embedding factors. + + Args: + factors: Tuple of embedding tensors of shape ``(batch_size, dim)``. + + Returns: + Scalar regularization loss. + """ norm = 0 for f in factors: norm += self.weight * torch.sum( diff --git a/requirements.txt b/requirements.txt index a93676f..0bb9577 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,10 @@ -tqdm -pytorch -numpy -scikit-learn -scipy \ No newline at end of file +# Core deep learning framework — install pytorch separately via https://pytorch.org +# for your platform (CPU or CUDA). Example: +# conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia +# or for CPU-only: +# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + +tqdm>=4.66 +numpy>=1.26 +scikit-learn>=1.4 +scipy>=1.12 \ No newline at end of file diff --git a/setup.py b/setup.py index dabe37b..9405897 100644 --- a/setup.py +++ b/setup.py @@ -5,11 +5,22 @@ # LICENSE file in the root directory of this source tree. # -from setuptools import setup +from setuptools import setup, find_packages setup( name='kbc', + version='0.1.0', ext_package='', - packages=['kbc'], + packages=find_packages(), package_data={'kbc': ['data/**/*']}, + python_requires='>=3.10', + classifiers=[ + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'License :: Other/Proprietary License', + 'Operating System :: OS Independent', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + ], )