facebookresearch · krish-arora-88 · Feb 24, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,49 @@
-build
-*.egg-info
-**/__pycache__
+# Build artifacts
+build/
+dist/
+*.egg-info/
+*.egg
+
+# Python cache
+__pycache__/
+**/__pycache__/
+*.py[cod]
+*$py.class
 **/*cpython*
+
+# Virtual environments
+.venv/
+venv/
+env/
+.env/
+
+# Distribution / packaging
+*.so
+*.dylib
+
+# C extension build artifacts
 kbc/lib/bindings.cpp
-dist
-kbc/data
+
+# Dataset files (downloaded separately)
+kbc/data/
+
+# NFS lock files
 **/.nfs*
+
+# IDE / editor
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Jupyter
+.ipynb_checkpoints/
+
+# Testing
+.pytest_cache/
+htmlcov/
+.coverage
diff --git a/README.md b/README.md
@@ -1,16 +1,38 @@
 # Knowledge Base Completion (kbc)
+
 This code reproduces results in [Canonical Tensor Decomposition for Knowledge Base Completion](https://arxiv.org/abs/1806.07297) (ICML 2018).
 
+## Prerequisites
+
+- **Python** 3.10 or later
+- **Conda** (recommended) or pip
+- **PyTorch** 2.0+ — install separately from [pytorch.org](https://pytorch.org/get-started/locally/) for your platform
+- **GPU** (optional but recommended) — NVIDIA GPU with CUDA support speeds up training significantly
+
 ## Installation
-Create a conda environment with pytorch cython and scikit-learn :
+
+Create a conda environment with Python 3.10 and activate it:
+
+```bash
+conda create --name kbc_env python=3.10
+conda activate kbc_env
 ```
-conda create --name kbc_env python=3.7
-source activate kbc_env
-conda install --file requirements.txt -c pytorch
+
+Install PyTorch for your platform by following the instructions at [pytorch.org](https://pytorch.org/get-started/locally/). For example, for a CPU-only install:
+
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 ```
 
-Then install the kbc package to this environment
+Then install the remaining dependencies:
+
+```bash
+pip install -r requirements.txt
 ```
+
+Finally, install the kbc package into the environment:
+
+```bash
 python setup.py install
 ```
 
@@ -47,7 +69,7 @@ regularized with the weighted N3 on several datasets, for several dimensions. We
 For rank 2000 : learning rate 1e-2, batch-size 100, max epochs 200.
 
 |   rank     | 5|25|50|100|500|2000|
-|------------|--|--|--|---|---|----|
+|------------|--|--|--|---|---|-----|
 |   MRR      | 0.36|0.61|0.78|0.83|0.84|0.86 |
 |   H@1      | 0.27|0.52|0.73|0.79|0.80|0.83 |
 |   H@3      | 0.41|0.67|0.81|0.85|0.87|0.87 |

diff --git a/kbc/datasets.py b/kbc/datasets.py
@@ -19,7 +19,29 @@
 
 
 class Dataset(object):
-    def __init__(self, name: str):
+    """Knowledge base dataset loader and evaluator.
+
+    Loads pre-processed train / valid / test splits from pickle files produced
+    by ``process_datasets.py`` and provides utilities for feeding data to a
+    model and computing filtered ranking metrics.
+
+    Args:
+        name: Name of the dataset subfolder inside ``kbc/data/``
+              (e.g. ``"FB15K"`` or ``"WN18RR"``).
+
+    Attributes:
+        root: Path to the dataset directory.
+        data: Dictionary mapping split name to a NumPy array of
+              ``(lhs, rel, rhs)`` triples.
+        n_entities: Total number of unique entities.
+        n_predicates: Total number of unique predicates (doubled to include
+                      reciprocal relations).
+        to_skip: Filtered-metric lookup table.  ``to_skip[side][(s, r)]``
+                 gives the list of objects that are considered correct and
+                 should be masked during ranking.
+    """
+
+    def __init__(self, name: str) -> None:
         self.root = DATA_PATH / name
 
         self.data = {}
@@ -36,10 +58,27 @@ def __init__(self, name: str):
         self.to_skip: Dict[str, Dict[Tuple[int, int], List[int]]] = pickle.load(inp_f)
         inp_f.close()
 
-    def get_examples(self, split):
+    def get_examples(self, split: str) -> np.ndarray:
+        """Return the raw NumPy array for a given data split.
+
+        Args:
+            split: One of ``"train"``, ``"valid"``, or ``"test"``.
+
+        Returns:
+            NumPy array of shape ``(n_triples, 3)`` with uint64 values.
+        """
         return self.data[split]
 
-    def get_train(self):
+    def get_train(self) -> np.ndarray:
+        """Return the augmented training set including reciprocal triples.
+
+        Each training triple ``(lhs, rel, rhs)`` is accompanied by its
+        reciprocal ``(rhs, rel + n_predicates//2, lhs)``, doubling the
+        training set size.
+
+        Returns:
+            NumPy array of shape ``(2 * n_train_triples, 3)``.
+        """
         copy = np.copy(self.data['train'])
         tmp = np.copy(copy[:, 0])
         copy[:, 0] = copy[:, 2]
@@ -49,16 +88,34 @@ def get_train(self):
 
     def eval(
             self, model: KBCModel, split: str, n_queries: int = -1, missing_eval: str = 'both',
-            at: Tuple[int] = (1, 3, 10)
-    ):
+            at: Tuple[int, ...] = (1, 3, 10)
+    ) -> Tuple[Dict[str, float], Dict[str, torch.FloatTensor]]:
+        """Evaluate a model on a given data split using filtered ranking metrics.
+
+        Args:
+            model: The KBC model to evaluate.
+            split: Data split to evaluate on (``"train"``, ``"valid"``,
+                   or ``"test"``).
+            n_queries: Number of queries to sample. Evaluates all queries
+                       when set to ``-1``.
+            missing_eval: Which side to evaluate: ``"lhs"``, ``"rhs"``,
+                          or ``"both"``.
+            at: Hits@k thresholds to compute.
+
+        Returns:
+            A tuple ``(mean_reciprocal_rank, hits_at)`` where both are
+            dictionaries keyed by ``"lhs"`` / ``"rhs"`` (or both).
+            ``mean_reciprocal_rank[side]`` is a float; ``hits_at[side]``
+            is a FloatTensor of length ``len(at)``.
+        """
         test = self.get_examples(split)
         examples = torch.from_numpy(test.astype('int64')).cuda()
         missing = [missing_eval]
         if missing_eval == 'both':
             missing = ['rhs', 'lhs']
 
-        mean_reciprocal_rank = {}
-        hits_at = {}
+        mean_reciprocal_rank: Dict[str, float] = {}
+        hits_at: Dict[str, torch.FloatTensor] = {}
 
         for m in missing:
             q = examples.clone()
@@ -79,5 +136,10 @@ def eval(
 
         return mean_reciprocal_rank, hits_at
 
-    def get_shape(self):
+    def get_shape(self) -> Tuple[int, int, int]:
+        """Return the shape of the entity-relation-entity space.
+
+        Returns:
+            Tuple ``(n_entities, n_predicates, n_entities)``.
+        """
         return self.n_entities, self.n_predicates, self.n_entities
diff --git a/kbc/learn.py b/kbc/learn.py
@@ -111,20 +111,26 @@
 optimizer = KBCOptimizer(model, regularizer, optim_method, args.batch_size)
 
 
-def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]):
-    """
-    aggregate metrics for missing lhs and rhs
-    :param mrrs: d
-    :param hits:
-    :return:
+def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]) -> Dict[str, object]:
+    """Aggregate MRR and Hits@k metrics by averaging over lhs and rhs directions.
+
+    Args:
+        mrrs: Dictionary mapping ``"lhs"`` and ``"rhs"`` to their respective
+              mean reciprocal rank values.
+        hits: Dictionary mapping ``"lhs"`` and ``"rhs"`` to FloatTensors of
+              Hits@k values.
+
+    Returns:
+        Dictionary with keys ``"MRR"`` (float) and ``"hits@[1,3,10]"``
+        (FloatTensor), each averaged over both evaluation directions.
     """
     m = (mrrs['lhs'] + mrrs['rhs']) / 2.
     h = (hits['lhs'] + hits['rhs']) / 2.
     return {'MRR': m, 'hits@[1,3,10]': h}
 
 
 cur_loss = 0
-curve = {'train': [], 'valid': [], 'test': []}
+curve: Dict[str, list] = {'train': [], 'valid': [], 'test': []}
 for e in range(args.max_epochs):
     cur_loss = optimizer.epoch(examples)