From 46ff651f7da40d340180dccc8f912ead4deca918 Mon Sep 17 00:00:00 2001
From: krish <karora07@student.ubc.ca>
Date: Mon, 23 Feb 2026 19:07:13 -0600
Subject: [PATCH] Modernize Python version, add type hints/docstrings, improve
 .gitignore and README

---
 .gitignore              |  51 +++++++++-
 README.md               |  34 +++++--
 kbc/datasets.py         |  78 ++++++++++++++--
 kbc/learn.py            |  20 ++--
 kbc/models.py           | 200 +++++++++++++++++++++++++++++++++++-----
 kbc/optimizers.py       |  37 +++++++-
 kbc/process_datasets.py |  43 ++++++---
 kbc/regularizers.py     |  62 ++++++++++++-
 requirements.txt        |  15 ++-
 setup.py                |  15 ++-
 10 files changed, 479 insertions(+), 76 deletions(-)

diff --git a/.gitignore b/.gitignore
index b97bc7b..2ecc605 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,49 @@
-build
-*.egg-info
-**/__pycache__
+# Build artifacts
+build/
+dist/
+*.egg-info/
+*.egg
+
+# Python cache
+__pycache__/
+**/__pycache__/
+*.py[cod]
+*$py.class
 **/*cpython*
+
+# Virtual environments
+.venv/
+venv/
+env/
+.env/
+
+# Distribution / packaging
+*.so
+*.dylib
+
+# C extension build artifacts
 kbc/lib/bindings.cpp
-dist
-kbc/data
+
+# Dataset files (downloaded separately)
+kbc/data/
+
+# NFS lock files
 **/.nfs*
+
+# IDE / editor
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Jupyter
+.ipynb_checkpoints/
+
+# Testing
+.pytest_cache/
+htmlcov/
+.coverage
diff --git a/README.md b/README.md
index 571f5c7..8504926 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,38 @@
 # Knowledge Base Completion (kbc)
+
 This code reproduces results in [Canonical Tensor Decomposition for Knowledge Base Completion](https://arxiv.org/abs/1806.07297) (ICML 2018).
 
+## Prerequisites
+
+- **Python** 3.10 or later
+- **Conda** (recommended) or pip
+- **PyTorch** 2.0+ — install separately from [pytorch.org](https://pytorch.org/get-started/locally/) for your platform
+- **GPU** (optional but recommended) — NVIDIA GPU with CUDA support speeds up training significantly
+
 ## Installation
-Create a conda environment with pytorch cython and scikit-learn :
+
+Create a conda environment with Python 3.10 and activate it:
+
+```bash
+conda create --name kbc_env python=3.10
+conda activate kbc_env
 ```
-conda create --name kbc_env python=3.7
-source activate kbc_env
-conda install --file requirements.txt -c pytorch
+
+Install PyTorch for your platform by following the instructions at [pytorch.org](https://pytorch.org/get-started/locally/). For example, for a CPU-only install:
+
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 ```
 
-Then install the kbc package to this environment
+Then install the remaining dependencies:
+
+```bash
+pip install -r requirements.txt
 ```
+
+Finally, install the kbc package into the environment:
+
+```bash
 python setup.py install
 ```
 
@@ -47,7 +69,7 @@ regularized with the weighted N3 on several datasets, for several dimensions. We
 For rank 2000 : learning rate 1e-2, batch-size 100, max epochs 200.
 
 |   rank     | 5|25|50|100|500|2000|
-|------------|--|--|--|---|---|----|
+|------------|--|--|--|---|---|-----|
 |   MRR      | 0.36|0.61|0.78|0.83|0.84|0.86 |
 |   H@1      | 0.27|0.52|0.73|0.79|0.80|0.83 |
 |   H@3      | 0.41|0.67|0.81|0.85|0.87|0.87 |
diff --git a/kbc/datasets.py b/kbc/datasets.py
index c4d32c2..001d64c 100644
--- a/kbc/datasets.py
+++ b/kbc/datasets.py
@@ -19,7 +19,29 @@
 
 
 class Dataset(object):
-    def __init__(self, name: str):
+    """Knowledge base dataset loader and evaluator.
+
+    Loads pre-processed train / valid / test splits from pickle files produced
+    by ``process_datasets.py`` and provides utilities for feeding data to a
+    model and computing filtered ranking metrics.
+
+    Args:
+        name: Name of the dataset subfolder inside ``kbc/data/``
+              (e.g. ``"FB15K"`` or ``"WN18RR"``).
+
+    Attributes:
+        root: Path to the dataset directory.
+        data: Dictionary mapping split name to a NumPy array of
+              ``(lhs, rel, rhs)`` triples.
+        n_entities: Total number of unique entities.
+        n_predicates: Total number of unique predicates (doubled to include
+                      reciprocal relations).
+        to_skip: Filtered-metric lookup table.  ``to_skip[side][(s, r)]``
+                 gives the list of objects that are considered correct and
+                 should be masked during ranking.
+    """
+
+    def __init__(self, name: str) -> None:
         self.root = DATA_PATH / name
 
         self.data = {}
@@ -36,10 +58,27 @@ def __init__(self, name: str):
         self.to_skip: Dict[str, Dict[Tuple[int, int], List[int]]] = pickle.load(inp_f)
         inp_f.close()
 
-    def get_examples(self, split):
+    def get_examples(self, split: str) -> np.ndarray:
+        """Return the raw NumPy array for a given data split.
+
+        Args:
+            split: One of ``"train"``, ``"valid"``, or ``"test"``.
+
+        Returns:
+            NumPy array of shape ``(n_triples, 3)`` with uint64 values.
+        """
         return self.data[split]
 
-    def get_train(self):
+    def get_train(self) -> np.ndarray:
+        """Return the augmented training set including reciprocal triples.
+
+        Each training triple ``(lhs, rel, rhs)`` is accompanied by its
+        reciprocal ``(rhs, rel + n_predicates//2, lhs)``, doubling the
+        training set size.
+
+        Returns:
+            NumPy array of shape ``(2 * n_train_triples, 3)``.
+        """
         copy = np.copy(self.data['train'])
         tmp = np.copy(copy[:, 0])
         copy[:, 0] = copy[:, 2]
@@ -49,16 +88,34 @@ def get_train(self):
 
     def eval(
             self, model: KBCModel, split: str, n_queries: int = -1, missing_eval: str = 'both',
-            at: Tuple[int] = (1, 3, 10)
-    ):
+            at: Tuple[int, ...] = (1, 3, 10)
+    ) -> Tuple[Dict[str, float], Dict[str, torch.FloatTensor]]:
+        """Evaluate a model on a given data split using filtered ranking metrics.
+
+        Args:
+            model: The KBC model to evaluate.
+            split: Data split to evaluate on (``"train"``, ``"valid"``,
+                   or ``"test"``).
+            n_queries: Number of queries to sample. Evaluates all queries
+                       when set to ``-1``.
+            missing_eval: Which side to evaluate: ``"lhs"``, ``"rhs"``,
+                          or ``"both"``.
+            at: Hits@k thresholds to compute.
+
+        Returns:
+            A tuple ``(mean_reciprocal_rank, hits_at)`` where both are
+            dictionaries keyed by ``"lhs"`` / ``"rhs"`` (or both).
+            ``mean_reciprocal_rank[side]`` is a float; ``hits_at[side]``
+            is a FloatTensor of length ``len(at)``.
+        """
         test = self.get_examples(split)
         examples = torch.from_numpy(test.astype('int64')).cuda()
         missing = [missing_eval]
         if missing_eval == 'both':
             missing = ['rhs', 'lhs']
 
-        mean_reciprocal_rank = {}
-        hits_at = {}
+        mean_reciprocal_rank: Dict[str, float] = {}
+        hits_at: Dict[str, torch.FloatTensor] = {}
 
         for m in missing:
             q = examples.clone()
@@ -79,5 +136,10 @@ def eval(
 
         return mean_reciprocal_rank, hits_at
 
-    def get_shape(self):
+    def get_shape(self) -> Tuple[int, int, int]:
+        """Return the shape of the entity-relation-entity space.
+
+        Returns:
+            Tuple ``(n_entities, n_predicates, n_entities)``.
+        """
         return self.n_entities, self.n_predicates, self.n_entities
diff --git a/kbc/learn.py b/kbc/learn.py
index 2355b82..8e8de62 100644
--- a/kbc/learn.py
+++ b/kbc/learn.py
@@ -111,12 +111,18 @@
 optimizer = KBCOptimizer(model, regularizer, optim_method, args.batch_size)
 
 
-def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]):
-    """
-    aggregate metrics for missing lhs and rhs
-    :param mrrs: d
-    :param hits:
-    :return:
+def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]) -> Dict[str, object]:
+    """Aggregate MRR and Hits@k metrics by averaging over lhs and rhs directions.
+
+    Args:
+        mrrs: Dictionary mapping ``"lhs"`` and ``"rhs"`` to their respective
+              mean reciprocal rank values.
+        hits: Dictionary mapping ``"lhs"`` and ``"rhs"`` to FloatTensors of
+              Hits@k values.
+
+    Returns:
+        Dictionary with keys ``"MRR"`` (float) and ``"hits@[1,3,10]"``
+        (FloatTensor), each averaged over both evaluation directions.
     """
     m = (mrrs['lhs'] + mrrs['rhs']) / 2.
     h = (hits['lhs'] + hits['rhs']) / 2.
@@ -124,7 +130,7 @@ def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]):
 
 
 cur_loss = 0
-curve = {'train': [], 'valid': [], 'test': []}
+curve: Dict[str, list] = {'train': [], 'valid': [], 'test': []}
 for e in range(args.max_epochs):
     cur_loss = optimizer.epoch(examples)
 
diff --git a/kbc/models.py b/kbc/models.py
index 9985a4a..6954a7a 100644
--- a/kbc/models.py
+++ b/kbc/models.py
@@ -6,36 +6,83 @@
 #
 
 from abc import ABC, abstractmethod
-from typing import Tuple, List, Dict
+from typing import Tuple, List, Dict, Optional
 import torch
 from torch import nn
 
 
 class KBCModel(nn.Module, ABC):
+    """Abstract base class for Knowledge Base Completion models.
+
+    Subclasses must implement ``get_rhs``, ``get_queries``, and ``score``
+    so that they can be used with the shared ``get_ranking`` evaluation loop.
+    """
+
     @abstractmethod
-    def get_rhs(self, chunk_begin: int, chunk_size: int):
+    def get_rhs(self, chunk_begin: int, chunk_size: int) -> torch.Tensor:
+        """Return right-hand-side entity embeddings for a chunk of entities.
+
+        Args:
+            chunk_begin: Index of the first entity in this chunk.
+            chunk_size: Number of entities to include in the chunk.
+
+        Returns:
+            A tensor of shape ``(embedding_dim, chunk_size)`` ready for
+            dot-product scoring against query representations.
+        """
         pass
 
     @abstractmethod
-    def get_queries(self, queries: torch.Tensor):
+    def get_queries(self, queries: torch.Tensor) -> torch.Tensor:
+        """Return query representations for a batch of (lhs, rel, rhs) triples.
+
+        Args:
+            queries: LongTensor of shape ``(batch_size, 3)`` containing
+                     (lhs, rel, rhs) indices.
+
+        Returns:
+            A float tensor of shape ``(batch_size, embedding_dim)``
+            representing each query in the scoring space.
+        """
         pass
 
     @abstractmethod
-    def score(self, x: torch.Tensor):
+    def score(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute the scalar score for each (lhs, rel, rhs) triple.
+
+        Args:
+            x: LongTensor of shape ``(batch_size, 3)`` containing
+               (lhs, rel, rhs) indices.
+
+        Returns:
+            A float tensor of shape ``(batch_size, 1)`` with one score per triple.
+        """
         pass
 
     def get_ranking(
             self, queries: torch.Tensor,
             filters: Dict[Tuple[int, int], List[int]],
             batch_size: int = 1000, chunk_size: int = -1
-    ):
-        """
-        Returns filtered ranking for each queries.
-        :param queries: a torch.LongTensor of triples (lhs, rel, rhs)
-        :param filters: filters[(lhs, rel)] gives the rhs to filter from ranking
-        :param batch_size: maximum number of queries processed at once
-        :param chunk_size: maximum number of candidates processed at once
-        :return:
+    ) -> torch.Tensor:
+        """Return filtered ranking for each query triple.
+
+        For each query ``(lhs, rel, rhs)`` the method computes, over all
+        candidate entities, how many score at least as high as the true answer
+        after masking out known true answers (filtered setting).
+
+        Args:
+            queries: LongTensor of shape ``(n_queries, 3)`` with
+                     (lhs, rel, rhs) indices.
+            filters: Mapping from ``(lhs, rel)`` to a list of rhs entities that
+                     should be filtered out of the ranking (all known correct
+                     answers in the dataset).
+            batch_size: Maximum number of queries processed simultaneously.
+            chunk_size: Maximum number of candidate entities scored at once.
+                        Defaults to all entities when set to ``-1``.
+
+        Returns:
+            A float tensor of shape ``(n_queries,)`` where each entry is the
+            filtered rank of the correct answer (1 = best).
         """
         if chunk_size < 0:
             chunk_size = self.sizes[2]
@@ -76,10 +123,24 @@ def get_ranking(
 
 
 class CP(KBCModel):
+    """Canonical Polyadic (CP) decomposition model for knowledge base completion.
+
+    Represents each entity and relation as a separate real-valued embedding
+    and scores a triple ``(lhs, rel, rhs)`` with the three-way dot product
+    ``lhs · rel · rhs``.
+
+    Args:
+        sizes: Triple ``(n_lhs_entities, n_relations, n_rhs_entities)``.
+               For most KBC datasets lhs and rhs share the same entity vocab,
+               so ``sizes[0] == sizes[2]``.
+        rank: Dimensionality of the entity and relation embeddings.
+        init_size: Scale factor applied to the initial random weights.
+    """
+
     def __init__(
             self, sizes: Tuple[int, int, int], rank: int,
             init_size: float = 1e-3
-    ):
+    ) -> None:
         super(CP, self).__init__()
         self.sizes = sizes
         self.rank = rank
@@ -92,33 +153,89 @@ def __init__(
         self.rel.weight.data *= init_size
         self.rhs.weight.data *= init_size
 
-    def score(self, x):
+    def score(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute the CP score for each triple.
+
+        Args:
+            x: LongTensor of shape ``(batch_size, 3)`` with (lhs, rel, rhs) ids.
+
+        Returns:
+            Float tensor of shape ``(batch_size, 1)``.
+        """
         lhs = self.lhs(x[:, 0])
         rel = self.rel(x[:, 1])
         rhs = self.rhs(x[:, 2])
 
         return torch.sum(lhs * rel * rhs, 1, keepdim=True)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        """Forward pass returning scores over all entities and embedding factors.
+
+        Args:
+            x: LongTensor of shape ``(batch_size, 3)`` with (lhs, rel, rhs) ids.
+
+        Returns:
+            A tuple ``(scores, factors)`` where:
+            - ``scores`` is a float tensor of shape ``(batch_size, n_entities)``
+              giving the score of each candidate rhs entity for each query.
+            - ``factors`` is a tuple ``(lhs_emb, rel_emb, rhs_emb)`` used by
+              the regularizer.
+        """
         lhs = self.lhs(x[:, 0])
         rel = self.rel(x[:, 1])
         rhs = self.rhs(x[:, 2])
         return (lhs * rel) @ self.rhs.weight.t(), (lhs, rel, rhs)
 
-    def get_rhs(self, chunk_begin: int, chunk_size: int):
+    def get_rhs(self, chunk_begin: int, chunk_size: int) -> torch.Tensor:
+        """Return transposed rhs embeddings for a chunk of entities.
+
+        Args:
+            chunk_begin: Index of the first entity in the chunk.
+            chunk_size: Number of entities in the chunk.
+
+        Returns:
+            Float tensor of shape ``(rank, chunk_size)``.
+        """
         return self.rhs.weight.data[
             chunk_begin:chunk_begin + chunk_size
         ].transpose(0, 1)
 
-    def get_queries(self, queries: torch.Tensor):
+    def get_queries(self, queries: torch.Tensor) -> torch.Tensor:
+        """Return lhs * rel query vectors for a batch of triples.
+
+        Args:
+            queries: LongTensor of shape ``(batch_size, 3)``.
+
+        Returns:
+            Float tensor of shape ``(batch_size, rank)``.
+        """
         return self.lhs(queries[:, 0]).data * self.rel(queries[:, 1]).data
 
 
 class ComplEx(KBCModel):
+    """ComplEx decomposition model for knowledge base completion.
+
+    Uses complex-valued embeddings where each entity/relation embedding of
+    dimension ``rank`` is stored as a ``2*rank`` real vector (first half = real
+    part, second half = imaginary part).  Scoring follows the Hermitian inner
+    product of the form ``Re(<lhs, rel, conj(rhs)>)``.
+
+    Reference: Trouillon et al., "Complex Embeddings for Simple Link
+    Prediction", ICML 2016.
+
+    Args:
+        sizes: Triple ``(n_lhs_entities, n_relations, n_rhs_entities)``.
+               Entities share a single embedding table (``sizes[0] == sizes[2]``
+               is assumed).
+        rank: Half the total embedding width (real and imaginary parts each
+              have ``rank`` dimensions).
+        init_size: Scale factor applied to the initial random weights.
+    """
+
     def __init__(
             self, sizes: Tuple[int, int, int], rank: int,
             init_size: float = 1e-3
-    ):
+    ) -> None:
         super(ComplEx, self).__init__()
         self.sizes = sizes
         self.rank = rank
@@ -130,7 +247,15 @@ def __init__(
         self.embeddings[0].weight.data *= init_size
         self.embeddings[1].weight.data *= init_size
 
-    def score(self, x):
+    def score(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute the ComplEx score for each triple.
+
+        Args:
+            x: LongTensor of shape ``(batch_size, 3)`` with (lhs, rel, rhs) ids.
+
+        Returns:
+            Float tensor of shape ``(batch_size, 1)``.
+        """
         lhs = self.embeddings[0](x[:, 0])
         rel = self.embeddings[1](x[:, 1])
         rhs = self.embeddings[0](x[:, 2])
@@ -145,7 +270,18 @@ def score(self, x):
             1, keepdim=True
         )
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        """Forward pass returning scores over all entities and embedding moduli.
+
+        Args:
+            x: LongTensor of shape ``(batch_size, 3)`` with (lhs, rel, rhs) ids.
+
+        Returns:
+            A tuple ``(scores, factors)`` where:
+            - ``scores`` is a float tensor of shape ``(batch_size, n_entities)``.
+            - ``factors`` is a tuple ``(|lhs|, |rel|, |rhs|)`` (element-wise
+              modulus) used by the N3 regularizer.
+        """
         lhs = self.embeddings[0](x[:, 0])
         rel = self.embeddings[1](x[:, 1])
         rhs = self.embeddings[0](x[:, 2])
@@ -165,12 +301,32 @@ def forward(self, x):
             torch.sqrt(rhs[0] ** 2 + rhs[1] ** 2)
         )
 
-    def get_rhs(self, chunk_begin: int, chunk_size: int):
+    def get_rhs(self, chunk_begin: int, chunk_size: int) -> torch.Tensor:
+        """Return transposed entity embeddings for a chunk.
+
+        Args:
+            chunk_begin: Index of the first entity in the chunk.
+            chunk_size: Number of entities in the chunk.
+
+        Returns:
+            Float tensor of shape ``(2 * rank, chunk_size)``.
+        """
         return self.embeddings[0].weight.data[
             chunk_begin:chunk_begin + chunk_size
         ].transpose(0, 1)
 
-    def get_queries(self, queries: torch.Tensor):
+    def get_queries(self, queries: torch.Tensor) -> torch.Tensor:
+        """Return complex query vectors for a batch of triples.
+
+        Computes ``lhs * conj(rel)`` in the complex sense and concatenates
+        real and imaginary parts into a single ``2*rank`` vector per query.
+
+        Args:
+            queries: LongTensor of shape ``(batch_size, 3)``.
+
+        Returns:
+            Float tensor of shape ``(batch_size, 2 * rank)``.
+        """
         lhs = self.embeddings[0](queries[:, 0])
         rel = self.embeddings[1](queries[:, 1])
         lhs = lhs[:, :self.rank], lhs[:, self.rank:]
diff --git a/kbc/optimizers.py b/kbc/optimizers.py
index c1ad05a..9f40d84 100644
--- a/kbc/optimizers.py
+++ b/kbc/optimizers.py
@@ -15,17 +15,49 @@
 
 
 class KBCOptimizer(object):
+    """Training optimizer wrapper for knowledge base completion models.
+
+    Wraps a KBC model, a regularizer, and a PyTorch optimizer to provide a
+    single ``epoch`` call that iterates over shuffled mini-batches, computes
+    the combined cross-entropy + regularization loss, and performs a gradient
+    update.
+
+    Args:
+        model: The KBC model to train (must expose a ``forward`` method that
+               returns ``(scores, factors)``).
+        regularizer: Regularizer applied to the embedding factors returned by
+                     ``model.forward``.
+        optimizer: A PyTorch optimizer (e.g. ``torch.optim.Adagrad``) already
+                   constructed for ``model.parameters()``.
+        batch_size: Number of training triples per gradient update step.
+        verbose: If ``True``, display a tqdm progress bar during training.
+    """
+
     def __init__(
             self, model: KBCModel, regularizer: Regularizer, optimizer: optim.Optimizer, batch_size: int = 256,
             verbose: bool = True
-    ):
+    ) -> None:
         self.model = model
         self.regularizer = regularizer
         self.optimizer = optimizer
         self.batch_size = batch_size
         self.verbose = verbose
 
-    def epoch(self, examples: torch.LongTensor):
+    def epoch(self, examples: torch.LongTensor) -> float:
+        """Run one full training epoch over the provided triples.
+
+        Shuffles ``examples``, then iterates in mini-batches.  For each batch
+        the method computes cross-entropy loss on right-hand-side prediction
+        plus the regularization term, back-propagates, and updates parameters.
+
+        Args:
+            examples: LongTensor of shape ``(n_triples, 3)`` containing
+                      (lhs, rel, rhs) training triples.
+
+        Returns:
+            The total (fit + regularization) loss for the final mini-batch of
+            the epoch (for logging purposes).
+        """
         actual_examples = examples[torch.randperm(examples.shape[0]), :]
         loss = nn.CrossEntropyLoss(reduction='mean')
         with tqdm.tqdm(total=examples.shape[0], unit='ex', disable=not self.verbose) as bar:
@@ -49,3 +81,4 @@ def epoch(self, examples: torch.LongTensor):
                 b_begin += self.batch_size
                 bar.update(input_batch.shape[0])
                 bar.set_postfix(loss=f'{l.item():.0f}')
+        return l.item()
diff --git a/kbc/process_datasets.py b/kbc/process_datasets.py
index 7d8631a..d16301e 100644
--- a/kbc/process_datasets.py
+++ b/kbc/process_datasets.py
@@ -9,6 +9,7 @@
 import errno
 from pathlib import Path
 import pickle
+from typing import Dict, List
 
 import numpy as np
 
@@ -17,16 +18,31 @@
 DATA_PATH = pkg_resources.resource_filename('kbc', 'data/')
 
 
-def prepare_dataset(path, name):
-    """
-    Given a path to a folder containing tab separated files :
-     train, test, valid
-    In the format :
-    (lhs)\t(rel)\t(rhs)\n
-    Maps each entity and relation to a unique id, create corresponding folder
-    name in pkg/data, with mapped train/test/valid files.
-    Also create to_skip_lhs / to_skip_rhs for filtered metrics and
-    rel_id / ent_id for analysis.
+def prepare_dataset(path: str, name: str) -> None:
+    """Process raw tab-separated KBC data files and save them as pickles.
+
+    Given a directory containing ``train``, ``valid``, and ``test`` files in
+    TSV format (one ``lhs\\trel\\trhs`` triple per line), this function:
+
+    1. Builds entity and relation vocabularies and writes ``ent_id`` and
+       ``rel_id`` mapping files.
+    2. Re-encodes each split as a NumPy ``uint64`` array and saves it as a
+       pickle inside ``kbc/data/<name>/``.
+    3. Builds filtered-metric lookup tables (``to_skip.pickle``) mapping
+       ``(subject, relation)`` pairs to the full list of correct objects in
+       the dataset.
+    4. Computes entity frequency distributions and saves them as
+       ``probas.pickle`` for optional importance-weighted sampling.
+
+    Args:
+        path: Path to the folder containing the raw ``train``, ``valid``,
+              and ``test`` TSV files.
+        name: Name of the dataset (e.g. ``"FB15K"``).  Used as the output
+              subdirectory name under ``kbc/data/``.
+
+    Raises:
+        OSError: If the output directory already exists (errno.EEXIST) or
+                 another filesystem error occurs.
     """
     files = ['train', 'valid', 'test']
     entities, relations = set(), set()
@@ -71,14 +87,14 @@ def prepare_dataset(path, name):
     print("creating filtering lists")
 
     # create filtering files
-    to_skip = {'lhs': defaultdict(set), 'rhs': defaultdict(set)}
+    to_skip: Dict[str, Dict] = {'lhs': defaultdict(set), 'rhs': defaultdict(set)}
     for f in files:
         examples = pickle.load(open(Path(DATA_PATH) / name / (f + '.pickle'), 'rb'))
         for lhs, rel, rhs in examples:
             to_skip['lhs'][(rhs, rel + n_relations)].add(lhs)  # reciprocals
             to_skip['rhs'][(lhs, rel)].add(rhs)
 
-    to_skip_final = {'lhs': {}, 'rhs': {}}
+    to_skip_final: Dict[str, Dict] = {'lhs': {}, 'rhs': {}}
     for kk, skip in to_skip.items():
         for k, v in skip.items():
             to_skip_final[kk][k] = sorted(list(v))
@@ -107,7 +123,7 @@ def prepare_dataset(path, name):
 
 
 if __name__ == "__main__":
-    datasets = ['FB15K', 'WN', 'WN18RR', 'FB237', 'YAGO3-10']
+    datasets: List[str] = ['FB15K', 'WN', 'WN18RR', 'FB237', 'YAGO3-10']
     for d in datasets:
         print("Preparing dataset {}".format(d))
         try:
@@ -123,4 +139,3 @@ def prepare_dataset(path, name):
                 print("File exists. skipping...")
             else:
                 raise
-
diff --git a/kbc/regularizers.py b/kbc/regularizers.py
index c617f36..ea3e6b1 100644
--- a/kbc/regularizers.py
+++ b/kbc/regularizers.py
@@ -12,17 +12,50 @@
 
 
 class Regularizer(nn.Module, ABC):
+    """Abstract base class for knowledge base embedding regularizers.
+
+    Subclasses implement ``forward`` to compute a scalar regularization loss
+    from a tuple of embedding tensors returned by a model's ``forward`` pass.
+    """
+
     @abstractmethod
-    def forward(self, factors: Tuple[torch.Tensor]):
+    def forward(self, factors: Tuple[torch.Tensor, ...]) -> torch.Tensor:
+        """Compute the regularization term for the given embedding factors.
+
+        Args:
+            factors: A tuple of embedding tensors, typically
+                     ``(lhs_emb, rel_emb, rhs_emb)`` or their moduli.
+
+        Returns:
+            A scalar tensor representing the regularization loss.
+        """
         pass
 
 
 class F2(Regularizer):
-    def __init__(self, weight: float):
+    """Squared Frobenius (L2) regularizer.
+
+    Penalizes the sum of squared embedding values across all factors,
+    normalized by batch size.
+
+    Args:
+        weight: Regularization strength (lambda). Higher values impose
+                stronger regularization.
+    """
+
+    def __init__(self, weight: float) -> None:
         super(F2, self).__init__()
         self.weight = weight
 
-    def forward(self, factors):
+    def forward(self, factors: Tuple[torch.Tensor, ...]) -> torch.Tensor:
+        """Compute mean squared L2 penalty across all embedding factors.
+
+        Args:
+            factors: Tuple of embedding tensors of shape ``(batch_size, dim)``.
+
+        Returns:
+            Scalar regularization loss.
+        """
         norm = 0
         for f in factors:
             norm += self.weight * torch.sum(f ** 2)
@@ -30,11 +63,30 @@ def forward(self, factors):
 
 
 class N3(Regularizer):
-    def __init__(self, weight: float):
+    """Nuclear 3-norm (N3) regularizer.
+
+    Penalizes the sum of cubed absolute embedding values across all factors,
+    normalized by batch size.  This regularizer, introduced in the ICML 2018
+    paper, encourages sparse, low-magnitude embeddings while remaining
+    compatible with complex (ComplEx) models.
+
+    Args:
+        weight: Regularization strength (lambda).
+    """
+
+    def __init__(self, weight: float) -> None:
         super(N3, self).__init__()
         self.weight = weight
 
-    def forward(self, factors):
+    def forward(self, factors: Tuple[torch.Tensor, ...]) -> torch.Tensor:
+        """Compute mean N3 penalty across all embedding factors.
+
+        Args:
+            factors: Tuple of embedding tensors of shape ``(batch_size, dim)``.
+
+        Returns:
+            Scalar regularization loss.
+        """
         norm = 0
         for f in factors:
             norm += self.weight * torch.sum(
diff --git a/requirements.txt b/requirements.txt
index a93676f..0bb9577 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,10 @@
-tqdm
-pytorch
-numpy
-scikit-learn
-scipy
\ No newline at end of file
+# Core deep learning framework — install pytorch separately via https://pytorch.org
+# for your platform (CPU or CUDA). Example:
+#   conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
+# or for CPU-only:
+#   pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+
+tqdm>=4.66
+numpy>=1.26
+scikit-learn>=1.4
+scipy>=1.12
\ No newline at end of file
diff --git a/setup.py b/setup.py
index dabe37b..9405897 100644
--- a/setup.py
+++ b/setup.py
@@ -5,11 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 #
 
-from setuptools import setup
+from setuptools import setup, find_packages
 
 setup(
     name='kbc',
+    version='0.1.0',
     ext_package='',
-    packages=['kbc'],
+    packages=find_packages(),
     package_data={'kbc': ['data/**/*']},
+    python_requires='>=3.10',
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
+        'License :: Other/Proprietary License',
+        'Operating System :: OS Independent',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    ],
 )