Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 46 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,49 @@
build
*.egg-info
**/__pycache__
# Build artifacts
build/
dist/
*.egg-info/
*.egg

# Python cache
__pycache__/
**/__pycache__/
*.py[cod]
*$py.class
**/*cpython*

# Virtual environments
.venv/
venv/
env/
.env/

# Distribution / packaging
*.so
*.dylib

# C extension build artifacts
kbc/lib/bindings.cpp
dist
kbc/data

# Dataset files (downloaded separately)
kbc/data/

# NFS lock files
**/.nfs*

# IDE / editor
.vscode/
.idea/
*.swp
*.swo

# OS
.DS_Store
Thumbs.db

# Jupyter
.ipynb_checkpoints/

# Testing
.pytest_cache/
htmlcov/
.coverage
34 changes: 28 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,38 @@
# Knowledge Base Completion (kbc)

This code reproduces results in [Canonical Tensor Decomposition for Knowledge Base Completion](https://arxiv.org/abs/1806.07297) (ICML 2018).

## Prerequisites

- **Python** 3.10 or later
- **Conda** (recommended) or pip
- **PyTorch** 2.0+ — install separately from [pytorch.org](https://pytorch.org/get-started/locally/) for your platform
- **GPU** (optional but recommended) — NVIDIA GPU with CUDA support speeds up training significantly

## Installation
Create a conda environment with pytorch cython and scikit-learn :

Create a conda environment with Python 3.10 and activate it:

```bash
conda create --name kbc_env python=3.10
conda activate kbc_env
```
conda create --name kbc_env python=3.7
source activate kbc_env
conda install --file requirements.txt -c pytorch

Install PyTorch for your platform by following the instructions at [pytorch.org](https://pytorch.org/get-started/locally/). For example, for a CPU-only install:

```bash
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
```

Then install the kbc package to this environment
Then install the remaining dependencies:

```bash
pip install -r requirements.txt
```

Finally, install the kbc package into the environment:

```bash
python setup.py install
```

Expand Down Expand Up @@ -47,7 +69,7 @@ regularized with the weighted N3 on several datasets, for several dimensions. We
For rank 2000 : learning rate 1e-2, batch-size 100, max epochs 200.

| rank | 5|25|50|100|500|2000|
|------------|--|--|--|---|---|----|
|------------|--|--|--|---|---|-----|
| MRR | 0.36|0.61|0.78|0.83|0.84|0.86 |
| H@1 | 0.27|0.52|0.73|0.79|0.80|0.83 |
| H@3 | 0.41|0.67|0.81|0.85|0.87|0.87 |
Expand Down
78 changes: 70 additions & 8 deletions kbc/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,29 @@


class Dataset(object):
def __init__(self, name: str):
"""Knowledge base dataset loader and evaluator.

Loads pre-processed train / valid / test splits from pickle files produced
by ``process_datasets.py`` and provides utilities for feeding data to a
model and computing filtered ranking metrics.

Args:
name: Name of the dataset subfolder inside ``kbc/data/``
(e.g. ``"FB15K"`` or ``"WN18RR"``).

Attributes:
root: Path to the dataset directory.
data: Dictionary mapping split name to a NumPy array of
``(lhs, rel, rhs)`` triples.
n_entities: Total number of unique entities.
n_predicates: Total number of unique predicates (doubled to include
reciprocal relations).
to_skip: Filtered-metric lookup table. ``to_skip[side][(s, r)]``
gives the list of objects that are considered correct and
should be masked during ranking.
"""

def __init__(self, name: str) -> None:
self.root = DATA_PATH / name

self.data = {}
Expand All @@ -36,10 +58,27 @@ def __init__(self, name: str):
self.to_skip: Dict[str, Dict[Tuple[int, int], List[int]]] = pickle.load(inp_f)
inp_f.close()

def get_examples(self, split):
def get_examples(self, split: str) -> np.ndarray:
"""Return the raw NumPy array for a given data split.

Args:
split: One of ``"train"``, ``"valid"``, or ``"test"``.

Returns:
NumPy array of shape ``(n_triples, 3)`` with uint64 values.
"""
return self.data[split]

def get_train(self):
def get_train(self) -> np.ndarray:
"""Return the augmented training set including reciprocal triples.

Each training triple ``(lhs, rel, rhs)`` is accompanied by its
reciprocal ``(rhs, rel + n_predicates//2, lhs)``, doubling the
training set size.

Returns:
NumPy array of shape ``(2 * n_train_triples, 3)``.
"""
copy = np.copy(self.data['train'])
tmp = np.copy(copy[:, 0])
copy[:, 0] = copy[:, 2]
Expand All @@ -49,16 +88,34 @@ def get_train(self):

def eval(
self, model: KBCModel, split: str, n_queries: int = -1, missing_eval: str = 'both',
at: Tuple[int] = (1, 3, 10)
):
at: Tuple[int, ...] = (1, 3, 10)
) -> Tuple[Dict[str, float], Dict[str, torch.FloatTensor]]:
"""Evaluate a model on a given data split using filtered ranking metrics.

Args:
model: The KBC model to evaluate.
split: Data split to evaluate on (``"train"``, ``"valid"``,
or ``"test"``).
n_queries: Number of queries to sample. Evaluates all queries
when set to ``-1``.
missing_eval: Which side to evaluate: ``"lhs"``, ``"rhs"``,
or ``"both"``.
at: Hits@k thresholds to compute.

Returns:
A tuple ``(mean_reciprocal_rank, hits_at)`` where both are
dictionaries keyed by ``"lhs"`` / ``"rhs"`` (or both).
``mean_reciprocal_rank[side]`` is a float; ``hits_at[side]``
is a FloatTensor of length ``len(at)``.
"""
test = self.get_examples(split)
examples = torch.from_numpy(test.astype('int64')).cuda()
missing = [missing_eval]
if missing_eval == 'both':
missing = ['rhs', 'lhs']

mean_reciprocal_rank = {}
hits_at = {}
mean_reciprocal_rank: Dict[str, float] = {}
hits_at: Dict[str, torch.FloatTensor] = {}

for m in missing:
q = examples.clone()
Expand All @@ -79,5 +136,10 @@ def eval(

return mean_reciprocal_rank, hits_at

def get_shape(self):
def get_shape(self) -> Tuple[int, int, int]:
"""Return the shape of the entity-relation-entity space.

Returns:
Tuple ``(n_entities, n_predicates, n_entities)``.
"""
return self.n_entities, self.n_predicates, self.n_entities
20 changes: 13 additions & 7 deletions kbc/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,20 +111,26 @@
optimizer = KBCOptimizer(model, regularizer, optim_method, args.batch_size)


def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]):
"""
aggregate metrics for missing lhs and rhs
:param mrrs: d
:param hits:
:return:
def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]) -> Dict[str, object]:
"""Aggregate MRR and Hits@k metrics by averaging over lhs and rhs directions.

Args:
mrrs: Dictionary mapping ``"lhs"`` and ``"rhs"`` to their respective
mean reciprocal rank values.
hits: Dictionary mapping ``"lhs"`` and ``"rhs"`` to FloatTensors of
Hits@k values.

Returns:
Dictionary with keys ``"MRR"`` (float) and ``"hits@[1,3,10]"``
(FloatTensor), each averaged over both evaluation directions.
"""
m = (mrrs['lhs'] + mrrs['rhs']) / 2.
h = (hits['lhs'] + hits['rhs']) / 2.
return {'MRR': m, 'hits@[1,3,10]': h}


cur_loss = 0
curve = {'train': [], 'valid': [], 'test': []}
curve: Dict[str, list] = {'train': [], 'valid': [], 'test': []}
for e in range(args.max_epochs):
cur_loss = optimizer.epoch(examples)

Expand Down
Loading