From b88cb768a8f00a6d69e70866fa9ad69c12a7b52e Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:42 -0400
Subject: [PATCH 01/55] add graphormer file

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py        | 1 +
 gridfm_graphkit/models/gps_transformer.py            | 2 ++
 gridfm_graphkit/models/graphormer.py                 | 0
 gridfm_graphkit/tasks/feature_reconstruction_task.py | 2 ++
 4 files changed, 5 insertions(+)
 create mode 100644 gridfm_graphkit/models/graphormer.py

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index 026d9a8..b1e4bd0 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -204,6 +204,7 @@ def get(self, idx):
         data = torch.load(file_name, weights_only=False)
         if self.transform:
             data = self.transform(data)
+        # print('data>>>>>>>',data) # TODO remove
         return data
 
     def change_transform(self, new_transform):
diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py
index cc8b648..b807ff3 100644
--- a/gridfm_graphkit/models/gps_transformer.py
+++ b/gridfm_graphkit/models/gps_transformer.py
@@ -121,7 +121,9 @@ def forward(self, x, pe, edge_index, edge_attr, batch):
         """
         x_pe = self.pe_norm(pe)
 
+        print('enc>>>', x.size())   # TODO remove
         x = self.encoder(x)
+        print('post>>>', x.size())  # TODO remove
         x = self.input_norm(x)
 
         x = torch.cat((x, x_pe), 1)
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
new file mode 100644
index 0000000..e69de29
diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py
index cb6963b..e42d09d 100644
--- a/gridfm_graphkit/tasks/feature_reconstruction_task.py
+++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py
@@ -129,6 +129,8 @@ def shared_step(self, batch):
         return output, loss_dict
 
     def training_step(self, batch):
+        # print('trainbatch>>>>', batch.size())     # TODO remove
+        # print(batch)
         _, loss_dict = self.shared_step(batch)
         current_lr = self.optimizer.param_groups[0]["lr"]
         metrics = {}

From dde35deb77629a5e24dbb0d77e7e2f6129c557fe Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:42 -0400
Subject: [PATCH 02/55] graphormer data formatting

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/gmae_collator.py | 127 +++++++
 gridfm_graphkit/models/gmae_wrapper.py  |  91 +++++
 gridfm_graphkit/models/graphormer.py    | 483 ++++++++++++++++++++++++
 3 files changed, 701 insertions(+)
 create mode 100644 gridfm_graphkit/models/gmae_collator.py
 create mode 100644 gridfm_graphkit/models/gmae_wrapper.py

diff --git a/gridfm_graphkit/models/gmae_collator.py b/gridfm_graphkit/models/gmae_collator.py
new file mode 100644
index 0000000..f4bc532
--- /dev/null
+++ b/gridfm_graphkit/models/gmae_collator.py
@@ -0,0 +1,127 @@
+import torch
+
+
+def pad_1d_unsqueeze(x, padlen):
+    x = x + 1  # pad id = 0
+    xlen = x.size(0)
+    if xlen < padlen:
+        new_x = x.new_zeros([padlen], dtype=x.dtype)
+        new_x[:xlen] = x
+        x = new_x
+    return x.unsqueeze(0)
+
+
+def pad_2d_unsqueeze(x, padlen):
+    x = x + 1  # pad id = 0
+    # print('-------->', x.size())
+    xlen, xdim = x.size()
+    if xlen < padlen:
+        new_x = x.new_zeros([padlen, xdim], dtype=x.dtype)
+        new_x[:xlen, :] = x
+        x = new_x
+    return x.unsqueeze(0)
+
+
+def pad_attn_bias_unsqueeze(x, padlen):
+    xlen = x.size(0)
+    if xlen < padlen:
+        new_x = x.new_zeros(
+            [padlen, padlen], dtype=x.dtype).fill_(float('-inf'))
+        new_x[:xlen, :xlen] = x
+        new_x[xlen:, :xlen] = 0
+        x = new_x
+    return x.unsqueeze(0)
+
+
+def pad_spatial_pos_unsqueeze(x, padlen):
+    x = x + 1
+    xlen = x.size(0)
+    if xlen < padlen:
+        new_x = x.new_zeros([padlen, padlen], dtype=x.dtype)
+        new_x[:xlen, :xlen] = x
+        x = new_x
+    return x.unsqueeze(0)
+
+
+class Batch():
+    def __init__(self, 
+            min_node_num, 
+            attn_bias, 
+            spatial_pos, 
+            in_degree, 
+            out_degree, 
+            x, 
+            y,
+            orig_id
+            ):
+        super(Batch, self).__init__()
+        self.min_node_num = int(min_node_num)
+        self.in_degree, self.out_degree = in_degree, out_degree
+        self.x, self.y = x, y
+        self.attn_bias, self.spatial_pos = attn_bias, spatial_pos
+        self.orig_id = orig_id
+
+    def to(self, device):
+        self.in_degree, self.out_degree = self.in_degree.to(
+            device), self.out_degree.to(device)
+        self.x = self.x.to(device)
+        self.y = self.y.to(device)
+        self.attn_bias, self.spatial_pos = self.attn_bias.to(
+            device), self.spatial_pos.to(device)
+        return self
+
+    def __len__(self):
+        return self.in_degree.size(0)
+
+
+def collator(items, spatial_pos_max=20):
+    """
+    custom collator, among other transformations...
+
+    unequal input graphs are padded to all have the same size
+    
+    adds 1 to the input x via pad_2d_unsqueeze and similar functions
+    """
+    items = [
+        item for item in items if item is not None]
+    items = [
+            (item[0], item[1], item[2], item[3], item[4], item[5], item[6], item[7]) 
+            for item in items
+            ]
+    
+    # at this step all graphs in batch have their input size
+    xs, ys, adjs, attn_biases, spatial_poses, in_degrees, out_degrees, orig_ids = zip(*items)
+    
+    for idx, _ in enumerate(attn_biases):
+        attn_biases[idx][spatial_poses[idx] >= spatial_pos_max] = float('-inf')
+    max_node_num = max(i.size(0) for i in xs)
+    min_node_num = min(i.size(0) for i in xs)
+    
+    if all([torch.all(xx == yy) for xx,yy in zip(xs, ys)]):    # then this is for and encoder-decoder setup
+        y = torch.cat([pad_2d_unsqueeze(i, max_node_num) for i in ys])
+    else:
+        y = torch.stack(ys)
+
+    # following steps pad the smaller graphs to match the largest for batching
+    # incidentally a constant value of 1 is added as well
+    x = torch.cat([pad_2d_unsqueeze(i, max_node_num) for i in xs])
+    attn_bias = torch.cat([pad_attn_bias_unsqueeze(
+        i, max_node_num) for i in attn_biases])
+    spatial_pos = torch.cat([pad_spatial_pos_unsqueeze(i, max_node_num)
+                        for i in spatial_poses])
+    in_degree = torch.cat([pad_1d_unsqueeze(i, max_node_num)
+                          for i in in_degrees])
+    out_degree = torch.cat([pad_1d_unsqueeze(i, max_node_num)
+                           for i in out_degrees])
+
+
+    return Batch(
+        min_node_num=min_node_num,
+        attn_bias=attn_bias,
+        spatial_pos=spatial_pos,
+        in_degree=in_degree,
+        out_degree=out_degree,
+        x=x,
+        y=y,
+        orig_id=orig_ids
+    )
diff --git a/gridfm_graphkit/models/gmae_wrapper.py b/gridfm_graphkit/models/gmae_wrapper.py
new file mode 100644
index 0000000..b76c915
--- /dev/null
+++ b/gridfm_graphkit/models/gmae_wrapper.py
@@ -0,0 +1,91 @@
+import torch
+
+import numpy as np
+import pyximport
+pyximport.install(setup_args={'include_dirs': np.get_include()})
+import algos
+
+from torch_geometric.loader import NeighborSampler
+from torch_geometric.utils import to_undirected
+
+
+
+def process_samples(batch_size, n_id, edge_index, dataset):
+    """
+    transformation of sampled nodes to: 
+    - node features of sampled set, 
+    - y, 
+    - edges tensor
+
+    # TODO reconcile redundance of using edge_index and dataset
+    # in the case where the full graph is used
+    """
+
+    # print(edge_index)
+    # print('<------->')
+    if edge_index.size(1) != 0:
+        edge_index = to_undirected(edge_index)
+    n_nodes = len(n_id)
+    edge_sp_adj = torch.sparse.FloatTensor(edge_index,
+                                            torch.ones(edge_index.shape[1]),
+                                            [n_nodes, n_nodes])
+    edge_adj = edge_sp_adj
+
+    # print('<<---------------->>')
+    # print(n_id)
+    # print(dataset.x.size())
+    # print(dataset.y.size())
+
+    return [dataset.x[n_id], dataset.y[n_id], edge_adj]
+    
+
+# GMAE_graph positional encoding
+class MyDataset(torch.utils.data.Dataset):
+    def __init__(self, items, settype=''):
+        super(MyDataset, self).__init__()
+
+        self.items = items
+        self.type = settype
+
+
+    def __len__(self):
+        return len(self.items)
+
+    def __getitem__(self, idx):
+        item = self.items[idx]
+        
+        if self.type=='csv':
+            graphdata = torch.load(item[1])
+            num_nodes = graphdata.num_nodes
+        
+            # padding and mask creation should happend here
+            ns0 = 1 # batch size
+            ns1 = torch.arange(num_nodes, dtype=torch.int32)   # node ids
+            ns2 = graphdata.edge_index
+            data_item = process_samples(
+                        ns0, 
+                        ns1, 
+                        ns2,
+                        graphdata) + [0]    # TODO completely remove the appended [0]
+        else:
+            data_item = item    # in memory dataset in use
+
+        return preprocess_item(data_item)
+
+
+def preprocess_item(item):
+    """
+    """
+    x, y, adj, orig_id = item[0], item[1], item[2].to_dense(), item[3]
+    N = x.size(0)
+
+    # node adj matrix [N, N] bool
+    adj = adj.bool()
+
+    shortest_path_result, path = algos.floyd_warshall(adj.numpy())
+    spatial_pos = torch.from_numpy((shortest_path_result)).long()
+    attn_bias = torch.zeros([N, N], dtype=torch.float)
+
+    in_degree = adj.long().sum(dim=1).view(-1)
+    out_degree = adj.long().sum(dim=0).view(-1)
+    return x, y, adj, attn_bias, spatial_pos, in_degree, out_degree, orig_id
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index e69de29..6aa3df7 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -0,0 +1,483 @@
+from lr import PolynomialDecayLR
+import torch
+import math
+import numpy as np
+import torch.nn as nn
+import pytorch_lightning as pl
+
+from torch.nn import functional as F
+from losses import active_power_loss
+
+def init_params(module, n_layers):
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=0.02 / math.sqrt(n_layers))
+        if module.bias is not None:
+            module.bias.data.zero_()
+    if isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=0.02)
+
+
+
+class GMAE_node(pl.LightningModule):
+    def __init__(
+        self,
+        n_encoder_layers,
+        n_decoder_layers,
+        num_heads,
+        hidden_dim,
+        dropout_rate,
+        intput_dropout_rate,
+        weight_decay,
+        ffn_dim,
+        dataset_name,
+        warmup_updates,
+        tot_updates,
+        peak_lr,
+        end_lr,
+        attention_dropout_rate,
+        n_node_features,
+        mask_ratio,
+        n_val_sampler,
+    ):
+        super().__init__()
+        self.save_hyperparameters()
+        self.n_node_features = n_node_features
+        self.n_val_sampler = n_val_sampler
+        self.mask_ratio = mask_ratio
+        self.num_heads = num_heads
+        self.input_proj = nn.Linear(n_node_features, hidden_dim)
+
+        self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0)
+        self.in_degree_encoder = nn.Embedding(
+            512, hidden_dim, padding_idx=0)
+        self.out_degree_encoder = nn.Embedding(
+            512, hidden_dim, padding_idx=0)
+
+        self.input_dropout = nn.Dropout(intput_dropout_rate)
+        encoders = [EncoderLayer(hidden_dim, ffn_dim, dropout_rate, attention_dropout_rate, num_heads)
+                    for _ in range(n_encoder_layers)]
+        self.encoder_layers = nn.ModuleList(encoders)
+        self.encoder_final_ln = nn.LayerNorm(hidden_dim)
+
+        self.encoder_to_decoder = nn.Linear(hidden_dim, hidden_dim)
+
+        decoders = [EncoderLayer(hidden_dim, ffn_dim, dropout_rate, attention_dropout_rate, num_heads)
+                    for _ in range(n_decoder_layers)]
+        self.decoder_layers = nn.ModuleList(decoders)
+        self.decoder_final_ln = nn.LayerNorm(hidden_dim)
+
+        self.out_proj = nn.Linear(hidden_dim, self.n_node_features)
+        self.loss_fn = F.mse_loss
+        self.masking_value = -4
+        self.loss_phys1 = active_power_loss
+        self.alpha = 1.0/50.0    # weight for loss_phys1
+
+        self.dataset_name = dataset_name
+
+        self.warmup_updates = warmup_updates
+        self.tot_updates = tot_updates
+        self.peak_lr = peak_lr
+        self.end_lr = end_lr
+        self.weight_decay = weight_decay
+
+        self.hidden_dim = hidden_dim
+        self.automatic_optimization = True
+        self.apply(lambda module: init_params(module, n_layers=n_encoder_layers))
+
+
+    
+    def compute_pos_embeddings(self, batched_data):
+        attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x
+        in_degree, out_degree = batched_data.in_degree, batched_data.in_degree
+        # graph_attn_bias
+        graph_attn_bias = attn_bias.clone()
+        graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
+            1, self.num_heads, 1, 1)  # [n_graph, n_head, n_node, n_node]
+        # spatial pos
+        # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+        spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
+        graph_attn_bias = graph_attn_bias + spatial_pos_bias
+        graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
+
+        node_feature = self.input_proj(x)
+        node_feature = node_feature + \
+            self.in_degree_encoder(in_degree) + \
+            self.out_degree_encoder(out_degree)
+        graph_node_feature = node_feature
+
+        return graph_node_feature, graph_attn_bias
+
+    def encoder(self, graph_node_feature, graph_attn_bias, mask=None):
+
+        graph_node_feature_masked = graph_node_feature
+        graph_attn_bias_masked = graph_attn_bias
+
+        # transfomrer encoder
+        output = self.input_dropout(graph_node_feature_masked)
+        for enc_layer in self.encoder_layers:
+            output = enc_layer(output, graph_attn_bias_masked, mask)
+        output = self.encoder_final_ln(output)
+        return output
+
+    def decoder(self, output, in_degree, out_degree, graph_attn_bias, mask=None):
+        
+        pos_embed = self.in_degree_encoder(in_degree) + self.out_degree_encoder(out_degree)
+        output = output + pos_embed
+    
+        for enc_layer in self.decoder_layers:
+            output = enc_layer(output, graph_attn_bias, mask)
+
+        output = self.decoder_final_ln(output)
+        output = self.out_proj(output)  # [n_graph, n_node, n_feature]
+ 
+        return output
+
+    def forward(self, batched_data, mask=None):
+        """
+        process a batch of data, applying the input mask, while
+        excluding non-valid values that arrise during processing
+
+        mask: incoming values to mask for prediction
+        """
+        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data)
+        in_degree = batched_data.in_degree
+        out_degree = batched_data.out_degree
+
+        graph_mask = None   # TODO this could be removed eventually
+
+        output = self.encoder(graph_node_feature, graph_attn_bias, mask)
+        output = self.encoder_to_decoder(output)
+        output = self.decoder(output, in_degree, out_degree, graph_attn_bias, mask)
+        return output, graph_mask
+
+    def generate_pretrain_embeddings_for_downstream_task(self, batched_data):
+        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data)
+        output = self.encoder(graph_node_feature, graph_attn_bias)
+        output = output.reshape(-1, self.n_val_sampler, output.size(1), self.hidden_dim)[:, :, 0, :].mean(1)
+        output = output  # [n_graph(n_central_node), n_feature]
+        return output
+
+    def generate_node_pred(self, batched_data):
+        """
+        for a batch of nodes, return the masked node array and
+        the predicted arrays for those nodes
+
+        note:
+
+        mask: nodes that are to be predicted (and are thus masked) in each
+                graph (constant for the batch)
+        graph_mask: graphs in batch that have valid results
+        """
+        num_nodes = batched_data.x.size(1)
+
+        mask = None
+        y_hat, graph_mask = self(batched_data, mask)  # [n_graph, n_masked_node, n_feature]
+        if graph_mask is not None:
+            y_gt = batched_data.x[graph_mask].float()
+
+        else:
+            y_gt = batched_data.x.float()
+            graph_mask = torch.from_numpy(np.array([]))
+        no_feat = y_hat.size(2)
+        y_hat = y_hat.reshape(-1, y_hat.size(2))  # [n_graph*n_masked_node, n_feature]
+
+        y_gt = y_gt.reshape(-1, y_gt.size(2))  # [n_graph*n_masked_node, n_feature]
+        pad_mask = torch.nonzero(y_gt.sum(-1))
+        
+        # final shaping
+        y_gt = torch.squeeze(y_gt)
+        y_hat = torch.squeeze(y_hat)
+        
+        return y_gt, y_hat, graph_mask
+    
+    
+    def training_step(self, batched_data, batch_idx):
+        num_nodes = batched_data.x.size(1)
+
+        # create a boolean mask where padding was added
+        # note that this assumes all input data had features with
+        # values >= 0
+        mask = None
+        masked_entries = torch.sum(batched_data.x == 0, axis=2)
+        mask = masked_entries == batched_data.x.size(2)
+
+        # add low-level random noise to input X
+        noise = np.random.normal(
+                    loc=0.0,
+                    scale=0.00001,  # TODO make configurable
+                    size=batched_data.x.size()
+        )
+        device = batched_data.x.device
+        orig_data = batched_data.x
+        batched_data.x = batched_data.x + torch.Tensor(noise).to(device)
+
+        strategy = ''
+        # fifty-fifty split between random masking and power-flow solution
+        if np.random.uniform() > 0.5:
+            # find location of all nozero entries for masking and shuffle, select, mask
+            inds = torch.where(orig_data.flatten() != 0)
+            num_mask = int(self.mask_ratio * len(inds[0]))
+            shuf_inds = (inds[0][torch.randperm(len(inds[0]))],)
+
+            nshape = batched_data.x.size()
+            batched_data.x = batched_data.x.flatten()
+            batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value
+            batched_data.x = torch.reshape(batched_data.x, nshape)
+        else:   # assume only  voltage and power variables to be masked
+            inds = torch.cat([
+                    # to pred
+                    torch.range(xx,len(orig_data.flatten()), 25, dtype=int) 
+                    for xx in [ii for ii in range(17,25)]
+                ])
+
+            shuf_inds = inds[torch.randperm(len(inds))]
+
+            nshape = batched_data.x.size()
+            batched_data.x = batched_data.x.flatten()
+            batched_data.x[shuf_inds.to(device)] = self.masking_value
+            batched_data.x = torch.reshape(batched_data.x, nshape)
+
+        
+        y_hat, graph_mask = self(batched_data, mask)  # [n_graph, n_masked_node, n_feature]
+        if graph_mask is not None:
+            y_gt = orig_data[graph_mask].float()
+        else:
+            y_gt = orig_data.float()
+
+        y_gt = y_gt[~mask]
+        y_hat = y_hat[~mask]
+
+        # print('pre loss shapes', y_gt.size(), y_hat.size())
+        loss = self.loss_fn(y_hat, y_gt)
+        loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device)
+        self.log('train_loss', loss)
+        self.log('activ_loss', loss_actv)
+
+        return loss + loss_actv
+
+    def validation_step(self, batched_data, batch_idx):
+        num_nodes = batched_data.x.size(1)
+        mask = None
+
+        masked_entries = torch.sum(batched_data.x == 0, axis=2)
+        mask = masked_entries == batched_data.x.size(2)
+
+        # add low-level random noise to input X
+        noise = np.random.normal(
+                    loc=0.0,
+                    scale=0.00001,  # TODO make configurable
+                    size=batched_data.x.size()
+        )
+        device = batched_data.x.device
+        orig_data = batched_data.x
+        batched_data.x = batched_data.x + torch.Tensor(noise).to(device)
+        
+        # fifty-fifty split between random masking and power-flow solution
+        if np.random.uniform() > 0.5:
+            # find location of all nozero entries for masking and shuffle, select, mask
+            inds = torch.where(orig_data.flatten() != 0)
+            num_mask = int(self.mask_ratio * len(inds[0]))
+            shuf_inds = (inds[0][torch.randperm(len(inds[0]))],)
+
+            nshape = batched_data.x.size()
+            batched_data.x = batched_data.x.flatten()
+            batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value
+            batched_data.x = torch.reshape(batched_data.x, nshape)
+        else:   # assume only  voltage and power variables to be masked
+            inds = torch.cat([
+                    # to pred
+                    torch.range(xx,len(orig_data.flatten()), 25, dtype=int) 
+                    for xx in [ii for ii in range(17,25)]
+                ])
+
+            shuf_inds = inds[torch.randperm(len(inds))]
+
+            nshape = batched_data.x.size()
+            batched_data.x = batched_data.x.flatten()
+            batched_data.x[shuf_inds.to(device)] = self.masking_value
+            batched_data.x = torch.reshape(batched_data.x, nshape)
+        
+        y_hat, graph_mask = self(batched_data, mask)  # [n_graph, n_masked_node, n_feature]
+        if graph_mask is not None:
+            y_gt = orig_data[graph_mask].float()
+        else:
+            y_gt = orig_data.float()
+
+        no_features = y_hat.size(2)
+        y_gt = y_gt[~mask]
+        y_hat = y_hat[~mask]
+        y_hat = y_hat.reshape(-1, y_hat.size(1))  # [n_graph*n_masked_node, n_feature]
+        y_gt = y_gt.reshape(-1, y_gt.size(1))  # [n_graph*n_masked_node, n_feature]
+        pad_mask = torch.nonzero(y_gt.sum(-1))
+        
+        y_gt = y_gt[pad_mask, :]
+        y_hat = y_hat[pad_mask, :]
+
+        loss = self.loss_fn(y_hat, y_gt)
+        loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device)
+        self.log('val_loss', loss, batch_size=1)
+
+        # loss per feature, for logging only
+        for ii in range(no_features):
+            self.log(
+                    'val_loss_{}'.format(ii), 
+                    self.loss_fn(y_hat[ii::no_features], y_gt[ii::no_features]), 
+                    batch_size=1
+                    )
+
+        return loss + loss_actv
+
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.parameters(), lr=self.peak_lr, weight_decay=self.weight_decay)
+        lr_scheduler = {
+            'scheduler': PolynomialDecayLR(
+                optimizer,
+                warmup_updates=self.warmup_updates,
+                tot_updates=self.tot_updates,
+                lr=self.peak_lr,
+                end_lr=self.end_lr,
+                power=1.0,
+            ),
+            'name': 'learning_rate',
+            'interval': 'step',
+            'frequency': 1,
+        }
+        return [optimizer], [lr_scheduler]
+
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group("GMAE_node")
+        parser.add_argument('--n_encoder_layers', type=int, default=3)
+        parser.add_argument('--n_decoder_layers', type=int, default=3)
+        parser.add_argument('--num_heads', type=int, default=8)
+        parser.add_argument('--hidden_dim', type=int, default=64)
+        parser.add_argument('--ffn_dim', type=int, default=64)
+        parser.add_argument('--intput_dropout_rate', type=float, default=0.1)
+        parser.add_argument('--dropout_rate', type=float, default=0.5)
+        parser.add_argument('--weight_decay', type=float, default=1e-5)
+        parser.add_argument('--attention_dropout_rate',type=float, default=0.1)
+        parser.add_argument('--checkpoint_path', type=str, default='')
+        parser.add_argument('--warmup_updates', type=int, default=40000)
+        parser.add_argument('--tot_updates', type=int, default=400000)
+        parser.add_argument('--peak_lr', type=float, default=0.0001)
+        parser.add_argument('--end_lr', type=float, default=1e-9)
+        parser.add_argument('--mask_ratio', type=float, default=0.5)
+        parser.add_argument('--validate', action='store_true', default=False)
+        parser.add_argument('--test', action='store_true', default=False)
+
+        return parent_parser
+
+
+class FeedForwardNetwork(nn.Module):
+    def __init__(self, hidden_size, ffn_size, dropout_rate):
+        super(FeedForwardNetwork, self).__init__()
+
+        self.layer1 = nn.Linear(hidden_size, ffn_size)
+        self.gelu = nn.GELU()
+        self.layer2 = nn.Linear(ffn_size, hidden_size)
+
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.gelu(x)
+        x = self.layer2(x)
+        return x
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, hidden_size, attention_dropout_rate, num_heads):
+        super(MultiHeadAttention, self).__init__()
+
+        self.num_heads = num_heads
+
+        self.att_size = att_size = hidden_size // num_heads
+        self.scale = att_size ** -0.5
+
+        self.linear_q = nn.Linear(hidden_size, num_heads * att_size)
+        self.linear_k = nn.Linear(hidden_size, num_heads * att_size)
+        self.linear_v = nn.Linear(hidden_size, num_heads * att_size)
+        self.att_dropout = nn.Dropout(attention_dropout_rate)
+
+        self.output_layer = nn.Linear(num_heads * att_size, hidden_size)
+
+    def forward(self, q, k, v, attn_bias=None, mask=None):
+        orig_q_size = q.size()
+
+        d_k = self.att_size
+        d_v = self.att_size
+        batch_size = q.size(0)
+
+        # head_i = Attention(Q(W^Q)_i, K(W^K)_i, V(W^V)_i)
+        q = self.linear_q(q).view(batch_size, -1, self.num_heads, d_k)
+        k = self.linear_k(k).view(batch_size, -1, self.num_heads, d_k)
+        v = self.linear_v(v).view(batch_size, -1, self.num_heads, d_v)
+
+        q = q.transpose(1, 2)                  # [b, h, q_len, d_k]
+        v = v.transpose(1, 2)                  # [b, h, v_len, d_v]
+        k = k.transpose(1, 2).transpose(2, 3)  # [b, h, d_k, k_len]
+
+        # Scaled Dot-Product Attention.
+        # Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V
+        q = q * self.scale
+        x = torch.matmul(q, k)  # [b, h, q_len, k_len]
+        # print('**********',
+        #      x.size(), q.size(), 
+        #      k.size(), v.size(), 
+        #      attn_bias.size(), mask.size()
+        #      )
+        if attn_bias is not None:
+            if mask is not None:
+                usm0 = mask.unsqueeze(1).unsqueeze(3)
+                usm1 = mask.unsqueeze(1).unsqueeze(2)
+
+                attn_bias = attn_bias.masked_fill(usm0 == 1, 0.0)
+                attn_bias = attn_bias.masked_fill(usm1 == 1, 0.0)
+            x = x + attn_bias
+
+        # mask the data before the softmax
+        if mask is not None:
+            usm0 = mask.unsqueeze(1).unsqueeze(2)
+            x = x.masked_fill(usm0 == 1, -1e9)
+
+        x = torch.softmax(x, dim=3)
+        x = self.att_dropout(x)
+        x = x.matmul(v)  # [b, h, q_len, attn]
+
+        x = x.transpose(1, 2).contiguous()  # [b, q_len, h, attn]
+        x = x.view(batch_size, -1, self.num_heads * d_v)
+
+        x = self.output_layer(x)
+
+        assert x.size() == orig_q_size
+        return x
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate, num_heads):
+        super(EncoderLayer, self).__init__()
+
+        self.self_attention_norm = nn.LayerNorm(hidden_size)
+        self.self_attention = MultiHeadAttention(
+            hidden_size, attention_dropout_rate, num_heads)
+        self.self_attention_dropout = nn.Dropout(dropout_rate)
+
+        self.ffn_norm = nn.LayerNorm(hidden_size)
+        self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate)
+        self.ffn_dropout = nn.Dropout(dropout_rate)
+
+    def forward(self, x, attn_bias=None, mask=None):
+        """
+        It is assumed that the mask is 1 where values are to be ignored
+        and then 0 where there are valid data
+        """
+        y = self.self_attention_norm(x)
+        y = self.self_attention(y, y, y, attn_bias, mask)
+        y = self.self_attention_dropout(y)
+        x = x + y
+
+        y = self.ffn_norm(x)
+        y = self.ffn(y)
+        y = self.ffn_dropout(y)
+        x = x + y
+        return x

From 0e46f0212b3c5ce331af932b25cdb94d2f03fdbf Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:42 -0400
Subject: [PATCH 03/55] graphormer data formatting

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/gmae_data.py | 165 ++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 gridfm_graphkit/models/gmae_data.py

diff --git a/gridfm_graphkit/models/gmae_data.py b/gridfm_graphkit/models/gmae_data.py
new file mode 100644
index 0000000..1be7279
--- /dev/null
+++ b/gridfm_graphkit/models/gmae_data.py
@@ -0,0 +1,165 @@
+from collator import collator
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import DataLoader, random_split
+from functools import partial
+import random
+import torch
+from wrapper import MyDataset, process_samples
+from torch_geometric.utils import to_undirected
+
+from torch_geometric.datasets import Planetoid, WikiCS, Amazon
+from torch_geometric.loader import NeighborSampler
+import torch_geometric.transforms as T
+import hqdata
+
+
+dataset = None
+
+
+def get_dataset(dataset_name='Cora', nodefile='', edgefile=''):
+    global dataset
+    path = 'dataset/' + dataset_name
+    if dataset is not None:
+        return dataset
+
+    elif dataset_name in ['Cora', 'CiteSeer', 'PubMed']:
+        return Planetoid(root=path, name=dataset_name, transform=T.NormalizeFeatures())
+    elif dataset_name == 'WikiCS':
+        return WikiCS(root=path, transform=T.NormalizeFeatures())
+    elif dataset_name == 'Amazon-Computers':
+        return Amazon(root=path, name='computers', transform=T.NormalizeFeatures())
+    elif dataset_name == 'Amazon-Photo':
+        return Amazon(root=path, name='photo', transform=T.NormalizeFeatures())
+    elif dataset_name == 'hqdata':
+        return hqdata.simple_batch(nodefile, edgefile)
+    else:
+        raise NotImplementedError
+
+def read_csv(infile):
+    """
+    assume two columns: instances number, file location and name
+    """
+
+    lines = []
+    with open(infile, 'r') as ff:
+        for line in ff:
+            lines.append([xx.strip() for xx in line.split(',')])
+
+    return lines
+
+class GraphDataModule(LightningDataModule):
+    name = "Cora"
+
+    def __init__(
+        self,
+        dataset_name: str = 'Cora',
+        num_workers: int = 8,
+        batch_size: int = 64,
+        seed: int = 42,
+        edgefile: str = '',
+        nodefile: str = '',
+        processedfile: str = '', # preprocessed dataset file in pt format
+        n_val_sampler: int = 10,
+        num_node_features: int = 25,
+        test=False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.dataset_name = dataset_name
+        if nodefile and edgefile:
+            self.dataset = get_dataset(dataset_name, nodefile, edgefile)
+        else: 
+            self.dataset = read_csv(processedfile)
+        self.num_node_features = num_node_features
+        self.seed = seed
+        self.n_val_sampler = n_val_sampler
+
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.dataset_full = ...
+        self.dataset_train = ...
+        self.dataset_val = ...
+        self.dataset_test = ... # not currently in use
+        self.train_frac = 0.8   # train-val split only
+        self.istest = test
+
+
+    def setup(self, stage: str = None):
+        """
+        automatically called, if prepare_data() is defined, then the latter
+        is called first
+
+        during testing this section is not needed
+        """
+
+        if self.istest:
+            pass
+        else:
+            items = self.dataset    # for disk data the dataset is in items form
+            self.dataset_full = MyDataset(
+                                items, 
+                                settype='csv', 
+                                )
+
+            # split the train and validation data
+            train_set_size = int(self.train_frac*len(self.dataset_full))
+            valid_set_size = len(self.dataset_full) - train_set_size
+            seed = torch.Generator().manual_seed(self.seed)
+            train_set, valid_set = random_split(
+                                self.dataset_full, 
+                                [train_set_size, valid_set_size], 
+                                generator=seed
+                                )
+            print('**train and val dataset sizes**',len(train_set),len(valid_set))
+            self.dataset_train = train_set
+            self.dataset_val = valid_set
+
+
+    def train_dataloader(self):
+        loader = DataLoader(self.dataset_train, batch_size=self.batch_size,
+                            shuffle=True,
+                            num_workers=self.num_workers,
+                            collate_fn=partial(collator),
+                            )
+        return loader
+
+    def val_dataloader(self):
+        loader = DataLoader(self.dataset_val, batch_size=self.batch_size,
+                            shuffle=False,
+                            num_workers=self.num_workers,
+                            collate_fn=partial(collator),
+                            )
+        return loader
+
+    def eval_dataloader(self):
+        """
+        for downstream evaluation
+        """
+        # do not wish to shuffle for evaluation
+        graphs_to_process = self.dataset.datalist
+
+    
+        items = []    # from in mem dataset 
+
+        for graphdata in graphs_to_process:
+            # padding and mask creation should happend here
+            num_nodes = graphdata.num_nodes
+            ns0 = 1 # batch size
+            ns1 = torch.arange(num_nodes, dtype=torch.int32)   # node ids
+            ns2 = graphdata.edge_index
+            data_item = process_samples(
+                        ns0, 
+                        ns1, 
+                        ns2,
+                        graphdata) + [0]    # TODO completely remove the appended [0]
+            items.append(data_item)
+
+        self.dataset_eval = MyDataset(items)
+        loader = DataLoader(self.dataset_eval, 
+                            batch_size=self.batch_size*self.n_val_sampler,
+                            shuffle=False,
+                            num_workers=self.num_workers,
+                            collate_fn=partial(collator),
+                            )
+        return loader

From 0f07900800863d3557ab59455ae4dcd3e302c602 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:42 -0400
Subject: [PATCH 04/55] graphormer data formatting

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/temp_leftovers.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 gridfm_graphkit/models/temp_leftovers.py

diff --git a/gridfm_graphkit/models/temp_leftovers.py b/gridfm_graphkit/models/temp_leftovers.py
new file mode 100644
index 0000000..e69de29

From 00d0987f1b1dab57a730795ddf30b2538c2b4d4f Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:43 -0400
Subject: [PATCH 05/55] basic reworking of model to match formatting

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/graphormer.py     | 351 +++--------------------
 gridfm_graphkit/models/temp_leftovers.py | 168 +++++++++++
 2 files changed, 213 insertions(+), 306 deletions(-)

diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 6aa3df7..a351c81 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -1,6 +1,5 @@
-from lr import PolynomialDecayLR
+
 import torch
-import math
 import numpy as np
 import torch.nn as nn
 import pytorch_lightning as pl
@@ -8,105 +7,60 @@
 from torch.nn import functional as F
 from losses import active_power_loss
 
-def init_params(module, n_layers):
-    if isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=0.02 / math.sqrt(n_layers))
-        if module.bias is not None:
-            module.bias.data.zero_()
-    if isinstance(module, nn.Embedding):
-        module.weight.data.normal_(mean=0.0, std=0.02)
-
 
 
+@MODELS_REGISTRY.register("Graphormer")
 class GMAE_node(pl.LightningModule):
+    """
+    TODO fill in description
+    """
     def __init__(
         self,
-        n_encoder_layers,
-        n_decoder_layers,
-        num_heads,
-        hidden_dim,
-        dropout_rate,
-        intput_dropout_rate,
-        weight_decay,
-        ffn_dim,
-        dataset_name,
-        warmup_updates,
-        tot_updates,
-        peak_lr,
-        end_lr,
-        attention_dropout_rate,
-        n_node_features,
-        mask_ratio,
-        n_val_sampler,
+        # n_encoder_layers,
+        # n_decoder_layers,
+        # num_heads,
+        # hidden_dim,
+        # dropout_rate,
+        # intput_dropout_rate,
+        # weight_decay,
+        # ffn_dim,
+        # dataset_name,
+        # warmup_updates,
+        # tot_updates,
+        # peak_lr,
+        # end_lr,
+        # attention_dropout_rate,
+        # n_node_features,
+        # mask_ratio,
+        # n_val_sampler,
+        args
     ):
         super().__init__()
         self.save_hyperparameters()
-        self.n_node_features = n_node_features
-        self.n_val_sampler = n_val_sampler
-        self.mask_ratio = mask_ratio
-        self.num_heads = num_heads
-        self.input_proj = nn.Linear(n_node_features, hidden_dim)
-
-        self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0)
-        self.in_degree_encoder = nn.Embedding(
-            512, hidden_dim, padding_idx=0)
-        self.out_degree_encoder = nn.Embedding(
-            512, hidden_dim, padding_idx=0)
+        self.n_node_features = args.model.input_dim
+        self.num_heads = 8  # TODO make this configurable or to match their structure
+        self.hidden_dim = args.model.hidden_size
+        intput_dropout_rate = 0.3
+        dropout_rate = 0.3
+        attention_dropout_rate = 0.3
 
+        self.input_proj = nn.Linear(n_node_features, hidden_dim)
         self.input_dropout = nn.Dropout(intput_dropout_rate)
-        encoders = [EncoderLayer(hidden_dim, ffn_dim, dropout_rate, attention_dropout_rate, num_heads)
+        encoders = [EncoderLayer(hidden_dim, hidden_dim, dropout_rate, attention_dropout_rate, num_heads)
                     for _ in range(n_encoder_layers)]
         self.encoder_layers = nn.ModuleList(encoders)
         self.encoder_final_ln = nn.LayerNorm(hidden_dim)
 
-        self.encoder_to_decoder = nn.Linear(hidden_dim, hidden_dim)
-
-        decoders = [EncoderLayer(hidden_dim, ffn_dim, dropout_rate, attention_dropout_rate, num_heads)
-                    for _ in range(n_decoder_layers)]
-        self.decoder_layers = nn.ModuleList(decoders)
-        self.decoder_final_ln = nn.LayerNorm(hidden_dim)
-
-        self.out_proj = nn.Linear(hidden_dim, self.n_node_features)
-        self.loss_fn = F.mse_loss
-        self.masking_value = -4
-        self.loss_phys1 = active_power_loss
-        self.alpha = 1.0/50.0    # weight for loss_phys1
-
-        self.dataset_name = dataset_name
-
-        self.warmup_updates = warmup_updates
-        self.tot_updates = tot_updates
-        self.peak_lr = peak_lr
-        self.end_lr = end_lr
-        self.weight_decay = weight_decay
-
-        self.hidden_dim = hidden_dim
-        self.automatic_optimization = True
-        self.apply(lambda module: init_params(module, n_layers=n_encoder_layers))
+        self.decoder = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LeakyReLU(),
+            nn.Linear(hidden_dim, self.n_node_features)
+        )
+        
+        # self.loss_fn = F.mse_loss # TODO remove eventually as they are specd elsewhere
+        # self.masking_value = -4
 
 
-    
-    def compute_pos_embeddings(self, batched_data):
-        attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x
-        in_degree, out_degree = batched_data.in_degree, batched_data.in_degree
-        # graph_attn_bias
-        graph_attn_bias = attn_bias.clone()
-        graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
-            1, self.num_heads, 1, 1)  # [n_graph, n_head, n_node, n_node]
-        # spatial pos
-        # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
-        spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
-        graph_attn_bias = graph_attn_bias + spatial_pos_bias
-        graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
-
-        node_feature = self.input_proj(x)
-        node_feature = node_feature + \
-            self.in_degree_encoder(in_degree) + \
-            self.out_degree_encoder(out_degree)
-        graph_node_feature = node_feature
-
-        return graph_node_feature, graph_attn_bias
-
     def encoder(self, graph_node_feature, graph_attn_bias, mask=None):
 
         graph_node_feature_masked = graph_node_feature
@@ -139,236 +93,20 @@ def forward(self, batched_data, mask=None):
 
         mask: incoming values to mask for prediction
         """
+
+        # TODO in the baseline code the PE is an input here and passes through
+        # a normalization before being concatenated to the features
+        
+        
         graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data)
         in_degree = batched_data.in_degree
         out_degree = batched_data.out_degree
 
-        graph_mask = None   # TODO this could be removed eventually
-
         output = self.encoder(graph_node_feature, graph_attn_bias, mask)
         output = self.encoder_to_decoder(output)
         output = self.decoder(output, in_degree, out_degree, graph_attn_bias, mask)
-        return output, graph_mask
-
-    def generate_pretrain_embeddings_for_downstream_task(self, batched_data):
-        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data)
-        output = self.encoder(graph_node_feature, graph_attn_bias)
-        output = output.reshape(-1, self.n_val_sampler, output.size(1), self.hidden_dim)[:, :, 0, :].mean(1)
-        output = output  # [n_graph(n_central_node), n_feature]
         return output
 
-    def generate_node_pred(self, batched_data):
-        """
-        for a batch of nodes, return the masked node array and
-        the predicted arrays for those nodes
-
-        note:
-
-        mask: nodes that are to be predicted (and are thus masked) in each
-                graph (constant for the batch)
-        graph_mask: graphs in batch that have valid results
-        """
-        num_nodes = batched_data.x.size(1)
-
-        mask = None
-        y_hat, graph_mask = self(batched_data, mask)  # [n_graph, n_masked_node, n_feature]
-        if graph_mask is not None:
-            y_gt = batched_data.x[graph_mask].float()
-
-        else:
-            y_gt = batched_data.x.float()
-            graph_mask = torch.from_numpy(np.array([]))
-        no_feat = y_hat.size(2)
-        y_hat = y_hat.reshape(-1, y_hat.size(2))  # [n_graph*n_masked_node, n_feature]
-
-        y_gt = y_gt.reshape(-1, y_gt.size(2))  # [n_graph*n_masked_node, n_feature]
-        pad_mask = torch.nonzero(y_gt.sum(-1))
-        
-        # final shaping
-        y_gt = torch.squeeze(y_gt)
-        y_hat = torch.squeeze(y_hat)
-        
-        return y_gt, y_hat, graph_mask
-    
-    
-    def training_step(self, batched_data, batch_idx):
-        num_nodes = batched_data.x.size(1)
-
-        # create a boolean mask where padding was added
-        # note that this assumes all input data had features with
-        # values >= 0
-        mask = None
-        masked_entries = torch.sum(batched_data.x == 0, axis=2)
-        mask = masked_entries == batched_data.x.size(2)
-
-        # add low-level random noise to input X
-        noise = np.random.normal(
-                    loc=0.0,
-                    scale=0.00001,  # TODO make configurable
-                    size=batched_data.x.size()
-        )
-        device = batched_data.x.device
-        orig_data = batched_data.x
-        batched_data.x = batched_data.x + torch.Tensor(noise).to(device)
-
-        strategy = ''
-        # fifty-fifty split between random masking and power-flow solution
-        if np.random.uniform() > 0.5:
-            # find location of all nozero entries for masking and shuffle, select, mask
-            inds = torch.where(orig_data.flatten() != 0)
-            num_mask = int(self.mask_ratio * len(inds[0]))
-            shuf_inds = (inds[0][torch.randperm(len(inds[0]))],)
-
-            nshape = batched_data.x.size()
-            batched_data.x = batched_data.x.flatten()
-            batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value
-            batched_data.x = torch.reshape(batched_data.x, nshape)
-        else:   # assume only  voltage and power variables to be masked
-            inds = torch.cat([
-                    # to pred
-                    torch.range(xx,len(orig_data.flatten()), 25, dtype=int) 
-                    for xx in [ii for ii in range(17,25)]
-                ])
-
-            shuf_inds = inds[torch.randperm(len(inds))]
-
-            nshape = batched_data.x.size()
-            batched_data.x = batched_data.x.flatten()
-            batched_data.x[shuf_inds.to(device)] = self.masking_value
-            batched_data.x = torch.reshape(batched_data.x, nshape)
-
-        
-        y_hat, graph_mask = self(batched_data, mask)  # [n_graph, n_masked_node, n_feature]
-        if graph_mask is not None:
-            y_gt = orig_data[graph_mask].float()
-        else:
-            y_gt = orig_data.float()
-
-        y_gt = y_gt[~mask]
-        y_hat = y_hat[~mask]
-
-        # print('pre loss shapes', y_gt.size(), y_hat.size())
-        loss = self.loss_fn(y_hat, y_gt)
-        loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device)
-        self.log('train_loss', loss)
-        self.log('activ_loss', loss_actv)
-
-        return loss + loss_actv
-
-    def validation_step(self, batched_data, batch_idx):
-        num_nodes = batched_data.x.size(1)
-        mask = None
-
-        masked_entries = torch.sum(batched_data.x == 0, axis=2)
-        mask = masked_entries == batched_data.x.size(2)
-
-        # add low-level random noise to input X
-        noise = np.random.normal(
-                    loc=0.0,
-                    scale=0.00001,  # TODO make configurable
-                    size=batched_data.x.size()
-        )
-        device = batched_data.x.device
-        orig_data = batched_data.x
-        batched_data.x = batched_data.x + torch.Tensor(noise).to(device)
-        
-        # fifty-fifty split between random masking and power-flow solution
-        if np.random.uniform() > 0.5:
-            # find location of all nozero entries for masking and shuffle, select, mask
-            inds = torch.where(orig_data.flatten() != 0)
-            num_mask = int(self.mask_ratio * len(inds[0]))
-            shuf_inds = (inds[0][torch.randperm(len(inds[0]))],)
-
-            nshape = batched_data.x.size()
-            batched_data.x = batched_data.x.flatten()
-            batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value
-            batched_data.x = torch.reshape(batched_data.x, nshape)
-        else:   # assume only  voltage and power variables to be masked
-            inds = torch.cat([
-                    # to pred
-                    torch.range(xx,len(orig_data.flatten()), 25, dtype=int) 
-                    for xx in [ii for ii in range(17,25)]
-                ])
-
-            shuf_inds = inds[torch.randperm(len(inds))]
-
-            nshape = batched_data.x.size()
-            batched_data.x = batched_data.x.flatten()
-            batched_data.x[shuf_inds.to(device)] = self.masking_value
-            batched_data.x = torch.reshape(batched_data.x, nshape)
-        
-        y_hat, graph_mask = self(batched_data, mask)  # [n_graph, n_masked_node, n_feature]
-        if graph_mask is not None:
-            y_gt = orig_data[graph_mask].float()
-        else:
-            y_gt = orig_data.float()
-
-        no_features = y_hat.size(2)
-        y_gt = y_gt[~mask]
-        y_hat = y_hat[~mask]
-        y_hat = y_hat.reshape(-1, y_hat.size(1))  # [n_graph*n_masked_node, n_feature]
-        y_gt = y_gt.reshape(-1, y_gt.size(1))  # [n_graph*n_masked_node, n_feature]
-        pad_mask = torch.nonzero(y_gt.sum(-1))
-        
-        y_gt = y_gt[pad_mask, :]
-        y_hat = y_hat[pad_mask, :]
-
-        loss = self.loss_fn(y_hat, y_gt)
-        loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device)
-        self.log('val_loss', loss, batch_size=1)
-
-        # loss per feature, for logging only
-        for ii in range(no_features):
-            self.log(
-                    'val_loss_{}'.format(ii), 
-                    self.loss_fn(y_hat[ii::no_features], y_gt[ii::no_features]), 
-                    batch_size=1
-                    )
-
-        return loss + loss_actv
-
-
-    def configure_optimizers(self):
-        optimizer = torch.optim.AdamW(
-            self.parameters(), lr=self.peak_lr, weight_decay=self.weight_decay)
-        lr_scheduler = {
-            'scheduler': PolynomialDecayLR(
-                optimizer,
-                warmup_updates=self.warmup_updates,
-                tot_updates=self.tot_updates,
-                lr=self.peak_lr,
-                end_lr=self.end_lr,
-                power=1.0,
-            ),
-            'name': 'learning_rate',
-            'interval': 'step',
-            'frequency': 1,
-        }
-        return [optimizer], [lr_scheduler]
-
-    @staticmethod
-    def add_model_specific_args(parent_parser):
-        parser = parent_parser.add_argument_group("GMAE_node")
-        parser.add_argument('--n_encoder_layers', type=int, default=3)
-        parser.add_argument('--n_decoder_layers', type=int, default=3)
-        parser.add_argument('--num_heads', type=int, default=8)
-        parser.add_argument('--hidden_dim', type=int, default=64)
-        parser.add_argument('--ffn_dim', type=int, default=64)
-        parser.add_argument('--intput_dropout_rate', type=float, default=0.1)
-        parser.add_argument('--dropout_rate', type=float, default=0.5)
-        parser.add_argument('--weight_decay', type=float, default=1e-5)
-        parser.add_argument('--attention_dropout_rate',type=float, default=0.1)
-        parser.add_argument('--checkpoint_path', type=str, default='')
-        parser.add_argument('--warmup_updates', type=int, default=40000)
-        parser.add_argument('--tot_updates', type=int, default=400000)
-        parser.add_argument('--peak_lr', type=float, default=0.0001)
-        parser.add_argument('--end_lr', type=float, default=1e-9)
-        parser.add_argument('--mask_ratio', type=float, default=0.5)
-        parser.add_argument('--validate', action='store_true', default=False)
-        parser.add_argument('--test', action='store_true', default=False)
-
-        return parent_parser
-
 
 class FeedForwardNetwork(nn.Module):
     def __init__(self, hidden_size, ffn_size, dropout_rate):
@@ -402,6 +140,7 @@ def __init__(self, hidden_size, attention_dropout_rate, num_heads):
         self.output_layer = nn.Linear(num_heads * att_size, hidden_size)
 
     def forward(self, q, k, v, attn_bias=None, mask=None):
+
         orig_q_size = q.size()
 
         d_k = self.att_size
diff --git a/gridfm_graphkit/models/temp_leftovers.py b/gridfm_graphkit/models/temp_leftovers.py
index e69de29..87e07be 100644
--- a/gridfm_graphkit/models/temp_leftovers.py
+++ b/gridfm_graphkit/models/temp_leftovers.py
@@ -0,0 +1,168 @@
+# temporary file to hold functions while they wait to be 
+# transferred to other modules
+
+
+    self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0)
+    self.in_degree_encoder = nn.Embedding(
+        512, hidden_dim, padding_idx=0)
+    self.out_degree_encoder = nn.Embedding(
+        512, hidden_dim, padding_idx=0)
+
+    
+    def compute_pos_embeddings(self, batched_data):
+        attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x
+        in_degree, out_degree = batched_data.in_degree, batched_data.in_degree
+        # graph_attn_bias
+        graph_attn_bias = attn_bias.clone()
+        graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
+            1, self.num_heads, 1, 1)  # [n_graph, n_head, n_node, n_node]
+        # spatial pos
+        # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+        spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
+        graph_attn_bias = graph_attn_bias + spatial_pos_bias
+        graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
+
+        node_feature = self.input_proj(x)
+        node_feature = node_feature + \
+            self.in_degree_encoder(in_degree) + \
+            self.out_degree_encoder(out_degree)
+        graph_node_feature = node_feature
+
+        return graph_node_feature, graph_attn_bias
+
+
+    def training_step(self, batched_data, batch_idx):
+        num_nodes = batched_data.x.size(1)
+
+        # create a boolean mask where padding was added
+        # note that this assumes all input data had features with
+        # values >= 0
+        mask = None
+        masked_entries = torch.sum(batched_data.x == 0, axis=2)
+        mask = masked_entries == batched_data.x.size(2)
+
+        # add low-level random noise to input X
+        noise = np.random.normal(
+                    loc=0.0,
+                    scale=0.00001,  # TODO make configurable
+                    size=batched_data.x.size()
+        )
+        device = batched_data.x.device
+        orig_data = batched_data.x
+        batched_data.x = batched_data.x + torch.Tensor(noise).to(device)
+
+        strategy = ''
+        # fifty-fifty split between random masking and power-flow solution
+        if np.random.uniform() > 0.5:
+            # find location of all nozero entries for masking and shuffle, select, mask
+            inds = torch.where(orig_data.flatten() != 0)
+            num_mask = int(self.mask_ratio * len(inds[0]))
+            shuf_inds = (inds[0][torch.randperm(len(inds[0]))],)
+
+            nshape = batched_data.x.size()
+            batched_data.x = batched_data.x.flatten()
+            batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value
+            batched_data.x = torch.reshape(batched_data.x, nshape)
+        else:   # assume only  voltage and power variables to be masked
+            inds = torch.cat([
+                    # to pred
+                    torch.range(xx,len(orig_data.flatten()), 25, dtype=int) 
+                    for xx in [ii for ii in range(17,25)]
+                ])
+
+            shuf_inds = inds[torch.randperm(len(inds))]
+
+            nshape = batched_data.x.size()
+            batched_data.x = batched_data.x.flatten()
+            batched_data.x[shuf_inds.to(device)] = self.masking_value
+            batched_data.x = torch.reshape(batched_data.x, nshape)
+
+        
+        y_hat, graph_mask = self(batched_data, mask)  # [n_graph, n_masked_node, n_feature]
+        if graph_mask is not None:
+            y_gt = orig_data[graph_mask].float()
+        else:
+            y_gt = orig_data.float()
+
+        y_gt = y_gt[~mask]
+        y_hat = y_hat[~mask]
+
+        # print('pre loss shapes', y_gt.size(), y_hat.size())
+        loss = self.loss_fn(y_hat, y_gt)
+        loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device)
+        self.log('train_loss', loss)
+        self.log('activ_loss', loss_actv)
+
+        return loss + loss_actv
+
+    def validation_step(self, batched_data, batch_idx):
+        num_nodes = batched_data.x.size(1)
+        mask = None
+
+        masked_entries = torch.sum(batched_data.x == 0, axis=2)
+        mask = masked_entries == batched_data.x.size(2)
+
+        # add low-level random noise to input X
+        noise = np.random.normal(
+                    loc=0.0,
+                    scale=0.00001,  # TODO make configurable
+                    size=batched_data.x.size()
+        )
+        device = batched_data.x.device
+        orig_data = batched_data.x
+        batched_data.x = batched_data.x + torch.Tensor(noise).to(device)
+        
+        # fifty-fifty split between random masking and power-flow solution
+        if np.random.uniform() > 0.5:
+            # find location of all nozero entries for masking and shuffle, select, mask
+            inds = torch.where(orig_data.flatten() != 0)
+            num_mask = int(self.mask_ratio * len(inds[0]))
+            shuf_inds = (inds[0][torch.randperm(len(inds[0]))],)
+
+            nshape = batched_data.x.size()
+            batched_data.x = batched_data.x.flatten()
+            batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value
+            batched_data.x = torch.reshape(batched_data.x, nshape)
+        else:   # assume only  voltage and power variables to be masked
+            inds = torch.cat([
+                    # to pred
+                    torch.range(xx,len(orig_data.flatten()), 25, dtype=int) 
+                    for xx in [ii for ii in range(17,25)]
+                ])
+
+            shuf_inds = inds[torch.randperm(len(inds))]
+
+            nshape = batched_data.x.size()
+            batched_data.x = batched_data.x.flatten()
+            batched_data.x[shuf_inds.to(device)] = self.masking_value
+            batched_data.x = torch.reshape(batched_data.x, nshape)
+        
+        y_hat, graph_mask = self(batched_data, mask)  # [n_graph, n_masked_node, n_feature]
+        if graph_mask is not None:
+            y_gt = orig_data[graph_mask].float()
+        else:
+            y_gt = orig_data.float()
+
+        no_features = y_hat.size(2)
+        y_gt = y_gt[~mask]
+        y_hat = y_hat[~mask]
+        y_hat = y_hat.reshape(-1, y_hat.size(1))  # [n_graph*n_masked_node, n_feature]
+        y_gt = y_gt.reshape(-1, y_gt.size(1))  # [n_graph*n_masked_node, n_feature]
+        pad_mask = torch.nonzero(y_gt.sum(-1))
+        
+        y_gt = y_gt[pad_mask, :]
+        y_hat = y_hat[pad_mask, :]
+
+        loss = self.loss_fn(y_hat, y_gt)
+        loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device)
+        self.log('val_loss', loss, batch_size=1)
+
+        # loss per feature, for logging only
+        for ii in range(no_features):
+            self.log(
+                    'val_loss_{}'.format(ii), 
+                    self.loss_fn(y_hat[ii::no_features], y_gt[ii::no_features]), 
+                    batch_size=1
+                    )
+
+        return loss + loss_actv
\ No newline at end of file

From 7da6b281613cb3d2778a36d1388e1d520a5767fd Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:43 -0400
Subject: [PATCH 06/55] rearrange the Data preprocessing to match existing

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py |  9 ++-
 gridfm_graphkit/datasets/transforms.py        | 59 +++++++++++++++++++
 gridfm_graphkit/models/gmae_wrapper.py        |  3 -
 gridfm_graphkit/models/graphormer.py          | 35 ++++++++++-
 gridfm_graphkit/models/temp_leftovers.py      | 27 +--------
 5 files changed, 101 insertions(+), 32 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index b1e4bd0..2a70519 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -2,6 +2,7 @@
 from gridfm_graphkit.datasets.transforms import (
     AddEdgeWeights,
     AddNormalizedRandomWalkPE,
+    AddGraphormerEncodings
 )
 
 import os.path as osp
@@ -204,7 +205,13 @@ def get(self, idx):
         data = torch.load(file_name, weights_only=False)
         if self.transform:
             data = self.transform(data)
-        # print('data>>>>>>>',data) # TODO remove
+
+        # TODO move this to the pretreatment when validated
+        gr_transform = AddGraphormerEncodings(
+                attr_name="gr",
+            )
+        data = gr_transform(data)
+        print('data>>>>>>>', data) # TODO remove
         return data
 
     def change_transform(self, new_transform):
diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index fb770d3..744fa30 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -15,6 +15,10 @@
     to_torch_csr_tensor,
 )
 
+import pyximport
+pyximport.install(setup_args={'include_dirs': np.get_include()})
+import algos
+
 
 class AddNormalizedRandomWalkPE(BaseTransform):
     r"""Adds the random walk positional encoding from the
@@ -83,6 +87,61 @@ def get_pe(out: Tensor) -> Tensor:
 
         return data
 
+def preprocess_item(data):
+    """
+    TODO fill in header for the function
+    """
+    edge_index = data.edge_index
+    N = data.num_nodes
+    edge_adj = torch.sparse.FloatTensor(
+                                    edge_index,
+                                    torch.ones(edge_index.shape[1]),
+                                    [N, N]
+                                    )
+
+    adj = edge_adj.to_dense()
+
+    # node adj matrix [N, N] bool
+    adj = adj.bool()
+
+    shortest_path_result, path = algos.floyd_warshall(adj.numpy())
+    spatial_pos = torch.from_numpy((shortest_path_result)).long()
+    attn_bias = torch.zeros([N, N], dtype=torch.float)  # TODO verifie is updated
+
+    in_degree = adj.long().sum(dim=1).view(-1)
+    out_degree = adj.long().sum(dim=0).view(-1)
+    return attn_bias, spatial_pos, in_degree, out_degree
+
+class AddGraphormerEncodings(BaseTransform):
+    r"""...
+    TODO update with encoding info
+    """
+
+    def __init__(
+        self,
+        attr_name: Optional[str] = "gres",  # TODO remove if not needed
+    ) -> None:
+        self.attr_name = attr_name
+
+    def forward(self, data: Data) -> Data:
+        if data.edge_index is None:
+            raise ValueError("Expected data.edge_index to be not None")
+
+        N = data.num_nodes
+        if N is None:
+            raise ValueError("Expected data.num_nodes to be not None")
+
+        
+        attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data)
+
+        # data[self.attr_name] = pe
+        data['attn_bias'] = attn_bias
+        data['spatial_pos'] = spatial_pos
+        data['in_degree'] = in_degree
+        data['in_degree'] = out_degree
+
+        return data
+
 
 class AddEdgeWeights(BaseTransform):
     """
diff --git a/gridfm_graphkit/models/gmae_wrapper.py b/gridfm_graphkit/models/gmae_wrapper.py
index b76c915..dfc1367 100644
--- a/gridfm_graphkit/models/gmae_wrapper.py
+++ b/gridfm_graphkit/models/gmae_wrapper.py
@@ -1,9 +1,6 @@
 import torch
 
 import numpy as np
-import pyximport
-pyximport.install(setup_args={'include_dirs': np.get_include()})
-import algos
 
 from torch_geometric.loader import NeighborSampler
 from torch_geometric.utils import to_undirected
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index a351c81..85355f8 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -1,4 +1,5 @@
 
+from gridfm_graphkit.io.registries import MODELS_REGISTRY
 import torch
 import numpy as np
 import torch.nn as nn
@@ -10,7 +11,7 @@
 
 
 @MODELS_REGISTRY.register("Graphormer")
-class GMAE_node(pl.LightningModule):
+class GMAE_node(nn.Module):
     """
     TODO fill in description
     """
@@ -57,9 +58,38 @@ def __init__(
             nn.Linear(hidden_dim, self.n_node_features)
         )
         
+
+        # for pos embeddings
+        self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0)
+        self.in_degree_encoder = nn.Embedding(
+            512, hidden_dim, padding_idx=0)
+        self.out_degree_encoder = nn.Embedding(
+            512, hidden_dim, padding_idx=0)
+
         # self.loss_fn = F.mse_loss # TODO remove eventually as they are specd elsewhere
         # self.masking_value = -4
 
+    def compute_pos_embeddings(self, batched_data):
+        attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x
+        in_degree, out_degree = batched_data.in_degree, batched_data.in_degree
+        # graph_attn_bias
+        graph_attn_bias = attn_bias.clone()
+        graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
+            1, self.num_heads, 1, 1)  # [n_graph, n_head, n_node, n_node]
+        # spatial pos
+        # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+        spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
+        graph_attn_bias = graph_attn_bias + spatial_pos_bias
+        graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
+
+        node_feature = self.input_proj(x)
+        node_feature = node_feature + \
+            self.in_degree_encoder(in_degree) + \
+            self.out_degree_encoder(out_degree)
+        graph_node_feature = node_feature
+
+        return graph_node_feature, graph_attn_bias
+
 
     def encoder(self, graph_node_feature, graph_attn_bias, mask=None):
 
@@ -95,8 +125,7 @@ def forward(self, batched_data, mask=None):
         """
 
         # TODO in the baseline code the PE is an input here and passes through
-        # a normalization before being concatenated to the features
-        
+        # a normalization before being concatenated to the features, follow this in final version
         
         graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data)
         in_degree = batched_data.in_degree
diff --git a/gridfm_graphkit/models/temp_leftovers.py b/gridfm_graphkit/models/temp_leftovers.py
index 87e07be..f46a8c5 100644
--- a/gridfm_graphkit/models/temp_leftovers.py
+++ b/gridfm_graphkit/models/temp_leftovers.py
@@ -2,33 +2,10 @@
 # transferred to other modules
 
 
-    self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0)
-    self.in_degree_encoder = nn.Embedding(
-        512, hidden_dim, padding_idx=0)
-    self.out_degree_encoder = nn.Embedding(
-        512, hidden_dim, padding_idx=0)
+    
 
     
-    def compute_pos_embeddings(self, batched_data):
-        attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x
-        in_degree, out_degree = batched_data.in_degree, batched_data.in_degree
-        # graph_attn_bias
-        graph_attn_bias = attn_bias.clone()
-        graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
-            1, self.num_heads, 1, 1)  # [n_graph, n_head, n_node, n_node]
-        # spatial pos
-        # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
-        spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
-        graph_attn_bias = graph_attn_bias + spatial_pos_bias
-        graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
-
-        node_feature = self.input_proj(x)
-        node_feature = node_feature + \
-            self.in_degree_encoder(in_degree) + \
-            self.out_degree_encoder(out_degree)
-        graph_node_feature = node_feature
-
-        return graph_node_feature, graph_attn_bias
+    
 
 
     def training_step(self, batched_data, batch_idx):

From b125061e0ccda45857457cbbc6dd83820ea79af0 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:43 -0400
Subject: [PATCH 07/55] changes up to decision between collator or
 to_dense_batch, will try to_dense batch first

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/gps_transformer.py |  4 ++--
 gridfm_graphkit/models/graphormer.py      | 29 ++++++++---------------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py
index b807ff3..50e7db9 100644
--- a/gridfm_graphkit/models/gps_transformer.py
+++ b/gridfm_graphkit/models/gps_transformer.py
@@ -121,9 +121,9 @@ def forward(self, x, pe, edge_index, edge_attr, batch):
         """
         x_pe = self.pe_norm(pe)
 
-        print('enc>>>', x.size())   # TODO remove
+        # print('enc>>>', x.size())   # TODO remove
         x = self.encoder(x)
-        print('post>>>', x.size())  # TODO remove
+        # print('post>>>', x.size())  # TODO remove
         x = self.input_norm(x)
 
         x = torch.cat((x, x_pe), 1)
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 85355f8..f4f997b 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -91,7 +91,7 @@ def compute_pos_embeddings(self, batched_data):
         return graph_node_feature, graph_attn_bias
 
 
-    def encoder(self, graph_node_feature, graph_attn_bias, mask=None):
+    def encoder(self, graph_node_feature, graph_attn_bias):
 
         graph_node_feature_masked = graph_node_feature
         graph_attn_bias_masked = graph_attn_bias
@@ -99,30 +99,20 @@ def encoder(self, graph_node_feature, graph_attn_bias, mask=None):
         # transfomrer encoder
         output = self.input_dropout(graph_node_feature_masked)
         for enc_layer in self.encoder_layers:
-            output = enc_layer(output, graph_attn_bias_masked, mask)
+            output = enc_layer(output, graph_attn_bias_masked)
         output = self.encoder_final_ln(output)
         return output
 
-    def decoder(self, output, in_degree, out_degree, graph_attn_bias, mask=None):
-        
-        pos_embed = self.in_degree_encoder(in_degree) + self.out_degree_encoder(out_degree)
-        output = output + pos_embed
-    
-        for enc_layer in self.decoder_layers:
-            output = enc_layer(output, graph_attn_bias, mask)
-
-        output = self.decoder_final_ln(output)
-        output = self.out_proj(output)  # [n_graph, n_node, n_feature]
- 
-        return output
-
-    def forward(self, batched_data, mask=None):
+    def forward(self, x, pe, edge_index, edge_attr, batched_data):
         """
         process a batch of data, applying the input mask, while
         excluding non-valid values that arrise during processing
 
         mask: incoming values to mask for prediction
         """
+        mask=None   # TODO remove
+
+        # TODO note that the x, pe are redundant or not needed, so clean up at the end
 
         # TODO in the baseline code the PE is an input here and passes through
         # a normalization before being concatenated to the features, follow this in final version
@@ -131,12 +121,13 @@ def forward(self, batched_data, mask=None):
         in_degree = batched_data.in_degree
         out_degree = batched_data.out_degree
 
-        output = self.encoder(graph_node_feature, graph_attn_bias, mask)
-        output = self.encoder_to_decoder(output)
-        output = self.decoder(output, in_degree, out_degree, graph_attn_bias, mask)
+        output = self.encoder(graph_node_feature, graph_attn_bias)
+        output = self.decoder(output)
+
         return output
 
 
+# TODO maybe set this as the decoder
 class FeedForwardNetwork(nn.Module):
     def __init__(self, hidden_size, ffn_size, dropout_rate):
         super(FeedForwardNetwork, self).__init__()

From 7fa15080ea59c5103a19c2e26f73bd80a7dc8f05 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:43 -0400
Subject: [PATCH 08/55] changes up to decision between collator or
 to_dense_batch, will try to_dense batch first

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/graphormer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index f4f997b..48d2cf6 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -118,8 +118,6 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data):
         # a normalization before being concatenated to the features, follow this in final version
         
         graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data)
-        in_degree = batched_data.in_degree
-        out_degree = batched_data.out_degree
 
         output = self.encoder(graph_node_feature, graph_attn_bias)
         output = self.decoder(output)

From 8125e0468822058e7eb2e123b614698c1a643ad2 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:43 -0400
Subject: [PATCH 09/55] cython - replace long by int for python 3 compat

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 3 ++-
 gridfm_graphkit/models/graphormer.py   | 2 +-
 pyproject.toml                         | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 744fa30..04ac4ba 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -15,9 +15,10 @@
     to_torch_csr_tensor,
 )
 
+import numpy as np
 import pyximport
 pyximport.install(setup_args={'include_dirs': np.get_include()})
-import algos
+import gridfm_graphkit.models.algos as algos
 
 
 class AddNormalizedRandomWalkPE(BaseTransform):
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 48d2cf6..06e2264 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -110,7 +110,7 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data):
 
         mask: incoming values to mask for prediction
         """
-        mask=None   # TODO remove
+        print('batch', batched_data)
 
         # TODO note that the x, pe are redundant or not needed, so clean up at the end
 
diff --git a/pyproject.toml b/pyproject.toml
index 51c8665..10719f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,7 @@ dependencies = [
     "pyyaml",
     "lightning",
     "seaborn",
+    "cython"
 ]
 
 [project.optional-dependencies]

From a513b1ae96084ad549b376955b27c6ba6d96a006 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:43 -0400
Subject: [PATCH 10/55] cython - adjust version to use long

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/__init__.py   |  3 ++-
 gridfm_graphkit/models/graphormer.py | 33 ++++++++++++++++++----------
 pyproject.toml                       |  2 +-
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/gridfm_graphkit/models/__init__.py b/gridfm_graphkit/models/__init__.py
index de355d3..ce5432e 100644
--- a/gridfm_graphkit/models/__init__.py
+++ b/gridfm_graphkit/models/__init__.py
@@ -1,4 +1,5 @@
 from gridfm_graphkit.models.gps_transformer import GPSTransformer
 from gridfm_graphkit.models.gnn_transformer import GNN_TransformerConv
+from gridfm_graphkit.models.graphormer import Graphormer
 
-__all__ = ["GPSTransformer", "GNN_TransformerConv"]
+__all__ = ["GPSTransformer", "GNN_TransformerConv", "Graphormer"]
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 06e2264..7bb4f99 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -6,12 +6,12 @@
 import pytorch_lightning as pl
 
 from torch.nn import functional as F
-from losses import active_power_loss
+
 
 
 
 @MODELS_REGISTRY.register("Graphormer")
-class GMAE_node(nn.Module):
+class Graphormer(nn.Module):
     """
     TODO fill in description
     """
@@ -37,34 +37,43 @@ def __init__(
         args
     ):
         super().__init__()
-        self.save_hyperparameters()
+
         self.n_node_features = args.model.input_dim
         self.num_heads = 8  # TODO make this configurable or to match their structure
         self.hidden_dim = args.model.hidden_size
+        self.n_encoder_layers = args.model.num_layers
         intput_dropout_rate = 0.3
         dropout_rate = 0.3
         attention_dropout_rate = 0.3
 
-        self.input_proj = nn.Linear(n_node_features, hidden_dim)
+        self.input_proj = nn.Linear(self.n_node_features, self.hidden_dim)
         self.input_dropout = nn.Dropout(intput_dropout_rate)
-        encoders = [EncoderLayer(hidden_dim, hidden_dim, dropout_rate, attention_dropout_rate, num_heads)
-                    for _ in range(n_encoder_layers)]
+        encoders = [
+                EncoderLayer(
+                        self.hidden_dim, 
+                        self.hidden_dim, 
+                        dropout_rate, 
+                        attention_dropout_rate, 
+                        self.num_heads
+                        )
+                    for _ in range(self.n_encoder_layers)
+                    ]
         self.encoder_layers = nn.ModuleList(encoders)
-        self.encoder_final_ln = nn.LayerNorm(hidden_dim)
+        self.encoder_final_ln = nn.LayerNorm(self.hidden_dim)
 
         self.decoder = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim),
+            nn.Linear(self.hidden_dim, self.hidden_dim),
             nn.LeakyReLU(),
-            nn.Linear(hidden_dim, self.n_node_features)
+            nn.Linear(self.hidden_dim, self.n_node_features)
         )
         
 
         # for pos embeddings
-        self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0)
+        self.spatial_pos_encoder = nn.Embedding(512, self.num_heads, padding_idx=0)
         self.in_degree_encoder = nn.Embedding(
-            512, hidden_dim, padding_idx=0)
+            512, self.hidden_dim, padding_idx=0)
         self.out_degree_encoder = nn.Embedding(
-            512, hidden_dim, padding_idx=0)
+            512, self.hidden_dim, padding_idx=0)
 
         # self.loss_fn = F.mse_loss # TODO remove eventually as they are specd elsewhere
         # self.masking_value = -4
diff --git a/pyproject.toml b/pyproject.toml
index 10719f1..0ddbe13 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ dependencies = [
     "pyyaml",
     "lightning",
     "seaborn",
-    "cython"
+    "cython<3.1"
 ]
 
 [project.optional-dependencies]

From d7a6193cdf9edfdb53bdec388c0de5f8aa09bb21 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:43 -0400
Subject: [PATCH 11/55] replace cython by networkx version of floyd_warshall

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 19 +++++++++++++------
 pyproject.toml                         |  3 +--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 04ac4ba..c9313db 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -15,10 +15,13 @@
     to_torch_csr_tensor,
 )
 
-import numpy as np
-import pyximport
-pyximport.install(setup_args={'include_dirs': np.get_include()})
-import gridfm_graphkit.models.algos as algos
+# import numpy as np
+# import pyximport
+# pyximport.install(setup_args={'include_dirs': np.get_include()})
+# import gridfm_graphkit.models.algos as algos
+
+from networkx import floyd_warshall_numpy
+from torch_geometric.utils import to_networkx
 
 
 class AddNormalizedRandomWalkPE(BaseTransform):
@@ -105,7 +108,11 @@ def preprocess_item(data):
     # node adj matrix [N, N] bool
     adj = adj.bool()
 
-    shortest_path_result, path = algos.floyd_warshall(adj.numpy())
+    # shortest_path_result, path = algos.floyd_warshall(adj.numpy())
+    gg = to_networkx(data)
+    shortest_path_result = floyd_warshall_numpy(gg)
+    print('sp>>>', shortest_path_result)
+    print(shortest_path_result.shape)
     spatial_pos = torch.from_numpy((shortest_path_result)).long()
     attn_bias = torch.zeros([N, N], dtype=torch.float)  # TODO verifie is updated
 
@@ -114,7 +121,7 @@ def preprocess_item(data):
     return attn_bias, spatial_pos, in_degree, out_degree
 
 class AddGraphormerEncodings(BaseTransform):
-    r"""...
+    """...
     TODO update with encoding info
     """
 
diff --git a/pyproject.toml b/pyproject.toml
index 0ddbe13..0c09d17 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,8 +50,7 @@ dependencies = [
     "plotly",
     "pyyaml",
     "lightning",
-    "seaborn",
-    "cython<3.1"
+    "seaborn"
 ]
 
 [project.optional-dependencies]

From 10bc942875c4e0da3281077c01ee06817a00efc4 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:44 -0400
Subject: [PATCH 12/55] put in place holder for pos embed to speed up
 development

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py |  3 ++-
 gridfm_graphkit/datasets/transforms.py        | 25 ++++++++---------
 gridfm_graphkit/models/graphormer.py          | 27 ++++++++++++++++---
 .../tasks/feature_reconstruction_task.py      |  5 ++--
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index 2a70519..cfe63e9 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -207,11 +207,12 @@ def get(self, idx):
             data = self.transform(data)
 
         # TODO move this to the pretreatment when validated
+        # print('datab>>>>>>>', data)
         gr_transform = AddGraphormerEncodings(
                 attr_name="gr",
             )
         data = gr_transform(data)
-        print('data>>>>>>>', data) # TODO remove
+        # print('dataa>>>>>>>', data) # TODO remove
         return data
 
     def change_transform(self, new_transform):
diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index c9313db..1929386 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -15,7 +15,7 @@
     to_torch_csr_tensor,
 )
 
-# import numpy as np
+import numpy as np
 # import pyximport
 # pyximport.install(setup_args={'include_dirs': np.get_include()})
 # import gridfm_graphkit.models.algos as algos
@@ -105,14 +105,15 @@ def preprocess_item(data):
 
     adj = edge_adj.to_dense()
 
-    # node adj matrix [N, N] bool
-    adj = adj.bool()
-
+    # TODO replace the placeholder with actual algorithm
+    shortest_path_result = np.ones((N,N))
     # shortest_path_result, path = algos.floyd_warshall(adj.numpy())
-    gg = to_networkx(data)
-    shortest_path_result = floyd_warshall_numpy(gg)
-    print('sp>>>', shortest_path_result)
-    print(shortest_path_result.shape)
+    #gg = to_networkx(data)
+    #shortest_path_result = floyd_warshall_numpy(gg)
+    
+    # TODO the output of fw is integer number of hops in n x n, review if need to norm etc.
+    # print('sp>>>', shortest_path_result)
+    # print(shortest_path_result.shape)
     spatial_pos = torch.from_numpy((shortest_path_result)).long()
     attn_bias = torch.zeros([N, N], dtype=torch.float)  # TODO verifie is updated
 
@@ -143,10 +144,10 @@ def forward(self, data: Data) -> Data:
         attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data)
 
         # data[self.attr_name] = pe
-        data['attn_bias'] = attn_bias
-        data['spatial_pos'] = spatial_pos
-        data['in_degree'] = in_degree
-        data['in_degree'] = out_degree
+        data['attn_bias'] = attn_bias.unsqueeze(0)
+        data['spatial_pos'] = spatial_pos.unsqueeze(0)
+        data['in_degree'] = in_degree   # assume undirected ie in == out
+        # data['out_degree'] = out_degree.unsqueeze(0)
 
         return data
 
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 7bb4f99..39fe60a 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -46,6 +46,22 @@ def __init__(
         dropout_rate = 0.3
         attention_dropout_rate = 0.3
 
+        # variables flown over from GPS TODO check
+        self.mask_dim = getattr(args.data, "mask_dim", 6)
+        self.mask_value = getattr(args.data, "mask_value", -1.0)
+        self.learn_mask = getattr(args.data, "learn_mask", True)
+
+        if self.learn_mask:
+            self.mask_value = nn.Parameter(
+                torch.randn(self.mask_dim) + self.mask_value,
+                requires_grad=True,
+            )
+        else:
+            self.mask_value = nn.Parameter(
+                torch.zeros(self.mask_dim) + self.mask_value,
+                requires_grad=False,
+            )
+
         self.input_proj = nn.Linear(self.n_node_features, self.hidden_dim)
         self.input_dropout = nn.Dropout(intput_dropout_rate)
         encoders = [
@@ -87,11 +103,13 @@ def compute_pos_embeddings(self, batched_data):
             1, self.num_heads, 1, 1)  # [n_graph, n_head, n_node, n_node]
         # spatial pos
         # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+        # print('xxxxxx', graph_attn_bias.size())
         spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
         graph_attn_bias = graph_attn_bias + spatial_pos_bias
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
 
         node_feature = self.input_proj(x)
+        # print('nf>>', node_feature.size(), in_degree.size(), out_degree.size(), self.in_degree_encoder(in_degree).size())
         node_feature = node_feature + \
             self.in_degree_encoder(in_degree) + \
             self.out_degree_encoder(out_degree)
@@ -112,22 +130,23 @@ def encoder(self, graph_node_feature, graph_attn_bias):
         output = self.encoder_final_ln(output)
         return output
 
-    def forward(self, x, pe, edge_index, edge_attr, batched_data):
+    def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
         """
         process a batch of data, applying the input mask, while
         excluding non-valid values that arrise during processing
 
         mask: incoming values to mask for prediction
         """
-        print('batch', batched_data)
+        print('batch', data)
+        print(x.size())
 
         # TODO note that the x, pe are redundant or not needed, so clean up at the end
 
         # TODO in the baseline code the PE is an input here and passes through
         # a normalization before being concatenated to the features, follow this in final version
         
-        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data)
-
+        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data)
+        print('gnodes********', graph_node_feature.size(), graph_attn_bias.size())
         output = self.encoder(graph_node_feature, graph_attn_bias)
         output = self.decoder(output)
 
diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py
index e42d09d..0092805 100644
--- a/gridfm_graphkit/tasks/feature_reconstruction_task.py
+++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py
@@ -74,11 +74,11 @@ def __init__(self, args, node_normalizers, edge_normalizers):
         self.edge_normalizers = edge_normalizers
         self.save_hyperparameters()
 
-    def forward(self, x, pe, edge_index, edge_attr, batch, mask=None):
+    def forward(self, x, pe, edge_index, edge_attr, batch, mask=None, data=None):
         if mask is not None:
             mask_value_expanded = self.model.mask_value.expand(x.shape[0], -1)
             x[:, : mask.shape[1]][mask] = mask_value_expanded[mask]
-        return self.model(x, pe, edge_index, edge_attr, batch)
+        return self.model(x, pe, edge_index, edge_attr, batch, data)
 
     @rank_zero_only
     def on_fit_start(self):
@@ -117,6 +117,7 @@ def shared_step(self, batch):
             edge_attr=batch.edge_attr,
             batch=batch.batch,
             mask=batch.mask,
+            data=batch
         )
 
         loss_dict = self.loss_fn(

From e1f8cd50c8ad022a6dea0d3a40d5d8e52c2fdd69 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:44 -0400
Subject: [PATCH 13/55] passed positional embedding

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/graphormer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 39fe60a..c10c742 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -103,7 +103,6 @@ def compute_pos_embeddings(self, batched_data):
             1, self.num_heads, 1, 1)  # [n_graph, n_head, n_node, n_node]
         # spatial pos
         # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
-        # print('xxxxxx', graph_attn_bias.size())
         spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
         graph_attn_bias = graph_attn_bias + spatial_pos_bias
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset

From 4bd084526f892a2f6d264f130cfd5a12b9f88585 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:44 -0400
Subject: [PATCH 14/55] pass to loss

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/graphormer.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index c10c742..ea55460 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -7,7 +7,7 @@
 
 from torch.nn import functional as F
 
-
+from torch_geometric.utils import to_dense_batch
 
 
 @MODELS_REGISTRY.register("Graphormer")
@@ -117,7 +117,7 @@ def compute_pos_embeddings(self, batched_data):
         return graph_node_feature, graph_attn_bias
 
 
-    def encoder(self, graph_node_feature, graph_attn_bias):
+    def encoder(self, graph_node_feature, graph_attn_bias, batch=1):
 
         graph_node_feature_masked = graph_node_feature
         graph_attn_bias_masked = graph_attn_bias
@@ -125,7 +125,7 @@ def encoder(self, graph_node_feature, graph_attn_bias):
         # transfomrer encoder
         output = self.input_dropout(graph_node_feature_masked)
         for enc_layer in self.encoder_layers:
-            output = enc_layer(output, graph_attn_bias_masked)
+            output = enc_layer(output, graph_attn_bias_masked, batch=batch)
         output = self.encoder_final_ln(output)
         return output
 
@@ -137,7 +137,7 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
         mask: incoming values to mask for prediction
         """
         print('batch', data)
-        print(x.size())
+        print(x.size(), batched_data)
 
         # TODO note that the x, pe are redundant or not needed, so clean up at the end
 
@@ -145,8 +145,8 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
         # a normalization before being concatenated to the features, follow this in final version
         
         graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data)
-        print('gnodes********', graph_node_feature.size(), graph_attn_bias.size())
-        output = self.encoder(graph_node_feature, graph_attn_bias)
+        # print('gnodes********', graph_node_feature.size(), graph_attn_bias.size())
+        output = self.encoder(graph_node_feature, graph_attn_bias, batch=batched_data)
         output = self.decoder(output)
 
         return output
@@ -250,15 +250,19 @@ def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate,
         self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate)
         self.ffn_dropout = nn.Dropout(dropout_rate)
 
-    def forward(self, x, attn_bias=None, mask=None):
+    def forward(self, x, attn_bias=None, mask=None, batch=1):
         """
         It is assumed that the mask is 1 where values are to be ignored
         and then 0 where there are valid data
         """
         y = self.self_attention_norm(x)
-        y = self.self_attention(y, y, y, attn_bias, mask)
+        # print(y.size(), attn_bias.size(), batch)
+        y, mask = to_dense_batch(y, batch)
+        # print('dense>>>', y.size(), mask.size())
+        # print('msum>>>', mask.sum(dim=-1))
+        y = self.self_attention(y, y, y, attn_bias, ~mask)
         y = self.self_attention_dropout(y)
-        x = x + y
+        x = x + torch.reshape(y, x.size())
 
         y = self.ffn_norm(x)
         y = self.ffn(y)

From 2fff2317fd6de82955b67ff191e11850e6d5ea81 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:44 -0400
Subject: [PATCH 15/55] pass loss, but note that AddGrEnc breaks multi-graph
 batch

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/graphormer.py                 | 3 ++-
 gridfm_graphkit/tasks/feature_reconstruction_task.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index ea55460..b45124f 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -50,6 +50,7 @@ def __init__(
         self.mask_dim = getattr(args.data, "mask_dim", 6)
         self.mask_value = getattr(args.data, "mask_value", -1.0)
         self.learn_mask = getattr(args.data, "learn_mask", True)
+        self.output_dim = args.model.output_dim
 
         if self.learn_mask:
             self.mask_value = nn.Parameter(
@@ -80,7 +81,7 @@ def __init__(
         self.decoder = nn.Sequential(
             nn.Linear(self.hidden_dim, self.hidden_dim),
             nn.LeakyReLU(),
-            nn.Linear(self.hidden_dim, self.n_node_features)
+            nn.Linear(self.hidden_dim, self.output_dim)
         )
         
 
diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py
index 0092805..c4714b2 100644
--- a/gridfm_graphkit/tasks/feature_reconstruction_task.py
+++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py
@@ -78,7 +78,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch, mask=None, data=None):
         if mask is not None:
             mask_value_expanded = self.model.mask_value.expand(x.shape[0], -1)
             x[:, : mask.shape[1]][mask] = mask_value_expanded[mask]
-        return self.model(x, pe, edge_index, edge_attr, batch, data)
+        return self.model(x, pe, edge_index, edge_attr, batch)  #, data
 
     @rank_zero_only
     def on_fit_start(self):

From 4e995e925679748e38df762439eed515e1bf49be Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:44 -0400
Subject: [PATCH 16/55] wrap up before testing new batching

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index cfe63e9..465140f 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -172,6 +172,12 @@ def process(self):
                 attr_name="pe",
             )
             graph_data = pe_transform(graph_data)
+
+            gr_transform = AddGraphormerEncodings(
+                attr_name="gr",
+            )
+            graph_data = gr_transform(graph_data)
+
             torch.save(
                 graph_data,
                 osp.join(
@@ -208,10 +214,10 @@ def get(self, idx):
 
         # TODO move this to the pretreatment when validated
         # print('datab>>>>>>>', data)
-        gr_transform = AddGraphormerEncodings(
-                attr_name="gr",
-            )
-        data = gr_transform(data)
+        # gr_transform = AddGraphormerEncodings(
+        #         attr_name="gr",
+        #     )
+        # data = gr_transform(data)
         # print('dataa>>>>>>>', data) # TODO remove
         return data
 

From f9da3c0af6140185bc7c00e59165d23199104d55 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:44 -0400
Subject: [PATCH 17/55] confirmation that flat tensors do not break batching

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py | 16 ++++++++--------
 gridfm_graphkit/datasets/transforms.py        |  9 +++++----
 gridfm_graphkit/models/graphormer.py          |  3 ++-
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index 465140f..32aaae3 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -173,10 +173,10 @@ def process(self):
             )
             graph_data = pe_transform(graph_data)
 
-            gr_transform = AddGraphormerEncodings(
-                attr_name="gr",
-            )
-            graph_data = gr_transform(graph_data)
+            # gr_transform = AddGraphormerEncodings(
+            #     attr_name="gr",
+            # )
+            # graph_data = gr_transform(graph_data)
 
             torch.save(
                 graph_data,
@@ -214,10 +214,10 @@ def get(self, idx):
 
         # TODO move this to the pretreatment when validated
         # print('datab>>>>>>>', data)
-        # gr_transform = AddGraphormerEncodings(
-        #         attr_name="gr",
-        #     )
-        # data = gr_transform(data)
+        gr_transform = AddGraphormerEncodings(
+                attr_name="gr",
+            )
+        data = gr_transform(data)
         # print('dataa>>>>>>>', data) # TODO remove
         return data
 
diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 1929386..c5b5c5d 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -122,13 +122,13 @@ def preprocess_item(data):
     return attn_bias, spatial_pos, in_degree, out_degree
 
 class AddGraphormerEncodings(BaseTransform):
-    """...
+    """
     TODO update with encoding info
     """
 
     def __init__(
         self,
-        attr_name: Optional[str] = "gres",  # TODO remove if not needed
+        attr_name: Optional[str] = "gres"  # TODO remove if not needed
     ) -> None:
         self.attr_name = attr_name
 
@@ -144,8 +144,9 @@ def forward(self, data: Data) -> Data:
         attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data)
 
         # data[self.attr_name] = pe
-        data['attn_bias'] = attn_bias.unsqueeze(0)
-        data['spatial_pos'] = spatial_pos.unsqueeze(0)
+        # print('******', attn_bias.size(), spatial_pos.size(), in_degree.size())
+        data['attn_bias'] = attn_bias.flatten()
+        data['spatial_pos'] = spatial_pos.flatten()
         data['in_degree'] = in_degree   # assume undirected ie in == out
         # data['out_degree'] = out_degree.unsqueeze(0)
 
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index b45124f..125ad6e 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -137,8 +137,9 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
 
         mask: incoming values to mask for prediction
         """
-        print('batch', data)
+        print('***batch***', data)
         print(x.size(), batched_data)
+        print(batched_data.attn_bias.size(), batched_data.spatial_pos.size())
 
         # TODO note that the x, pe are redundant or not needed, so clean up at the end
 

From 0976cbaea9cde654a8cb19556cec4e7be8c6d557 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:44 -0400
Subject: [PATCH 18/55] add padding of attributes

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py |  5 ++
 gridfm_graphkit/datasets/transforms.py        | 88 +++++++++++++++++--
 gridfm_graphkit/models/graphormer.py          | 59 +++++++++++--
 .../tasks/feature_reconstruction_task.py      |  2 +-
 4 files changed, 136 insertions(+), 18 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index 32aaae3..0f4299e 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -201,6 +201,11 @@ def len(self):
             self.length = len(files)
         return self.length
 
+    def __cat_dim__(self, key, value, *args, **kwargs):
+        if key in ['attn_bias', 'spatial_pos', 'in_degree']:
+            return None
+        return super().__cat_dim__(key, value, *args, **kwargs)
+    
     def get(self, idx):
         file_name = osp.join(
             self.processed_dir,
diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index c5b5c5d..64db08f 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -4,7 +4,7 @@
 import torch
 from torch import Tensor
 from torch_geometric.transforms import BaseTransform
-from typing import Optional
+from typing import Optional, Any
 import torch_geometric.typing
 from torch_geometric.data import Data
 from torch_geometric.utils import (
@@ -91,15 +91,28 @@ def get_pe(out: Tensor) -> Tensor:
 
         return data
 
+def add_node_attr(data: Data, value: Any,
+                  attr_name: Optional[str] = None) -> Data:
+    if attr_name is None:
+        if 'x' in data:
+            x = data.x.view(-1, 1) if data.x.dim() == 1 else data.x
+            data.x = torch.cat([x, value.to(x.device, x.dtype)], dim=-1)
+        else:
+            data.x = value
+    else:
+        data[attr_name] = value
+
+    return data
+
 def preprocess_item(data):
     """
     TODO fill in header for the function
     """
     edge_index = data.edge_index
     N = data.num_nodes
-    edge_adj = torch.sparse.FloatTensor(
+    edge_adj = torch.sparse_coo_tensor(
                                     edge_index,
-                                    torch.ones(edge_index.shape[1]),
+                                    torch.ones(edge_index.shape[1]).to(data.x.device),
                                     [N, N]
                                     )
 
@@ -114,13 +127,54 @@ def preprocess_item(data):
     # TODO the output of fw is integer number of hops in n x n, review if need to norm etc.
     # print('sp>>>', shortest_path_result)
     # print(shortest_path_result.shape)
-    spatial_pos = torch.from_numpy((shortest_path_result)).long()
-    attn_bias = torch.zeros([N, N], dtype=torch.float)  # TODO verifie is updated
+    spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device)
+    attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device)  # TODO verifie is updated
 
     in_degree = adj.long().sum(dim=1).view(-1)
     out_degree = adj.long().sum(dim=0).view(-1)
     return attn_bias, spatial_pos, in_degree, out_degree
 
+def pad_1d_unsqueeze(x, padlen):
+    x = x + 1  # pad id = 0
+    xlen = x.size(0)
+    if xlen < padlen:
+        new_x = x.new_zeros([padlen], dtype=x.dtype)
+        new_x[:xlen] = x
+        x = new_x
+    return x.unsqueeze(0)
+
+
+def pad_2d_unsqueeze(x, padlen):
+    x = x + 1  # pad id = 0
+    # print('-------->', x.size())
+    xlen, xdim = x.size()
+    if xlen < padlen:
+        new_x = x.new_zeros([padlen, xdim], dtype=x.dtype)
+        new_x[:xlen, :] = x
+        x = new_x
+    return x.unsqueeze(0)
+
+
+def pad_attn_bias_unsqueeze(x, padlen):
+    xlen = x.size(0)
+    if xlen < padlen:
+        new_x = x.new_zeros(
+            [padlen, padlen], dtype=x.dtype).fill_(float('-inf'))
+        new_x[:xlen, :xlen] = x
+        new_x[xlen:, :xlen] = 0
+        x = new_x
+    return x.unsqueeze(0)
+
+
+def pad_spatial_pos_unsqueeze(x, padlen):
+    x = x + 1
+    xlen = x.size(0)
+    if xlen < padlen:
+        new_x = x.new_zeros([padlen, padlen], dtype=x.dtype)
+        new_x[:xlen, :xlen] = x
+        x = new_x
+    return x.unsqueeze(0)
+
 class AddGraphormerEncodings(BaseTransform):
     """
     TODO update with encoding info
@@ -144,10 +198,26 @@ def forward(self, data: Data) -> Data:
         attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data)
 
         # data[self.attr_name] = pe
-        # print('******', attn_bias.size(), spatial_pos.size(), in_degree.size())
-        data['attn_bias'] = attn_bias.flatten()
-        data['spatial_pos'] = spatial_pos.flatten()
-        data['in_degree'] = in_degree   # assume undirected ie in == out
+        # print('******>>', attn_bias.size(), spatial_pos.size(), in_degree.size())
+        # print(data)
+        # data[] = attn_bias.unsqueeze(0)   #.flatten()
+        # data[] = spatial_pos.unsqueeze(0)   #.flatten()
+        # data[] = in_degree   # assume undirected ie in == out
+        # data['nodeslice'] = torch.from_numpy(np.array([N]))
+
+        max_node_num = 2000
+        attn_bias = pad_attn_bias_unsqueeze(attn_bias, max_node_num)
+        spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, max_node_num)
+        in_degree = pad_1d_unsqueeze(in_degree, max_node_num).squeeze()
+ 
+        data = add_node_attr(data, attn_bias, attr_name='attn_bias')
+        data = add_node_attr(data, spatial_pos, attr_name='spatial_pos')
+        data = add_node_attr(data, in_degree, attr_name='in_degree')
+
+        data.x = pad_2d_unsqueeze(data.x, max_node_num).squeeze()
+        data.y = pad_2d_unsqueeze(data.y, max_node_num).squeeze()
+
+        # print(data)
         # data['out_degree'] = out_degree.unsqueeze(0)
 
         return data
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 125ad6e..8dc79f6 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -9,6 +9,8 @@
 
 from torch_geometric.utils import to_dense_batch
 
+from gridfm_graphkit.datasets.transforms import AddGraphormerEncodings
+
 
 @MODELS_REGISTRY.register("Graphormer")
 class Graphormer(nn.Module):
@@ -95,16 +97,48 @@ def __init__(
         # self.loss_fn = F.mse_loss # TODO remove eventually as they are specd elsewhere
         # self.masking_value = -4
 
-    def compute_pos_embeddings(self, batched_data):
+    def compute_pos_embeddings(self, batched_data, batch):
         attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x
         in_degree, out_degree = batched_data.in_degree, batched_data.in_degree
+
+        # gr_transform = AddGraphormerEncodings(
+        #         attr_name="gr",
+        #     )
+        # batched_data = gr_transform(batched_data)
+        # attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x
+        # in_degree, out_degree = batched_data.in_degree, batched_data.in_degree
+        
+        
+        # print('--->', attn_bias.size(), attn_bias.device, batch.size())
+
+        # yy0, mask = to_dense_batch(attn_bias, batch=batch, max_num_nodes=2000, batch_size=8)
+        # yy1, mask = to_dense_batch(spatial_pos, batch=batch, max_num_nodes=2000, batch_size=8)
+
+        # attn_bias = yy0
+        # spatial_pos = yy1
+
+        # print('yyyyyy', yy0.size(), yy1.size(), x.size())
+
+        # attn_bias = attn_bias.reshape(8,-1)
+        # spatial_pos = spatial_pos.reshape(8,-1)
+        # print('-----', attn_bias.size(), spatial_pos.size(), x.size())
+        # odim = int(torch.sqrt(torch.as_tensor(attn_bias.size(-1))).item())
+        # print('oooo', odim)
+        # attn_bias = attn_bias.reshape(-1,odim,odim)
+        # spatial_pos = spatial_pos.reshape(-1,odim,odim)
+        # print('-----', attn_bias.size(), spatial_pos.size(), x.size())
+        
         # graph_attn_bias
         graph_attn_bias = attn_bias.clone()
         graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
             1, self.num_heads, 1, 1)  # [n_graph, n_head, n_node, n_node]
+        # print('aaaaaaaaaa', graph_attn_bias.size(), graph_attn_bias.device)
+
         # spatial pos
         # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
         spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
+        # print('sssssssssss', spatial_pos_bias.size(), spatial_pos_bias.device)
+
         graph_attn_bias = graph_attn_bias + spatial_pos_bias
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
 
@@ -137,16 +171,16 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
 
         mask: incoming values to mask for prediction
         """
-        print('***batch***', data)
-        print(x.size(), batched_data)
-        print(batched_data.attn_bias.size(), batched_data.spatial_pos.size())
+        # print('***batch***', data)
+        # print(x.size(), batched_data)
+        # print(data.attn_bias.size(), data.spatial_pos.size())
 
         # TODO note that the x, pe are redundant or not needed, so clean up at the end
 
         # TODO in the baseline code the PE is an input here and passes through
         # a normalization before being concatenated to the features, follow this in final version
         
-        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data)
+        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data, batched_data)
         # print('gnodes********', graph_node_feature.size(), graph_attn_bias.size())
         output = self.encoder(graph_node_feature, graph_attn_bias, batch=batched_data)
         output = self.decoder(output)
@@ -257,17 +291,26 @@ def forward(self, x, attn_bias=None, mask=None, batch=1):
         It is assumed that the mask is 1 where values are to be ignored
         and then 0 where there are valid data
         """
+        # print('xxxxxxxxxxxxx', x.size(), batch.size())
+        x, mask = to_dense_batch(x, batch)
+
         y = self.self_attention_norm(x)
         # print(y.size(), attn_bias.size(), batch)
-        y, mask = to_dense_batch(y, batch)
-        # print('dense>>>', y.size(), mask.size())
-        # print('msum>>>', mask.sum(dim=-1))
+        
+        attn_bias = attn_bias.squeeze()
+        # attn_bias = attn_bias.permute(1, 2, 0)
+        # attn_bias, maska = to_dense_batch(attn_bias, batch)
+        # print('dense>>>', y.size(), mask.size(), attn_bias.size())
+        # print('msum>>>', mask.sum(dim=-1), )
         y = self.self_attention(y, y, y, attn_bias, ~mask)
         y = self.self_attention_dropout(y)
+        # print('<<<<<>>>>', x.size(), y.size())
         x = x + torch.reshape(y, x.size())
 
         y = self.ffn_norm(x)
         y = self.ffn(y)
         y = self.ffn_dropout(y)
         x = x + y
+        x=x.flatten(0,1)
+        # print('222222222222222', x.size())
         return x
diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py
index c4714b2..117596c 100644
--- a/gridfm_graphkit/tasks/feature_reconstruction_task.py
+++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py
@@ -78,7 +78,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch, mask=None, data=None):
         if mask is not None:
             mask_value_expanded = self.model.mask_value.expand(x.shape[0], -1)
             x[:, : mask.shape[1]][mask] = mask_value_expanded[mask]
-        return self.model(x, pe, edge_index, edge_attr, batch)  #, data
+        return self.model(x, pe, edge_index, edge_attr, batch, data)  #
 
     @rank_zero_only
     def on_fit_start(self):

From de141d7fd7644bfcce9ca8e8864f480898c89916 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:45 -0400
Subject: [PATCH 19/55] corrected cython integer

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 19 +++++++++++--------
 gridfm_graphkit/models/graphormer.py   |  2 +-
 pyproject.toml                         |  3 ++-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 64db08f..ec5f558 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -16,12 +16,12 @@
 )
 
 import numpy as np
-# import pyximport
-# pyximport.install(setup_args={'include_dirs': np.get_include()})
-# import gridfm_graphkit.models.algos as algos
+import pyximport
+pyximport.install(setup_args={'include_dirs': np.get_include()})
+import gridfm_graphkit.models.algos as algos
 
-from networkx import floyd_warshall_numpy
-from torch_geometric.utils import to_networkx
+# from networkx import floyd_warshall_numpy
+# from torch_geometric.utils import to_networkx
 
 
 class AddNormalizedRandomWalkPE(BaseTransform):
@@ -116,16 +116,19 @@ def preprocess_item(data):
                                     [N, N]
                                     )
 
-    adj = edge_adj.to_dense()
+    adj = edge_adj.to_dense().to(torch.int16)
 
     # TODO replace the placeholder with actual algorithm
-    shortest_path_result = np.ones((N,N))
-    # shortest_path_result, path = algos.floyd_warshall(adj.numpy())
+    # shortest_path_result = np.ones((N,N))
+
+    # print('+++++++',adj.dtype, adj.numpy().dtype)
+    shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32))
     #gg = to_networkx(data)
     #shortest_path_result = floyd_warshall_numpy(gg)
     
     # TODO the output of fw is integer number of hops in n x n, review if need to norm etc.
     # print('sp>>>', shortest_path_result)
+    # print('sp>>>', shortest_path_result.shape)
     # print(shortest_path_result.shape)
     spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device)
     attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device)  # TODO verifie is updated
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 8dc79f6..de24580 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -9,7 +9,7 @@
 
 from torch_geometric.utils import to_dense_batch
 
-from gridfm_graphkit.datasets.transforms import AddGraphormerEncodings
+# from gridfm_graphkit.datasets.transforms import AddGraphormerEncodings
 
 
 @MODELS_REGISTRY.register("Graphormer")
diff --git a/pyproject.toml b/pyproject.toml
index 0c09d17..10719f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,8 @@ dependencies = [
     "plotly",
     "pyyaml",
     "lightning",
-    "seaborn"
+    "seaborn",
+    "cython"
 ]
 
 [project.optional-dependencies]

From 9d834a3a7313ed58287005e373f71f7530b6ba76 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:45 -0400
Subject: [PATCH 20/55] confirmation that route with cython and masking
 functions

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py |  5 +++--
 gridfm_graphkit/models/graphormer.py   | 24 ++++++++++++++++--------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index ec5f558..9aef961 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -138,7 +138,7 @@ def preprocess_item(data):
     return attn_bias, spatial_pos, in_degree, out_degree
 
 def pad_1d_unsqueeze(x, padlen):
-    x = x + 1  # pad id = 0
+    # x = x + 1  # pad id = 0 #TODO remove all +1s
     xlen = x.size(0)
     if xlen < padlen:
         new_x = x.new_zeros([padlen], dtype=x.dtype)
@@ -148,11 +148,12 @@ def pad_1d_unsqueeze(x, padlen):
 
 
 def pad_2d_unsqueeze(x, padlen):
-    x = x + 1  # pad id = 0
+    # x = x + 1  # pad id = 0
     # print('-------->', x.size())
     xlen, xdim = x.size()
     if xlen < padlen:
         new_x = x.new_zeros([padlen, xdim], dtype=x.dtype)
+        new_x[:,:] = -1e9
         new_x[:xlen, :] = x
         x = new_x
     return x.unsqueeze(0)
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index de24580..11b6c7c 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -152,15 +152,15 @@ def compute_pos_embeddings(self, batched_data, batch):
         return graph_node_feature, graph_attn_bias
 
 
-    def encoder(self, graph_node_feature, graph_attn_bias, batch=1):
+    def encoder(self, graph_node_feature, graph_attn_bias, mask=None, batch=1):
 
-        graph_node_feature_masked = graph_node_feature
+        graph_node_feature_masked = graph_node_feature  #TODO simplify this
         graph_attn_bias_masked = graph_attn_bias
 
         # transfomrer encoder
         output = self.input_dropout(graph_node_feature_masked)
         for enc_layer in self.encoder_layers:
-            output = enc_layer(output, graph_attn_bias_masked, batch=batch)
+            output = enc_layer(output, graph_attn_bias_masked, mask=mask, batch=batch)
         output = self.encoder_final_ln(output)
         return output
 
@@ -175,6 +175,11 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
         # print(x.size(), batched_data)
         # print(data.attn_bias.size(), data.spatial_pos.size())
 
+        mask = None
+        masked_entries = torch.sum(x < -100, axis=-1)  #TODO make this mesh with normalizn
+        mask = masked_entries == x.size(-1)
+        print('pad mask >>>', mask.size(), mask.sum())
+
         # TODO note that the x, pe are redundant or not needed, so clean up at the end
 
         # TODO in the baseline code the PE is an input here and passes through
@@ -182,7 +187,7 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
         
         graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data, batched_data)
         # print('gnodes********', graph_node_feature.size(), graph_attn_bias.size())
-        output = self.encoder(graph_node_feature, graph_attn_bias, batch=batched_data)
+        output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batched_data)
         output = self.decoder(output)
 
         return output
@@ -291,8 +296,10 @@ def forward(self, x, attn_bias=None, mask=None, batch=1):
         It is assumed that the mask is 1 where values are to be ignored
         and then 0 where there are valid data
         """
+
         # print('xxxxxxxxxxxxx', x.size(), batch.size())
-        x, mask = to_dense_batch(x, batch)
+        x, bmask = to_dense_batch(x, batch) # TODO remove bmask if padding remains in final
+        mask, _ = to_dense_batch(mask, batch)
 
         y = self.self_attention_norm(x)
         # print(y.size(), attn_bias.size(), batch)
@@ -300,9 +307,10 @@ def forward(self, x, attn_bias=None, mask=None, batch=1):
         attn_bias = attn_bias.squeeze()
         # attn_bias = attn_bias.permute(1, 2, 0)
         # attn_bias, maska = to_dense_batch(attn_bias, batch)
-        # print('dense>>>', y.size(), mask.size(), attn_bias.size())
-        # print('msum>>>', mask.sum(dim=-1), )
-        y = self.self_attention(y, y, y, attn_bias, ~mask)
+        # print('dense>>>', y.size(), bmask.size(), attn_bias.size())
+        # print('msum>>>', bmask.sum(dim=-1), )
+        # print('msum2>>', mask.size(),mask.sum(dim=-1))
+        y = self.self_attention(y, y, y, attn_bias, mask)
         y = self.self_attention_dropout(y)
         # print('<<<<<>>>>', x.size(), y.size())
         x = x + torch.reshape(y, x.size())

From 903708c6ac7b1127454a357f143a8170c2d674c0 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:45 -0400
Subject: [PATCH 21/55] propogate mask to loss calculation for all models

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py     |  2 ++
 gridfm_graphkit/datasets/transforms.py            | 13 +------------
 gridfm_graphkit/models/gnn_transformer.py         |  6 +++++-
 gridfm_graphkit/models/gps_transformer.py         |  4 +++-
 gridfm_graphkit/models/graphormer.py              | 15 ++++++++++-----
 .../tasks/feature_reconstruction_task.py          |  8 ++++----
 6 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index 0f4299e..09374ba 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -219,10 +219,12 @@ def get(self, idx):
 
         # TODO move this to the pretreatment when validated
         # print('datab>>>>>>>', data)
+        # print('qqqqqq', data.x.min(), data.x.max())
         gr_transform = AddGraphormerEncodings(
                 attr_name="gr",
             )
         data = gr_transform(data)
+        # print('aaaaaaaaaaaaaaa', data.x.min(), data.x.max())
         # print('dataa>>>>>>>', data) # TODO remove
         return data
 
diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 9aef961..c3f039f 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -201,15 +201,7 @@ def forward(self, data: Data) -> Data:
         
         attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data)
 
-        # data[self.attr_name] = pe
-        # print('******>>', attn_bias.size(), spatial_pos.size(), in_degree.size())
-        # print(data)
-        # data[] = attn_bias.unsqueeze(0)   #.flatten()
-        # data[] = spatial_pos.unsqueeze(0)   #.flatten()
-        # data[] = in_degree   # assume undirected ie in == out
-        # data['nodeslice'] = torch.from_numpy(np.array([N]))
-
-        max_node_num = 2000
+        max_node_num = 118 # TODO extract from batch
         attn_bias = pad_attn_bias_unsqueeze(attn_bias, max_node_num)
         spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, max_node_num)
         in_degree = pad_1d_unsqueeze(in_degree, max_node_num).squeeze()
@@ -221,9 +213,6 @@ def forward(self, data: Data) -> Data:
         data.x = pad_2d_unsqueeze(data.x, max_node_num).squeeze()
         data.y = pad_2d_unsqueeze(data.y, max_node_num).squeeze()
 
-        # print(data)
-        # data['out_degree'] = out_degree.unsqueeze(0)
-
         return data
 
 
diff --git a/gridfm_graphkit/models/gnn_transformer.py b/gridfm_graphkit/models/gnn_transformer.py
index 9e1ab23..627cd49 100644
--- a/gridfm_graphkit/models/gnn_transformer.py
+++ b/gridfm_graphkit/models/gnn_transformer.py
@@ -93,4 +93,8 @@ def forward(self, x, pe, edge_index, edge_attr, batch):
             x = nn.LeakyReLU()(x)
 
         x = self.mlps(x)
-        return x
+
+        masked_entries = torch.sum(x < -1e8, axis=-1)
+        mask = masked_entries >= 3
+
+        return x, ~mask
diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py
index 50e7db9..2bae93e 100644
--- a/gridfm_graphkit/models/gps_transformer.py
+++ b/gridfm_graphkit/models/gps_transformer.py
@@ -139,4 +139,6 @@ def forward(self, x, pe, edge_index, edge_attr, batch):
         x = self.pre_decoder_norm(x)
         x = self.decoder(x)
 
-        return x
+        masked_entries = torch.sum(x < -1e8, axis=-1)
+        mask = masked_entries >= 3
+        return x, ~mask
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 11b6c7c..382bb53 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -172,13 +172,15 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
         mask: incoming values to mask for prediction
         """
         # print('***batch***', data)
-        # print(x.size(), batched_data)
+        # print('====', x.size(), batched_data)
         # print(data.attn_bias.size(), data.spatial_pos.size())
 
         mask = None
-        masked_entries = torch.sum(x < -100, axis=-1)  #TODO make this mesh with normalizn
-        mask = masked_entries == x.size(-1)
-        print('pad mask >>>', mask.size(), mask.sum())
+        masked_entries = torch.sum(x < -1e8, axis=-1)  #TODO make this mesh with normalizn
+        # print('>>', masked_entries.size())
+        # TODO key to make this more general to handle other masking objectives
+        mask = masked_entries >= 3  # due to masking # x.size(-1)
+        # print('pad mask >>>', mask.size(), mask.sum())
 
         # TODO note that the x, pe are redundant or not needed, so clean up at the end
 
@@ -190,7 +192,10 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
         output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batched_data)
         output = self.decoder(output)
 
-        return output
+        # evaluate where mask is True, so update it TODO
+        # print('ooooooooo', output[~mask].size())
+        # print('bbbbbbbb', data.mask.size(), data.mask, data.mask.sum()/len(data.mask.flatten()))
+        return output, ~mask
 
 
 # TODO maybe set this as the decoder
diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py
index 117596c..e7d5d79 100644
--- a/gridfm_graphkit/tasks/feature_reconstruction_task.py
+++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py
@@ -110,7 +110,7 @@ def on_fit_start(self):
                 )
 
     def shared_step(self, batch):
-        output = self.forward(
+        output, valid = self.forward(
             x=batch.x,
             pe=batch.pe,
             edge_index=batch.edge_index,
@@ -121,11 +121,11 @@ def shared_step(self, batch):
         )
 
         loss_dict = self.loss_fn(
-            output,
-            batch.y,
+            output[valid],
+            batch.y[valid],
             batch.edge_index,
             batch.edge_attr,
-            batch.mask,
+            batch.mask[valid],
         )
         return output, loss_dict
 

From 40140349b03f5d1ed10e991e917ee1fff3af2116 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:45 -0400
Subject: [PATCH 22/55] clean up and include cython code for encoding

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/algos.pyx        |  91 +++++++++++++
 gridfm_graphkit/models/gmae_collator.py | 127 ------------------
 gridfm_graphkit/models/gmae_data.py     | 165 ------------------------
 gridfm_graphkit/models/gmae_wrapper.py  |  88 -------------
 gridfm_graphkit/models/graphormer.py    |  11 +-
 5 files changed, 94 insertions(+), 388 deletions(-)
 create mode 100644 gridfm_graphkit/models/algos.pyx
 delete mode 100644 gridfm_graphkit/models/gmae_collator.py
 delete mode 100644 gridfm_graphkit/models/gmae_data.py
 delete mode 100644 gridfm_graphkit/models/gmae_wrapper.py

diff --git a/gridfm_graphkit/models/algos.pyx b/gridfm_graphkit/models/algos.pyx
new file mode 100644
index 0000000..8600367
--- /dev/null
+++ b/gridfm_graphkit/models/algos.pyx
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import cython
+from cython.parallel cimport prange, parallel
+cimport numpy
+import numpy
+
+def floyd_warshall(adjacency_matrix):
+
+    (nrows, ncols) = adjacency_matrix.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+
+    adj_mat_copy = adjacency_matrix.astype(numpy.int32, order='C', casting='safe', copy=True)
+    assert adj_mat_copy.flags['C_CONTIGUOUS']
+    cdef numpy.ndarray[long, ndim=2, mode='c'] M = adj_mat_copy
+    cdef numpy.ndarray[long, ndim=2, mode='c'] path = numpy.zeros([n, n], dtype=numpy.int32)
+
+    cdef unsigned int i, j, k
+    cdef long M_ij, M_ik, cost_ikkj
+    cdef long* M_ptr = &M[0,0]
+    cdef long* M_i_ptr
+    cdef long* M_k_ptr
+
+    # set unreachable nodes distance to 510
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                M[i][j] = 0
+            elif M[i][j] == 0:
+                M[i][j] = 510
+
+    # floyed algo
+    for k in range(n):
+        M_k_ptr = M_ptr + n*k
+        for i in range(n):
+            M_i_ptr = M_ptr + n*i
+            M_ik = M_i_ptr[k]
+            for j in range(n):
+                cost_ikkj = M_ik + M_k_ptr[j]
+                M_ij = M_i_ptr[j]
+                if M_ij > cost_ikkj:
+                    M_i_ptr[j] = cost_ikkj
+                    path[i][j] = k
+
+    # set unreachable path to 510
+    for i in range(n):
+        for j in range(n):
+            if M[i][j] >= 510:
+                path[i][j] = 510
+                M[i][j] = 510
+
+    return M, path
+
+
+def get_all_edges(path, i, j):
+    cdef unsigned int k = path[i][j]
+    if k == 0:
+        return []
+    else:
+        return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)
+
+
+def gen_edge_input(max_dist, path, edge_feat):
+
+    (nrows, ncols) = path.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+    cdef unsigned int max_dist_copy = max_dist
+
+    path_copy = path.astype(long, order='C', casting='safe', copy=True)
+    edge_feat_copy = edge_feat.astype(long, order='C', casting='safe', copy=True)
+    assert path_copy.flags['C_CONTIGUOUS']
+    assert edge_feat_copy.flags['C_CONTIGUOUS']
+
+    cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=numpy.int64)
+    cdef unsigned int i, j, k, num_path, cur
+
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                continue
+            if path_copy[i][j] == 510:
+                continue
+            path = [i] + get_all_edges(path_copy, i, j) + [j]
+            num_path = len(path) - 1
+            for k in range(num_path):
+                edge_fea_all[i, j, k, :] = edge_feat_copy[path[k], path[k+1], :]
+
+    return edge_fea_all
diff --git a/gridfm_graphkit/models/gmae_collator.py b/gridfm_graphkit/models/gmae_collator.py
deleted file mode 100644
index f4bc532..0000000
--- a/gridfm_graphkit/models/gmae_collator.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import torch
-
-
-def pad_1d_unsqueeze(x, padlen):
-    x = x + 1  # pad id = 0
-    xlen = x.size(0)
-    if xlen < padlen:
-        new_x = x.new_zeros([padlen], dtype=x.dtype)
-        new_x[:xlen] = x
-        x = new_x
-    return x.unsqueeze(0)
-
-
-def pad_2d_unsqueeze(x, padlen):
-    x = x + 1  # pad id = 0
-    # print('-------->', x.size())
-    xlen, xdim = x.size()
-    if xlen < padlen:
-        new_x = x.new_zeros([padlen, xdim], dtype=x.dtype)
-        new_x[:xlen, :] = x
-        x = new_x
-    return x.unsqueeze(0)
-
-
-def pad_attn_bias_unsqueeze(x, padlen):
-    xlen = x.size(0)
-    if xlen < padlen:
-        new_x = x.new_zeros(
-            [padlen, padlen], dtype=x.dtype).fill_(float('-inf'))
-        new_x[:xlen, :xlen] = x
-        new_x[xlen:, :xlen] = 0
-        x = new_x
-    return x.unsqueeze(0)
-
-
-def pad_spatial_pos_unsqueeze(x, padlen):
-    x = x + 1
-    xlen = x.size(0)
-    if xlen < padlen:
-        new_x = x.new_zeros([padlen, padlen], dtype=x.dtype)
-        new_x[:xlen, :xlen] = x
-        x = new_x
-    return x.unsqueeze(0)
-
-
-class Batch():
-    def __init__(self, 
-            min_node_num, 
-            attn_bias, 
-            spatial_pos, 
-            in_degree, 
-            out_degree, 
-            x, 
-            y,
-            orig_id
-            ):
-        super(Batch, self).__init__()
-        self.min_node_num = int(min_node_num)
-        self.in_degree, self.out_degree = in_degree, out_degree
-        self.x, self.y = x, y
-        self.attn_bias, self.spatial_pos = attn_bias, spatial_pos
-        self.orig_id = orig_id
-
-    def to(self, device):
-        self.in_degree, self.out_degree = self.in_degree.to(
-            device), self.out_degree.to(device)
-        self.x = self.x.to(device)
-        self.y = self.y.to(device)
-        self.attn_bias, self.spatial_pos = self.attn_bias.to(
-            device), self.spatial_pos.to(device)
-        return self
-
-    def __len__(self):
-        return self.in_degree.size(0)
-
-
-def collator(items, spatial_pos_max=20):
-    """
-    custom collator, among other transformations...
-
-    unequal input graphs are padded to all have the same size
-    
-    adds 1 to the input x via pad_2d_unsqueeze and similar functions
-    """
-    items = [
-        item for item in items if item is not None]
-    items = [
-            (item[0], item[1], item[2], item[3], item[4], item[5], item[6], item[7]) 
-            for item in items
-            ]
-    
-    # at this step all graphs in batch have their input size
-    xs, ys, adjs, attn_biases, spatial_poses, in_degrees, out_degrees, orig_ids = zip(*items)
-    
-    for idx, _ in enumerate(attn_biases):
-        attn_biases[idx][spatial_poses[idx] >= spatial_pos_max] = float('-inf')
-    max_node_num = max(i.size(0) for i in xs)
-    min_node_num = min(i.size(0) for i in xs)
-    
-    if all([torch.all(xx == yy) for xx,yy in zip(xs, ys)]):    # then this is for and encoder-decoder setup
-        y = torch.cat([pad_2d_unsqueeze(i, max_node_num) for i in ys])
-    else:
-        y = torch.stack(ys)
-
-    # following steps pad the smaller graphs to match the largest for batching
-    # incidentally a constant value of 1 is added as well
-    x = torch.cat([pad_2d_unsqueeze(i, max_node_num) for i in xs])
-    attn_bias = torch.cat([pad_attn_bias_unsqueeze(
-        i, max_node_num) for i in attn_biases])
-    spatial_pos = torch.cat([pad_spatial_pos_unsqueeze(i, max_node_num)
-                        for i in spatial_poses])
-    in_degree = torch.cat([pad_1d_unsqueeze(i, max_node_num)
-                          for i in in_degrees])
-    out_degree = torch.cat([pad_1d_unsqueeze(i, max_node_num)
-                           for i in out_degrees])
-
-
-    return Batch(
-        min_node_num=min_node_num,
-        attn_bias=attn_bias,
-        spatial_pos=spatial_pos,
-        in_degree=in_degree,
-        out_degree=out_degree,
-        x=x,
-        y=y,
-        orig_id=orig_ids
-    )
diff --git a/gridfm_graphkit/models/gmae_data.py b/gridfm_graphkit/models/gmae_data.py
deleted file mode 100644
index 1be7279..0000000
--- a/gridfm_graphkit/models/gmae_data.py
+++ /dev/null
@@ -1,165 +0,0 @@
-from collator import collator
-from pytorch_lightning import LightningDataModule
-from torch.utils.data import DataLoader, random_split
-from functools import partial
-import random
-import torch
-from wrapper import MyDataset, process_samples
-from torch_geometric.utils import to_undirected
-
-from torch_geometric.datasets import Planetoid, WikiCS, Amazon
-from torch_geometric.loader import NeighborSampler
-import torch_geometric.transforms as T
-import hqdata
-
-
-dataset = None
-
-
-def get_dataset(dataset_name='Cora', nodefile='', edgefile=''):
-    global dataset
-    path = 'dataset/' + dataset_name
-    if dataset is not None:
-        return dataset
-
-    elif dataset_name in ['Cora', 'CiteSeer', 'PubMed']:
-        return Planetoid(root=path, name=dataset_name, transform=T.NormalizeFeatures())
-    elif dataset_name == 'WikiCS':
-        return WikiCS(root=path, transform=T.NormalizeFeatures())
-    elif dataset_name == 'Amazon-Computers':
-        return Amazon(root=path, name='computers', transform=T.NormalizeFeatures())
-    elif dataset_name == 'Amazon-Photo':
-        return Amazon(root=path, name='photo', transform=T.NormalizeFeatures())
-    elif dataset_name == 'hqdata':
-        return hqdata.simple_batch(nodefile, edgefile)
-    else:
-        raise NotImplementedError
-
-def read_csv(infile):
-    """
-    assume two columns: instances number, file location and name
-    """
-
-    lines = []
-    with open(infile, 'r') as ff:
-        for line in ff:
-            lines.append([xx.strip() for xx in line.split(',')])
-
-    return lines
-
-class GraphDataModule(LightningDataModule):
-    name = "Cora"
-
-    def __init__(
-        self,
-        dataset_name: str = 'Cora',
-        num_workers: int = 8,
-        batch_size: int = 64,
-        seed: int = 42,
-        edgefile: str = '',
-        nodefile: str = '',
-        processedfile: str = '', # preprocessed dataset file in pt format
-        n_val_sampler: int = 10,
-        num_node_features: int = 25,
-        test=False,
-        *args,
-        **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        self.dataset_name = dataset_name
-        if nodefile and edgefile:
-            self.dataset = get_dataset(dataset_name, nodefile, edgefile)
-        else: 
-            self.dataset = read_csv(processedfile)
-        self.num_node_features = num_node_features
-        self.seed = seed
-        self.n_val_sampler = n_val_sampler
-
-        self.num_workers = num_workers
-        self.batch_size = batch_size
-        self.dataset_full = ...
-        self.dataset_train = ...
-        self.dataset_val = ...
-        self.dataset_test = ... # not currently in use
-        self.train_frac = 0.8   # train-val split only
-        self.istest = test
-
-
-    def setup(self, stage: str = None):
-        """
-        automatically called, if prepare_data() is defined, then the latter
-        is called first
-
-        during testing this section is not needed
-        """
-
-        if self.istest:
-            pass
-        else:
-            items = self.dataset    # for disk data the dataset is in items form
-            self.dataset_full = MyDataset(
-                                items, 
-                                settype='csv', 
-                                )
-
-            # split the train and validation data
-            train_set_size = int(self.train_frac*len(self.dataset_full))
-            valid_set_size = len(self.dataset_full) - train_set_size
-            seed = torch.Generator().manual_seed(self.seed)
-            train_set, valid_set = random_split(
-                                self.dataset_full, 
-                                [train_set_size, valid_set_size], 
-                                generator=seed
-                                )
-            print('**train and val dataset sizes**',len(train_set),len(valid_set))
-            self.dataset_train = train_set
-            self.dataset_val = valid_set
-
-
-    def train_dataloader(self):
-        loader = DataLoader(self.dataset_train, batch_size=self.batch_size,
-                            shuffle=True,
-                            num_workers=self.num_workers,
-                            collate_fn=partial(collator),
-                            )
-        return loader
-
-    def val_dataloader(self):
-        loader = DataLoader(self.dataset_val, batch_size=self.batch_size,
-                            shuffle=False,
-                            num_workers=self.num_workers,
-                            collate_fn=partial(collator),
-                            )
-        return loader
-
-    def eval_dataloader(self):
-        """
-        for downstream evaluation
-        """
-        # do not wish to shuffle for evaluation
-        graphs_to_process = self.dataset.datalist
-
-    
-        items = []    # from in mem dataset 
-
-        for graphdata in graphs_to_process:
-            # padding and mask creation should happend here
-            num_nodes = graphdata.num_nodes
-            ns0 = 1 # batch size
-            ns1 = torch.arange(num_nodes, dtype=torch.int32)   # node ids
-            ns2 = graphdata.edge_index
-            data_item = process_samples(
-                        ns0, 
-                        ns1, 
-                        ns2,
-                        graphdata) + [0]    # TODO completely remove the appended [0]
-            items.append(data_item)
-
-        self.dataset_eval = MyDataset(items)
-        loader = DataLoader(self.dataset_eval, 
-                            batch_size=self.batch_size*self.n_val_sampler,
-                            shuffle=False,
-                            num_workers=self.num_workers,
-                            collate_fn=partial(collator),
-                            )
-        return loader
diff --git a/gridfm_graphkit/models/gmae_wrapper.py b/gridfm_graphkit/models/gmae_wrapper.py
deleted file mode 100644
index dfc1367..0000000
--- a/gridfm_graphkit/models/gmae_wrapper.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import torch
-
-import numpy as np
-
-from torch_geometric.loader import NeighborSampler
-from torch_geometric.utils import to_undirected
-
-
-
-def process_samples(batch_size, n_id, edge_index, dataset):
-    """
-    transformation of sampled nodes to: 
-    - node features of sampled set, 
-    - y, 
-    - edges tensor
-
-    # TODO reconcile redundance of using edge_index and dataset
-    # in the case where the full graph is used
-    """
-
-    # print(edge_index)
-    # print('<------->')
-    if edge_index.size(1) != 0:
-        edge_index = to_undirected(edge_index)
-    n_nodes = len(n_id)
-    edge_sp_adj = torch.sparse.FloatTensor(edge_index,
-                                            torch.ones(edge_index.shape[1]),
-                                            [n_nodes, n_nodes])
-    edge_adj = edge_sp_adj
-
-    # print('<<---------------->>')
-    # print(n_id)
-    # print(dataset.x.size())
-    # print(dataset.y.size())
-
-    return [dataset.x[n_id], dataset.y[n_id], edge_adj]
-    
-
-# GMAE_graph positional encoding
-class MyDataset(torch.utils.data.Dataset):
-    def __init__(self, items, settype=''):
-        super(MyDataset, self).__init__()
-
-        self.items = items
-        self.type = settype
-
-
-    def __len__(self):
-        return len(self.items)
-
-    def __getitem__(self, idx):
-        item = self.items[idx]
-        
-        if self.type=='csv':
-            graphdata = torch.load(item[1])
-            num_nodes = graphdata.num_nodes
-        
-            # padding and mask creation should happend here
-            ns0 = 1 # batch size
-            ns1 = torch.arange(num_nodes, dtype=torch.int32)   # node ids
-            ns2 = graphdata.edge_index
-            data_item = process_samples(
-                        ns0, 
-                        ns1, 
-                        ns2,
-                        graphdata) + [0]    # TODO completely remove the appended [0]
-        else:
-            data_item = item    # in memory dataset in use
-
-        return preprocess_item(data_item)
-
-
-def preprocess_item(item):
-    """
-    """
-    x, y, adj, orig_id = item[0], item[1], item[2].to_dense(), item[3]
-    N = x.size(0)
-
-    # node adj matrix [N, N] bool
-    adj = adj.bool()
-
-    shortest_path_result, path = algos.floyd_warshall(adj.numpy())
-    spatial_pos = torch.from_numpy((shortest_path_result)).long()
-    attn_bias = torch.zeros([N, N], dtype=torch.float)
-
-    in_degree = adj.long().sum(dim=1).view(-1)
-    out_degree = adj.long().sum(dim=0).view(-1)
-    return x, y, adj, attn_bias, spatial_pos, in_degree, out_degree, orig_id
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 382bb53..27c7da9 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -171,16 +171,11 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
 
         mask: incoming values to mask for prediction
         """
-        # print('***batch***', data)
-        # print('====', x.size(), batched_data)
-        # print(data.attn_bias.size(), data.spatial_pos.size())
 
         mask = None
-        masked_entries = torch.sum(x < -1e8, axis=-1)  #TODO make this mesh with normalizn
-        # print('>>', masked_entries.size())
-        # TODO key to make this more general to handle other masking objectives
-        mask = masked_entries >= 3  # due to masking # x.size(-1)
-        # print('pad mask >>>', mask.size(), mask.sum())
+        masked_entries = torch.sum(x < -1e8, axis=-1)
+        mask = masked_entries >= 3  # due to masking up to feature 6 of 9 # x.size(-1)
+
 
         # TODO note that the x, pe are redundant or not needed, so clean up at the end
 

From 6cc690ecf43240b82c76c6c7154318e0a58c70ae Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:45 -0400
Subject: [PATCH 23/55] clean up

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/temp_leftovers.py | 145 -----------------------
 1 file changed, 145 deletions(-)
 delete mode 100644 gridfm_graphkit/models/temp_leftovers.py

diff --git a/gridfm_graphkit/models/temp_leftovers.py b/gridfm_graphkit/models/temp_leftovers.py
deleted file mode 100644
index f46a8c5..0000000
--- a/gridfm_graphkit/models/temp_leftovers.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# temporary file to hold functions while they wait to be 
-# transferred to other modules
-
-
-    
-
-    
-    
-
-
-    def training_step(self, batched_data, batch_idx):
-        num_nodes = batched_data.x.size(1)
-
-        # create a boolean mask where padding was added
-        # note that this assumes all input data had features with
-        # values >= 0
-        mask = None
-        masked_entries = torch.sum(batched_data.x == 0, axis=2)
-        mask = masked_entries == batched_data.x.size(2)
-
-        # add low-level random noise to input X
-        noise = np.random.normal(
-                    loc=0.0,
-                    scale=0.00001,  # TODO make configurable
-                    size=batched_data.x.size()
-        )
-        device = batched_data.x.device
-        orig_data = batched_data.x
-        batched_data.x = batched_data.x + torch.Tensor(noise).to(device)
-
-        strategy = ''
-        # fifty-fifty split between random masking and power-flow solution
-        if np.random.uniform() > 0.5:
-            # find location of all nozero entries for masking and shuffle, select, mask
-            inds = torch.where(orig_data.flatten() != 0)
-            num_mask = int(self.mask_ratio * len(inds[0]))
-            shuf_inds = (inds[0][torch.randperm(len(inds[0]))],)
-
-            nshape = batched_data.x.size()
-            batched_data.x = batched_data.x.flatten()
-            batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value
-            batched_data.x = torch.reshape(batched_data.x, nshape)
-        else:   # assume only  voltage and power variables to be masked
-            inds = torch.cat([
-                    # to pred
-                    torch.range(xx,len(orig_data.flatten()), 25, dtype=int) 
-                    for xx in [ii for ii in range(17,25)]
-                ])
-
-            shuf_inds = inds[torch.randperm(len(inds))]
-
-            nshape = batched_data.x.size()
-            batched_data.x = batched_data.x.flatten()
-            batched_data.x[shuf_inds.to(device)] = self.masking_value
-            batched_data.x = torch.reshape(batched_data.x, nshape)
-
-        
-        y_hat, graph_mask = self(batched_data, mask)  # [n_graph, n_masked_node, n_feature]
-        if graph_mask is not None:
-            y_gt = orig_data[graph_mask].float()
-        else:
-            y_gt = orig_data.float()
-
-        y_gt = y_gt[~mask]
-        y_hat = y_hat[~mask]
-
-        # print('pre loss shapes', y_gt.size(), y_hat.size())
-        loss = self.loss_fn(y_hat, y_gt)
-        loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device)
-        self.log('train_loss', loss)
-        self.log('activ_loss', loss_actv)
-
-        return loss + loss_actv
-
-    def validation_step(self, batched_data, batch_idx):
-        num_nodes = batched_data.x.size(1)
-        mask = None
-
-        masked_entries = torch.sum(batched_data.x == 0, axis=2)
-        mask = masked_entries == batched_data.x.size(2)
-
-        # add low-level random noise to input X
-        noise = np.random.normal(
-                    loc=0.0,
-                    scale=0.00001,  # TODO make configurable
-                    size=batched_data.x.size()
-        )
-        device = batched_data.x.device
-        orig_data = batched_data.x
-        batched_data.x = batched_data.x + torch.Tensor(noise).to(device)
-        
-        # fifty-fifty split between random masking and power-flow solution
-        if np.random.uniform() > 0.5:
-            # find location of all nozero entries for masking and shuffle, select, mask
-            inds = torch.where(orig_data.flatten() != 0)
-            num_mask = int(self.mask_ratio * len(inds[0]))
-            shuf_inds = (inds[0][torch.randperm(len(inds[0]))],)
-
-            nshape = batched_data.x.size()
-            batched_data.x = batched_data.x.flatten()
-            batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value
-            batched_data.x = torch.reshape(batched_data.x, nshape)
-        else:   # assume only  voltage and power variables to be masked
-            inds = torch.cat([
-                    # to pred
-                    torch.range(xx,len(orig_data.flatten()), 25, dtype=int) 
-                    for xx in [ii for ii in range(17,25)]
-                ])
-
-            shuf_inds = inds[torch.randperm(len(inds))]
-
-            nshape = batched_data.x.size()
-            batched_data.x = batched_data.x.flatten()
-            batched_data.x[shuf_inds.to(device)] = self.masking_value
-            batched_data.x = torch.reshape(batched_data.x, nshape)
-        
-        y_hat, graph_mask = self(batched_data, mask)  # [n_graph, n_masked_node, n_feature]
-        if graph_mask is not None:
-            y_gt = orig_data[graph_mask].float()
-        else:
-            y_gt = orig_data.float()
-
-        no_features = y_hat.size(2)
-        y_gt = y_gt[~mask]
-        y_hat = y_hat[~mask]
-        y_hat = y_hat.reshape(-1, y_hat.size(1))  # [n_graph*n_masked_node, n_feature]
-        y_gt = y_gt.reshape(-1, y_gt.size(1))  # [n_graph*n_masked_node, n_feature]
-        pad_mask = torch.nonzero(y_gt.sum(-1))
-        
-        y_gt = y_gt[pad_mask, :]
-        y_hat = y_hat[pad_mask, :]
-
-        loss = self.loss_fn(y_hat, y_gt)
-        loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device)
-        self.log('val_loss', loss, batch_size=1)
-
-        # loss per feature, for logging only
-        for ii in range(no_features):
-            self.log(
-                    'val_loss_{}'.format(ii), 
-                    self.loss_fn(y_hat[ii::no_features], y_gt[ii::no_features]), 
-                    batch_size=1
-                    )
-
-        return loss + loss_actv
\ No newline at end of file

From 87ef06510060c3d7cb922d11688e5bd81d9fc79b Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:45 -0400
Subject: [PATCH 24/55] rework function head and parameters

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/gps_transformer.py |  2 -
 gridfm_graphkit/models/graphormer.py      | 66 ++++++++++++-----------
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py
index 2bae93e..b3f1043 100644
--- a/gridfm_graphkit/models/gps_transformer.py
+++ b/gridfm_graphkit/models/gps_transformer.py
@@ -121,9 +121,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch):
         """
         x_pe = self.pe_norm(pe)
 
-        # print('enc>>>', x.size())   # TODO remove
         x = self.encoder(x)
-        # print('post>>>', x.size())  # TODO remove
         x = self.input_norm(x)
 
         x = torch.cat((x, x_pe), 1)
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 27c7da9..77a5ba2 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -15,45 +15,48 @@
 @MODELS_REGISTRY.register("Graphormer")
 class Graphormer(nn.Module):
     """
-    TODO fill in description
+    A Graph Transformer model based on the Graphormer architecture
+
+    This model directly modifies the attention between nodes based on
+    its graph encodings. This requires padding the input nodes and propogating
+    the associated mask as needed.
+
+    Args:
+        args (NestedNamespace): Parameters
+
+    Attributes:
+        input_dim (int): Dimension of input node features. From ``args.model.input_dim``.
+        hidden_size (int): Hidden dimension size for all layers. From ``args.model.hidden_size``.
+        output_dim (int): Dimension of the output node features. From ``args.model.output_dim``.
+        edge_dim (int): Dimension of edge features. From ``args.model.edge_dim``.
+        pe_dim (int): Dimension of the positional encoding. Must be less than ``hidden_dim``. From ``args.model.pe_dim``.
+        num_layers (int): Number of GPSConv layers. From ``args.model.num_layers``.
+        heads (int, optional): Number of attention heads in GPSConv. From ``args.model.attention_head``. Defaults to 1.
+        dropout (float, optional): Dropout rate in GPSConv. From ``args.model.dropout``. Defaults to 0.0.
+        mask_dim (int, optional): Dimension of the mask vector. From ``args.data.mask_dim``. Defaults to 6.
+        mask_value (float, optional): Initial value for learnable mask parameters. From ``args.data.mask_value``. Defaults to -1.0.
+        learn_mask (bool, optional): Whether to learn mask values as parameters. From ``args.data.learn_mask``. Defaults to True.
+
     """
-    def __init__(
-        self,
-        # n_encoder_layers,
-        # n_decoder_layers,
-        # num_heads,
-        # hidden_dim,
-        # dropout_rate,
-        # intput_dropout_rate,
-        # weight_decay,
-        # ffn_dim,
-        # dataset_name,
-        # warmup_updates,
-        # tot_updates,
-        # peak_lr,
-        # end_lr,
-        # attention_dropout_rate,
-        # n_node_features,
-        # mask_ratio,
-        # n_val_sampler,
-        args
-    ):
+    def __init__(self, args):
         super().__init__()
 
         self.n_node_features = args.model.input_dim
-        self.num_heads = 8  # TODO make this configurable or to match their structure
         self.hidden_dim = args.model.hidden_size
+        self.output_dim = args.model.output_dim
         self.n_encoder_layers = args.model.num_layers
-        intput_dropout_rate = 0.3
-        dropout_rate = 0.3
+        self.num_heads = args.model.attention_head
+        
+        # TODO move these to config or calculate
+        self.dropout = getattr(args.model, "dropout", 0.0)  # TODO propagate
         attention_dropout_rate = 0.3
 
         # variables flown over from GPS TODO check
         self.mask_dim = getattr(args.data, "mask_dim", 6)
         self.mask_value = getattr(args.data, "mask_value", -1.0)
         self.learn_mask = getattr(args.data, "learn_mask", True)
-        self.output_dim = args.model.output_dim
-
+        
+        # TODO verify function of mask
         if self.learn_mask:
             self.mask_value = nn.Parameter(
                 torch.randn(self.mask_dim) + self.mask_value,
@@ -66,13 +69,12 @@ def __init__(
             )
 
         self.input_proj = nn.Linear(self.n_node_features, self.hidden_dim)
-        self.input_dropout = nn.Dropout(intput_dropout_rate)
+        self.input_dropout = nn.Dropout(self.dropout)
         encoders = [
                 EncoderLayer(
                         self.hidden_dim, 
                         self.hidden_dim, 
-                        dropout_rate, 
-                        attention_dropout_rate, 
+                        self.dropout, 
                         self.num_heads
                         )
                     for _ in range(self.n_encoder_layers)
@@ -279,12 +281,12 @@ def forward(self, q, k, v, attn_bias=None, mask=None):
 
 
 class EncoderLayer(nn.Module):
-    def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate, num_heads):
+    def __init__(self, hidden_size, ffn_size, dropout_rate, num_heads):
         super(EncoderLayer, self).__init__()
 
         self.self_attention_norm = nn.LayerNorm(hidden_size)
         self.self_attention = MultiHeadAttention(
-            hidden_size, attention_dropout_rate, num_heads)
+            hidden_size, dropout_rate, num_heads)
         self.self_attention_dropout = nn.Dropout(dropout_rate)
 
         self.ffn_norm = nn.LayerNorm(hidden_size)

From 74486b3c5fdd71f6fad981c60a349f91d2b28847 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:45 -0400
Subject: [PATCH 25/55] clean up Graphormer code

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/graphormer.py          | 149 ++++++------------
 .../tasks/feature_reconstruction_task.py      |   2 +-
 2 files changed, 51 insertions(+), 100 deletions(-)

diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 77a5ba2..1188d2e 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -1,15 +1,10 @@
 
 from gridfm_graphkit.io.registries import MODELS_REGISTRY
 import torch
-import numpy as np
 import torch.nn as nn
-import pytorch_lightning as pl
-
-from torch.nn import functional as F
 
 from torch_geometric.utils import to_dense_batch
 
-# from gridfm_graphkit.datasets.transforms import AddGraphormerEncodings
 
 
 @MODELS_REGISTRY.register("Graphormer")
@@ -25,17 +20,15 @@ class Graphormer(nn.Module):
         args (NestedNamespace): Parameters
 
     Attributes:
-        input_dim (int): Dimension of input node features. From ``args.model.input_dim``.
-        hidden_size (int): Hidden dimension size for all layers. From ``args.model.hidden_size``.
+        n_node_features (int): Dimension of input node features. From ``args.model.input_dim``.
+        hidden_dim (int): Hidden dimension size for all layers. From ``args.model.hidden_size``.
         output_dim (int): Dimension of the output node features. From ``args.model.output_dim``.
-        edge_dim (int): Dimension of edge features. From ``args.model.edge_dim``.
-        pe_dim (int): Dimension of the positional encoding. Must be less than ``hidden_dim``. From ``args.model.pe_dim``.
-        num_layers (int): Number of GPSConv layers. From ``args.model.num_layers``.
-        heads (int, optional): Number of attention heads in GPSConv. From ``args.model.attention_head``. Defaults to 1.
-        dropout (float, optional): Dropout rate in GPSConv. From ``args.model.dropout``. Defaults to 0.0.
+        n_encoder_layers (int): Number of transformer blocks. From ``args.model.num_layers``.
+        num_heads (int): Number of attention heads. From ``args.model.attention_head``. Defaults to 1.
+        dropout (float, optional): Dropout rate in attention blocks. From ``args.model.dropout``. Defaults to 0.0.
         mask_dim (int, optional): Dimension of the mask vector. From ``args.data.mask_dim``. Defaults to 6.
         mask_value (float, optional): Initial value for learnable mask parameters. From ``args.data.mask_value``. Defaults to -1.0.
-        learn_mask (bool, optional): Whether to learn mask values as parameters. From ``args.data.learn_mask``. Defaults to True.
+        learn_mask (bool, optional): Whether to learn mask values as parameters. From ``args.data.learn_mask``. Defaults to False.
 
     """
     def __init__(self, args):
@@ -46,17 +39,11 @@ def __init__(self, args):
         self.output_dim = args.model.output_dim
         self.n_encoder_layers = args.model.num_layers
         self.num_heads = args.model.attention_head
-        
-        # TODO move these to config or calculate
-        self.dropout = getattr(args.model, "dropout", 0.0)  # TODO propagate
-        attention_dropout_rate = 0.3
-
-        # variables flown over from GPS TODO check
+        self.dropout = getattr(args.model, "dropout", 0.0) 
         self.mask_dim = getattr(args.data, "mask_dim", 6)
         self.mask_value = getattr(args.data, "mask_value", -1.0)
-        self.learn_mask = getattr(args.data, "learn_mask", True)
+        self.learn_mask = getattr(args.data, "learn_mask", False)
         
-        # TODO verify function of mask
         if self.learn_mask:
             self.mask_value = nn.Parameter(
                 torch.randn(self.mask_dim) + self.mask_value,
@@ -68,6 +55,7 @@ def __init__(self, args):
                 requires_grad=False,
             )
 
+        # model layers
         self.input_proj = nn.Linear(self.n_node_features, self.hidden_dim)
         self.input_dropout = nn.Dropout(self.dropout)
         encoders = [
@@ -88,64 +76,41 @@ def __init__(self, args):
             nn.Linear(self.hidden_dim, self.output_dim)
         )
         
-
-        # for pos embeddings
+        # for positional embeddings
         self.spatial_pos_encoder = nn.Embedding(512, self.num_heads, padding_idx=0)
         self.in_degree_encoder = nn.Embedding(
             512, self.hidden_dim, padding_idx=0)
         self.out_degree_encoder = nn.Embedding(
             512, self.hidden_dim, padding_idx=0)
 
-        # self.loss_fn = F.mse_loss # TODO remove eventually as they are specd elsewhere
-        # self.masking_value = -4
 
-    def compute_pos_embeddings(self, batched_data, batch):
-        attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x
-        in_degree, out_degree = batched_data.in_degree, batched_data.in_degree
-
-        # gr_transform = AddGraphormerEncodings(
-        #         attr_name="gr",
-        #     )
-        # batched_data = gr_transform(batched_data)
-        # attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x
-        # in_degree, out_degree = batched_data.in_degree, batched_data.in_degree
-        
-        
-        # print('--->', attn_bias.size(), attn_bias.device, batch.size())
-
-        # yy0, mask = to_dense_batch(attn_bias, batch=batch, max_num_nodes=2000, batch_size=8)
-        # yy1, mask = to_dense_batch(spatial_pos, batch=batch, max_num_nodes=2000, batch_size=8)
-
-        # attn_bias = yy0
-        # spatial_pos = yy1
+    def compute_pos_embeddings(self, data):
+        """
+        Calculate Graphormer positional encodings, and attention biases
 
-        # print('yyyyyy', yy0.size(), yy1.size(), x.size())
+        Args:
+            data (Data): Input node features of shape [num_nodes, input_dim].
 
-        # attn_bias = attn_bias.reshape(8,-1)
-        # spatial_pos = spatial_pos.reshape(8,-1)
-        # print('-----', attn_bias.size(), spatial_pos.size(), x.size())
-        # odim = int(torch.sqrt(torch.as_tensor(attn_bias.size(-1))).item())
-        # print('oooo', odim)
-        # attn_bias = attn_bias.reshape(-1,odim,odim)
-        # spatial_pos = spatial_pos.reshape(-1,odim,odim)
-        # print('-----', attn_bias.size(), spatial_pos.size(), x.size())
+        Returns:
+            graph_node_feature (Tensor): data.x with positional encoding appended.
+            graph_attn_bias (Tensor): attention bais terms.
+        """
+        attn_bias, spatial_pos, x = data.attn_bias, data.spatial_pos, data.x
+        in_degree, out_degree = data.in_degree, data.in_degree
         
         # graph_attn_bias
         graph_attn_bias = attn_bias.clone()
         graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
             1, self.num_heads, 1, 1)  # [n_graph, n_head, n_node, n_node]
-        # print('aaaaaaaaaa', graph_attn_bias.size(), graph_attn_bias.device)
 
         # spatial pos
         # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
         spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
-        # print('sssssssssss', spatial_pos_bias.size(), spatial_pos_bias.device)
 
         graph_attn_bias = graph_attn_bias + spatial_pos_bias
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
 
         node_feature = self.input_proj(x)
-        # print('nf>>', node_feature.size(), in_degree.size(), out_degree.size(), self.in_degree_encoder(in_degree).size())
         node_feature = node_feature + \
             self.in_degree_encoder(in_degree) + \
             self.out_degree_encoder(out_degree)
@@ -156,46 +121,42 @@ def compute_pos_embeddings(self, batched_data, batch):
 
     def encoder(self, graph_node_feature, graph_attn_bias, mask=None, batch=1):
 
-        graph_node_feature_masked = graph_node_feature  #TODO simplify this
-        graph_attn_bias_masked = graph_attn_bias
-
         # transfomrer encoder
-        output = self.input_dropout(graph_node_feature_masked)
+        output = self.input_dropout(graph_node_feature)
         for enc_layer in self.encoder_layers:
-            output = enc_layer(output, graph_attn_bias_masked, mask=mask, batch=batch)
+            output = enc_layer(output, graph_attn_bias, mask=mask, batch=batch)
         output = self.encoder_final_ln(output)
         return output
 
-    def forward(self, x, pe, edge_index, edge_attr, batched_data, data):
-        """
-        process a batch of data, applying the input mask, while
-        excluding non-valid values that arrise during processing
 
-        mask: incoming values to mask for prediction
+    def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data=None):
+        """
+        Forward pass for Graphormer.
+
+        Args:
+            x (Tensor): Input node features of shape [num_nodes, input_dim].
+            pe (Tensor): Positional encoding of shape [num_nodes, pe_dim].
+            edge_index (Tensor): Edge indices for graph convolution.
+            edge_attr (Tensor): Edge feature tensor.
+            batch (Tensor): Batch vector assigning nodes to graphs.
+            data (Data): Pytorch Geometric Batch() object.
+
+        Returns:
+            output (Tensor): Output node features of shape [num_nodes, output_dim].
         """
 
-        mask = None
+        # identify buffer nodes, and create a mask for them
         masked_entries = torch.sum(x < -1e8, axis=-1)
-        mask = masked_entries >= 3  # due to masking up to feature 6 of 9 # x.size(-1)
-
-
-        # TODO note that the x, pe are redundant or not needed, so clean up at the end
-
-        # TODO in the baseline code the PE is an input here and passes through
-        # a normalization before being concatenated to the features, follow this in final version
+        mask = masked_entries >= 3  # due to masking up to feature 6 of 9
         
-        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data, batched_data)
-        # print('gnodes********', graph_node_feature.size(), graph_attn_bias.size())
-        output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batched_data)
+        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data)
+        output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batch)
         output = self.decoder(output)
 
-        # evaluate where mask is True, so update it TODO
-        # print('ooooooooo', output[~mask].size())
-        # print('bbbbbbbb', data.mask.size(), data.mask, data.mask.sum()/len(data.mask.flatten()))
+        # return the negative of the buffer mask to select data for loss calculation
         return output, ~mask
 
 
-# TODO maybe set this as the decoder
 class FeedForwardNetwork(nn.Module):
     def __init__(self, hidden_size, ffn_size, dropout_rate):
         super(FeedForwardNetwork, self).__init__()
@@ -212,6 +173,10 @@ def forward(self, x):
 
 
 class MultiHeadAttention(nn.Module):
+    """
+    This is a slight modification of vanilla attention, to allow masking
+    of buffer nodes, and the addition of biasses to the attention mechanism.
+    """
     def __init__(self, hidden_size, attention_dropout_rate, num_heads):
         super(MultiHeadAttention, self).__init__()
 
@@ -228,7 +193,7 @@ def __init__(self, hidden_size, attention_dropout_rate, num_heads):
         self.output_layer = nn.Linear(num_heads * att_size, hidden_size)
 
     def forward(self, q, k, v, attn_bias=None, mask=None):
-
+        
         orig_q_size = q.size()
 
         d_k = self.att_size
@@ -248,11 +213,7 @@ def forward(self, q, k, v, attn_bias=None, mask=None):
         # Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V
         q = q * self.scale
         x = torch.matmul(q, k)  # [b, h, q_len, k_len]
-        # print('**********',
-        #      x.size(), q.size(), 
-        #      k.size(), v.size(), 
-        #      attn_bias.size(), mask.size()
-        #      )
+
         if attn_bias is not None:
             if mask is not None:
                 usm0 = mask.unsqueeze(1).unsqueeze(3)
@@ -298,23 +259,13 @@ def forward(self, x, attn_bias=None, mask=None, batch=1):
         It is assumed that the mask is 1 where values are to be ignored
         and then 0 where there are valid data
         """
-
-        # print('xxxxxxxxxxxxx', x.size(), batch.size())
-        x, bmask = to_dense_batch(x, batch) # TODO remove bmask if padding remains in final
+        x, _ = to_dense_batch(x, batch) 
         mask, _ = to_dense_batch(mask, batch)
 
         y = self.self_attention_norm(x)
-        # print(y.size(), attn_bias.size(), batch)
-        
         attn_bias = attn_bias.squeeze()
-        # attn_bias = attn_bias.permute(1, 2, 0)
-        # attn_bias, maska = to_dense_batch(attn_bias, batch)
-        # print('dense>>>', y.size(), bmask.size(), attn_bias.size())
-        # print('msum>>>', bmask.sum(dim=-1), )
-        # print('msum2>>', mask.size(),mask.sum(dim=-1))
         y = self.self_attention(y, y, y, attn_bias, mask)
         y = self.self_attention_dropout(y)
-        # print('<<<<<>>>>', x.size(), y.size())
         x = x + torch.reshape(y, x.size())
 
         y = self.ffn_norm(x)
@@ -322,5 +273,5 @@ def forward(self, x, attn_bias=None, mask=None, batch=1):
         y = self.ffn_dropout(y)
         x = x + y
         x=x.flatten(0,1)
-        # print('222222222222222', x.size())
+
         return x
diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py
index e7d5d79..e3bd215 100644
--- a/gridfm_graphkit/tasks/feature_reconstruction_task.py
+++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py
@@ -78,7 +78,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch, mask=None, data=None):
         if mask is not None:
             mask_value_expanded = self.model.mask_value.expand(x.shape[0], -1)
             x[:, : mask.shape[1]][mask] = mask_value_expanded[mask]
-        return self.model(x, pe, edge_index, edge_attr, batch, data)  #
+        return self.model(x, pe, edge_index, edge_attr, batch, data)  # TODO prop args to GPS
 
     @rank_zero_only
     def on_fit_start(self):

From e9a4d0435e249deab5827086ff243abcb0e60eae Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:45 -0400
Subject: [PATCH 26/55] clean up

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/gps_transformer.py            | 3 ++-
 gridfm_graphkit/models/graphormer.py                 | 4 ++--
 gridfm_graphkit/tasks/feature_reconstruction_task.py | 4 +---
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py
index b3f1043..178570b 100644
--- a/gridfm_graphkit/models/gps_transformer.py
+++ b/gridfm_graphkit/models/gps_transformer.py
@@ -105,7 +105,7 @@ def __init__(self, args):
                 requires_grad=False,
             )
 
-    def forward(self, x, pe, edge_index, edge_attr, batch):
+    def forward(self, x, pe, edge_index, edge_attr, batch, data=None):
         """
         Forward pass for the GPSTransformer.
 
@@ -115,6 +115,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch):
             edge_index (Tensor): Edge indices for graph convolution.
             edge_attr (Tensor): Edge feature tensor.
             batch (Tensor): Batch vector assigning nodes to graphs.
+            data (Data): Pytorch Geometric Data/Batch object.
 
         Returns:
             output (Tensor): Output node features of shape [num_nodes, output_dim].
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 1188d2e..d861755 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -89,7 +89,7 @@ def compute_pos_embeddings(self, data):
         Calculate Graphormer positional encodings, and attention biases
 
         Args:
-            data (Data): Input node features of shape [num_nodes, input_dim].
+            data (Data): Pytorch geometric Data/Batch object
 
         Returns:
             graph_node_feature (Tensor): data.x with positional encoding appended.
@@ -139,7 +139,7 @@ def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data=
             edge_index (Tensor): Edge indices for graph convolution.
             edge_attr (Tensor): Edge feature tensor.
             batch (Tensor): Batch vector assigning nodes to graphs.
-            data (Data): Pytorch Geometric Batch() object.
+            data (Data): Pytorch Geometric Data/Batch object.
 
         Returns:
             output (Tensor): Output node features of shape [num_nodes, output_dim].
diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py
index e3bd215..96d79bd 100644
--- a/gridfm_graphkit/tasks/feature_reconstruction_task.py
+++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py
@@ -78,7 +78,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch, mask=None, data=None):
         if mask is not None:
             mask_value_expanded = self.model.mask_value.expand(x.shape[0], -1)
             x[:, : mask.shape[1]][mask] = mask_value_expanded[mask]
-        return self.model(x, pe, edge_index, edge_attr, batch, data)  # TODO prop args to GPS
+        return self.model(x, pe, edge_index, edge_attr, batch, data)
 
     @rank_zero_only
     def on_fit_start(self):
@@ -130,8 +130,6 @@ def shared_step(self, batch):
         return output, loss_dict
 
     def training_step(self, batch):
-        # print('trainbatch>>>>', batch.size())     # TODO remove
-        # print(batch)
         _, loss_dict = self.shared_step(batch)
         current_lr = self.optimizer.param_groups[0]["lr"]
         metrics = {}

From e3430492a8206f845c5ced0be8eb40a62f5b44b5 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:46 -0400
Subject: [PATCH 27/55] flow dataset parameters from the config

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 .../datasets/powergrid_datamodule.py          |  1 +
 gridfm_graphkit/datasets/powergrid_dataset.py | 23 ++++------
 gridfm_graphkit/datasets/transforms.py        | 43 +++++--------------
 3 files changed, 21 insertions(+), 46 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_datamodule.py b/gridfm_graphkit/datasets/powergrid_datamodule.py
index c18c360..ff796a8 100644
--- a/gridfm_graphkit/datasets/powergrid_datamodule.py
+++ b/gridfm_graphkit/datasets/powergrid_datamodule.py
@@ -128,6 +128,7 @@ def setup(self, stage: str):
                 pe_dim=self.args.model.pe_dim,
                 mask_dim=self.args.data.mask_dim,
                 transform=get_transform(args=self.args),
+                args=self.args.data
             )
             self.datasets.append(dataset)
 
diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index 09374ba..b58289a 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -44,6 +44,7 @@ def __init__(
         transform: Optional[Callable] = None,
         pre_transform: Optional[Callable] = None,
         pre_filter: Optional[Callable] = None,
+        args: Optional[dict] = None,
     ):
         self.norm_method = norm_method
         self.node_normalizer = node_normalizer
@@ -52,6 +53,10 @@ def __init__(
         self.mask_dim = mask_dim
         self.length = None
 
+        if args.add_graphormer_encoding:
+            self.add_graphormer_encoding = args.add_graphormer_encoding
+            self.max_node_num = args.max_node_num
+
         super().__init__(root, transform, pre_transform, pre_filter)
 
         # Load normalization stats if available
@@ -173,11 +178,6 @@ def process(self):
             )
             graph_data = pe_transform(graph_data)
 
-            # gr_transform = AddGraphormerEncodings(
-            #     attr_name="gr",
-            # )
-            # graph_data = gr_transform(graph_data)
-
             torch.save(
                 graph_data,
                 osp.join(
@@ -217,15 +217,10 @@ def get(self, idx):
         if self.transform:
             data = self.transform(data)
 
-        # TODO move this to the pretreatment when validated
-        # print('datab>>>>>>>', data)
-        # print('qqqqqq', data.x.min(), data.x.max())
-        gr_transform = AddGraphormerEncodings(
-                attr_name="gr",
-            )
-        data = gr_transform(data)
-        # print('aaaaaaaaaaaaaaa', data.x.min(), data.x.max())
-        # print('dataa>>>>>>>', data) # TODO remove
+        if self.add_graphormer_encoding:
+            gr_transform = AddGraphormerEncodings(self.max_node_num)
+            data = gr_transform(data)
+
         return data
 
     def change_transform(self, new_transform):
diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index c3f039f..654d3fa 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -20,9 +20,6 @@
 pyximport.install(setup_args={'include_dirs': np.get_include()})
 import gridfm_graphkit.models.algos as algos
 
-# from networkx import floyd_warshall_numpy
-# from torch_geometric.utils import to_networkx
-
 
 class AddNormalizedRandomWalkPE(BaseTransform):
     r"""Adds the random walk positional encoding from the
@@ -118,18 +115,7 @@ def preprocess_item(data):
 
     adj = edge_adj.to_dense().to(torch.int16)
 
-    # TODO replace the placeholder with actual algorithm
-    # shortest_path_result = np.ones((N,N))
-
-    # print('+++++++',adj.dtype, adj.numpy().dtype)
     shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32))
-    #gg = to_networkx(data)
-    #shortest_path_result = floyd_warshall_numpy(gg)
-    
-    # TODO the output of fw is integer number of hops in n x n, review if need to norm etc.
-    # print('sp>>>', shortest_path_result)
-    # print('sp>>>', shortest_path_result.shape)
-    # print(shortest_path_result.shape)
     spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device)
     attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device)  # TODO verifie is updated
 
@@ -138,7 +124,6 @@ def preprocess_item(data):
     return attn_bias, spatial_pos, in_degree, out_degree
 
 def pad_1d_unsqueeze(x, padlen):
-    # x = x + 1  # pad id = 0 #TODO remove all +1s
     xlen = x.size(0)
     if xlen < padlen:
         new_x = x.new_zeros([padlen], dtype=x.dtype)
@@ -146,10 +131,7 @@ def pad_1d_unsqueeze(x, padlen):
         x = new_x
     return x.unsqueeze(0)
 
-
 def pad_2d_unsqueeze(x, padlen):
-    # x = x + 1  # pad id = 0
-    # print('-------->', x.size())
     xlen, xdim = x.size()
     if xlen < padlen:
         new_x = x.new_zeros([padlen, xdim], dtype=x.dtype)
@@ -158,7 +140,6 @@ def pad_2d_unsqueeze(x, padlen):
         x = new_x
     return x.unsqueeze(0)
 
-
 def pad_attn_bias_unsqueeze(x, padlen):
     xlen = x.size(0)
     if xlen < padlen:
@@ -169,9 +150,7 @@ def pad_attn_bias_unsqueeze(x, padlen):
         x = new_x
     return x.unsqueeze(0)
 
-
 def pad_spatial_pos_unsqueeze(x, padlen):
-    x = x + 1
     xlen = x.size(0)
     if xlen < padlen:
         new_x = x.new_zeros([padlen, padlen], dtype=x.dtype)
@@ -179,16 +158,18 @@ def pad_spatial_pos_unsqueeze(x, padlen):
         x = new_x
     return x.unsqueeze(0)
 
+
 class AddGraphormerEncodings(BaseTransform):
-    """
-    TODO update with encoding info
+    """Adds a positional encoding (node centrallity) to the given graph, as 
+    well as the attention biases, as described in: Do transformers really 
+    perform badly for graph representation?, C. Ying et al., 2021.
     """
 
     def __init__(
         self,
-        attr_name: Optional[str] = "gres"  # TODO remove if not needed
+        max_node_num: int,
     ) -> None:
-        self.attr_name = attr_name
+        self.max_node_num = max_node_num
 
     def forward(self, data: Data) -> Data:
         if data.edge_index is None:
@@ -198,20 +179,18 @@ def forward(self, data: Data) -> Data:
         if N is None:
             raise ValueError("Expected data.num_nodes to be not None")
 
-        
         attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data)
 
-        max_node_num = 118 # TODO extract from batch
-        attn_bias = pad_attn_bias_unsqueeze(attn_bias, max_node_num)
-        spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, max_node_num)
-        in_degree = pad_1d_unsqueeze(in_degree, max_node_num).squeeze()
+        attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num)
+        spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num)
+        in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze()
  
         data = add_node_attr(data, attn_bias, attr_name='attn_bias')
         data = add_node_attr(data, spatial_pos, attr_name='spatial_pos')
         data = add_node_attr(data, in_degree, attr_name='in_degree')
 
-        data.x = pad_2d_unsqueeze(data.x, max_node_num).squeeze()
-        data.y = pad_2d_unsqueeze(data.y, max_node_num).squeeze()
+        data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze()
+        data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze()
 
         return data
 

From ddb0ff806317500014e27feb40af9ff2e83f1498 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:46 -0400
Subject: [PATCH 28/55] baseline dataset finalized

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py | 2 +-
 gridfm_graphkit/datasets/transforms.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index b58289a..309a155 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -44,7 +44,7 @@ def __init__(
         transform: Optional[Callable] = None,
         pre_transform: Optional[Callable] = None,
         pre_filter: Optional[Callable] = None,
-        args: Optional[dict] = None,
+        args: Optional = None,
     ):
         self.norm_method = norm_method
         self.node_normalizer = node_normalizer
diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 654d3fa..4221246 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -103,7 +103,7 @@ def add_node_attr(data: Data, value: Any,
 
 def preprocess_item(data):
     """
-    TODO fill in header for the function
+    Calculation of the attention bias, and positional/structural data
     """
     edge_index = data.edge_index
     N = data.num_nodes

From 6124403e0da13d7f3c5a8c70bb992f56817af647 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:46 -0400
Subject: [PATCH 29/55] flow over baseline logic for edge encodings

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py |  2 +-
 gridfm_graphkit/datasets/transforms.py        | 27 +++++++++++++--
 gridfm_graphkit/models/graphormer.py          | 34 +++++++++++++++++++
 3 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index 309a155..f67d3cd 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -202,7 +202,7 @@ def len(self):
         return self.length
 
     def __cat_dim__(self, key, value, *args, **kwargs):
-        if key in ['attn_bias', 'spatial_pos', 'in_degree']:
+        if key in ['attn_bias', 'spatial_pos', 'in_degree', 'edge_input']:
             return None
         return super().__cat_dim__(key, value, *args, **kwargs)
     
diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 4221246..1e7273b 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -101,11 +101,22 @@ def add_node_attr(data: Data, value: Any,
 
     return data
 
+def get_edge_encoding(edge_attr):
+    if len(edge_attr.size()) == 1:
+            edge_attr = edge_attr[:, None]
+    attn_edge_type = torch.zeros([N, N, edge_attr.size(-1)], dtype=torch.long)
+    attn_edge_type[edge_index[0, :], edge_index[1, :]
+                    ] = convert_to_single_emb(edge_attr.long()) + 1
+    edge_input = algos.gen_edge_input(max_dist, path, attn_edge_type.numpy())
+
+    return attn_edge_type, torch.from_numpy(edge_input).long()
+
 def preprocess_item(data):
     """
     Calculation of the attention bias, and positional/structural data
     """
     edge_index = data.edge_index
+    edge_attr = data.edge_attr
     N = data.num_nodes
     edge_adj = torch.sparse_coo_tensor(
                                     edge_index,
@@ -119,9 +130,15 @@ def preprocess_item(data):
     spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device)
     attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device)  # TODO verifie is updated
 
+    if edge_attr is not None:
+        attn_edge_type, edge_input = get_edge_encoding(edge_attr)
+    else:
+        edge_input = None
+        attn_edge_type = None
+
     in_degree = adj.long().sum(dim=1).view(-1)
     out_degree = adj.long().sum(dim=0).view(-1)
-    return attn_bias, spatial_pos, in_degree, out_degree
+    return attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input
 
 def pad_1d_unsqueeze(x, padlen):
     xlen = x.size(0)
@@ -179,15 +196,21 @@ def forward(self, data: Data) -> Data:
         if N is None:
             raise ValueError("Expected data.num_nodes to be not None")
 
-        attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data)
+        attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = preprocess_item(data)
 
         attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num)
         spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num)
         in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze()
+        print('eeeeee>', edge_input.size())   # TODO remove
+        edge_input = pad_attn_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name
+        # TODO need to verify padding for attn_edge_type
+        print('etetetet>', attn_edge_type.size())
  
         data = add_node_attr(data, attn_bias, attr_name='attn_bias')
         data = add_node_attr(data, spatial_pos, attr_name='spatial_pos')
         data = add_node_attr(data, in_degree, attr_name='in_degree')
+        data = add_node_attr(data, edge_input, attr_name='edge_input')
+        data = add_node_attr(data, edge_input, attr_name='attn_edge_type')
 
         data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze()
         data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze()
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index d861755..4522344 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -82,6 +82,8 @@ def __init__(self, args):
             512, self.hidden_dim, padding_idx=0)
         self.out_degree_encoder = nn.Embedding(
             512, self.hidden_dim, padding_idx=0)
+        self.edge_encoder = nn.Embedding(
+                512 * self.n_edge_features + 1, num_heads, padding_idx=0)
 
 
     def compute_pos_embeddings(self, data):
@@ -108,6 +110,38 @@ def compute_pos_embeddings(self, data):
         spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
 
         graph_attn_bias = graph_attn_bias + spatial_pos_bias
+
+        ###########
+        if data.edge_input is not None:
+            edge_input, attn_edge_type = data.edge_input, data.attn_edge_type
+            # edge feature
+            # TODO flow over the upstream logic for edge_types...
+            if self.edge_type == 'multi_hop':
+                spatial_pos_ = spatial_pos.clone()
+                spatial_pos_[spatial_pos_ == 0] = 1  # set pad to 1
+                # set 1 to 1, x > 1 to x - 1
+                spatial_pos_ = torch.where(spatial_pos_ > 1, spatial_pos_ - 1, spatial_pos_)
+                if self.multi_hop_max_dist > 0:
+                    spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist)
+                    edge_input = edge_input[:, :, :, :self.multi_hop_max_dist, :]
+                # [n_graph, n_node, n_node, max_dist, n_head]
+                edge_input = self.edge_encoder(edge_input).mean(-2)
+                max_dist = edge_input.size(-2)
+                edge_input_flat = edge_input.permute(
+                    3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads)
+                edge_input_flat = torch.bmm(edge_input_flat, self.edge_dis_encoder.weight.reshape(
+                    -1, self.num_heads, self.num_heads)[:max_dist, :, :])
+                edge_input = edge_input_flat.reshape(
+                    max_dist, n_graph, n_node, n_node, self.num_heads).permute(1, 2, 3, 0, 4)
+                edge_input = (edge_input.sum(-2) /
+                              (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2)
+            else:
+                # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+                edge_input = self.edge_encoder(
+                    attn_edge_type).mean(-2).permute(0, 3, 1, 2)
+            graph_attn_bias = graph_attn_bias + edge_input
+        ###########
+
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
 
         node_feature = self.input_proj(x)

From 61fd07edf7ff7507febd50cb79055e58a883ecb8 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:46 -0400
Subject: [PATCH 30/55] added model parameters for managing edge data

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/graphormer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 4522344..bca816a 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -29,6 +29,8 @@ class Graphormer(nn.Module):
         mask_dim (int, optional): Dimension of the mask vector. From ``args.data.mask_dim``. Defaults to 6.
         mask_value (float, optional): Initial value for learnable mask parameters. From ``args.data.mask_value``. Defaults to -1.0.
         learn_mask (bool, optional): Whether to learn mask values as parameters. From ``args.data.learn_mask``. Defaults to False.
+        edge_type (string, optional): Type of edge to consider multi_hop or not. From ``args.data.edge_type``. Defaults to multi_hop.
+        multi_hop_max_dist (int, optional): Maximum number of hops to consider at edges. From ``args.data.multi_hop_max_dist``. Defaults to 20.
 
     """
     def __init__(self, args):
@@ -43,6 +45,8 @@ def __init__(self, args):
         self.mask_dim = getattr(args.data, "mask_dim", 6)
         self.mask_value = getattr(args.data, "mask_value", -1.0)
         self.learn_mask = getattr(args.data, "learn_mask", False)
+        self.edge_type = getattr(args.model, "edge_type", "multi_hop") 
+        self.multi_hop_max_dist = getattr(args.model, "multi_hop_max_dist", 20) 
         
         if self.learn_mask:
             self.mask_value = nn.Parameter(
@@ -111,7 +115,6 @@ def compute_pos_embeddings(self, data):
 
         graph_attn_bias = graph_attn_bias + spatial_pos_bias
 
-        ###########
         if data.edge_input is not None:
             edge_input, attn_edge_type = data.edge_input, data.attn_edge_type
             # edge feature
@@ -140,7 +143,6 @@ def compute_pos_embeddings(self, data):
                 edge_input = self.edge_encoder(
                     attn_edge_type).mean(-2).permute(0, 3, 1, 2)
             graph_attn_bias = graph_attn_bias + edge_input
-        ###########
 
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
 

From 32c19f324dea998dd2d9d3e97fcfb8753ca57e44 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:46 -0400
Subject: [PATCH 31/55] work in progress for incorporating edge data

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 33 +++++++++++++++++++-------
 gridfm_graphkit/models/algos.pyx       |  2 +-
 gridfm_graphkit/models/graphormer.py   | 21 +++++++++++-----
 gridfm_graphkit/training/loss.py       |  4 ++++
 4 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 1e7273b..e4ae56f 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -101,7 +101,15 @@ def add_node_attr(data: Data, value: Any,
 
     return data
 
-def get_edge_encoding(edge_attr):
+# TODO verify how this meshes with the node features, as compared to orig version
+def convert_to_single_emb(x, offset=512):
+    feature_num = x.size(1) if len(x.size()) > 1 else 1
+    feature_offset = 1 + \
+        torch.arange(0, feature_num * offset, offset, dtype=torch.long)
+    x = x + feature_offset
+    return x
+
+def get_edge_encoding(edge_attr, N, edge_index, max_dist, path):
     if len(edge_attr.size()) == 1:
             edge_attr = edge_attr[:, None]
     attn_edge_type = torch.zeros([N, N, edge_attr.size(-1)], dtype=torch.long)
@@ -131,7 +139,8 @@ def preprocess_item(data):
     attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device)  # TODO verifie is updated
 
     if edge_attr is not None:
-        attn_edge_type, edge_input = get_edge_encoding(edge_attr)
+        max_dist = np.amax(shortest_path_result)
+        attn_edge_type, edge_input = get_edge_encoding(edge_attr, N, edge_index, max_dist, path)
     else:
         edge_input = None
         attn_edge_type = None
@@ -161,7 +170,17 @@ def pad_attn_bias_unsqueeze(x, padlen):
     xlen = x.size(0)
     if xlen < padlen:
         new_x = x.new_zeros(
-            [padlen, padlen], dtype=x.dtype).fill_(float('-inf'))
+            [padlen, padlen], dtype=x.dtype).fill_(float('-inf'))   # TODO verify if masking is needed given this is at -inf...
+        new_x[:xlen, :xlen] = x
+        new_x[xlen:, :xlen] = 0 # TODO verify if masking is needed given this is at -inf...
+        x = new_x
+    return x.unsqueeze(0)
+
+def pad_edge_bias_unsqueeze(x, padlen):
+    xlen = x.size(0)
+    if xlen < padlen:
+        new_x = x.new_zeros(
+            (padlen, padlen) + x.size()[-2:], dtype=x.dtype).fill_(int(0))
         new_x[:xlen, :xlen] = x
         new_x[xlen:, :xlen] = 0
         x = new_x
@@ -197,20 +216,16 @@ def forward(self, data: Data) -> Data:
             raise ValueError("Expected data.num_nodes to be not None")
 
         attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = preprocess_item(data)
-
         attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num)
         spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num)
         in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze()
-        print('eeeeee>', edge_input.size())   # TODO remove
-        edge_input = pad_attn_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name
-        # TODO need to verify padding for attn_edge_type
-        print('etetetet>', attn_edge_type.size())
+        edge_input = pad_edge_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name
  
         data = add_node_attr(data, attn_bias, attr_name='attn_bias')
         data = add_node_attr(data, spatial_pos, attr_name='spatial_pos')
         data = add_node_attr(data, in_degree, attr_name='in_degree')
         data = add_node_attr(data, edge_input, attr_name='edge_input')
-        data = add_node_attr(data, edge_input, attr_name='attn_edge_type')
+        data = add_node_attr(data, attn_edge_type, attr_name='attn_edge_type')
 
         data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze()
         data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze()
diff --git a/gridfm_graphkit/models/algos.pyx b/gridfm_graphkit/models/algos.pyx
index 8600367..d25b99c 100644
--- a/gridfm_graphkit/models/algos.pyx
+++ b/gridfm_graphkit/models/algos.pyx
@@ -74,7 +74,7 @@ def gen_edge_input(max_dist, path, edge_feat):
     assert path_copy.flags['C_CONTIGUOUS']
     assert edge_feat_copy.flags['C_CONTIGUOUS']
 
-    cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=numpy.int64)
+    cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=numpy.int32)
     cdef unsigned int i, j, k, num_path, cur
 
     for i in range(n):
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index bca816a..45bf63b 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -25,6 +25,7 @@ class Graphormer(nn.Module):
         output_dim (int): Dimension of the output node features. From ``args.model.output_dim``.
         n_encoder_layers (int): Number of transformer blocks. From ``args.model.num_layers``.
         num_heads (int): Number of attention heads. From ``args.model.attention_head``. Defaults to 1.
+        n_edge_features (int): Dimension of edge features. From ``args.model.edge_dim``.
         dropout (float, optional): Dropout rate in attention blocks. From ``args.model.dropout``. Defaults to 0.0.
         mask_dim (int, optional): Dimension of the mask vector. From ``args.data.mask_dim``. Defaults to 6.
         mask_value (float, optional): Initial value for learnable mask parameters. From ``args.data.mask_value``. Defaults to -1.0.
@@ -41,6 +42,7 @@ def __init__(self, args):
         self.output_dim = args.model.output_dim
         self.n_encoder_layers = args.model.num_layers
         self.num_heads = args.model.attention_head
+        self.n_edge_features = args.model.edge_dim
         self.dropout = getattr(args.model, "dropout", 0.0) 
         self.mask_dim = getattr(args.data, "mask_dim", 6)
         self.mask_value = getattr(args.data, "mask_value", -1.0)
@@ -86,8 +88,12 @@ def __init__(self, args):
             512, self.hidden_dim, padding_idx=0)
         self.out_degree_encoder = nn.Embedding(
             512, self.hidden_dim, padding_idx=0)
-        self.edge_encoder = nn.Embedding(
-                512 * self.n_edge_features + 1, num_heads, padding_idx=0)
+        if self.n_edge_features is not None:
+            self.edge_encoder = nn.Embedding(
+                512 * self.n_edge_features + 1, self.num_heads, padding_idx=0)
+        if self.edge_type == 'multi_hop':
+            self.edge_dis_encoder = nn.Embedding(
+                128 * self.num_heads * self.num_heads, 1)       
 
 
     def compute_pos_embeddings(self, data):
@@ -118,8 +124,8 @@ def compute_pos_embeddings(self, data):
         if data.edge_input is not None:
             edge_input, attn_edge_type = data.edge_input, data.attn_edge_type
             # edge feature
-            # TODO flow over the upstream logic for edge_types...
             if self.edge_type == 'multi_hop':
+                n_graph, n_node = edge_input.size()[:2]
                 spatial_pos_ = spatial_pos.clone()
                 spatial_pos_[spatial_pos_ == 0] = 1  # set pad to 1
                 # set 1 to 1, x > 1 to x - 1
@@ -128,7 +134,10 @@ def compute_pos_embeddings(self, data):
                     spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist)
                     edge_input = edge_input[:, :, :, :self.multi_hop_max_dist, :]
                 # [n_graph, n_node, n_node, max_dist, n_head]
-                edge_input = self.edge_encoder(edge_input).mean(-2)
+                print('!!!!!!', edge_input.size())
+                print('mmmmm', edge_input.max(), edge_input.min())
+                edge_input = self.edge_encoder(edge_input+1).mean(-2)    # TODO determine source of -1 and correct
+                print('22222', edge_input.size())
                 max_dist = edge_input.size(-2)
                 edge_input_flat = edge_input.permute(
                     3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads)
@@ -140,9 +149,9 @@ def compute_pos_embeddings(self, data):
                               (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2)
             else:
                 # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
-                edge_input = self.edge_encoder(
+                edge_input = self.edge_encoder( # TODO test this path
                     attn_edge_type).mean(-2).permute(0, 3, 1, 2)
-            graph_attn_bias = graph_attn_bias + edge_input
+            #graph_attn_bias = graph_attn_bias + edge_input # TODO uncomment
 
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
 
diff --git a/gridfm_graphkit/training/loss.py b/gridfm_graphkit/training/loss.py
index f90953b..e705232 100644
--- a/gridfm_graphkit/training/loss.py
+++ b/gridfm_graphkit/training/loss.py
@@ -176,6 +176,10 @@ def forward(self, pred, target, edge_index=None, edge_attr=None, mask=None):
         loss_details = {}
 
         for i, loss_fn in enumerate(self.loss_functions):
+            print('---x', pred.size(), pred.min(), pred.max())
+            print('---y', target.size(), target.min(), target.max())
+            print('---ei', edge_index.size(), edge_index.min(), edge_index.max())
+            print('----ea', edge_attr.size(), edge_attr.min(), edge_attr.max())
             loss_output = loss_fn(
                 pred,
                 target,

From bf1e221aeeb66f0813b7adbd2143307ff1a35923 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:46 -0400
Subject: [PATCH 32/55] multi-hop functional with N nodes upper limit

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py |  9 +++++++--
 gridfm_graphkit/models/graphormer.py   | 10 ++++++----
 gridfm_graphkit/training/loss.py       |  5 +----
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index e4ae56f..89df689 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -139,7 +139,8 @@ def preprocess_item(data):
     attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device)  # TODO verifie is updated
 
     if edge_attr is not None:
-        max_dist = np.amax(shortest_path_result)
+        max_dist = N    # fix this to allow multiple graphs # np.amax(shortest_path_result)
+        # print('----->', max_dist) # TODO remove
         attn_edge_type, edge_input = get_edge_encoding(edge_attr, N, edge_index, max_dist, path)
     else:
         edge_input = None
@@ -216,11 +217,15 @@ def forward(self, data: Data) -> Data:
             raise ValueError("Expected data.num_nodes to be not None")
 
         attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = preprocess_item(data)
+        
+        # print('e>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size())
         attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num)
         spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num)
         in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze()
         edge_input = pad_edge_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name
- 
+        # TODO check padding of attn_edge_type, and num steps to sort out batching issue
+        # print('ffe>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size())
+
         data = add_node_attr(data, attn_bias, attr_name='attn_bias')
         data = add_node_attr(data, spatial_pos, attr_name='spatial_pos')
         data = add_node_attr(data, in_degree, attr_name='in_degree')
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 45bf63b..f38159c 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -134,10 +134,10 @@ def compute_pos_embeddings(self, data):
                     spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist)
                     edge_input = edge_input[:, :, :, :self.multi_hop_max_dist, :]
                 # [n_graph, n_node, n_node, max_dist, n_head]
-                print('!!!!!!', edge_input.size())
-                print('mmmmm', edge_input.max(), edge_input.min())
+                # print('!!!!!!', edge_input.size())
+                # print('mmmmm', edge_input.max(), edge_input.min())
                 edge_input = self.edge_encoder(edge_input+1).mean(-2)    # TODO determine source of -1 and correct
-                print('22222', edge_input.size())
+                # print('22222', edge_input.size())
                 max_dist = edge_input.size(-2)
                 edge_input_flat = edge_input.permute(
                     3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads)
@@ -149,9 +149,11 @@ def compute_pos_embeddings(self, data):
                               (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2)
             else:
                 # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+                # TODO pad attn_edge_type for this path
                 edge_input = self.edge_encoder( # TODO test this path
                     attn_edge_type).mean(-2).permute(0, 3, 1, 2)
-            #graph_attn_bias = graph_attn_bias + edge_input # TODO uncomment
+            print('sum>>>', graph_attn_bias.size(), edge_input.size(), attn_edge_type.size())
+            graph_attn_bias = graph_attn_bias + edge_input
 
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
 
diff --git a/gridfm_graphkit/training/loss.py b/gridfm_graphkit/training/loss.py
index e705232..34664ee 100644
--- a/gridfm_graphkit/training/loss.py
+++ b/gridfm_graphkit/training/loss.py
@@ -176,10 +176,7 @@ def forward(self, pred, target, edge_index=None, edge_attr=None, mask=None):
         loss_details = {}
 
         for i, loss_fn in enumerate(self.loss_functions):
-            print('---x', pred.size(), pred.min(), pred.max())
-            print('---y', target.size(), target.min(), target.max())
-            print('---ei', edge_index.size(), edge_index.min(), edge_index.max())
-            print('----ea', edge_attr.size(), edge_attr.min(), edge_attr.max())
+
             loss_output = loss_fn(
                 pred,
                 target,

From fc5a93e25d28eea36559d59f245840bf1bc4d3ec Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:46 -0400
Subject: [PATCH 33/55] set max hops for edge encoding

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py |  6 +++++-
 gridfm_graphkit/datasets/transforms.py        | 19 +++++++++++++------
 gridfm_graphkit/models/algos.pyx              |  5 ++++-
 gridfm_graphkit/models/graphormer.py          |  2 +-
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index f67d3cd..3691500 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -56,6 +56,7 @@ def __init__(
         if args.add_graphormer_encoding:
             self.add_graphormer_encoding = args.add_graphormer_encoding
             self.max_node_num = args.max_node_num
+            self.max_hops = args.max_hops
 
         super().__init__(root, transform, pre_transform, pre_filter)
 
@@ -218,7 +219,10 @@ def get(self, idx):
             data = self.transform(data)
 
         if self.add_graphormer_encoding:
-            gr_transform = AddGraphormerEncodings(self.max_node_num)
+            gr_transform = AddGraphormerEncodings(
+                                        self.max_node_num,
+                                        self.max_hops
+                                        )
             data = gr_transform(data)
 
         return data
diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 89df689..f7f8799 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -119,7 +119,7 @@ def get_edge_encoding(edge_attr, N, edge_index, max_dist, path):
 
     return attn_edge_type, torch.from_numpy(edge_input).long()
 
-def preprocess_item(data):
+def preprocess_item(data, max_hops):
     """
     Calculation of the attention bias, and positional/structural data
     """
@@ -134,14 +134,18 @@ def preprocess_item(data):
 
     adj = edge_adj.to_dense().to(torch.int16)
 
-    shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32))
+    # get shortest paths in number of hops (shortest_path_result) and intermediate nodes
+    # for those shortest paths (path)
+    shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32), max_hops)
     spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device)
     attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device)  # TODO verifie is updated
 
     if edge_attr is not None:
-        max_dist = N    # fix this to allow multiple graphs # np.amax(shortest_path_result)
-        # print('----->', max_dist) # TODO remove
-        attn_edge_type, edge_input = get_edge_encoding(edge_attr, N, edge_index, max_dist, path)
+        # print(path)
+        # print(path.shape)
+        # print(shortest_path_result)
+        # print(shortest_path_result.shape)
+        attn_edge_type, edge_input = get_edge_encoding(edge_attr, N, edge_index, max_hops, path)
     else:
         edge_input = None
         attn_edge_type = None
@@ -205,8 +209,10 @@ class AddGraphormerEncodings(BaseTransform):
     def __init__(
         self,
         max_node_num: int,
+        max_hops: int,
     ) -> None:
         self.max_node_num = max_node_num
+        self.max_hops = max_hops
 
     def forward(self, data: Data) -> Data:
         if data.edge_index is None:
@@ -216,7 +222,8 @@ def forward(self, data: Data) -> Data:
         if N is None:
             raise ValueError("Expected data.num_nodes to be not None")
 
-        attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = preprocess_item(data)
+        attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = \
+                            preprocess_item(data, self.max_hops)
         
         # print('e>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size())
         attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num)
diff --git a/gridfm_graphkit/models/algos.pyx b/gridfm_graphkit/models/algos.pyx
index d25b99c..7ab5851 100644
--- a/gridfm_graphkit/models/algos.pyx
+++ b/gridfm_graphkit/models/algos.pyx
@@ -6,11 +6,12 @@ from cython.parallel cimport prange, parallel
 cimport numpy
 import numpy
 
-def floyd_warshall(adjacency_matrix):
+def floyd_warshall(adjacency_matrix, max_hops):
 
     (nrows, ncols) = adjacency_matrix.shape
     assert nrows == ncols
     cdef unsigned int n = nrows
+    cdef unsigned int max_hops_copy = max_hops
 
     adj_mat_copy = adjacency_matrix.astype(numpy.int32, order='C', casting='safe', copy=True)
     assert adj_mat_copy.flags['C_CONTIGUOUS']
@@ -40,6 +41,8 @@ def floyd_warshall(adjacency_matrix):
             for j in range(n):
                 cost_ikkj = M_ik + M_k_ptr[j]
                 M_ij = M_i_ptr[j]
+                if cost_ikkj > max_hops_copy:   # TODO flow from above
+                    continue
                 if M_ij > cost_ikkj:
                     M_i_ptr[j] = cost_ikkj
                     path[i][j] = k
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index f38159c..8005863 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -48,7 +48,7 @@ def __init__(self, args):
         self.mask_value = getattr(args.data, "mask_value", -1.0)
         self.learn_mask = getattr(args.data, "learn_mask", False)
         self.edge_type = getattr(args.model, "edge_type", "multi_hop") 
-        self.multi_hop_max_dist = getattr(args.model, "multi_hop_max_dist", 20) 
+        self.multi_hop_max_dist = getattr(args.data, "max_hops", 20) 
         
         if self.learn_mask:
             self.mask_value = nn.Parameter(

From 04aed1033fa54ab2a164ba3e1222ab70e2c3a307 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:47 -0400
Subject: [PATCH 34/55] add buffer for single hop case

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 3 ++-
 gridfm_graphkit/models/graphormer.py   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index f7f8799..916a2ea 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -185,7 +185,7 @@ def pad_edge_bias_unsqueeze(x, padlen):
     xlen = x.size(0)
     if xlen < padlen:
         new_x = x.new_zeros(
-            (padlen, padlen) + x.size()[-2:], dtype=x.dtype).fill_(int(0))
+            (padlen, padlen) + x.size()[2:], dtype=x.dtype).fill_(int(0))
         new_x[:xlen, :xlen] = x
         new_x[xlen:, :xlen] = 0
         x = new_x
@@ -230,6 +230,7 @@ def forward(self, data: Data) -> Data:
         spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num)
         in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze()
         edge_input = pad_edge_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name
+        attn_edge_type = pad_edge_bias_unsqueeze(attn_edge_type, self.max_node_num)
         # TODO check padding of attn_edge_type, and num steps to sort out batching issue
         # print('ffe>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size())
 
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 8005863..dbed710 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -152,7 +152,7 @@ def compute_pos_embeddings(self, data):
                 # TODO pad attn_edge_type for this path
                 edge_input = self.edge_encoder( # TODO test this path
                     attn_edge_type).mean(-2).permute(0, 3, 1, 2)
-            print('sum>>>', graph_attn_bias.size(), edge_input.size(), attn_edge_type.size())
+            # print('sum>>>', graph_attn_bias.size(), edge_input.size(), attn_edge_type.size())
             graph_attn_bias = graph_attn_bias + edge_input
 
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset

From b9e9efd3b5bafc868352d23a1fcc8f6c438abf2d Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:47 -0400
Subject: [PATCH 35/55] checkpoint before cleanup and testing

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/algos.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gridfm_graphkit/models/algos.pyx b/gridfm_graphkit/models/algos.pyx
index 7ab5851..003eae9 100644
--- a/gridfm_graphkit/models/algos.pyx
+++ b/gridfm_graphkit/models/algos.pyx
@@ -41,7 +41,7 @@ def floyd_warshall(adjacency_matrix, max_hops):
             for j in range(n):
                 cost_ikkj = M_ik + M_k_ptr[j]
                 M_ij = M_i_ptr[j]
-                if cost_ikkj > max_hops_copy:   # TODO flow from above
+                if cost_ikkj > max_hops_copy:
                     continue
                 if M_ij > cost_ikkj:
                     M_i_ptr[j] = cost_ikkj

From 98e1b123c0af6b7d6d66eac66565688a141e3d42 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:47 -0400
Subject: [PATCH 36/55] TODOs cleared from transforms

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 916a2ea..452a265 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -101,7 +101,7 @@ def add_node_attr(data: Data, value: Any,
 
     return data
 
-# TODO verify how this meshes with the node features, as compared to orig version
+
 def convert_to_single_emb(x, offset=512):
     feature_num = x.size(1) if len(x.size()) > 1 else 1
     feature_offset = 1 + \
@@ -109,6 +109,7 @@ def convert_to_single_emb(x, offset=512):
     x = x + feature_offset
     return x
 
+
 def get_edge_encoding(edge_attr, N, edge_index, max_dist, path):
     if len(edge_attr.size()) == 1:
             edge_attr = edge_attr[:, None]
@@ -119,6 +120,7 @@ def get_edge_encoding(edge_attr, N, edge_index, max_dist, path):
 
     return attn_edge_type, torch.from_numpy(edge_input).long()
 
+
 def preprocess_item(data, max_hops):
     """
     Calculation of the attention bias, and positional/structural data
@@ -138,7 +140,7 @@ def preprocess_item(data, max_hops):
     # for those shortest paths (path)
     shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32), max_hops)
     spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device)
-    attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device)  # TODO verifie is updated
+    attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) 
 
     if edge_attr is not None:
         # print(path)
@@ -175,9 +177,9 @@ def pad_attn_bias_unsqueeze(x, padlen):
     xlen = x.size(0)
     if xlen < padlen:
         new_x = x.new_zeros(
-            [padlen, padlen], dtype=x.dtype).fill_(float('-inf'))   # TODO verify if masking is needed given this is at -inf...
+            [padlen, padlen], dtype=x.dtype).fill_(float('-inf'))   
         new_x[:xlen, :xlen] = x
-        new_x[xlen:, :xlen] = 0 # TODO verify if masking is needed given this is at -inf...
+        new_x[xlen:, :xlen] = 0 
         x = new_x
     return x.unsqueeze(0)
 
@@ -225,14 +227,11 @@ def forward(self, data: Data) -> Data:
         attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = \
                             preprocess_item(data, self.max_hops)
         
-        # print('e>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size())
         attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num)
         spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num)
         in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze()
-        edge_input = pad_edge_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name
+        edge_input = pad_edge_bias_unsqueeze(edge_input, self.max_node_num)
         attn_edge_type = pad_edge_bias_unsqueeze(attn_edge_type, self.max_node_num)
-        # TODO check padding of attn_edge_type, and num steps to sort out batching issue
-        # print('ffe>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size())
 
         data = add_node_attr(data, attn_bias, attr_name='attn_bias')
         data = add_node_attr(data, spatial_pos, attr_name='spatial_pos')

From a3b5b61b87090c51ad74b37f0e00d7f9f0a68a69 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:47 -0400
Subject: [PATCH 37/55] TODOs cleared from transforms

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 452a265..86a98c8 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -143,10 +143,6 @@ def preprocess_item(data, max_hops):
     attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) 
 
     if edge_attr is not None:
-        # print(path)
-        # print(path.shape)
-        # print(shortest_path_result)
-        # print(shortest_path_result.shape)
         attn_edge_type, edge_input = get_edge_encoding(edge_attr, N, edge_index, max_hops, path)
     else:
         edge_input = None

From 498709bfd5fbf5b574dbe180c3a888ce436c37d9 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:47 -0400
Subject: [PATCH 38/55] TODOs cleared from graphormer

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/graphormer.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index dbed710..42d5f4d 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -134,10 +134,7 @@ def compute_pos_embeddings(self, data):
                     spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist)
                     edge_input = edge_input[:, :, :, :self.multi_hop_max_dist, :]
                 # [n_graph, n_node, n_node, max_dist, n_head]
-                # print('!!!!!!', edge_input.size())
-                # print('mmmmm', edge_input.max(), edge_input.min())
-                edge_input = self.edge_encoder(edge_input+1).mean(-2)    # TODO determine source of -1 and correct
-                # print('22222', edge_input.size())
+                edge_input = self.edge_encoder(edge_input+1).mean(-2) 
                 max_dist = edge_input.size(-2)
                 edge_input_flat = edge_input.permute(
                     3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads)
@@ -149,10 +146,9 @@ def compute_pos_embeddings(self, data):
                               (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2)
             else:
                 # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
-                # TODO pad attn_edge_type for this path
-                edge_input = self.edge_encoder( # TODO test this path
+                edge_input = self.edge_encoder(
                     attn_edge_type).mean(-2).permute(0, 3, 1, 2)
-            # print('sum>>>', graph_attn_bias.size(), edge_input.size(), attn_edge_type.size())
+            
             graph_attn_bias = graph_attn_bias + edge_input
 
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset

From 7f0e2851290e47c4eaa9fc26fe2cad7e5dc10ecf Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:47 -0400
Subject: [PATCH 39/55] add raw graphormer config

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 .../config/my_gridFMv0.2_pretraining.yaml     | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 examples/config/my_gridFMv0.2_pretraining.yaml

diff --git a/examples/config/my_gridFMv0.2_pretraining.yaml b/examples/config/my_gridFMv0.2_pretraining.yaml
new file mode 100644
index 0000000..d740d34
--- /dev/null
+++ b/examples/config/my_gridFMv0.2_pretraining.yaml
@@ -0,0 +1,56 @@
+callbacks:
+  patience: 100
+  tol: 0
+data:
+  baseMVA: 100
+  learn_mask: false
+  mask_dim: 6
+  mask_ratio: 0.5
+  mask_type: rnd
+  mask_value: -1.0
+  networks:
+  # - Texas2k_case1_2016summerpeak
+  - case24_ieee_rts
+  - case118_ieee
+  - case300_ieee
+  normalization: baseMVAnorm
+  scenarios:
+  - 5000
+  - 5000
+  - 5000
+  test_ratio: 0.1
+  val_ratio: 0.1
+  workers: 4
+  add_graphormer_encoding: true
+  max_node_num: 300 # necessary for Graphormer
+  max_hops: 6 # for the edge encoding, should match 
+  edge_type: multi_hop # singlehop
+model:
+  attention_head: 8
+  dropout: 0.1
+  edge_dim: 2
+  hidden_size: 123
+  input_dim: 9
+  num_layers: 14
+  output_dim: 6
+  pe_dim: 20
+  type: Graphormer  #GPSTransformer  #
+optimizer:
+  beta1: 0.9
+  beta2: 0.999
+  learning_rate: 0.0001
+  lr_decay: 0.7
+  lr_patience: 10
+seed: 0
+training:
+  batch_size: 8
+  epochs: 500
+  loss_weights:
+  - 0.01
+  - 0.99
+  losses:
+  - MaskedMSE
+  - PBE
+  accelerator: auto
+  devices: auto
+  strategy: auto

From d3abe1f77d87f1061debcaf2836ba8e257529e99 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:47 -0400
Subject: [PATCH 40/55] rename graphormer config

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 ...my_gridFMv0.2_pretraining.yaml => graphormer_pretraining.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/config/{my_gridFMv0.2_pretraining.yaml => graphormer_pretraining.yaml} (100%)

diff --git a/examples/config/my_gridFMv0.2_pretraining.yaml b/examples/config/graphormer_pretraining.yaml
similarity index 100%
rename from examples/config/my_gridFMv0.2_pretraining.yaml
rename to examples/config/graphormer_pretraining.yaml

From f63a760a8bbf0f3533b4e274e8602e9bb3662472 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:47 -0400
Subject: [PATCH 41/55] adjust calc of mask

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 16 +++++++---------
 gridfm_graphkit/models/graphormer.py   |  4 +++-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 86a98c8..5d14a46 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -88,14 +88,12 @@ def get_pe(out: Tensor) -> Tensor:
 
         return data
 
-def add_node_attr(data: Data, value: Any,
-                  attr_name: Optional[str] = None) -> Data:
+def add_node_attr(data: Data, 
+                    value: Any,
+                    attr_name: Optional[str] = None
+                    ) -> Data:
     if attr_name is None:
-        if 'x' in data:
-            x = data.x.view(-1, 1) if data.x.dim() == 1 else data.x
-            data.x = torch.cat([x, value.to(x.device, x.dtype)], dim=-1)
-        else:
-            data.x = value
+        raise ValueError("Expected attr_name to be not None")
     else:
         data[attr_name] = value
 
@@ -165,7 +163,7 @@ def pad_2d_unsqueeze(x, padlen):
     if xlen < padlen:
         new_x = x.new_zeros([padlen, xdim], dtype=x.dtype)
         new_x[:,:] = -1e9
-        new_x[:xlen, :] = x
+        new_x[:xlen, :] = x # TODO verify this step as well with x shape
         x = new_x
     return x.unsqueeze(0)
 
@@ -175,7 +173,7 @@ def pad_attn_bias_unsqueeze(x, padlen):
         new_x = x.new_zeros(
             [padlen, padlen], dtype=x.dtype).fill_(float('-inf'))   
         new_x[:xlen, :xlen] = x
-        new_x[xlen:, :xlen] = 0 
+        new_x[xlen:, :xlen] = 0     # TODO verify this step
         x = new_x
     return x.unsqueeze(0)
 
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 42d5f4d..ea3e419 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -189,8 +189,10 @@ def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data=
         """
 
         # identify buffer nodes, and create a mask for them
+        # note masking will be done up to feature mask_dim of n_node_features
         masked_entries = torch.sum(x < -1e8, axis=-1)
-        mask = masked_entries >= 3  # due to masking up to feature 6 of 9
+        mask = masked_entries >= (self.n_node_features - self.mask_dim)  
+
         
         graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data)
         output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batch)

From 47a7cef8588219d53d03f8d2dc914656bf533aee Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:48 -0400
Subject: [PATCH 42/55] clean up comments

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 25 ++++++++++++++++---------
 gridfm_graphkit/models/graphormer.py   |  7 +++----
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 5d14a46..9bae59b 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -88,9 +88,10 @@ def get_pe(out: Tensor) -> Tensor:
 
         return data
 
+
 def add_node_attr(data: Data, 
                     value: Any,
-                    attr_name: Optional[str] = None
+                    attr_name: str
                     ) -> Data:
     if attr_name is None:
         raise ValueError("Expected attr_name to be not None")
@@ -99,7 +100,6 @@ def add_node_attr(data: Data,
 
     return data
 
-
 def convert_to_single_emb(x, offset=512):
     feature_num = x.size(1) if len(x.size()) > 1 else 1
     feature_offset = 1 + \
@@ -107,7 +107,6 @@ def convert_to_single_emb(x, offset=512):
     x = x + feature_offset
     return x
 
-
 def get_edge_encoding(edge_attr, N, edge_index, max_dist, path):
     if len(edge_attr.size()) == 1:
             edge_attr = edge_attr[:, None]
@@ -118,10 +117,13 @@ def get_edge_encoding(edge_attr, N, edge_index, max_dist, path):
 
     return attn_edge_type, torch.from_numpy(edge_input).long()
 
-
 def preprocess_item(data, max_hops):
     """
-    Calculation of the attention bias, and positional/structural data
+    Calculation of the Graphormer attention bias, and positional/structural 
+    variables. From a Data-like object the shortest paths in number of hops 
+    between nodes are calculated, being cut off at max_hops. In addition to the
+    centrality (assume undirected graphs) and attention bias, these are the 
+    inputs to the model structural and positional encodings.
     """
     edge_index = data.edge_index
     edge_attr = data.edge_attr
@@ -163,7 +165,7 @@ def pad_2d_unsqueeze(x, padlen):
     if xlen < padlen:
         new_x = x.new_zeros([padlen, xdim], dtype=x.dtype)
         new_x[:,:] = -1e9
-        new_x[:xlen, :] = x # TODO verify this step as well with x shape
+        new_x[:xlen, :] = x 
         x = new_x
     return x.unsqueeze(0)
 
@@ -173,7 +175,7 @@ def pad_attn_bias_unsqueeze(x, padlen):
         new_x = x.new_zeros(
             [padlen, padlen], dtype=x.dtype).fill_(float('-inf'))   
         new_x[:xlen, :xlen] = x
-        new_x[xlen:, :xlen] = 0     # TODO verify this step
+        new_x[xlen:, :xlen] = 0     
         x = new_x
     return x.unsqueeze(0)
 
@@ -198,8 +200,13 @@ def pad_spatial_pos_unsqueeze(x, padlen):
 
 class AddGraphormerEncodings(BaseTransform):
     """Adds a positional encoding (node centrallity) to the given graph, as 
-    well as the attention biases, as described in: Do transformers really 
-    perform badly for graph representation?, C. Ying et al., 2021.
+    well as the attention and edge biases, as described in: Do transformers 
+    really perform badly for graph representation?, C. Ying et al., 2021.
+    
+    Args:
+        max_node_num (int): The number of nodes in the largest graph considered.
+        max_hops (int): The maximum path length between nodes to consider for
+                        the edge encodings.
     """
 
     def __init__(
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index ea3e419..342186f 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -10,7 +10,7 @@
 @MODELS_REGISTRY.register("Graphormer")
 class Graphormer(nn.Module):
     """
-    A Graph Transformer model based on the Graphormer architecture
+    A Graph Transformer model based on the Graphormer architecture.
 
     This model directly modifies the attention between nodes based on
     its graph encodings. This requires padding the input nodes and propogating
@@ -83,7 +83,8 @@ def __init__(self, args):
         )
         
         # for positional embeddings
-        self.spatial_pos_encoder = nn.Embedding(512, self.num_heads, padding_idx=0)
+        self.spatial_pos_encoder = nn.Embedding(
+            512, self.num_heads, padding_idx=0)
         self.in_degree_encoder = nn.Embedding(
             512, self.hidden_dim, padding_idx=0)
         self.out_degree_encoder = nn.Embedding(
@@ -163,8 +164,6 @@ def compute_pos_embeddings(self, data):
 
 
     def encoder(self, graph_node_feature, graph_attn_bias, mask=None, batch=1):
-
-        # transfomrer encoder
         output = self.input_dropout(graph_node_feature)
         for enc_layer in self.encoder_layers:
             output = enc_layer(output, graph_attn_bias, mask=mask, batch=batch)

From 893146be83e773c1e62f01bd89fd26ff157eb51f Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:48 -0400
Subject: [PATCH 43/55] add swith for windows and linux for cython algos

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 24 ++++++++++++++++++++++--
 gridfm_graphkit/models/algos.pyx       | 10 +++++-----
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 9bae59b..535b70c 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -16,6 +16,7 @@
 )
 
 import numpy as np
+import os
 import pyximport
 pyximport.install(setup_args={'include_dirs': np.get_include()})
 import gridfm_graphkit.models.algos as algos
@@ -113,7 +114,15 @@ def get_edge_encoding(edge_attr, N, edge_index, max_dist, path):
     attn_edge_type = torch.zeros([N, N, edge_attr.size(-1)], dtype=torch.long)
     attn_edge_type[edge_index[0, :], edge_index[1, :]
                     ] = convert_to_single_emb(edge_attr.long()) + 1
-    edge_input = algos.gen_edge_input(max_dist, path, attn_edge_type.numpy())
+    if os.name == 'nt':
+        edge_input = algos.gen_edge_input(
+                    max_dist, 
+                    path, 
+                    attn_edge_type.numpy(),
+                    localtype=np.int32
+                    )
+    else:
+        edge_input = algos.gen_edge_input(max_dist, path, attn_edge_type.numpy())
 
     return attn_edge_type, torch.from_numpy(edge_input).long()
 
@@ -138,7 +147,18 @@ def preprocess_item(data, max_hops):
 
     # get shortest paths in number of hops (shortest_path_result) and intermediate nodes
     # for those shortest paths (path)
-    shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32), max_hops)
+    if os.name == 'nt':
+        shortest_path_result, path = algos.floyd_warshall(
+                        adj.numpy().astype(np.int32), 
+                        max_hops, 
+                        localtype=np.int32
+                        )
+    else:
+        shortest_path_result, path = algos.floyd_warshall(
+                        adj.numpy().astype(np.int32), 
+                        max_hops
+                        )
+
     spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device)
     attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) 
 
diff --git a/gridfm_graphkit/models/algos.pyx b/gridfm_graphkit/models/algos.pyx
index 003eae9..6701740 100644
--- a/gridfm_graphkit/models/algos.pyx
+++ b/gridfm_graphkit/models/algos.pyx
@@ -6,17 +6,17 @@ from cython.parallel cimport prange, parallel
 cimport numpy
 import numpy
 
-def floyd_warshall(adjacency_matrix, max_hops):
+def floyd_warshall(adjacency_matrix, max_hops, localtype=long):
 
     (nrows, ncols) = adjacency_matrix.shape
     assert nrows == ncols
     cdef unsigned int n = nrows
     cdef unsigned int max_hops_copy = max_hops
 
-    adj_mat_copy = adjacency_matrix.astype(numpy.int32, order='C', casting='safe', copy=True)
+    adj_mat_copy = adjacency_matrix.astype(localtype, order='C', casting='safe', copy=True)
     assert adj_mat_copy.flags['C_CONTIGUOUS']
     cdef numpy.ndarray[long, ndim=2, mode='c'] M = adj_mat_copy
-    cdef numpy.ndarray[long, ndim=2, mode='c'] path = numpy.zeros([n, n], dtype=numpy.int32)
+    cdef numpy.ndarray[long, ndim=2, mode='c'] path = numpy.zeros([n, n], dtype=localtype)
 
     cdef unsigned int i, j, k
     cdef long M_ij, M_ik, cost_ikkj
@@ -65,7 +65,7 @@ def get_all_edges(path, i, j):
         return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)
 
 
-def gen_edge_input(max_dist, path, edge_feat):
+def gen_edge_input(max_dist, path, edge_feat, localtype=long):
 
     (nrows, ncols) = path.shape
     assert nrows == ncols
@@ -77,7 +77,7 @@ def gen_edge_input(max_dist, path, edge_feat):
     assert path_copy.flags['C_CONTIGUOUS']
     assert edge_feat_copy.flags['C_CONTIGUOUS']
 
-    cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=numpy.int32)
+    cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=localtype)
     cdef unsigned int i, j, k, num_path, cur
 
     for i in range(n):

From 8c44e25a4556324c7465a4e941a16ad1c9fc8494 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:48 -0400
Subject: [PATCH 44/55] add layer norm to decoder

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/graphormer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 342186f..b1dd263 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -79,6 +79,7 @@ def __init__(self, args):
         self.decoder = nn.Sequential(
             nn.Linear(self.hidden_dim, self.hidden_dim),
             nn.LeakyReLU(),
+            nn.LayerNorm(self.hidden_dim),
             nn.Linear(self.hidden_dim, self.output_dim)
         )
         
@@ -122,7 +123,7 @@ def compute_pos_embeddings(self, data):
 
         graph_attn_bias = graph_attn_bias + spatial_pos_bias
 
-        if data.edge_input is not None:
+        if (data.edge_input is not None) and (self.edge_type is not None):
             edge_input, attn_edge_type = data.edge_input, data.attn_edge_type
             # edge feature
             if self.edge_type == 'multi_hop':

From 1478b3bfaeaedd71571aff6ca04b3fab7ea1ddfc Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:48 -0400
Subject: [PATCH 45/55] adjust edge encoding to accomodate better switching and
 negative values

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py | 4 +++-
 gridfm_graphkit/datasets/transforms.py        | 9 +++++++--
 gridfm_graphkit/models/graphormer.py          | 2 +-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index 3691500..17b205c 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -53,10 +53,12 @@ def __init__(
         self.mask_dim = mask_dim
         self.length = None
 
-        if args.add_graphormer_encoding:
+        if ("add_graphormer_encoding" in args) and args.add_graphormer_encoding:
             self.add_graphormer_encoding = args.add_graphormer_encoding
             self.max_node_num = args.max_node_num
             self.max_hops = args.max_hops
+        else:
+            self.add_graphormer_encoding = False
 
         super().__init__(root, transform, pre_transform, pre_filter)
 
diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 535b70c..e8573d8 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -102,10 +102,15 @@ def add_node_attr(data: Data,
     return data
 
 def convert_to_single_emb(x, offset=512):
+    """
+    The edge feature embedding range is set to start at 512 to accomodate
+    negative branch feature values in PF data.
+    """
     feature_num = x.size(1) if len(x.size()) > 1 else 1
     feature_offset = 1 + \
-        torch.arange(0, feature_num * offset, offset, dtype=torch.long)
+        torch.arange(offset, (feature_num + 1) * offset, offset, dtype=torch.long)
     x = x + feature_offset
+    
     return x
 
 def get_edge_encoding(edge_attr, N, edge_index, max_dist, path):
@@ -123,7 +128,7 @@ def get_edge_encoding(edge_attr, N, edge_index, max_dist, path):
                     )
     else:
         edge_input = algos.gen_edge_input(max_dist, path, attn_edge_type.numpy())
-
+        
     return attn_edge_type, torch.from_numpy(edge_input).long()
 
 def preprocess_item(data, max_hops):
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index b1dd263..af2396e 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -92,7 +92,7 @@ def __init__(self, args):
             512, self.hidden_dim, padding_idx=0)
         if self.n_edge_features is not None:
             self.edge_encoder = nn.Embedding(
-                512 * self.n_edge_features + 1, self.num_heads, padding_idx=0)
+                512 * (self.n_edge_features+1) + 1, self.num_heads, padding_idx=0)
         if self.edge_type == 'multi_hop':
             self.edge_dis_encoder = nn.Embedding(
                 128 * self.num_heads * self.num_heads, 1)       

From 518f549cb859bbe6a6c1701e94a08c3458e5d16f Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:48 -0400
Subject: [PATCH 46/55] checkpoint in testing

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 10 +++++++++-
 gridfm_graphkit/models/graphormer.py   |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index e8573d8..e0aa9a6 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -106,9 +106,17 @@ def convert_to_single_emb(x, offset=512):
     The edge feature embedding range is set to start at 512 to accomodate
     negative branch feature values in PF data.
     """
+    x = torch.clamp(x, 0, 512)
     feature_num = x.size(1) if len(x.size()) > 1 else 1
     feature_offset = 1 + \
-        torch.arange(offset, (feature_num + 1) * offset, offset, dtype=torch.long)
+        torch.arange(
+                0, 
+                (feature_num) * offset, 
+                offset,
+                 dtype=torch.long
+                 )  # start range at offset to accomodate -ve values TODO finalize
+        # torch.arange(offset, (feature_num + 1) * offset, offset, dtype=torch.long)
+        
     x = x + feature_offset
     
     return x
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index af2396e..b1dd263 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -92,7 +92,7 @@ def __init__(self, args):
             512, self.hidden_dim, padding_idx=0)
         if self.n_edge_features is not None:
             self.edge_encoder = nn.Embedding(
-                512 * (self.n_edge_features+1) + 1, self.num_heads, padding_idx=0)
+                512 * self.n_edge_features + 1, self.num_heads, padding_idx=0)
         if self.edge_type == 'multi_hop':
             self.edge_dis_encoder = nn.Embedding(
                 128 * self.num_heads * self.num_heads, 1)       

From b4c1a6f2f2501101e0ce5762e6811254a861eb29 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:48 -0400
Subject: [PATCH 47/55] verified fix to edge encoding range

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 15 ++++++++-------
 gridfm_graphkit/models/graphormer.py   |  1 +
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index e0aa9a6..7a0d6cd 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -103,10 +103,12 @@ def add_node_attr(data: Data,
 
 def convert_to_single_emb(x, offset=512):
     """
-    The edge feature embedding range is set to start at 512 to accomodate
-    negative branch feature values in PF data.
+    The edge feature embedding range is set to 512, with the futher assumption
+    that the input range is from -512 to 512. This may need to change in the future.
     """
-    x = torch.clamp(x, 0, 512)
+    x = torch.clamp(x, -512, 512)
+    x = ( 512*(x+512)/1024 ).long()
+
     feature_num = x.size(1) if len(x.size()) > 1 else 1
     feature_offset = 1 + \
         torch.arange(
@@ -114,11 +116,10 @@ def convert_to_single_emb(x, offset=512):
                 (feature_num) * offset, 
                 offset,
                  dtype=torch.long
-                 )  # start range at offset to accomodate -ve values TODO finalize
-        # torch.arange(offset, (feature_num + 1) * offset, offset, dtype=torch.long)
-        
+                 )
+
     x = x + feature_offset
-    
+     
     return x
 
 def get_edge_encoding(edge_attr, N, edge_index, max_dist, path):
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index b1dd263..3d92498 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -93,6 +93,7 @@ def __init__(self, args):
         if self.n_edge_features is not None:
             self.edge_encoder = nn.Embedding(
                 512 * self.n_edge_features + 1, self.num_heads, padding_idx=0)
+            #    1024, self.num_heads, padding_idx=0)
         if self.edge_type == 'multi_hop':
             self.edge_dis_encoder = nn.Embedding(
                 128 * self.num_heads * self.num_heads, 1)       

From 1b63f2b8b9dd76a52dab203ab29d04c333d8b64c Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:48 -0400
Subject: [PATCH 48/55] checkpoint before rework of tensor shapes

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py |  2 +-
 gridfm_graphkit/models/graphormer.py   | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index 7a0d6cd..bd24e5d 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -275,7 +275,7 @@ def forward(self, data: Data) -> Data:
         data = add_node_attr(data, attn_edge_type, attr_name='attn_edge_type')
 
         data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze()
-        data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze()
+        # data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze()
 
         return data
 
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 3d92498..d1b6eab 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -112,6 +112,7 @@ def compute_pos_embeddings(self, data):
         """
         attn_bias, spatial_pos, x = data.attn_bias, data.spatial_pos, data.x
         in_degree, out_degree = data.in_degree, data.in_degree
+        print('>>>>', attn_bias.size(), spatial_pos.size(), in_degree.size())
         
         # graph_attn_bias
         graph_attn_bias = attn_bias.clone()
@@ -157,10 +158,9 @@ def compute_pos_embeddings(self, data):
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
 
         node_feature = self.input_proj(x)
-        node_feature = node_feature + \
+        graph_node_feature = node_feature + \
             self.in_degree_encoder(in_degree) + \
             self.out_degree_encoder(out_degree)
-        graph_node_feature = node_feature
 
         return graph_node_feature, graph_attn_bias
 
@@ -188,12 +188,15 @@ def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data=
         Returns:
             output (Tensor): Output node features of shape [num_nodes, output_dim].
         """
+        print('0***', x.size(), data.y.size())
+        print('<<<', x.min(), x.max())
+        # x, _ = to_dense_batch(x, batch, max_num_nodes=30)
+        # print('1***', x.size())
 
         # identify buffer nodes, and create a mask for them
         # note masking will be done up to feature mask_dim of n_node_features
         masked_entries = torch.sum(x < -1e8, axis=-1)
         mask = masked_entries >= (self.n_node_features - self.mask_dim)  
-
         
         graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data)
         output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batch)
@@ -308,6 +311,18 @@ def forward(self, x, attn_bias=None, mask=None, batch=1):
         x, _ = to_dense_batch(x, batch) 
         mask, _ = to_dense_batch(mask, batch)
 
+        # print('***', x[~mask].min(), x[~mask].max())
+
+        print('-----')
+        # print(x.size())
+        print(mask.size(), attn_bias.size(), batch)
+        # print(mask.sum(axis=1))
+        # print((x[1:2,:,:] < -1e7).sum(axis=(1,2)))
+        print(x.min(), x.max())
+        # print((attn_bias[2:3,2:3,:,:]).sum(axis=2))
+        # print((attn_bias[2:3,2:3,:,:]).sum(axis=3))
+        # print(mask.sum())
+
         y = self.self_attention_norm(x)
         attn_bias = attn_bias.squeeze()
         y = self.self_attention(y, y, y, attn_bias, mask)

From 1f6fe592604e4860ca6b56251e551e87d7a6021f Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:49 -0400
Subject: [PATCH 49/55] mask based on dense batch tested

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py        |  7 +-
 gridfm_graphkit/models/graphormer.py          | 70 ++++++++++++-------
 .../tasks/feature_reconstruction_task.py      |  8 +--
 3 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index bd24e5d..fbe61de 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -274,8 +274,13 @@ def forward(self, data: Data) -> Data:
         data = add_node_attr(data, edge_input, attr_name='edge_input')
         data = add_node_attr(data, attn_edge_type, attr_name='attn_edge_type')
 
-        data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze()
+        # data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze()    # TODO finalize
         # data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze()
+        
+        # TODO remove testing lines
+        # masked_entries = torch.sum(data.x < -1e8, axis=-1)
+        # mask = masked_entries >= (9 - 3) 
+        # print('ssssss_orig', mask.sum())
 
         return data
 
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index d1b6eab..6c7d383 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -49,6 +49,7 @@ def __init__(self, args):
         self.learn_mask = getattr(args.data, "learn_mask", False)
         self.edge_type = getattr(args.model, "edge_type", "multi_hop") 
         self.multi_hop_max_dist = getattr(args.data, "max_hops", 20) 
+        self.max_node_num = getattr(args.data, "max_node_num", 24) 
         
         if self.learn_mask:
             self.mask_value = nn.Parameter(
@@ -76,12 +77,13 @@ def __init__(self, args):
         self.encoder_layers = nn.ModuleList(encoders)
         self.encoder_final_ln = nn.LayerNorm(self.hidden_dim)
 
-        self.decoder = nn.Sequential(
+        self.decoder_layers = nn.Sequential(
             nn.Linear(self.hidden_dim, self.hidden_dim),
             nn.LeakyReLU(),
             nn.LayerNorm(self.hidden_dim),
             nn.Linear(self.hidden_dim, self.output_dim)
         )
+
         
         # for positional embeddings
         self.spatial_pos_encoder = nn.Embedding(
@@ -99,7 +101,7 @@ def __init__(self, args):
                 128 * self.num_heads * self.num_heads, 1)       
 
 
-    def compute_pos_embeddings(self, data):
+    def compute_pos_embeddings(self, data, x):
         """
         Calculate Graphormer positional encodings, and attention biases
 
@@ -110,9 +112,9 @@ def compute_pos_embeddings(self, data):
             graph_node_feature (Tensor): data.x with positional encoding appended.
             graph_attn_bias (Tensor): attention bais terms.
         """
-        attn_bias, spatial_pos, x = data.attn_bias, data.spatial_pos, data.x
+        attn_bias, spatial_pos = data.attn_bias, data.spatial_pos    #, data.x
         in_degree, out_degree = data.in_degree, data.in_degree
-        print('>>>>', attn_bias.size(), spatial_pos.size(), in_degree.size())
+        # print('>>>>', attn_bias.size(), spatial_pos.size(), in_degree.size())
         
         # graph_attn_bias
         graph_attn_bias = attn_bias.clone()
@@ -158,9 +160,11 @@ def compute_pos_embeddings(self, data):
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
 
         node_feature = self.input_proj(x)
-        graph_node_feature = node_feature + \
+        # print('zzz', node_feature.flatten(0,1).size())
+        graph_node_feature = node_feature.flatten(0,1) + \
             self.in_degree_encoder(in_degree) + \
             self.out_degree_encoder(out_degree)
+        graph_node_feature = graph_node_feature.reshape(node_feature.size())
 
         return graph_node_feature, graph_attn_bias
 
@@ -169,9 +173,12 @@ def encoder(self, graph_node_feature, graph_attn_bias, mask=None, batch=1):
         output = self.input_dropout(graph_node_feature)
         for enc_layer in self.encoder_layers:
             output = enc_layer(output, graph_attn_bias, mask=mask, batch=batch)
-        output = self.encoder_final_ln(output)
+        output[~mask] = self.encoder_final_ln(output[~mask])
+        return output
+    
+    def decoder(self, x):
+        output = self.decoder_layers(x)
         return output
-
 
     def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data=None):
         """
@@ -188,26 +195,30 @@ def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data=
         Returns:
             output (Tensor): Output node features of shape [num_nodes, output_dim].
         """
-        print('0***', x.size(), data.y.size())
-        print('<<<', x.min(), x.max())
-        # x, _ = to_dense_batch(x, batch, max_num_nodes=30)
+        # print('0***', x.size(), data.y.size())
+        # print('<<<', x.min(), x.max())
+        x, valid_nodes = to_dense_batch(x, batch, max_num_nodes=self.max_node_num)
+        mask = ~valid_nodes
         # print('1***', x.size())
+        # TODO remove prints
 
         # identify buffer nodes, and create a mask for them
         # note masking will be done up to feature mask_dim of n_node_features
-        masked_entries = torch.sum(x < -1e8, axis=-1)
-        mask = masked_entries >= (self.n_node_features - self.mask_dim)  
+        # masked_entries = torch.sum(x < -1e8, axis=-1)
+        # mask = masked_entries >= (self.n_node_features - self.mask_dim)  
+        # print('mmmmmm', mask.size(), mask.size())
+        # print('ssssss', (~mask).sum())
         
-        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data)
+        graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data, x)
         output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batch)
-        output = self.decoder(output)
+        output = self.decoder(output[valid_nodes])
 
         # return the negative of the buffer mask to select data for loss calculation
-        return output, ~mask
+        return output
 
 
 class FeedForwardNetwork(nn.Module):
-    def __init__(self, hidden_size, ffn_size, dropout_rate):
+    def __init__(self, hidden_size, ffn_size):
         super(FeedForwardNetwork, self).__init__()
 
         self.layer1 = nn.Linear(hidden_size, ffn_size)
@@ -300,39 +311,44 @@ def __init__(self, hidden_size, ffn_size, dropout_rate, num_heads):
         self.self_attention_dropout = nn.Dropout(dropout_rate)
 
         self.ffn_norm = nn.LayerNorm(hidden_size)
-        self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate)
+        self.ffn = FeedForwardNetwork(hidden_size, ffn_size)
         self.ffn_dropout = nn.Dropout(dropout_rate)
 
-    def forward(self, x, attn_bias=None, mask=None, batch=1):
+    def forward(self, x, attn_bias=None, mask=None, batch=1):   #TODO remove batch if not needed
         """
         It is assumed that the mask is 1 where values are to be ignored
         and then 0 where there are valid data
         """
-        x, _ = to_dense_batch(x, batch) 
-        mask, _ = to_dense_batch(mask, batch)
+        # print('enin****', x.size())
+        # x, _ = to_dense_batch(x, batch) 
+        # mask, _ = to_dense_batch(mask, batch)
 
-        # print('***', x[~mask].min(), x[~mask].max())
+        # print('enc***', x[~mask].min(), x[~mask].max())
+        # print('enc***', x.size())
 
-        print('-----')
+        # print('-----')
         # print(x.size())
-        print(mask.size(), attn_bias.size(), batch)
+        # print(mask.size(), attn_bias.size(), batch)
         # print(mask.sum(axis=1))
         # print((x[1:2,:,:] < -1e7).sum(axis=(1,2)))
-        print(x.min(), x.max())
+        # print(x.min(), x.max())
+        # print('vvvvv', x[~mask].size(), x[~mask].min(), x[~mask].max())
         # print((attn_bias[2:3,2:3,:,:]).sum(axis=2))
         # print((attn_bias[2:3,2:3,:,:]).sum(axis=3))
         # print(mask.sum())
+        # print('>>>', x[~mask].min(), x[~mask].max(), '-', x.min(), x.max())
 
-        y = self.self_attention_norm(x)
+        y = x
+        y[~mask] = self.self_attention_norm(x[~mask])
         attn_bias = attn_bias.squeeze()
         y = self.self_attention(y, y, y, attn_bias, mask)
         y = self.self_attention_dropout(y)
         x = x + torch.reshape(y, x.size())
 
-        y = self.ffn_norm(x)
+        y[~mask] = self.ffn_norm(x[~mask])
         y = self.ffn(y)
         y = self.ffn_dropout(y)
         x = x + y
-        x=x.flatten(0,1)
+        # x=x.flatten(0,1)
 
         return x
diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py
index 96d79bd..902acd0 100644
--- a/gridfm_graphkit/tasks/feature_reconstruction_task.py
+++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py
@@ -110,7 +110,7 @@ def on_fit_start(self):
                 )
 
     def shared_step(self, batch):
-        output, valid = self.forward(
+        output = self.forward(
             x=batch.x,
             pe=batch.pe,
             edge_index=batch.edge_index,
@@ -121,11 +121,11 @@ def shared_step(self, batch):
         )
 
         loss_dict = self.loss_fn(
-            output[valid],
-            batch.y[valid],
+            output,
+            batch.y,
             batch.edge_index,
             batch.edge_attr,
-            batch.mask[valid],
+            batch.mask,
         )
         return output, loss_dict
 

From 03f4f7ac13e2266ce67322c72cd88aedd43fefb8 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:49 -0400
Subject: [PATCH 50/55] clean up

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py |  8 -----
 gridfm_graphkit/models/graphormer.py   | 44 ++++----------------------
 2 files changed, 6 insertions(+), 46 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index fbe61de..a24edcd 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -274,14 +274,6 @@ def forward(self, data: Data) -> Data:
         data = add_node_attr(data, edge_input, attr_name='edge_input')
         data = add_node_attr(data, attn_edge_type, attr_name='attn_edge_type')
 
-        # data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze()    # TODO finalize
-        # data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze()
-        
-        # TODO remove testing lines
-        # masked_entries = torch.sum(data.x < -1e8, axis=-1)
-        # mask = masked_entries >= (9 - 3) 
-        # print('ssssss_orig', mask.sum())
-
         return data
 
 
diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index 6c7d383..fb6288a 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -112,9 +112,8 @@ def compute_pos_embeddings(self, data, x):
             graph_node_feature (Tensor): data.x with positional encoding appended.
             graph_attn_bias (Tensor): attention bais terms.
         """
-        attn_bias, spatial_pos = data.attn_bias, data.spatial_pos    #, data.x
+        attn_bias, spatial_pos = data.attn_bias, data.spatial_pos 
         in_degree, out_degree = data.in_degree, data.in_degree
-        # print('>>>>', attn_bias.size(), spatial_pos.size(), in_degree.size())
         
         # graph_attn_bias
         graph_attn_bias = attn_bias.clone()
@@ -160,7 +159,6 @@ def compute_pos_embeddings(self, data, x):
         graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
 
         node_feature = self.input_proj(x)
-        # print('zzz', node_feature.flatten(0,1).size())
         graph_node_feature = node_feature.flatten(0,1) + \
             self.in_degree_encoder(in_degree) + \
             self.out_degree_encoder(out_degree)
@@ -169,10 +167,10 @@ def compute_pos_embeddings(self, data, x):
         return graph_node_feature, graph_attn_bias
 
 
-    def encoder(self, graph_node_feature, graph_attn_bias, mask=None, batch=1):
+    def encoder(self, graph_node_feature, graph_attn_bias, mask=None):
         output = self.input_dropout(graph_node_feature)
         for enc_layer in self.encoder_layers:
-            output = enc_layer(output, graph_attn_bias, mask=mask, batch=batch)
+            output = enc_layer(output, graph_attn_bias, mask=mask)
         output[~mask] = self.encoder_final_ln(output[~mask])
         return output
     
@@ -195,25 +193,14 @@ def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data=
         Returns:
             output (Tensor): Output node features of shape [num_nodes, output_dim].
         """
-        # print('0***', x.size(), data.y.size())
-        # print('<<<', x.min(), x.max())
+
         x, valid_nodes = to_dense_batch(x, batch, max_num_nodes=self.max_node_num)
         mask = ~valid_nodes
-        # print('1***', x.size())
-        # TODO remove prints
-
-        # identify buffer nodes, and create a mask for them
-        # note masking will be done up to feature mask_dim of n_node_features
-        # masked_entries = torch.sum(x < -1e8, axis=-1)
-        # mask = masked_entries >= (self.n_node_features - self.mask_dim)  
-        # print('mmmmmm', mask.size(), mask.size())
-        # print('ssssss', (~mask).sum())
         
         graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data, x)
-        output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batch)
+        output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask)
         output = self.decoder(output[valid_nodes])
 
-        # return the negative of the buffer mask to select data for loss calculation
         return output
 
 
@@ -314,29 +301,11 @@ def __init__(self, hidden_size, ffn_size, dropout_rate, num_heads):
         self.ffn = FeedForwardNetwork(hidden_size, ffn_size)
         self.ffn_dropout = nn.Dropout(dropout_rate)
 
-    def forward(self, x, attn_bias=None, mask=None, batch=1):   #TODO remove batch if not needed
+    def forward(self, x, attn_bias=None, mask=None):
         """
         It is assumed that the mask is 1 where values are to be ignored
         and then 0 where there are valid data
         """
-        # print('enin****', x.size())
-        # x, _ = to_dense_batch(x, batch) 
-        # mask, _ = to_dense_batch(mask, batch)
-
-        # print('enc***', x[~mask].min(), x[~mask].max())
-        # print('enc***', x.size())
-
-        # print('-----')
-        # print(x.size())
-        # print(mask.size(), attn_bias.size(), batch)
-        # print(mask.sum(axis=1))
-        # print((x[1:2,:,:] < -1e7).sum(axis=(1,2)))
-        # print(x.min(), x.max())
-        # print('vvvvv', x[~mask].size(), x[~mask].min(), x[~mask].max())
-        # print((attn_bias[2:3,2:3,:,:]).sum(axis=2))
-        # print((attn_bias[2:3,2:3,:,:]).sum(axis=3))
-        # print(mask.sum())
-        # print('>>>', x[~mask].min(), x[~mask].max(), '-', x.min(), x.max())
 
         y = x
         y[~mask] = self.self_attention_norm(x[~mask])
@@ -349,6 +318,5 @@ def forward(self, x, attn_bias=None, mask=None, batch=1):   #TODO remove batch i
         y = self.ffn(y)
         y = self.ffn_dropout(y)
         x = x + y
-        # x=x.flatten(0,1)
 
         return x

From c988fbb42361dd68f862a662b603d8dd3c8be251 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:49 -0400
Subject: [PATCH 51/55] clean up

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/gnn_transformer.py | 5 +----
 gridfm_graphkit/models/gps_transformer.py | 4 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/gridfm_graphkit/models/gnn_transformer.py b/gridfm_graphkit/models/gnn_transformer.py
index 627cd49..7747f59 100644
--- a/gridfm_graphkit/models/gnn_transformer.py
+++ b/gridfm_graphkit/models/gnn_transformer.py
@@ -94,7 +94,4 @@ def forward(self, x, pe, edge_index, edge_attr, batch):
 
         x = self.mlps(x)
 
-        masked_entries = torch.sum(x < -1e8, axis=-1)
-        mask = masked_entries >= 3
-
-        return x, ~mask
+        return x
diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py
index 178570b..e99188a 100644
--- a/gridfm_graphkit/models/gps_transformer.py
+++ b/gridfm_graphkit/models/gps_transformer.py
@@ -138,6 +138,4 @@ def forward(self, x, pe, edge_index, edge_attr, batch, data=None):
         x = self.pre_decoder_norm(x)
         x = self.decoder(x)
 
-        masked_entries = torch.sum(x < -1e8, axis=-1)
-        mask = masked_entries >= 3
-        return x, ~mask
+        return x

From f3c0f60d259a541f6a402fd5269c11dc1c650229 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:49 -0400
Subject: [PATCH 52/55] clean up

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/transforms.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py
index a24edcd..832087b 100644
--- a/gridfm_graphkit/datasets/transforms.py
+++ b/gridfm_graphkit/datasets/transforms.py
@@ -194,15 +194,6 @@ def pad_1d_unsqueeze(x, padlen):
         x = new_x
     return x.unsqueeze(0)
 
-def pad_2d_unsqueeze(x, padlen):
-    xlen, xdim = x.size()
-    if xlen < padlen:
-        new_x = x.new_zeros([padlen, xdim], dtype=x.dtype)
-        new_x[:,:] = -1e9
-        new_x[:xlen, :] = x 
-        x = new_x
-    return x.unsqueeze(0)
-
 def pad_attn_bias_unsqueeze(x, padlen):
     xlen = x.size(0)
     if xlen < padlen:

From e8ec0437614c79a7791ecdd129c99bcb34a4046d Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:49 -0400
Subject: [PATCH 53/55] update comments

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/models/graphormer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py
index fb6288a..b36610d 100644
--- a/gridfm_graphkit/models/graphormer.py
+++ b/gridfm_graphkit/models/graphormer.py
@@ -32,7 +32,7 @@ class Graphormer(nn.Module):
         learn_mask (bool, optional): Whether to learn mask values as parameters. From ``args.data.learn_mask``. Defaults to False.
         edge_type (string, optional): Type of edge to consider multi_hop or not. From ``args.data.edge_type``. Defaults to multi_hop.
         multi_hop_max_dist (int, optional): Maximum number of hops to consider at edges. From ``args.data.multi_hop_max_dist``. Defaults to 20.
-
+        max_node_num (int, optional): Maximum number of node in the input graphs. From ``args.data.max_node_num``. Defaults to 24.
     """
     def __init__(self, args):
         super().__init__()
@@ -83,7 +83,6 @@ def __init__(self, args):
             nn.LayerNorm(self.hidden_dim),
             nn.Linear(self.hidden_dim, self.output_dim)
         )
-
         
         # for positional embeddings
         self.spatial_pos_encoder = nn.Embedding(
@@ -95,7 +94,6 @@ def __init__(self, args):
         if self.n_edge_features is not None:
             self.edge_encoder = nn.Embedding(
                 512 * self.n_edge_features + 1, self.num_heads, padding_idx=0)
-            #    1024, self.num_heads, padding_idx=0)
         if self.edge_type == 'multi_hop':
             self.edge_dis_encoder = nn.Embedding(
                 128 * self.num_heads * self.num_heads, 1)       
@@ -107,6 +105,7 @@ def compute_pos_embeddings(self, data, x):
 
         Args:
             data (Data): Pytorch geometric Data/Batch object
+            x (Tensor): The node feature tensor from data
 
         Returns:
             graph_node_feature (Tensor): data.x with positional encoding appended.

From 23f0119266cbd60b3c29729fb9ee198e7147144e Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:49 -0400
Subject: [PATCH 54/55] specify cython version in toml

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 10719f1..caafe70 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ dependencies = [
     "pyyaml",
     "lightning",
     "seaborn",
-    "cython"
+    "cython==3.0.11"
 ]
 
 [project.optional-dependencies]

From 43a5e3c8aa270fab94defa0de3a5e564e7239d14 Mon Sep 17 00:00:00 2001
From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
Date: Fri, 31 Oct 2025 09:16:49 -0400
Subject: [PATCH 55/55] changed default of args for dataset

Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com>
---
 gridfm_graphkit/datasets/powergrid_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py
index 17b205c..309800a 100644
--- a/gridfm_graphkit/datasets/powergrid_dataset.py
+++ b/gridfm_graphkit/datasets/powergrid_dataset.py
@@ -44,7 +44,7 @@ def __init__(
         transform: Optional[Callable] = None,
         pre_transform: Optional[Callable] = None,
         pre_filter: Optional[Callable] = None,
-        args: Optional = None,
+        args: Optional = {},
     ):
         self.norm_method = norm_method
         self.node_normalizer = node_normalizer