From b88cb768a8f00a6d69e70866fa9ad69c12a7b52e Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:42 -0400 Subject: [PATCH 01/55] add graphormer file Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 1 + gridfm_graphkit/models/gps_transformer.py | 2 ++ gridfm_graphkit/models/graphormer.py | 0 gridfm_graphkit/tasks/feature_reconstruction_task.py | 2 ++ 4 files changed, 5 insertions(+) create mode 100644 gridfm_graphkit/models/graphormer.py diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index 026d9a8..b1e4bd0 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -204,6 +204,7 @@ def get(self, idx): data = torch.load(file_name, weights_only=False) if self.transform: data = self.transform(data) + # print('data>>>>>>>',data) # TODO remove return data def change_transform(self, new_transform): diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py index cc8b648..b807ff3 100644 --- a/gridfm_graphkit/models/gps_transformer.py +++ b/gridfm_graphkit/models/gps_transformer.py @@ -121,7 +121,9 @@ def forward(self, x, pe, edge_index, edge_attr, batch): """ x_pe = self.pe_norm(pe) + print('enc>>>', x.size()) # TODO remove x = self.encoder(x) + print('post>>>', x.size()) # TODO remove x = self.input_norm(x) x = torch.cat((x, x_pe), 1) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py new file mode 100644 index 0000000..e69de29 diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py index cb6963b..e42d09d 100644 --- a/gridfm_graphkit/tasks/feature_reconstruction_task.py +++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py @@ -129,6 +129,8 @@ def shared_step(self, batch): return output, loss_dict def training_step(self, batch): + # print('trainbatch>>>>', batch.size()) # TODO remove + # print(batch) _, loss_dict = self.shared_step(batch) current_lr = self.optimizer.param_groups[0]["lr"] metrics = {} From dde35deb77629a5e24dbb0d77e7e2f6129c557fe Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:42 -0400 Subject: [PATCH 02/55] graphormer data formatting Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/gmae_collator.py | 127 +++++++ gridfm_graphkit/models/gmae_wrapper.py | 91 +++++ gridfm_graphkit/models/graphormer.py | 483 ++++++++++++++++++++++++ 3 files changed, 701 insertions(+) create mode 100644 gridfm_graphkit/models/gmae_collator.py create mode 100644 gridfm_graphkit/models/gmae_wrapper.py diff --git a/gridfm_graphkit/models/gmae_collator.py b/gridfm_graphkit/models/gmae_collator.py new file mode 100644 index 0000000..f4bc532 --- /dev/null +++ b/gridfm_graphkit/models/gmae_collator.py @@ -0,0 +1,127 @@ +import torch + + +def pad_1d_unsqueeze(x, padlen): + x = x + 1 # pad id = 0 + xlen = x.size(0) + if xlen < padlen: + new_x = x.new_zeros([padlen], dtype=x.dtype) + new_x[:xlen] = x + x = new_x + return x.unsqueeze(0) + + +def pad_2d_unsqueeze(x, padlen): + x = x + 1 # pad id = 0 + # print('-------->', x.size()) + xlen, xdim = x.size() + if xlen < padlen: + new_x = x.new_zeros([padlen, xdim], dtype=x.dtype) + new_x[:xlen, :] = x + x = new_x + return x.unsqueeze(0) + + +def pad_attn_bias_unsqueeze(x, padlen): + xlen = x.size(0) + if xlen < padlen: + new_x = x.new_zeros( + [padlen, padlen], dtype=x.dtype).fill_(float('-inf')) + new_x[:xlen, :xlen] = x + new_x[xlen:, :xlen] = 0 + x = new_x + return x.unsqueeze(0) + + +def pad_spatial_pos_unsqueeze(x, padlen): + x = x + 1 + xlen = x.size(0) + if xlen < padlen: + new_x = x.new_zeros([padlen, padlen], dtype=x.dtype) + new_x[:xlen, :xlen] = x + x = new_x + return x.unsqueeze(0) + + +class Batch(): + def __init__(self, + min_node_num, + attn_bias, + spatial_pos, + in_degree, + out_degree, + x, + y, + orig_id + ): + super(Batch, self).__init__() + self.min_node_num = int(min_node_num) + self.in_degree, self.out_degree = in_degree, out_degree + self.x, self.y = x, y + self.attn_bias, self.spatial_pos = attn_bias, spatial_pos + self.orig_id = orig_id + + def to(self, device): + self.in_degree, self.out_degree = self.in_degree.to( + device), self.out_degree.to(device) + self.x = self.x.to(device) + self.y = self.y.to(device) + self.attn_bias, self.spatial_pos = self.attn_bias.to( + device), self.spatial_pos.to(device) + return self + + def __len__(self): + return self.in_degree.size(0) + + +def collator(items, spatial_pos_max=20): + """ + custom collator, among other transformations... + + unequal input graphs are padded to all have the same size + + adds 1 to the input x via pad_2d_unsqueeze and similar functions + """ + items = [ + item for item in items if item is not None] + items = [ + (item[0], item[1], item[2], item[3], item[4], item[5], item[6], item[7]) + for item in items + ] + + # at this step all graphs in batch have their input size + xs, ys, adjs, attn_biases, spatial_poses, in_degrees, out_degrees, orig_ids = zip(*items) + + for idx, _ in enumerate(attn_biases): + attn_biases[idx][spatial_poses[idx] >= spatial_pos_max] = float('-inf') + max_node_num = max(i.size(0) for i in xs) + min_node_num = min(i.size(0) for i in xs) + + if all([torch.all(xx == yy) for xx,yy in zip(xs, ys)]): # then this is for and encoder-decoder setup + y = torch.cat([pad_2d_unsqueeze(i, max_node_num) for i in ys]) + else: + y = torch.stack(ys) + + # following steps pad the smaller graphs to match the largest for batching + # incidentally a constant value of 1 is added as well + x = torch.cat([pad_2d_unsqueeze(i, max_node_num) for i in xs]) + attn_bias = torch.cat([pad_attn_bias_unsqueeze( + i, max_node_num) for i in attn_biases]) + spatial_pos = torch.cat([pad_spatial_pos_unsqueeze(i, max_node_num) + for i in spatial_poses]) + in_degree = torch.cat([pad_1d_unsqueeze(i, max_node_num) + for i in in_degrees]) + out_degree = torch.cat([pad_1d_unsqueeze(i, max_node_num) + for i in out_degrees]) + + + return Batch( + min_node_num=min_node_num, + attn_bias=attn_bias, + spatial_pos=spatial_pos, + in_degree=in_degree, + out_degree=out_degree, + x=x, + y=y, + orig_id=orig_ids + ) diff --git a/gridfm_graphkit/models/gmae_wrapper.py b/gridfm_graphkit/models/gmae_wrapper.py new file mode 100644 index 0000000..b76c915 --- /dev/null +++ b/gridfm_graphkit/models/gmae_wrapper.py @@ -0,0 +1,91 @@ +import torch + +import numpy as np +import pyximport +pyximport.install(setup_args={'include_dirs': np.get_include()}) +import algos + +from torch_geometric.loader import NeighborSampler +from torch_geometric.utils import to_undirected + + + +def process_samples(batch_size, n_id, edge_index, dataset): + """ + transformation of sampled nodes to: + - node features of sampled set, + - y, + - edges tensor + + # TODO reconcile redundance of using edge_index and dataset + # in the case where the full graph is used + """ + + # print(edge_index) + # print('<------->') + if edge_index.size(1) != 0: + edge_index = to_undirected(edge_index) + n_nodes = len(n_id) + edge_sp_adj = torch.sparse.FloatTensor(edge_index, + torch.ones(edge_index.shape[1]), + [n_nodes, n_nodes]) + edge_adj = edge_sp_adj + + # print('<<---------------->>') + # print(n_id) + # print(dataset.x.size()) + # print(dataset.y.size()) + + return [dataset.x[n_id], dataset.y[n_id], edge_adj] + + +# GMAE_graph positional encoding +class MyDataset(torch.utils.data.Dataset): + def __init__(self, items, settype=''): + super(MyDataset, self).__init__() + + self.items = items + self.type = settype + + + def __len__(self): + return len(self.items) + + def __getitem__(self, idx): + item = self.items[idx] + + if self.type=='csv': + graphdata = torch.load(item[1]) + num_nodes = graphdata.num_nodes + + # padding and mask creation should happend here + ns0 = 1 # batch size + ns1 = torch.arange(num_nodes, dtype=torch.int32) # node ids + ns2 = graphdata.edge_index + data_item = process_samples( + ns0, + ns1, + ns2, + graphdata) + [0] # TODO completely remove the appended [0] + else: + data_item = item # in memory dataset in use + + return preprocess_item(data_item) + + +def preprocess_item(item): + """ + """ + x, y, adj, orig_id = item[0], item[1], item[2].to_dense(), item[3] + N = x.size(0) + + # node adj matrix [N, N] bool + adj = adj.bool() + + shortest_path_result, path = algos.floyd_warshall(adj.numpy()) + spatial_pos = torch.from_numpy((shortest_path_result)).long() + attn_bias = torch.zeros([N, N], dtype=torch.float) + + in_degree = adj.long().sum(dim=1).view(-1) + out_degree = adj.long().sum(dim=0).view(-1) + return x, y, adj, attn_bias, spatial_pos, in_degree, out_degree, orig_id diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index e69de29..6aa3df7 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -0,0 +1,483 @@ +from lr import PolynomialDecayLR +import torch +import math +import numpy as np +import torch.nn as nn +import pytorch_lightning as pl + +from torch.nn import functional as F +from losses import active_power_loss + +def init_params(module, n_layers): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=0.02 / math.sqrt(n_layers)) + if module.bias is not None: + module.bias.data.zero_() + if isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=0.02) + + + +class GMAE_node(pl.LightningModule): + def __init__( + self, + n_encoder_layers, + n_decoder_layers, + num_heads, + hidden_dim, + dropout_rate, + intput_dropout_rate, + weight_decay, + ffn_dim, + dataset_name, + warmup_updates, + tot_updates, + peak_lr, + end_lr, + attention_dropout_rate, + n_node_features, + mask_ratio, + n_val_sampler, + ): + super().__init__() + self.save_hyperparameters() + self.n_node_features = n_node_features + self.n_val_sampler = n_val_sampler + self.mask_ratio = mask_ratio + self.num_heads = num_heads + self.input_proj = nn.Linear(n_node_features, hidden_dim) + + self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0) + self.in_degree_encoder = nn.Embedding( + 512, hidden_dim, padding_idx=0) + self.out_degree_encoder = nn.Embedding( + 512, hidden_dim, padding_idx=0) + + self.input_dropout = nn.Dropout(intput_dropout_rate) + encoders = [EncoderLayer(hidden_dim, ffn_dim, dropout_rate, attention_dropout_rate, num_heads) + for _ in range(n_encoder_layers)] + self.encoder_layers = nn.ModuleList(encoders) + self.encoder_final_ln = nn.LayerNorm(hidden_dim) + + self.encoder_to_decoder = nn.Linear(hidden_dim, hidden_dim) + + decoders = [EncoderLayer(hidden_dim, ffn_dim, dropout_rate, attention_dropout_rate, num_heads) + for _ in range(n_decoder_layers)] + self.decoder_layers = nn.ModuleList(decoders) + self.decoder_final_ln = nn.LayerNorm(hidden_dim) + + self.out_proj = nn.Linear(hidden_dim, self.n_node_features) + self.loss_fn = F.mse_loss + self.masking_value = -4 + self.loss_phys1 = active_power_loss + self.alpha = 1.0/50.0 # weight for loss_phys1 + + self.dataset_name = dataset_name + + self.warmup_updates = warmup_updates + self.tot_updates = tot_updates + self.peak_lr = peak_lr + self.end_lr = end_lr + self.weight_decay = weight_decay + + self.hidden_dim = hidden_dim + self.automatic_optimization = True + self.apply(lambda module: init_params(module, n_layers=n_encoder_layers)) + + + + def compute_pos_embeddings(self, batched_data): + attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x + in_degree, out_degree = batched_data.in_degree, batched_data.in_degree + # graph_attn_bias + graph_attn_bias = attn_bias.clone() + graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat( + 1, self.num_heads, 1, 1) # [n_graph, n_head, n_node, n_node] + # spatial pos + # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] + spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) + graph_attn_bias = graph_attn_bias + spatial_pos_bias + graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset + + node_feature = self.input_proj(x) + node_feature = node_feature + \ + self.in_degree_encoder(in_degree) + \ + self.out_degree_encoder(out_degree) + graph_node_feature = node_feature + + return graph_node_feature, graph_attn_bias + + def encoder(self, graph_node_feature, graph_attn_bias, mask=None): + + graph_node_feature_masked = graph_node_feature + graph_attn_bias_masked = graph_attn_bias + + # transfomrer encoder + output = self.input_dropout(graph_node_feature_masked) + for enc_layer in self.encoder_layers: + output = enc_layer(output, graph_attn_bias_masked, mask) + output = self.encoder_final_ln(output) + return output + + def decoder(self, output, in_degree, out_degree, graph_attn_bias, mask=None): + + pos_embed = self.in_degree_encoder(in_degree) + self.out_degree_encoder(out_degree) + output = output + pos_embed + + for enc_layer in self.decoder_layers: + output = enc_layer(output, graph_attn_bias, mask) + + output = self.decoder_final_ln(output) + output = self.out_proj(output) # [n_graph, n_node, n_feature] + + return output + + def forward(self, batched_data, mask=None): + """ + process a batch of data, applying the input mask, while + excluding non-valid values that arrise during processing + + mask: incoming values to mask for prediction + """ + graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data) + in_degree = batched_data.in_degree + out_degree = batched_data.out_degree + + graph_mask = None # TODO this could be removed eventually + + output = self.encoder(graph_node_feature, graph_attn_bias, mask) + output = self.encoder_to_decoder(output) + output = self.decoder(output, in_degree, out_degree, graph_attn_bias, mask) + return output, graph_mask + + def generate_pretrain_embeddings_for_downstream_task(self, batched_data): + graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data) + output = self.encoder(graph_node_feature, graph_attn_bias) + output = output.reshape(-1, self.n_val_sampler, output.size(1), self.hidden_dim)[:, :, 0, :].mean(1) + output = output # [n_graph(n_central_node), n_feature] + return output + + def generate_node_pred(self, batched_data): + """ + for a batch of nodes, return the masked node array and + the predicted arrays for those nodes + + note: + + mask: nodes that are to be predicted (and are thus masked) in each + graph (constant for the batch) + graph_mask: graphs in batch that have valid results + """ + num_nodes = batched_data.x.size(1) + + mask = None + y_hat, graph_mask = self(batched_data, mask) # [n_graph, n_masked_node, n_feature] + if graph_mask is not None: + y_gt = batched_data.x[graph_mask].float() + + else: + y_gt = batched_data.x.float() + graph_mask = torch.from_numpy(np.array([])) + no_feat = y_hat.size(2) + y_hat = y_hat.reshape(-1, y_hat.size(2)) # [n_graph*n_masked_node, n_feature] + + y_gt = y_gt.reshape(-1, y_gt.size(2)) # [n_graph*n_masked_node, n_feature] + pad_mask = torch.nonzero(y_gt.sum(-1)) + + # final shaping + y_gt = torch.squeeze(y_gt) + y_hat = torch.squeeze(y_hat) + + return y_gt, y_hat, graph_mask + + + def training_step(self, batched_data, batch_idx): + num_nodes = batched_data.x.size(1) + + # create a boolean mask where padding was added + # note that this assumes all input data had features with + # values >= 0 + mask = None + masked_entries = torch.sum(batched_data.x == 0, axis=2) + mask = masked_entries == batched_data.x.size(2) + + # add low-level random noise to input X + noise = np.random.normal( + loc=0.0, + scale=0.00001, # TODO make configurable + size=batched_data.x.size() + ) + device = batched_data.x.device + orig_data = batched_data.x + batched_data.x = batched_data.x + torch.Tensor(noise).to(device) + + strategy = '' + # fifty-fifty split between random masking and power-flow solution + if np.random.uniform() > 0.5: + # find location of all nozero entries for masking and shuffle, select, mask + inds = torch.where(orig_data.flatten() != 0) + num_mask = int(self.mask_ratio * len(inds[0])) + shuf_inds = (inds[0][torch.randperm(len(inds[0]))],) + + nshape = batched_data.x.size() + batched_data.x = batched_data.x.flatten() + batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value + batched_data.x = torch.reshape(batched_data.x, nshape) + else: # assume only voltage and power variables to be masked + inds = torch.cat([ + # to pred + torch.range(xx,len(orig_data.flatten()), 25, dtype=int) + for xx in [ii for ii in range(17,25)] + ]) + + shuf_inds = inds[torch.randperm(len(inds))] + + nshape = batched_data.x.size() + batched_data.x = batched_data.x.flatten() + batched_data.x[shuf_inds.to(device)] = self.masking_value + batched_data.x = torch.reshape(batched_data.x, nshape) + + + y_hat, graph_mask = self(batched_data, mask) # [n_graph, n_masked_node, n_feature] + if graph_mask is not None: + y_gt = orig_data[graph_mask].float() + else: + y_gt = orig_data.float() + + y_gt = y_gt[~mask] + y_hat = y_hat[~mask] + + # print('pre loss shapes', y_gt.size(), y_hat.size()) + loss = self.loss_fn(y_hat, y_gt) + loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device) + self.log('train_loss', loss) + self.log('activ_loss', loss_actv) + + return loss + loss_actv + + def validation_step(self, batched_data, batch_idx): + num_nodes = batched_data.x.size(1) + mask = None + + masked_entries = torch.sum(batched_data.x == 0, axis=2) + mask = masked_entries == batched_data.x.size(2) + + # add low-level random noise to input X + noise = np.random.normal( + loc=0.0, + scale=0.00001, # TODO make configurable + size=batched_data.x.size() + ) + device = batched_data.x.device + orig_data = batched_data.x + batched_data.x = batched_data.x + torch.Tensor(noise).to(device) + + # fifty-fifty split between random masking and power-flow solution + if np.random.uniform() > 0.5: + # find location of all nozero entries for masking and shuffle, select, mask + inds = torch.where(orig_data.flatten() != 0) + num_mask = int(self.mask_ratio * len(inds[0])) + shuf_inds = (inds[0][torch.randperm(len(inds[0]))],) + + nshape = batched_data.x.size() + batched_data.x = batched_data.x.flatten() + batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value + batched_data.x = torch.reshape(batched_data.x, nshape) + else: # assume only voltage and power variables to be masked + inds = torch.cat([ + # to pred + torch.range(xx,len(orig_data.flatten()), 25, dtype=int) + for xx in [ii for ii in range(17,25)] + ]) + + shuf_inds = inds[torch.randperm(len(inds))] + + nshape = batched_data.x.size() + batched_data.x = batched_data.x.flatten() + batched_data.x[shuf_inds.to(device)] = self.masking_value + batched_data.x = torch.reshape(batched_data.x, nshape) + + y_hat, graph_mask = self(batched_data, mask) # [n_graph, n_masked_node, n_feature] + if graph_mask is not None: + y_gt = orig_data[graph_mask].float() + else: + y_gt = orig_data.float() + + no_features = y_hat.size(2) + y_gt = y_gt[~mask] + y_hat = y_hat[~mask] + y_hat = y_hat.reshape(-1, y_hat.size(1)) # [n_graph*n_masked_node, n_feature] + y_gt = y_gt.reshape(-1, y_gt.size(1)) # [n_graph*n_masked_node, n_feature] + pad_mask = torch.nonzero(y_gt.sum(-1)) + + y_gt = y_gt[pad_mask, :] + y_hat = y_hat[pad_mask, :] + + loss = self.loss_fn(y_hat, y_gt) + loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device) + self.log('val_loss', loss, batch_size=1) + + # loss per feature, for logging only + for ii in range(no_features): + self.log( + 'val_loss_{}'.format(ii), + self.loss_fn(y_hat[ii::no_features], y_gt[ii::no_features]), + batch_size=1 + ) + + return loss + loss_actv + + + def configure_optimizers(self): + optimizer = torch.optim.AdamW( + self.parameters(), lr=self.peak_lr, weight_decay=self.weight_decay) + lr_scheduler = { + 'scheduler': PolynomialDecayLR( + optimizer, + warmup_updates=self.warmup_updates, + tot_updates=self.tot_updates, + lr=self.peak_lr, + end_lr=self.end_lr, + power=1.0, + ), + 'name': 'learning_rate', + 'interval': 'step', + 'frequency': 1, + } + return [optimizer], [lr_scheduler] + + @staticmethod + def add_model_specific_args(parent_parser): + parser = parent_parser.add_argument_group("GMAE_node") + parser.add_argument('--n_encoder_layers', type=int, default=3) + parser.add_argument('--n_decoder_layers', type=int, default=3) + parser.add_argument('--num_heads', type=int, default=8) + parser.add_argument('--hidden_dim', type=int, default=64) + parser.add_argument('--ffn_dim', type=int, default=64) + parser.add_argument('--intput_dropout_rate', type=float, default=0.1) + parser.add_argument('--dropout_rate', type=float, default=0.5) + parser.add_argument('--weight_decay', type=float, default=1e-5) + parser.add_argument('--attention_dropout_rate',type=float, default=0.1) + parser.add_argument('--checkpoint_path', type=str, default='') + parser.add_argument('--warmup_updates', type=int, default=40000) + parser.add_argument('--tot_updates', type=int, default=400000) + parser.add_argument('--peak_lr', type=float, default=0.0001) + parser.add_argument('--end_lr', type=float, default=1e-9) + parser.add_argument('--mask_ratio', type=float, default=0.5) + parser.add_argument('--validate', action='store_true', default=False) + parser.add_argument('--test', action='store_true', default=False) + + return parent_parser + + +class FeedForwardNetwork(nn.Module): + def __init__(self, hidden_size, ffn_size, dropout_rate): + super(FeedForwardNetwork, self).__init__() + + self.layer1 = nn.Linear(hidden_size, ffn_size) + self.gelu = nn.GELU() + self.layer2 = nn.Linear(ffn_size, hidden_size) + + def forward(self, x): + x = self.layer1(x) + x = self.gelu(x) + x = self.layer2(x) + return x + + +class MultiHeadAttention(nn.Module): + def __init__(self, hidden_size, attention_dropout_rate, num_heads): + super(MultiHeadAttention, self).__init__() + + self.num_heads = num_heads + + self.att_size = att_size = hidden_size // num_heads + self.scale = att_size ** -0.5 + + self.linear_q = nn.Linear(hidden_size, num_heads * att_size) + self.linear_k = nn.Linear(hidden_size, num_heads * att_size) + self.linear_v = nn.Linear(hidden_size, num_heads * att_size) + self.att_dropout = nn.Dropout(attention_dropout_rate) + + self.output_layer = nn.Linear(num_heads * att_size, hidden_size) + + def forward(self, q, k, v, attn_bias=None, mask=None): + orig_q_size = q.size() + + d_k = self.att_size + d_v = self.att_size + batch_size = q.size(0) + + # head_i = Attention(Q(W^Q)_i, K(W^K)_i, V(W^V)_i) + q = self.linear_q(q).view(batch_size, -1, self.num_heads, d_k) + k = self.linear_k(k).view(batch_size, -1, self.num_heads, d_k) + v = self.linear_v(v).view(batch_size, -1, self.num_heads, d_v) + + q = q.transpose(1, 2) # [b, h, q_len, d_k] + v = v.transpose(1, 2) # [b, h, v_len, d_v] + k = k.transpose(1, 2).transpose(2, 3) # [b, h, d_k, k_len] + + # Scaled Dot-Product Attention. + # Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V + q = q * self.scale + x = torch.matmul(q, k) # [b, h, q_len, k_len] + # print('**********', + # x.size(), q.size(), + # k.size(), v.size(), + # attn_bias.size(), mask.size() + # ) + if attn_bias is not None: + if mask is not None: + usm0 = mask.unsqueeze(1).unsqueeze(3) + usm1 = mask.unsqueeze(1).unsqueeze(2) + + attn_bias = attn_bias.masked_fill(usm0 == 1, 0.0) + attn_bias = attn_bias.masked_fill(usm1 == 1, 0.0) + x = x + attn_bias + + # mask the data before the softmax + if mask is not None: + usm0 = mask.unsqueeze(1).unsqueeze(2) + x = x.masked_fill(usm0 == 1, -1e9) + + x = torch.softmax(x, dim=3) + x = self.att_dropout(x) + x = x.matmul(v) # [b, h, q_len, attn] + + x = x.transpose(1, 2).contiguous() # [b, q_len, h, attn] + x = x.view(batch_size, -1, self.num_heads * d_v) + + x = self.output_layer(x) + + assert x.size() == orig_q_size + return x + + +class EncoderLayer(nn.Module): + def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate, num_heads): + super(EncoderLayer, self).__init__() + + self.self_attention_norm = nn.LayerNorm(hidden_size) + self.self_attention = MultiHeadAttention( + hidden_size, attention_dropout_rate, num_heads) + self.self_attention_dropout = nn.Dropout(dropout_rate) + + self.ffn_norm = nn.LayerNorm(hidden_size) + self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate) + self.ffn_dropout = nn.Dropout(dropout_rate) + + def forward(self, x, attn_bias=None, mask=None): + """ + It is assumed that the mask is 1 where values are to be ignored + and then 0 where there are valid data + """ + y = self.self_attention_norm(x) + y = self.self_attention(y, y, y, attn_bias, mask) + y = self.self_attention_dropout(y) + x = x + y + + y = self.ffn_norm(x) + y = self.ffn(y) + y = self.ffn_dropout(y) + x = x + y + return x From 0e46f0212b3c5ce331af932b25cdb94d2f03fdbf Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:42 -0400 Subject: [PATCH 03/55] graphormer data formatting Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/gmae_data.py | 165 ++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 gridfm_graphkit/models/gmae_data.py diff --git a/gridfm_graphkit/models/gmae_data.py b/gridfm_graphkit/models/gmae_data.py new file mode 100644 index 0000000..1be7279 --- /dev/null +++ b/gridfm_graphkit/models/gmae_data.py @@ -0,0 +1,165 @@ +from collator import collator +from pytorch_lightning import LightningDataModule +from torch.utils.data import DataLoader, random_split +from functools import partial +import random +import torch +from wrapper import MyDataset, process_samples +from torch_geometric.utils import to_undirected + +from torch_geometric.datasets import Planetoid, WikiCS, Amazon +from torch_geometric.loader import NeighborSampler +import torch_geometric.transforms as T +import hqdata + + +dataset = None + + +def get_dataset(dataset_name='Cora', nodefile='', edgefile=''): + global dataset + path = 'dataset/' + dataset_name + if dataset is not None: + return dataset + + elif dataset_name in ['Cora', 'CiteSeer', 'PubMed']: + return Planetoid(root=path, name=dataset_name, transform=T.NormalizeFeatures()) + elif dataset_name == 'WikiCS': + return WikiCS(root=path, transform=T.NormalizeFeatures()) + elif dataset_name == 'Amazon-Computers': + return Amazon(root=path, name='computers', transform=T.NormalizeFeatures()) + elif dataset_name == 'Amazon-Photo': + return Amazon(root=path, name='photo', transform=T.NormalizeFeatures()) + elif dataset_name == 'hqdata': + return hqdata.simple_batch(nodefile, edgefile) + else: + raise NotImplementedError + +def read_csv(infile): + """ + assume two columns: instances number, file location and name + """ + + lines = [] + with open(infile, 'r') as ff: + for line in ff: + lines.append([xx.strip() for xx in line.split(',')]) + + return lines + +class GraphDataModule(LightningDataModule): + name = "Cora" + + def __init__( + self, + dataset_name: str = 'Cora', + num_workers: int = 8, + batch_size: int = 64, + seed: int = 42, + edgefile: str = '', + nodefile: str = '', + processedfile: str = '', # preprocessed dataset file in pt format + n_val_sampler: int = 10, + num_node_features: int = 25, + test=False, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.dataset_name = dataset_name + if nodefile and edgefile: + self.dataset = get_dataset(dataset_name, nodefile, edgefile) + else: + self.dataset = read_csv(processedfile) + self.num_node_features = num_node_features + self.seed = seed + self.n_val_sampler = n_val_sampler + + self.num_workers = num_workers + self.batch_size = batch_size + self.dataset_full = ... + self.dataset_train = ... + self.dataset_val = ... + self.dataset_test = ... # not currently in use + self.train_frac = 0.8 # train-val split only + self.istest = test + + + def setup(self, stage: str = None): + """ + automatically called, if prepare_data() is defined, then the latter + is called first + + during testing this section is not needed + """ + + if self.istest: + pass + else: + items = self.dataset # for disk data the dataset is in items form + self.dataset_full = MyDataset( + items, + settype='csv', + ) + + # split the train and validation data + train_set_size = int(self.train_frac*len(self.dataset_full)) + valid_set_size = len(self.dataset_full) - train_set_size + seed = torch.Generator().manual_seed(self.seed) + train_set, valid_set = random_split( + self.dataset_full, + [train_set_size, valid_set_size], + generator=seed + ) + print('**train and val dataset sizes**',len(train_set),len(valid_set)) + self.dataset_train = train_set + self.dataset_val = valid_set + + + def train_dataloader(self): + loader = DataLoader(self.dataset_train, batch_size=self.batch_size, + shuffle=True, + num_workers=self.num_workers, + collate_fn=partial(collator), + ) + return loader + + def val_dataloader(self): + loader = DataLoader(self.dataset_val, batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, + collate_fn=partial(collator), + ) + return loader + + def eval_dataloader(self): + """ + for downstream evaluation + """ + # do not wish to shuffle for evaluation + graphs_to_process = self.dataset.datalist + + + items = [] # from in mem dataset + + for graphdata in graphs_to_process: + # padding and mask creation should happend here + num_nodes = graphdata.num_nodes + ns0 = 1 # batch size + ns1 = torch.arange(num_nodes, dtype=torch.int32) # node ids + ns2 = graphdata.edge_index + data_item = process_samples( + ns0, + ns1, + ns2, + graphdata) + [0] # TODO completely remove the appended [0] + items.append(data_item) + + self.dataset_eval = MyDataset(items) + loader = DataLoader(self.dataset_eval, + batch_size=self.batch_size*self.n_val_sampler, + shuffle=False, + num_workers=self.num_workers, + collate_fn=partial(collator), + ) + return loader From 0f07900800863d3557ab59455ae4dcd3e302c602 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:42 -0400 Subject: [PATCH 04/55] graphormer data formatting Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/temp_leftovers.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 gridfm_graphkit/models/temp_leftovers.py diff --git a/gridfm_graphkit/models/temp_leftovers.py b/gridfm_graphkit/models/temp_leftovers.py new file mode 100644 index 0000000..e69de29 From 00d0987f1b1dab57a730795ddf30b2538c2b4d4f Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:43 -0400 Subject: [PATCH 05/55] basic reworking of model to match formatting Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/graphormer.py | 351 +++-------------------- gridfm_graphkit/models/temp_leftovers.py | 168 +++++++++++ 2 files changed, 213 insertions(+), 306 deletions(-) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 6aa3df7..a351c81 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -1,6 +1,5 @@ -from lr import PolynomialDecayLR + import torch -import math import numpy as np import torch.nn as nn import pytorch_lightning as pl @@ -8,105 +7,60 @@ from torch.nn import functional as F from losses import active_power_loss -def init_params(module, n_layers): - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=0.02 / math.sqrt(n_layers)) - if module.bias is not None: - module.bias.data.zero_() - if isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=0.02) - +@MODELS_REGISTRY.register("Graphormer") class GMAE_node(pl.LightningModule): + """ + TODO fill in description + """ def __init__( self, - n_encoder_layers, - n_decoder_layers, - num_heads, - hidden_dim, - dropout_rate, - intput_dropout_rate, - weight_decay, - ffn_dim, - dataset_name, - warmup_updates, - tot_updates, - peak_lr, - end_lr, - attention_dropout_rate, - n_node_features, - mask_ratio, - n_val_sampler, + # n_encoder_layers, + # n_decoder_layers, + # num_heads, + # hidden_dim, + # dropout_rate, + # intput_dropout_rate, + # weight_decay, + # ffn_dim, + # dataset_name, + # warmup_updates, + # tot_updates, + # peak_lr, + # end_lr, + # attention_dropout_rate, + # n_node_features, + # mask_ratio, + # n_val_sampler, + args ): super().__init__() self.save_hyperparameters() - self.n_node_features = n_node_features - self.n_val_sampler = n_val_sampler - self.mask_ratio = mask_ratio - self.num_heads = num_heads - self.input_proj = nn.Linear(n_node_features, hidden_dim) - - self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0) - self.in_degree_encoder = nn.Embedding( - 512, hidden_dim, padding_idx=0) - self.out_degree_encoder = nn.Embedding( - 512, hidden_dim, padding_idx=0) + self.n_node_features = args.model.input_dim + self.num_heads = 8 # TODO make this configurable or to match their structure + self.hidden_dim = args.model.hidden_size + intput_dropout_rate = 0.3 + dropout_rate = 0.3 + attention_dropout_rate = 0.3 + self.input_proj = nn.Linear(n_node_features, hidden_dim) self.input_dropout = nn.Dropout(intput_dropout_rate) - encoders = [EncoderLayer(hidden_dim, ffn_dim, dropout_rate, attention_dropout_rate, num_heads) + encoders = [EncoderLayer(hidden_dim, hidden_dim, dropout_rate, attention_dropout_rate, num_heads) for _ in range(n_encoder_layers)] self.encoder_layers = nn.ModuleList(encoders) self.encoder_final_ln = nn.LayerNorm(hidden_dim) - self.encoder_to_decoder = nn.Linear(hidden_dim, hidden_dim) - - decoders = [EncoderLayer(hidden_dim, ffn_dim, dropout_rate, attention_dropout_rate, num_heads) - for _ in range(n_decoder_layers)] - self.decoder_layers = nn.ModuleList(decoders) - self.decoder_final_ln = nn.LayerNorm(hidden_dim) - - self.out_proj = nn.Linear(hidden_dim, self.n_node_features) - self.loss_fn = F.mse_loss - self.masking_value = -4 - self.loss_phys1 = active_power_loss - self.alpha = 1.0/50.0 # weight for loss_phys1 - - self.dataset_name = dataset_name - - self.warmup_updates = warmup_updates - self.tot_updates = tot_updates - self.peak_lr = peak_lr - self.end_lr = end_lr - self.weight_decay = weight_decay - - self.hidden_dim = hidden_dim - self.automatic_optimization = True - self.apply(lambda module: init_params(module, n_layers=n_encoder_layers)) + self.decoder = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.LeakyReLU(), + nn.Linear(hidden_dim, self.n_node_features) + ) + + # self.loss_fn = F.mse_loss # TODO remove eventually as they are specd elsewhere + # self.masking_value = -4 - - def compute_pos_embeddings(self, batched_data): - attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x - in_degree, out_degree = batched_data.in_degree, batched_data.in_degree - # graph_attn_bias - graph_attn_bias = attn_bias.clone() - graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat( - 1, self.num_heads, 1, 1) # [n_graph, n_head, n_node, n_node] - # spatial pos - # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] - spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) - graph_attn_bias = graph_attn_bias + spatial_pos_bias - graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset - - node_feature = self.input_proj(x) - node_feature = node_feature + \ - self.in_degree_encoder(in_degree) + \ - self.out_degree_encoder(out_degree) - graph_node_feature = node_feature - - return graph_node_feature, graph_attn_bias - def encoder(self, graph_node_feature, graph_attn_bias, mask=None): graph_node_feature_masked = graph_node_feature @@ -139,236 +93,20 @@ def forward(self, batched_data, mask=None): mask: incoming values to mask for prediction """ + + # TODO in the baseline code the PE is an input here and passes through + # a normalization before being concatenated to the features + + graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data) in_degree = batched_data.in_degree out_degree = batched_data.out_degree - graph_mask = None # TODO this could be removed eventually - output = self.encoder(graph_node_feature, graph_attn_bias, mask) output = self.encoder_to_decoder(output) output = self.decoder(output, in_degree, out_degree, graph_attn_bias, mask) - return output, graph_mask - - def generate_pretrain_embeddings_for_downstream_task(self, batched_data): - graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data) - output = self.encoder(graph_node_feature, graph_attn_bias) - output = output.reshape(-1, self.n_val_sampler, output.size(1), self.hidden_dim)[:, :, 0, :].mean(1) - output = output # [n_graph(n_central_node), n_feature] return output - def generate_node_pred(self, batched_data): - """ - for a batch of nodes, return the masked node array and - the predicted arrays for those nodes - - note: - - mask: nodes that are to be predicted (and are thus masked) in each - graph (constant for the batch) - graph_mask: graphs in batch that have valid results - """ - num_nodes = batched_data.x.size(1) - - mask = None - y_hat, graph_mask = self(batched_data, mask) # [n_graph, n_masked_node, n_feature] - if graph_mask is not None: - y_gt = batched_data.x[graph_mask].float() - - else: - y_gt = batched_data.x.float() - graph_mask = torch.from_numpy(np.array([])) - no_feat = y_hat.size(2) - y_hat = y_hat.reshape(-1, y_hat.size(2)) # [n_graph*n_masked_node, n_feature] - - y_gt = y_gt.reshape(-1, y_gt.size(2)) # [n_graph*n_masked_node, n_feature] - pad_mask = torch.nonzero(y_gt.sum(-1)) - - # final shaping - y_gt = torch.squeeze(y_gt) - y_hat = torch.squeeze(y_hat) - - return y_gt, y_hat, graph_mask - - - def training_step(self, batched_data, batch_idx): - num_nodes = batched_data.x.size(1) - - # create a boolean mask where padding was added - # note that this assumes all input data had features with - # values >= 0 - mask = None - masked_entries = torch.sum(batched_data.x == 0, axis=2) - mask = masked_entries == batched_data.x.size(2) - - # add low-level random noise to input X - noise = np.random.normal( - loc=0.0, - scale=0.00001, # TODO make configurable - size=batched_data.x.size() - ) - device = batched_data.x.device - orig_data = batched_data.x - batched_data.x = batched_data.x + torch.Tensor(noise).to(device) - - strategy = '' - # fifty-fifty split between random masking and power-flow solution - if np.random.uniform() > 0.5: - # find location of all nozero entries for masking and shuffle, select, mask - inds = torch.where(orig_data.flatten() != 0) - num_mask = int(self.mask_ratio * len(inds[0])) - shuf_inds = (inds[0][torch.randperm(len(inds[0]))],) - - nshape = batched_data.x.size() - batched_data.x = batched_data.x.flatten() - batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value - batched_data.x = torch.reshape(batched_data.x, nshape) - else: # assume only voltage and power variables to be masked - inds = torch.cat([ - # to pred - torch.range(xx,len(orig_data.flatten()), 25, dtype=int) - for xx in [ii for ii in range(17,25)] - ]) - - shuf_inds = inds[torch.randperm(len(inds))] - - nshape = batched_data.x.size() - batched_data.x = batched_data.x.flatten() - batched_data.x[shuf_inds.to(device)] = self.masking_value - batched_data.x = torch.reshape(batched_data.x, nshape) - - - y_hat, graph_mask = self(batched_data, mask) # [n_graph, n_masked_node, n_feature] - if graph_mask is not None: - y_gt = orig_data[graph_mask].float() - else: - y_gt = orig_data.float() - - y_gt = y_gt[~mask] - y_hat = y_hat[~mask] - - # print('pre loss shapes', y_gt.size(), y_hat.size()) - loss = self.loss_fn(y_hat, y_gt) - loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device) - self.log('train_loss', loss) - self.log('activ_loss', loss_actv) - - return loss + loss_actv - - def validation_step(self, batched_data, batch_idx): - num_nodes = batched_data.x.size(1) - mask = None - - masked_entries = torch.sum(batched_data.x == 0, axis=2) - mask = masked_entries == batched_data.x.size(2) - - # add low-level random noise to input X - noise = np.random.normal( - loc=0.0, - scale=0.00001, # TODO make configurable - size=batched_data.x.size() - ) - device = batched_data.x.device - orig_data = batched_data.x - batched_data.x = batched_data.x + torch.Tensor(noise).to(device) - - # fifty-fifty split between random masking and power-flow solution - if np.random.uniform() > 0.5: - # find location of all nozero entries for masking and shuffle, select, mask - inds = torch.where(orig_data.flatten() != 0) - num_mask = int(self.mask_ratio * len(inds[0])) - shuf_inds = (inds[0][torch.randperm(len(inds[0]))],) - - nshape = batched_data.x.size() - batched_data.x = batched_data.x.flatten() - batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value - batched_data.x = torch.reshape(batched_data.x, nshape) - else: # assume only voltage and power variables to be masked - inds = torch.cat([ - # to pred - torch.range(xx,len(orig_data.flatten()), 25, dtype=int) - for xx in [ii for ii in range(17,25)] - ]) - - shuf_inds = inds[torch.randperm(len(inds))] - - nshape = batched_data.x.size() - batched_data.x = batched_data.x.flatten() - batched_data.x[shuf_inds.to(device)] = self.masking_value - batched_data.x = torch.reshape(batched_data.x, nshape) - - y_hat, graph_mask = self(batched_data, mask) # [n_graph, n_masked_node, n_feature] - if graph_mask is not None: - y_gt = orig_data[graph_mask].float() - else: - y_gt = orig_data.float() - - no_features = y_hat.size(2) - y_gt = y_gt[~mask] - y_hat = y_hat[~mask] - y_hat = y_hat.reshape(-1, y_hat.size(1)) # [n_graph*n_masked_node, n_feature] - y_gt = y_gt.reshape(-1, y_gt.size(1)) # [n_graph*n_masked_node, n_feature] - pad_mask = torch.nonzero(y_gt.sum(-1)) - - y_gt = y_gt[pad_mask, :] - y_hat = y_hat[pad_mask, :] - - loss = self.loss_fn(y_hat, y_gt) - loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device) - self.log('val_loss', loss, batch_size=1) - - # loss per feature, for logging only - for ii in range(no_features): - self.log( - 'val_loss_{}'.format(ii), - self.loss_fn(y_hat[ii::no_features], y_gt[ii::no_features]), - batch_size=1 - ) - - return loss + loss_actv - - - def configure_optimizers(self): - optimizer = torch.optim.AdamW( - self.parameters(), lr=self.peak_lr, weight_decay=self.weight_decay) - lr_scheduler = { - 'scheduler': PolynomialDecayLR( - optimizer, - warmup_updates=self.warmup_updates, - tot_updates=self.tot_updates, - lr=self.peak_lr, - end_lr=self.end_lr, - power=1.0, - ), - 'name': 'learning_rate', - 'interval': 'step', - 'frequency': 1, - } - return [optimizer], [lr_scheduler] - - @staticmethod - def add_model_specific_args(parent_parser): - parser = parent_parser.add_argument_group("GMAE_node") - parser.add_argument('--n_encoder_layers', type=int, default=3) - parser.add_argument('--n_decoder_layers', type=int, default=3) - parser.add_argument('--num_heads', type=int, default=8) - parser.add_argument('--hidden_dim', type=int, default=64) - parser.add_argument('--ffn_dim', type=int, default=64) - parser.add_argument('--intput_dropout_rate', type=float, default=0.1) - parser.add_argument('--dropout_rate', type=float, default=0.5) - parser.add_argument('--weight_decay', type=float, default=1e-5) - parser.add_argument('--attention_dropout_rate',type=float, default=0.1) - parser.add_argument('--checkpoint_path', type=str, default='') - parser.add_argument('--warmup_updates', type=int, default=40000) - parser.add_argument('--tot_updates', type=int, default=400000) - parser.add_argument('--peak_lr', type=float, default=0.0001) - parser.add_argument('--end_lr', type=float, default=1e-9) - parser.add_argument('--mask_ratio', type=float, default=0.5) - parser.add_argument('--validate', action='store_true', default=False) - parser.add_argument('--test', action='store_true', default=False) - - return parent_parser - class FeedForwardNetwork(nn.Module): def __init__(self, hidden_size, ffn_size, dropout_rate): @@ -402,6 +140,7 @@ def __init__(self, hidden_size, attention_dropout_rate, num_heads): self.output_layer = nn.Linear(num_heads * att_size, hidden_size) def forward(self, q, k, v, attn_bias=None, mask=None): + orig_q_size = q.size() d_k = self.att_size diff --git a/gridfm_graphkit/models/temp_leftovers.py b/gridfm_graphkit/models/temp_leftovers.py index e69de29..87e07be 100644 --- a/gridfm_graphkit/models/temp_leftovers.py +++ b/gridfm_graphkit/models/temp_leftovers.py @@ -0,0 +1,168 @@ +# temporary file to hold functions while they wait to be +# transferred to other modules + + + self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0) + self.in_degree_encoder = nn.Embedding( + 512, hidden_dim, padding_idx=0) + self.out_degree_encoder = nn.Embedding( + 512, hidden_dim, padding_idx=0) + + + def compute_pos_embeddings(self, batched_data): + attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x + in_degree, out_degree = batched_data.in_degree, batched_data.in_degree + # graph_attn_bias + graph_attn_bias = attn_bias.clone() + graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat( + 1, self.num_heads, 1, 1) # [n_graph, n_head, n_node, n_node] + # spatial pos + # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] + spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) + graph_attn_bias = graph_attn_bias + spatial_pos_bias + graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset + + node_feature = self.input_proj(x) + node_feature = node_feature + \ + self.in_degree_encoder(in_degree) + \ + self.out_degree_encoder(out_degree) + graph_node_feature = node_feature + + return graph_node_feature, graph_attn_bias + + + def training_step(self, batched_data, batch_idx): + num_nodes = batched_data.x.size(1) + + # create a boolean mask where padding was added + # note that this assumes all input data had features with + # values >= 0 + mask = None + masked_entries = torch.sum(batched_data.x == 0, axis=2) + mask = masked_entries == batched_data.x.size(2) + + # add low-level random noise to input X + noise = np.random.normal( + loc=0.0, + scale=0.00001, # TODO make configurable + size=batched_data.x.size() + ) + device = batched_data.x.device + orig_data = batched_data.x + batched_data.x = batched_data.x + torch.Tensor(noise).to(device) + + strategy = '' + # fifty-fifty split between random masking and power-flow solution + if np.random.uniform() > 0.5: + # find location of all nozero entries for masking and shuffle, select, mask + inds = torch.where(orig_data.flatten() != 0) + num_mask = int(self.mask_ratio * len(inds[0])) + shuf_inds = (inds[0][torch.randperm(len(inds[0]))],) + + nshape = batched_data.x.size() + batched_data.x = batched_data.x.flatten() + batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value + batched_data.x = torch.reshape(batched_data.x, nshape) + else: # assume only voltage and power variables to be masked + inds = torch.cat([ + # to pred + torch.range(xx,len(orig_data.flatten()), 25, dtype=int) + for xx in [ii for ii in range(17,25)] + ]) + + shuf_inds = inds[torch.randperm(len(inds))] + + nshape = batched_data.x.size() + batched_data.x = batched_data.x.flatten() + batched_data.x[shuf_inds.to(device)] = self.masking_value + batched_data.x = torch.reshape(batched_data.x, nshape) + + + y_hat, graph_mask = self(batched_data, mask) # [n_graph, n_masked_node, n_feature] + if graph_mask is not None: + y_gt = orig_data[graph_mask].float() + else: + y_gt = orig_data.float() + + y_gt = y_gt[~mask] + y_hat = y_hat[~mask] + + # print('pre loss shapes', y_gt.size(), y_hat.size()) + loss = self.loss_fn(y_hat, y_gt) + loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device) + self.log('train_loss', loss) + self.log('activ_loss', loss_actv) + + return loss + loss_actv + + def validation_step(self, batched_data, batch_idx): + num_nodes = batched_data.x.size(1) + mask = None + + masked_entries = torch.sum(batched_data.x == 0, axis=2) + mask = masked_entries == batched_data.x.size(2) + + # add low-level random noise to input X + noise = np.random.normal( + loc=0.0, + scale=0.00001, # TODO make configurable + size=batched_data.x.size() + ) + device = batched_data.x.device + orig_data = batched_data.x + batched_data.x = batched_data.x + torch.Tensor(noise).to(device) + + # fifty-fifty split between random masking and power-flow solution + if np.random.uniform() > 0.5: + # find location of all nozero entries for masking and shuffle, select, mask + inds = torch.where(orig_data.flatten() != 0) + num_mask = int(self.mask_ratio * len(inds[0])) + shuf_inds = (inds[0][torch.randperm(len(inds[0]))],) + + nshape = batched_data.x.size() + batched_data.x = batched_data.x.flatten() + batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value + batched_data.x = torch.reshape(batched_data.x, nshape) + else: # assume only voltage and power variables to be masked + inds = torch.cat([ + # to pred + torch.range(xx,len(orig_data.flatten()), 25, dtype=int) + for xx in [ii for ii in range(17,25)] + ]) + + shuf_inds = inds[torch.randperm(len(inds))] + + nshape = batched_data.x.size() + batched_data.x = batched_data.x.flatten() + batched_data.x[shuf_inds.to(device)] = self.masking_value + batched_data.x = torch.reshape(batched_data.x, nshape) + + y_hat, graph_mask = self(batched_data, mask) # [n_graph, n_masked_node, n_feature] + if graph_mask is not None: + y_gt = orig_data[graph_mask].float() + else: + y_gt = orig_data.float() + + no_features = y_hat.size(2) + y_gt = y_gt[~mask] + y_hat = y_hat[~mask] + y_hat = y_hat.reshape(-1, y_hat.size(1)) # [n_graph*n_masked_node, n_feature] + y_gt = y_gt.reshape(-1, y_gt.size(1)) # [n_graph*n_masked_node, n_feature] + pad_mask = torch.nonzero(y_gt.sum(-1)) + + y_gt = y_gt[pad_mask, :] + y_hat = y_hat[pad_mask, :] + + loss = self.loss_fn(y_hat, y_gt) + loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device) + self.log('val_loss', loss, batch_size=1) + + # loss per feature, for logging only + for ii in range(no_features): + self.log( + 'val_loss_{}'.format(ii), + self.loss_fn(y_hat[ii::no_features], y_gt[ii::no_features]), + batch_size=1 + ) + + return loss + loss_actv \ No newline at end of file From 7da6b281613cb3d2778a36d1388e1d520a5767fd Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:43 -0400 Subject: [PATCH 06/55] rearrange the Data preprocessing to match existing Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 9 ++- gridfm_graphkit/datasets/transforms.py | 59 +++++++++++++++++++ gridfm_graphkit/models/gmae_wrapper.py | 3 - gridfm_graphkit/models/graphormer.py | 35 ++++++++++- gridfm_graphkit/models/temp_leftovers.py | 27 +-------- 5 files changed, 101 insertions(+), 32 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index b1e4bd0..2a70519 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -2,6 +2,7 @@ from gridfm_graphkit.datasets.transforms import ( AddEdgeWeights, AddNormalizedRandomWalkPE, + AddGraphormerEncodings ) import os.path as osp @@ -204,7 +205,13 @@ def get(self, idx): data = torch.load(file_name, weights_only=False) if self.transform: data = self.transform(data) - # print('data>>>>>>>',data) # TODO remove + + # TODO move this to the pretreatment when validated + gr_transform = AddGraphormerEncodings( + attr_name="gr", + ) + data = gr_transform(data) + print('data>>>>>>>', data) # TODO remove return data def change_transform(self, new_transform): diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index fb770d3..744fa30 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -15,6 +15,10 @@ to_torch_csr_tensor, ) +import pyximport +pyximport.install(setup_args={'include_dirs': np.get_include()}) +import algos + class AddNormalizedRandomWalkPE(BaseTransform): r"""Adds the random walk positional encoding from the @@ -83,6 +87,61 @@ def get_pe(out: Tensor) -> Tensor: return data +def preprocess_item(data): + """ + TODO fill in header for the function + """ + edge_index = data.edge_index + N = data.num_nodes + edge_adj = torch.sparse.FloatTensor( + edge_index, + torch.ones(edge_index.shape[1]), + [N, N] + ) + + adj = edge_adj.to_dense() + + # node adj matrix [N, N] bool + adj = adj.bool() + + shortest_path_result, path = algos.floyd_warshall(adj.numpy()) + spatial_pos = torch.from_numpy((shortest_path_result)).long() + attn_bias = torch.zeros([N, N], dtype=torch.float) # TODO verifie is updated + + in_degree = adj.long().sum(dim=1).view(-1) + out_degree = adj.long().sum(dim=0).view(-1) + return attn_bias, spatial_pos, in_degree, out_degree + +class AddGraphormerEncodings(BaseTransform): + r"""... + TODO update with encoding info + """ + + def __init__( + self, + attr_name: Optional[str] = "gres", # TODO remove if not needed + ) -> None: + self.attr_name = attr_name + + def forward(self, data: Data) -> Data: + if data.edge_index is None: + raise ValueError("Expected data.edge_index to be not None") + + N = data.num_nodes + if N is None: + raise ValueError("Expected data.num_nodes to be not None") + + + attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data) + + # data[self.attr_name] = pe + data['attn_bias'] = attn_bias + data['spatial_pos'] = spatial_pos + data['in_degree'] = in_degree + data['in_degree'] = out_degree + + return data + class AddEdgeWeights(BaseTransform): """ diff --git a/gridfm_graphkit/models/gmae_wrapper.py b/gridfm_graphkit/models/gmae_wrapper.py index b76c915..dfc1367 100644 --- a/gridfm_graphkit/models/gmae_wrapper.py +++ b/gridfm_graphkit/models/gmae_wrapper.py @@ -1,9 +1,6 @@ import torch import numpy as np -import pyximport -pyximport.install(setup_args={'include_dirs': np.get_include()}) -import algos from torch_geometric.loader import NeighborSampler from torch_geometric.utils import to_undirected diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index a351c81..85355f8 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -1,4 +1,5 @@ +from gridfm_graphkit.io.registries import MODELS_REGISTRY import torch import numpy as np import torch.nn as nn @@ -10,7 +11,7 @@ @MODELS_REGISTRY.register("Graphormer") -class GMAE_node(pl.LightningModule): +class GMAE_node(nn.Module): """ TODO fill in description """ @@ -57,9 +58,38 @@ def __init__( nn.Linear(hidden_dim, self.n_node_features) ) + + # for pos embeddings + self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0) + self.in_degree_encoder = nn.Embedding( + 512, hidden_dim, padding_idx=0) + self.out_degree_encoder = nn.Embedding( + 512, hidden_dim, padding_idx=0) + # self.loss_fn = F.mse_loss # TODO remove eventually as they are specd elsewhere # self.masking_value = -4 + def compute_pos_embeddings(self, batched_data): + attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x + in_degree, out_degree = batched_data.in_degree, batched_data.in_degree + # graph_attn_bias + graph_attn_bias = attn_bias.clone() + graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat( + 1, self.num_heads, 1, 1) # [n_graph, n_head, n_node, n_node] + # spatial pos + # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] + spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) + graph_attn_bias = graph_attn_bias + spatial_pos_bias + graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset + + node_feature = self.input_proj(x) + node_feature = node_feature + \ + self.in_degree_encoder(in_degree) + \ + self.out_degree_encoder(out_degree) + graph_node_feature = node_feature + + return graph_node_feature, graph_attn_bias + def encoder(self, graph_node_feature, graph_attn_bias, mask=None): @@ -95,8 +125,7 @@ def forward(self, batched_data, mask=None): """ # TODO in the baseline code the PE is an input here and passes through - # a normalization before being concatenated to the features - + # a normalization before being concatenated to the features, follow this in final version graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data) in_degree = batched_data.in_degree diff --git a/gridfm_graphkit/models/temp_leftovers.py b/gridfm_graphkit/models/temp_leftovers.py index 87e07be..f46a8c5 100644 --- a/gridfm_graphkit/models/temp_leftovers.py +++ b/gridfm_graphkit/models/temp_leftovers.py @@ -2,33 +2,10 @@ # transferred to other modules - self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0) - self.in_degree_encoder = nn.Embedding( - 512, hidden_dim, padding_idx=0) - self.out_degree_encoder = nn.Embedding( - 512, hidden_dim, padding_idx=0) + - def compute_pos_embeddings(self, batched_data): - attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x - in_degree, out_degree = batched_data.in_degree, batched_data.in_degree - # graph_attn_bias - graph_attn_bias = attn_bias.clone() - graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat( - 1, self.num_heads, 1, 1) # [n_graph, n_head, n_node, n_node] - # spatial pos - # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] - spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) - graph_attn_bias = graph_attn_bias + spatial_pos_bias - graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset - - node_feature = self.input_proj(x) - node_feature = node_feature + \ - self.in_degree_encoder(in_degree) + \ - self.out_degree_encoder(out_degree) - graph_node_feature = node_feature - - return graph_node_feature, graph_attn_bias + def training_step(self, batched_data, batch_idx): From b125061e0ccda45857457cbbc6dd83820ea79af0 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:43 -0400 Subject: [PATCH 07/55] changes up to decision between collator or to_dense_batch, will try to_dense batch first Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/gps_transformer.py | 4 ++-- gridfm_graphkit/models/graphormer.py | 29 ++++++++--------------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py index b807ff3..50e7db9 100644 --- a/gridfm_graphkit/models/gps_transformer.py +++ b/gridfm_graphkit/models/gps_transformer.py @@ -121,9 +121,9 @@ def forward(self, x, pe, edge_index, edge_attr, batch): """ x_pe = self.pe_norm(pe) - print('enc>>>', x.size()) # TODO remove + # print('enc>>>', x.size()) # TODO remove x = self.encoder(x) - print('post>>>', x.size()) # TODO remove + # print('post>>>', x.size()) # TODO remove x = self.input_norm(x) x = torch.cat((x, x_pe), 1) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 85355f8..f4f997b 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -91,7 +91,7 @@ def compute_pos_embeddings(self, batched_data): return graph_node_feature, graph_attn_bias - def encoder(self, graph_node_feature, graph_attn_bias, mask=None): + def encoder(self, graph_node_feature, graph_attn_bias): graph_node_feature_masked = graph_node_feature graph_attn_bias_masked = graph_attn_bias @@ -99,30 +99,20 @@ def encoder(self, graph_node_feature, graph_attn_bias, mask=None): # transfomrer encoder output = self.input_dropout(graph_node_feature_masked) for enc_layer in self.encoder_layers: - output = enc_layer(output, graph_attn_bias_masked, mask) + output = enc_layer(output, graph_attn_bias_masked) output = self.encoder_final_ln(output) return output - def decoder(self, output, in_degree, out_degree, graph_attn_bias, mask=None): - - pos_embed = self.in_degree_encoder(in_degree) + self.out_degree_encoder(out_degree) - output = output + pos_embed - - for enc_layer in self.decoder_layers: - output = enc_layer(output, graph_attn_bias, mask) - - output = self.decoder_final_ln(output) - output = self.out_proj(output) # [n_graph, n_node, n_feature] - - return output - - def forward(self, batched_data, mask=None): + def forward(self, x, pe, edge_index, edge_attr, batched_data): """ process a batch of data, applying the input mask, while excluding non-valid values that arrise during processing mask: incoming values to mask for prediction """ + mask=None # TODO remove + + # TODO note that the x, pe are redundant or not needed, so clean up at the end # TODO in the baseline code the PE is an input here and passes through # a normalization before being concatenated to the features, follow this in final version @@ -131,12 +121,13 @@ def forward(self, batched_data, mask=None): in_degree = batched_data.in_degree out_degree = batched_data.out_degree - output = self.encoder(graph_node_feature, graph_attn_bias, mask) - output = self.encoder_to_decoder(output) - output = self.decoder(output, in_degree, out_degree, graph_attn_bias, mask) + output = self.encoder(graph_node_feature, graph_attn_bias) + output = self.decoder(output) + return output +# TODO maybe set this as the decoder class FeedForwardNetwork(nn.Module): def __init__(self, hidden_size, ffn_size, dropout_rate): super(FeedForwardNetwork, self).__init__() From 7fa15080ea59c5103a19c2e26f73bd80a7dc8f05 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:43 -0400 Subject: [PATCH 08/55] changes up to decision between collator or to_dense_batch, will try to_dense batch first Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/graphormer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index f4f997b..48d2cf6 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -118,8 +118,6 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data): # a normalization before being concatenated to the features, follow this in final version graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data) - in_degree = batched_data.in_degree - out_degree = batched_data.out_degree output = self.encoder(graph_node_feature, graph_attn_bias) output = self.decoder(output) From 8125e0468822058e7eb2e123b614698c1a643ad2 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:43 -0400 Subject: [PATCH 09/55] cython - replace long by int for python 3 compat Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 3 ++- gridfm_graphkit/models/graphormer.py | 2 +- pyproject.toml | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 744fa30..04ac4ba 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -15,9 +15,10 @@ to_torch_csr_tensor, ) +import numpy as np import pyximport pyximport.install(setup_args={'include_dirs': np.get_include()}) -import algos +import gridfm_graphkit.models.algos as algos class AddNormalizedRandomWalkPE(BaseTransform): diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 48d2cf6..06e2264 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -110,7 +110,7 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data): mask: incoming values to mask for prediction """ - mask=None # TODO remove + print('batch', batched_data) # TODO note that the x, pe are redundant or not needed, so clean up at the end diff --git a/pyproject.toml b/pyproject.toml index 51c8665..10719f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ "pyyaml", "lightning", "seaborn", + "cython" ] [project.optional-dependencies] From a513b1ae96084ad549b376955b27c6ba6d96a006 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:43 -0400 Subject: [PATCH 10/55] cython - adjust version to use long Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/__init__.py | 3 ++- gridfm_graphkit/models/graphormer.py | 33 ++++++++++++++++++---------- pyproject.toml | 2 +- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/gridfm_graphkit/models/__init__.py b/gridfm_graphkit/models/__init__.py index de355d3..ce5432e 100644 --- a/gridfm_graphkit/models/__init__.py +++ b/gridfm_graphkit/models/__init__.py @@ -1,4 +1,5 @@ from gridfm_graphkit.models.gps_transformer import GPSTransformer from gridfm_graphkit.models.gnn_transformer import GNN_TransformerConv +from gridfm_graphkit.models.graphormer import Graphormer -__all__ = ["GPSTransformer", "GNN_TransformerConv"] +__all__ = ["GPSTransformer", "GNN_TransformerConv", "Graphormer"] diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 06e2264..7bb4f99 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -6,12 +6,12 @@ import pytorch_lightning as pl from torch.nn import functional as F -from losses import active_power_loss + @MODELS_REGISTRY.register("Graphormer") -class GMAE_node(nn.Module): +class Graphormer(nn.Module): """ TODO fill in description """ @@ -37,34 +37,43 @@ def __init__( args ): super().__init__() - self.save_hyperparameters() + self.n_node_features = args.model.input_dim self.num_heads = 8 # TODO make this configurable or to match their structure self.hidden_dim = args.model.hidden_size + self.n_encoder_layers = args.model.num_layers intput_dropout_rate = 0.3 dropout_rate = 0.3 attention_dropout_rate = 0.3 - self.input_proj = nn.Linear(n_node_features, hidden_dim) + self.input_proj = nn.Linear(self.n_node_features, self.hidden_dim) self.input_dropout = nn.Dropout(intput_dropout_rate) - encoders = [EncoderLayer(hidden_dim, hidden_dim, dropout_rate, attention_dropout_rate, num_heads) - for _ in range(n_encoder_layers)] + encoders = [ + EncoderLayer( + self.hidden_dim, + self.hidden_dim, + dropout_rate, + attention_dropout_rate, + self.num_heads + ) + for _ in range(self.n_encoder_layers) + ] self.encoder_layers = nn.ModuleList(encoders) - self.encoder_final_ln = nn.LayerNorm(hidden_dim) + self.encoder_final_ln = nn.LayerNorm(self.hidden_dim) self.decoder = nn.Sequential( - nn.Linear(hidden_dim, hidden_dim), + nn.Linear(self.hidden_dim, self.hidden_dim), nn.LeakyReLU(), - nn.Linear(hidden_dim, self.n_node_features) + nn.Linear(self.hidden_dim, self.n_node_features) ) # for pos embeddings - self.spatial_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0) + self.spatial_pos_encoder = nn.Embedding(512, self.num_heads, padding_idx=0) self.in_degree_encoder = nn.Embedding( - 512, hidden_dim, padding_idx=0) + 512, self.hidden_dim, padding_idx=0) self.out_degree_encoder = nn.Embedding( - 512, hidden_dim, padding_idx=0) + 512, self.hidden_dim, padding_idx=0) # self.loss_fn = F.mse_loss # TODO remove eventually as they are specd elsewhere # self.masking_value = -4 diff --git a/pyproject.toml b/pyproject.toml index 10719f1..0ddbe13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ dependencies = [ "pyyaml", "lightning", "seaborn", - "cython" + "cython<3.1" ] [project.optional-dependencies] From d7a6193cdf9edfdb53bdec388c0de5f8aa09bb21 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:43 -0400 Subject: [PATCH 11/55] replace cython by networkx version of floyd_warshall Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 19 +++++++++++++------ pyproject.toml | 3 +-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 04ac4ba..c9313db 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -15,10 +15,13 @@ to_torch_csr_tensor, ) -import numpy as np -import pyximport -pyximport.install(setup_args={'include_dirs': np.get_include()}) -import gridfm_graphkit.models.algos as algos +# import numpy as np +# import pyximport +# pyximport.install(setup_args={'include_dirs': np.get_include()}) +# import gridfm_graphkit.models.algos as algos + +from networkx import floyd_warshall_numpy +from torch_geometric.utils import to_networkx class AddNormalizedRandomWalkPE(BaseTransform): @@ -105,7 +108,11 @@ def preprocess_item(data): # node adj matrix [N, N] bool adj = adj.bool() - shortest_path_result, path = algos.floyd_warshall(adj.numpy()) + # shortest_path_result, path = algos.floyd_warshall(adj.numpy()) + gg = to_networkx(data) + shortest_path_result = floyd_warshall_numpy(gg) + print('sp>>>', shortest_path_result) + print(shortest_path_result.shape) spatial_pos = torch.from_numpy((shortest_path_result)).long() attn_bias = torch.zeros([N, N], dtype=torch.float) # TODO verifie is updated @@ -114,7 +121,7 @@ def preprocess_item(data): return attn_bias, spatial_pos, in_degree, out_degree class AddGraphormerEncodings(BaseTransform): - r"""... + """... TODO update with encoding info """ diff --git a/pyproject.toml b/pyproject.toml index 0ddbe13..0c09d17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,8 +50,7 @@ dependencies = [ "plotly", "pyyaml", "lightning", - "seaborn", - "cython<3.1" + "seaborn" ] [project.optional-dependencies] From 10bc942875c4e0da3281077c01ee06817a00efc4 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:44 -0400 Subject: [PATCH 12/55] put in place holder for pos embed to speed up development Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 3 ++- gridfm_graphkit/datasets/transforms.py | 25 ++++++++--------- gridfm_graphkit/models/graphormer.py | 27 ++++++++++++++++--- .../tasks/feature_reconstruction_task.py | 5 ++-- 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index 2a70519..cfe63e9 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -207,11 +207,12 @@ def get(self, idx): data = self.transform(data) # TODO move this to the pretreatment when validated + # print('datab>>>>>>>', data) gr_transform = AddGraphormerEncodings( attr_name="gr", ) data = gr_transform(data) - print('data>>>>>>>', data) # TODO remove + # print('dataa>>>>>>>', data) # TODO remove return data def change_transform(self, new_transform): diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index c9313db..1929386 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -15,7 +15,7 @@ to_torch_csr_tensor, ) -# import numpy as np +import numpy as np # import pyximport # pyximport.install(setup_args={'include_dirs': np.get_include()}) # import gridfm_graphkit.models.algos as algos @@ -105,14 +105,15 @@ def preprocess_item(data): adj = edge_adj.to_dense() - # node adj matrix [N, N] bool - adj = adj.bool() - + # TODO replace the placeholder with actual algorithm + shortest_path_result = np.ones((N,N)) # shortest_path_result, path = algos.floyd_warshall(adj.numpy()) - gg = to_networkx(data) - shortest_path_result = floyd_warshall_numpy(gg) - print('sp>>>', shortest_path_result) - print(shortest_path_result.shape) + #gg = to_networkx(data) + #shortest_path_result = floyd_warshall_numpy(gg) + + # TODO the output of fw is integer number of hops in n x n, review if need to norm etc. + # print('sp>>>', shortest_path_result) + # print(shortest_path_result.shape) spatial_pos = torch.from_numpy((shortest_path_result)).long() attn_bias = torch.zeros([N, N], dtype=torch.float) # TODO verifie is updated @@ -143,10 +144,10 @@ def forward(self, data: Data) -> Data: attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data) # data[self.attr_name] = pe - data['attn_bias'] = attn_bias - data['spatial_pos'] = spatial_pos - data['in_degree'] = in_degree - data['in_degree'] = out_degree + data['attn_bias'] = attn_bias.unsqueeze(0) + data['spatial_pos'] = spatial_pos.unsqueeze(0) + data['in_degree'] = in_degree # assume undirected ie in == out + # data['out_degree'] = out_degree.unsqueeze(0) return data diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 7bb4f99..39fe60a 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -46,6 +46,22 @@ def __init__( dropout_rate = 0.3 attention_dropout_rate = 0.3 + # variables flown over from GPS TODO check + self.mask_dim = getattr(args.data, "mask_dim", 6) + self.mask_value = getattr(args.data, "mask_value", -1.0) + self.learn_mask = getattr(args.data, "learn_mask", True) + + if self.learn_mask: + self.mask_value = nn.Parameter( + torch.randn(self.mask_dim) + self.mask_value, + requires_grad=True, + ) + else: + self.mask_value = nn.Parameter( + torch.zeros(self.mask_dim) + self.mask_value, + requires_grad=False, + ) + self.input_proj = nn.Linear(self.n_node_features, self.hidden_dim) self.input_dropout = nn.Dropout(intput_dropout_rate) encoders = [ @@ -87,11 +103,13 @@ def compute_pos_embeddings(self, batched_data): 1, self.num_heads, 1, 1) # [n_graph, n_head, n_node, n_node] # spatial pos # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] + # print('xxxxxx', graph_attn_bias.size()) spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) graph_attn_bias = graph_attn_bias + spatial_pos_bias graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset node_feature = self.input_proj(x) + # print('nf>>', node_feature.size(), in_degree.size(), out_degree.size(), self.in_degree_encoder(in_degree).size()) node_feature = node_feature + \ self.in_degree_encoder(in_degree) + \ self.out_degree_encoder(out_degree) @@ -112,22 +130,23 @@ def encoder(self, graph_node_feature, graph_attn_bias): output = self.encoder_final_ln(output) return output - def forward(self, x, pe, edge_index, edge_attr, batched_data): + def forward(self, x, pe, edge_index, edge_attr, batched_data, data): """ process a batch of data, applying the input mask, while excluding non-valid values that arrise during processing mask: incoming values to mask for prediction """ - print('batch', batched_data) + print('batch', data) + print(x.size()) # TODO note that the x, pe are redundant or not needed, so clean up at the end # TODO in the baseline code the PE is an input here and passes through # a normalization before being concatenated to the features, follow this in final version - graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(batched_data) - + graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data) + print('gnodes********', graph_node_feature.size(), graph_attn_bias.size()) output = self.encoder(graph_node_feature, graph_attn_bias) output = self.decoder(output) diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py index e42d09d..0092805 100644 --- a/gridfm_graphkit/tasks/feature_reconstruction_task.py +++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py @@ -74,11 +74,11 @@ def __init__(self, args, node_normalizers, edge_normalizers): self.edge_normalizers = edge_normalizers self.save_hyperparameters() - def forward(self, x, pe, edge_index, edge_attr, batch, mask=None): + def forward(self, x, pe, edge_index, edge_attr, batch, mask=None, data=None): if mask is not None: mask_value_expanded = self.model.mask_value.expand(x.shape[0], -1) x[:, : mask.shape[1]][mask] = mask_value_expanded[mask] - return self.model(x, pe, edge_index, edge_attr, batch) + return self.model(x, pe, edge_index, edge_attr, batch, data) @rank_zero_only def on_fit_start(self): @@ -117,6 +117,7 @@ def shared_step(self, batch): edge_attr=batch.edge_attr, batch=batch.batch, mask=batch.mask, + data=batch ) loss_dict = self.loss_fn( From e1f8cd50c8ad022a6dea0d3a40d5d8e52c2fdd69 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:44 -0400 Subject: [PATCH 13/55] passed positional embedding Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/graphormer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 39fe60a..c10c742 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -103,7 +103,6 @@ def compute_pos_embeddings(self, batched_data): 1, self.num_heads, 1, 1) # [n_graph, n_head, n_node, n_node] # spatial pos # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] - # print('xxxxxx', graph_attn_bias.size()) spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) graph_attn_bias = graph_attn_bias + spatial_pos_bias graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset From 4bd084526f892a2f6d264f130cfd5a12b9f88585 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:44 -0400 Subject: [PATCH 14/55] pass to loss Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/graphormer.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index c10c742..ea55460 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -7,7 +7,7 @@ from torch.nn import functional as F - +from torch_geometric.utils import to_dense_batch @MODELS_REGISTRY.register("Graphormer") @@ -117,7 +117,7 @@ def compute_pos_embeddings(self, batched_data): return graph_node_feature, graph_attn_bias - def encoder(self, graph_node_feature, graph_attn_bias): + def encoder(self, graph_node_feature, graph_attn_bias, batch=1): graph_node_feature_masked = graph_node_feature graph_attn_bias_masked = graph_attn_bias @@ -125,7 +125,7 @@ def encoder(self, graph_node_feature, graph_attn_bias): # transfomrer encoder output = self.input_dropout(graph_node_feature_masked) for enc_layer in self.encoder_layers: - output = enc_layer(output, graph_attn_bias_masked) + output = enc_layer(output, graph_attn_bias_masked, batch=batch) output = self.encoder_final_ln(output) return output @@ -137,7 +137,7 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data): mask: incoming values to mask for prediction """ print('batch', data) - print(x.size()) + print(x.size(), batched_data) # TODO note that the x, pe are redundant or not needed, so clean up at the end @@ -145,8 +145,8 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data): # a normalization before being concatenated to the features, follow this in final version graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data) - print('gnodes********', graph_node_feature.size(), graph_attn_bias.size()) - output = self.encoder(graph_node_feature, graph_attn_bias) + # print('gnodes********', graph_node_feature.size(), graph_attn_bias.size()) + output = self.encoder(graph_node_feature, graph_attn_bias, batch=batched_data) output = self.decoder(output) return output @@ -250,15 +250,19 @@ def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate, self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate) self.ffn_dropout = nn.Dropout(dropout_rate) - def forward(self, x, attn_bias=None, mask=None): + def forward(self, x, attn_bias=None, mask=None, batch=1): """ It is assumed that the mask is 1 where values are to be ignored and then 0 where there are valid data """ y = self.self_attention_norm(x) - y = self.self_attention(y, y, y, attn_bias, mask) + # print(y.size(), attn_bias.size(), batch) + y, mask = to_dense_batch(y, batch) + # print('dense>>>', y.size(), mask.size()) + # print('msum>>>', mask.sum(dim=-1)) + y = self.self_attention(y, y, y, attn_bias, ~mask) y = self.self_attention_dropout(y) - x = x + y + x = x + torch.reshape(y, x.size()) y = self.ffn_norm(x) y = self.ffn(y) From 2fff2317fd6de82955b67ff191e11850e6d5ea81 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:44 -0400 Subject: [PATCH 15/55] pass loss, but note that AddGrEnc breaks multi-graph batch Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/graphormer.py | 3 ++- gridfm_graphkit/tasks/feature_reconstruction_task.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index ea55460..b45124f 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -50,6 +50,7 @@ def __init__( self.mask_dim = getattr(args.data, "mask_dim", 6) self.mask_value = getattr(args.data, "mask_value", -1.0) self.learn_mask = getattr(args.data, "learn_mask", True) + self.output_dim = args.model.output_dim if self.learn_mask: self.mask_value = nn.Parameter( @@ -80,7 +81,7 @@ def __init__( self.decoder = nn.Sequential( nn.Linear(self.hidden_dim, self.hidden_dim), nn.LeakyReLU(), - nn.Linear(self.hidden_dim, self.n_node_features) + nn.Linear(self.hidden_dim, self.output_dim) ) diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py index 0092805..c4714b2 100644 --- a/gridfm_graphkit/tasks/feature_reconstruction_task.py +++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py @@ -78,7 +78,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch, mask=None, data=None): if mask is not None: mask_value_expanded = self.model.mask_value.expand(x.shape[0], -1) x[:, : mask.shape[1]][mask] = mask_value_expanded[mask] - return self.model(x, pe, edge_index, edge_attr, batch, data) + return self.model(x, pe, edge_index, edge_attr, batch) #, data @rank_zero_only def on_fit_start(self): From 4e995e925679748e38df762439eed515e1bf49be Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:44 -0400 Subject: [PATCH 16/55] wrap up before testing new batching Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index cfe63e9..465140f 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -172,6 +172,12 @@ def process(self): attr_name="pe", ) graph_data = pe_transform(graph_data) + + gr_transform = AddGraphormerEncodings( + attr_name="gr", + ) + graph_data = gr_transform(graph_data) + torch.save( graph_data, osp.join( @@ -208,10 +214,10 @@ def get(self, idx): # TODO move this to the pretreatment when validated # print('datab>>>>>>>', data) - gr_transform = AddGraphormerEncodings( - attr_name="gr", - ) - data = gr_transform(data) + # gr_transform = AddGraphormerEncodings( + # attr_name="gr", + # ) + # data = gr_transform(data) # print('dataa>>>>>>>', data) # TODO remove return data From f9da3c0af6140185bc7c00e59165d23199104d55 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:44 -0400 Subject: [PATCH 17/55] confirmation that flat tensors do not break batching Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 16 ++++++++-------- gridfm_graphkit/datasets/transforms.py | 9 +++++---- gridfm_graphkit/models/graphormer.py | 3 ++- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index 465140f..32aaae3 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -173,10 +173,10 @@ def process(self): ) graph_data = pe_transform(graph_data) - gr_transform = AddGraphormerEncodings( - attr_name="gr", - ) - graph_data = gr_transform(graph_data) + # gr_transform = AddGraphormerEncodings( + # attr_name="gr", + # ) + # graph_data = gr_transform(graph_data) torch.save( graph_data, @@ -214,10 +214,10 @@ def get(self, idx): # TODO move this to the pretreatment when validated # print('datab>>>>>>>', data) - # gr_transform = AddGraphormerEncodings( - # attr_name="gr", - # ) - # data = gr_transform(data) + gr_transform = AddGraphormerEncodings( + attr_name="gr", + ) + data = gr_transform(data) # print('dataa>>>>>>>', data) # TODO remove return data diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 1929386..c5b5c5d 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -122,13 +122,13 @@ def preprocess_item(data): return attn_bias, spatial_pos, in_degree, out_degree class AddGraphormerEncodings(BaseTransform): - """... + """ TODO update with encoding info """ def __init__( self, - attr_name: Optional[str] = "gres", # TODO remove if not needed + attr_name: Optional[str] = "gres" # TODO remove if not needed ) -> None: self.attr_name = attr_name @@ -144,8 +144,9 @@ def forward(self, data: Data) -> Data: attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data) # data[self.attr_name] = pe - data['attn_bias'] = attn_bias.unsqueeze(0) - data['spatial_pos'] = spatial_pos.unsqueeze(0) + # print('******', attn_bias.size(), spatial_pos.size(), in_degree.size()) + data['attn_bias'] = attn_bias.flatten() + data['spatial_pos'] = spatial_pos.flatten() data['in_degree'] = in_degree # assume undirected ie in == out # data['out_degree'] = out_degree.unsqueeze(0) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index b45124f..125ad6e 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -137,8 +137,9 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data): mask: incoming values to mask for prediction """ - print('batch', data) + print('***batch***', data) print(x.size(), batched_data) + print(batched_data.attn_bias.size(), batched_data.spatial_pos.size()) # TODO note that the x, pe are redundant or not needed, so clean up at the end From 0976cbaea9cde654a8cb19556cec4e7be8c6d557 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:44 -0400 Subject: [PATCH 18/55] add padding of attributes Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 5 ++ gridfm_graphkit/datasets/transforms.py | 88 +++++++++++++++++-- gridfm_graphkit/models/graphormer.py | 59 +++++++++++-- .../tasks/feature_reconstruction_task.py | 2 +- 4 files changed, 136 insertions(+), 18 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index 32aaae3..0f4299e 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -201,6 +201,11 @@ def len(self): self.length = len(files) return self.length + def __cat_dim__(self, key, value, *args, **kwargs): + if key in ['attn_bias', 'spatial_pos', 'in_degree']: + return None + return super().__cat_dim__(key, value, *args, **kwargs) + def get(self, idx): file_name = osp.join( self.processed_dir, diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index c5b5c5d..64db08f 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -4,7 +4,7 @@ import torch from torch import Tensor from torch_geometric.transforms import BaseTransform -from typing import Optional +from typing import Optional, Any import torch_geometric.typing from torch_geometric.data import Data from torch_geometric.utils import ( @@ -91,15 +91,28 @@ def get_pe(out: Tensor) -> Tensor: return data +def add_node_attr(data: Data, value: Any, + attr_name: Optional[str] = None) -> Data: + if attr_name is None: + if 'x' in data: + x = data.x.view(-1, 1) if data.x.dim() == 1 else data.x + data.x = torch.cat([x, value.to(x.device, x.dtype)], dim=-1) + else: + data.x = value + else: + data[attr_name] = value + + return data + def preprocess_item(data): """ TODO fill in header for the function """ edge_index = data.edge_index N = data.num_nodes - edge_adj = torch.sparse.FloatTensor( + edge_adj = torch.sparse_coo_tensor( edge_index, - torch.ones(edge_index.shape[1]), + torch.ones(edge_index.shape[1]).to(data.x.device), [N, N] ) @@ -114,13 +127,54 @@ def preprocess_item(data): # TODO the output of fw is integer number of hops in n x n, review if need to norm etc. # print('sp>>>', shortest_path_result) # print(shortest_path_result.shape) - spatial_pos = torch.from_numpy((shortest_path_result)).long() - attn_bias = torch.zeros([N, N], dtype=torch.float) # TODO verifie is updated + spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device) + attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) # TODO verifie is updated in_degree = adj.long().sum(dim=1).view(-1) out_degree = adj.long().sum(dim=0).view(-1) return attn_bias, spatial_pos, in_degree, out_degree +def pad_1d_unsqueeze(x, padlen): + x = x + 1 # pad id = 0 + xlen = x.size(0) + if xlen < padlen: + new_x = x.new_zeros([padlen], dtype=x.dtype) + new_x[:xlen] = x + x = new_x + return x.unsqueeze(0) + + +def pad_2d_unsqueeze(x, padlen): + x = x + 1 # pad id = 0 + # print('-------->', x.size()) + xlen, xdim = x.size() + if xlen < padlen: + new_x = x.new_zeros([padlen, xdim], dtype=x.dtype) + new_x[:xlen, :] = x + x = new_x + return x.unsqueeze(0) + + +def pad_attn_bias_unsqueeze(x, padlen): + xlen = x.size(0) + if xlen < padlen: + new_x = x.new_zeros( + [padlen, padlen], dtype=x.dtype).fill_(float('-inf')) + new_x[:xlen, :xlen] = x + new_x[xlen:, :xlen] = 0 + x = new_x + return x.unsqueeze(0) + + +def pad_spatial_pos_unsqueeze(x, padlen): + x = x + 1 + xlen = x.size(0) + if xlen < padlen: + new_x = x.new_zeros([padlen, padlen], dtype=x.dtype) + new_x[:xlen, :xlen] = x + x = new_x + return x.unsqueeze(0) + class AddGraphormerEncodings(BaseTransform): """ TODO update with encoding info @@ -144,10 +198,26 @@ def forward(self, data: Data) -> Data: attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data) # data[self.attr_name] = pe - # print('******', attn_bias.size(), spatial_pos.size(), in_degree.size()) - data['attn_bias'] = attn_bias.flatten() - data['spatial_pos'] = spatial_pos.flatten() - data['in_degree'] = in_degree # assume undirected ie in == out + # print('******>>', attn_bias.size(), spatial_pos.size(), in_degree.size()) + # print(data) + # data[] = attn_bias.unsqueeze(0) #.flatten() + # data[] = spatial_pos.unsqueeze(0) #.flatten() + # data[] = in_degree # assume undirected ie in == out + # data['nodeslice'] = torch.from_numpy(np.array([N])) + + max_node_num = 2000 + attn_bias = pad_attn_bias_unsqueeze(attn_bias, max_node_num) + spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, max_node_num) + in_degree = pad_1d_unsqueeze(in_degree, max_node_num).squeeze() + + data = add_node_attr(data, attn_bias, attr_name='attn_bias') + data = add_node_attr(data, spatial_pos, attr_name='spatial_pos') + data = add_node_attr(data, in_degree, attr_name='in_degree') + + data.x = pad_2d_unsqueeze(data.x, max_node_num).squeeze() + data.y = pad_2d_unsqueeze(data.y, max_node_num).squeeze() + + # print(data) # data['out_degree'] = out_degree.unsqueeze(0) return data diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 125ad6e..8dc79f6 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -9,6 +9,8 @@ from torch_geometric.utils import to_dense_batch +from gridfm_graphkit.datasets.transforms import AddGraphormerEncodings + @MODELS_REGISTRY.register("Graphormer") class Graphormer(nn.Module): @@ -95,16 +97,48 @@ def __init__( # self.loss_fn = F.mse_loss # TODO remove eventually as they are specd elsewhere # self.masking_value = -4 - def compute_pos_embeddings(self, batched_data): + def compute_pos_embeddings(self, batched_data, batch): attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x in_degree, out_degree = batched_data.in_degree, batched_data.in_degree + + # gr_transform = AddGraphormerEncodings( + # attr_name="gr", + # ) + # batched_data = gr_transform(batched_data) + # attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x + # in_degree, out_degree = batched_data.in_degree, batched_data.in_degree + + + # print('--->', attn_bias.size(), attn_bias.device, batch.size()) + + # yy0, mask = to_dense_batch(attn_bias, batch=batch, max_num_nodes=2000, batch_size=8) + # yy1, mask = to_dense_batch(spatial_pos, batch=batch, max_num_nodes=2000, batch_size=8) + + # attn_bias = yy0 + # spatial_pos = yy1 + + # print('yyyyyy', yy0.size(), yy1.size(), x.size()) + + # attn_bias = attn_bias.reshape(8,-1) + # spatial_pos = spatial_pos.reshape(8,-1) + # print('-----', attn_bias.size(), spatial_pos.size(), x.size()) + # odim = int(torch.sqrt(torch.as_tensor(attn_bias.size(-1))).item()) + # print('oooo', odim) + # attn_bias = attn_bias.reshape(-1,odim,odim) + # spatial_pos = spatial_pos.reshape(-1,odim,odim) + # print('-----', attn_bias.size(), spatial_pos.size(), x.size()) + # graph_attn_bias graph_attn_bias = attn_bias.clone() graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat( 1, self.num_heads, 1, 1) # [n_graph, n_head, n_node, n_node] + # print('aaaaaaaaaa', graph_attn_bias.size(), graph_attn_bias.device) + # spatial pos # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) + # print('sssssssssss', spatial_pos_bias.size(), spatial_pos_bias.device) + graph_attn_bias = graph_attn_bias + spatial_pos_bias graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset @@ -137,16 +171,16 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data): mask: incoming values to mask for prediction """ - print('***batch***', data) - print(x.size(), batched_data) - print(batched_data.attn_bias.size(), batched_data.spatial_pos.size()) + # print('***batch***', data) + # print(x.size(), batched_data) + # print(data.attn_bias.size(), data.spatial_pos.size()) # TODO note that the x, pe are redundant or not needed, so clean up at the end # TODO in the baseline code the PE is an input here and passes through # a normalization before being concatenated to the features, follow this in final version - graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data) + graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data, batched_data) # print('gnodes********', graph_node_feature.size(), graph_attn_bias.size()) output = self.encoder(graph_node_feature, graph_attn_bias, batch=batched_data) output = self.decoder(output) @@ -257,17 +291,26 @@ def forward(self, x, attn_bias=None, mask=None, batch=1): It is assumed that the mask is 1 where values are to be ignored and then 0 where there are valid data """ + # print('xxxxxxxxxxxxx', x.size(), batch.size()) + x, mask = to_dense_batch(x, batch) + y = self.self_attention_norm(x) # print(y.size(), attn_bias.size(), batch) - y, mask = to_dense_batch(y, batch) - # print('dense>>>', y.size(), mask.size()) - # print('msum>>>', mask.sum(dim=-1)) + + attn_bias = attn_bias.squeeze() + # attn_bias = attn_bias.permute(1, 2, 0) + # attn_bias, maska = to_dense_batch(attn_bias, batch) + # print('dense>>>', y.size(), mask.size(), attn_bias.size()) + # print('msum>>>', mask.sum(dim=-1), ) y = self.self_attention(y, y, y, attn_bias, ~mask) y = self.self_attention_dropout(y) + # print('<<<<<>>>>', x.size(), y.size()) x = x + torch.reshape(y, x.size()) y = self.ffn_norm(x) y = self.ffn(y) y = self.ffn_dropout(y) x = x + y + x=x.flatten(0,1) + # print('222222222222222', x.size()) return x diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py index c4714b2..117596c 100644 --- a/gridfm_graphkit/tasks/feature_reconstruction_task.py +++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py @@ -78,7 +78,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch, mask=None, data=None): if mask is not None: mask_value_expanded = self.model.mask_value.expand(x.shape[0], -1) x[:, : mask.shape[1]][mask] = mask_value_expanded[mask] - return self.model(x, pe, edge_index, edge_attr, batch) #, data + return self.model(x, pe, edge_index, edge_attr, batch, data) # @rank_zero_only def on_fit_start(self): From de141d7fd7644bfcce9ca8e8864f480898c89916 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:45 -0400 Subject: [PATCH 19/55] corrected cython integer Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 19 +++++++++++-------- gridfm_graphkit/models/graphormer.py | 2 +- pyproject.toml | 3 ++- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 64db08f..ec5f558 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -16,12 +16,12 @@ ) import numpy as np -# import pyximport -# pyximport.install(setup_args={'include_dirs': np.get_include()}) -# import gridfm_graphkit.models.algos as algos +import pyximport +pyximport.install(setup_args={'include_dirs': np.get_include()}) +import gridfm_graphkit.models.algos as algos -from networkx import floyd_warshall_numpy -from torch_geometric.utils import to_networkx +# from networkx import floyd_warshall_numpy +# from torch_geometric.utils import to_networkx class AddNormalizedRandomWalkPE(BaseTransform): @@ -116,16 +116,19 @@ def preprocess_item(data): [N, N] ) - adj = edge_adj.to_dense() + adj = edge_adj.to_dense().to(torch.int16) # TODO replace the placeholder with actual algorithm - shortest_path_result = np.ones((N,N)) - # shortest_path_result, path = algos.floyd_warshall(adj.numpy()) + # shortest_path_result = np.ones((N,N)) + + # print('+++++++',adj.dtype, adj.numpy().dtype) + shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32)) #gg = to_networkx(data) #shortest_path_result = floyd_warshall_numpy(gg) # TODO the output of fw is integer number of hops in n x n, review if need to norm etc. # print('sp>>>', shortest_path_result) + # print('sp>>>', shortest_path_result.shape) # print(shortest_path_result.shape) spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device) attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) # TODO verifie is updated diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 8dc79f6..de24580 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -9,7 +9,7 @@ from torch_geometric.utils import to_dense_batch -from gridfm_graphkit.datasets.transforms import AddGraphormerEncodings +# from gridfm_graphkit.datasets.transforms import AddGraphormerEncodings @MODELS_REGISTRY.register("Graphormer") diff --git a/pyproject.toml b/pyproject.toml index 0c09d17..10719f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,8 @@ dependencies = [ "plotly", "pyyaml", "lightning", - "seaborn" + "seaborn", + "cython" ] [project.optional-dependencies] From 9d834a3a7313ed58287005e373f71f7530b6ba76 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:45 -0400 Subject: [PATCH 20/55] confirmation that route with cython and masking functions Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 5 +++-- gridfm_graphkit/models/graphormer.py | 24 ++++++++++++++++-------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index ec5f558..9aef961 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -138,7 +138,7 @@ def preprocess_item(data): return attn_bias, spatial_pos, in_degree, out_degree def pad_1d_unsqueeze(x, padlen): - x = x + 1 # pad id = 0 + # x = x + 1 # pad id = 0 #TODO remove all +1s xlen = x.size(0) if xlen < padlen: new_x = x.new_zeros([padlen], dtype=x.dtype) @@ -148,11 +148,12 @@ def pad_1d_unsqueeze(x, padlen): def pad_2d_unsqueeze(x, padlen): - x = x + 1 # pad id = 0 + # x = x + 1 # pad id = 0 # print('-------->', x.size()) xlen, xdim = x.size() if xlen < padlen: new_x = x.new_zeros([padlen, xdim], dtype=x.dtype) + new_x[:,:] = -1e9 new_x[:xlen, :] = x x = new_x return x.unsqueeze(0) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index de24580..11b6c7c 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -152,15 +152,15 @@ def compute_pos_embeddings(self, batched_data, batch): return graph_node_feature, graph_attn_bias - def encoder(self, graph_node_feature, graph_attn_bias, batch=1): + def encoder(self, graph_node_feature, graph_attn_bias, mask=None, batch=1): - graph_node_feature_masked = graph_node_feature + graph_node_feature_masked = graph_node_feature #TODO simplify this graph_attn_bias_masked = graph_attn_bias # transfomrer encoder output = self.input_dropout(graph_node_feature_masked) for enc_layer in self.encoder_layers: - output = enc_layer(output, graph_attn_bias_masked, batch=batch) + output = enc_layer(output, graph_attn_bias_masked, mask=mask, batch=batch) output = self.encoder_final_ln(output) return output @@ -175,6 +175,11 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data): # print(x.size(), batched_data) # print(data.attn_bias.size(), data.spatial_pos.size()) + mask = None + masked_entries = torch.sum(x < -100, axis=-1) #TODO make this mesh with normalizn + mask = masked_entries == x.size(-1) + print('pad mask >>>', mask.size(), mask.sum()) + # TODO note that the x, pe are redundant or not needed, so clean up at the end # TODO in the baseline code the PE is an input here and passes through @@ -182,7 +187,7 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data): graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data, batched_data) # print('gnodes********', graph_node_feature.size(), graph_attn_bias.size()) - output = self.encoder(graph_node_feature, graph_attn_bias, batch=batched_data) + output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batched_data) output = self.decoder(output) return output @@ -291,8 +296,10 @@ def forward(self, x, attn_bias=None, mask=None, batch=1): It is assumed that the mask is 1 where values are to be ignored and then 0 where there are valid data """ + # print('xxxxxxxxxxxxx', x.size(), batch.size()) - x, mask = to_dense_batch(x, batch) + x, bmask = to_dense_batch(x, batch) # TODO remove bmask if padding remains in final + mask, _ = to_dense_batch(mask, batch) y = self.self_attention_norm(x) # print(y.size(), attn_bias.size(), batch) @@ -300,9 +307,10 @@ def forward(self, x, attn_bias=None, mask=None, batch=1): attn_bias = attn_bias.squeeze() # attn_bias = attn_bias.permute(1, 2, 0) # attn_bias, maska = to_dense_batch(attn_bias, batch) - # print('dense>>>', y.size(), mask.size(), attn_bias.size()) - # print('msum>>>', mask.sum(dim=-1), ) - y = self.self_attention(y, y, y, attn_bias, ~mask) + # print('dense>>>', y.size(), bmask.size(), attn_bias.size()) + # print('msum>>>', bmask.sum(dim=-1), ) + # print('msum2>>', mask.size(),mask.sum(dim=-1)) + y = self.self_attention(y, y, y, attn_bias, mask) y = self.self_attention_dropout(y) # print('<<<<<>>>>', x.size(), y.size()) x = x + torch.reshape(y, x.size()) From 903708c6ac7b1127454a357f143a8170c2d674c0 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:45 -0400 Subject: [PATCH 21/55] propogate mask to loss calculation for all models Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 2 ++ gridfm_graphkit/datasets/transforms.py | 13 +------------ gridfm_graphkit/models/gnn_transformer.py | 6 +++++- gridfm_graphkit/models/gps_transformer.py | 4 +++- gridfm_graphkit/models/graphormer.py | 15 ++++++++++----- .../tasks/feature_reconstruction_task.py | 8 ++++---- 6 files changed, 25 insertions(+), 23 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index 0f4299e..09374ba 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -219,10 +219,12 @@ def get(self, idx): # TODO move this to the pretreatment when validated # print('datab>>>>>>>', data) + # print('qqqqqq', data.x.min(), data.x.max()) gr_transform = AddGraphormerEncodings( attr_name="gr", ) data = gr_transform(data) + # print('aaaaaaaaaaaaaaa', data.x.min(), data.x.max()) # print('dataa>>>>>>>', data) # TODO remove return data diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 9aef961..c3f039f 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -201,15 +201,7 @@ def forward(self, data: Data) -> Data: attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data) - # data[self.attr_name] = pe - # print('******>>', attn_bias.size(), spatial_pos.size(), in_degree.size()) - # print(data) - # data[] = attn_bias.unsqueeze(0) #.flatten() - # data[] = spatial_pos.unsqueeze(0) #.flatten() - # data[] = in_degree # assume undirected ie in == out - # data['nodeslice'] = torch.from_numpy(np.array([N])) - - max_node_num = 2000 + max_node_num = 118 # TODO extract from batch attn_bias = pad_attn_bias_unsqueeze(attn_bias, max_node_num) spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, max_node_num) in_degree = pad_1d_unsqueeze(in_degree, max_node_num).squeeze() @@ -221,9 +213,6 @@ def forward(self, data: Data) -> Data: data.x = pad_2d_unsqueeze(data.x, max_node_num).squeeze() data.y = pad_2d_unsqueeze(data.y, max_node_num).squeeze() - # print(data) - # data['out_degree'] = out_degree.unsqueeze(0) - return data diff --git a/gridfm_graphkit/models/gnn_transformer.py b/gridfm_graphkit/models/gnn_transformer.py index 9e1ab23..627cd49 100644 --- a/gridfm_graphkit/models/gnn_transformer.py +++ b/gridfm_graphkit/models/gnn_transformer.py @@ -93,4 +93,8 @@ def forward(self, x, pe, edge_index, edge_attr, batch): x = nn.LeakyReLU()(x) x = self.mlps(x) - return x + + masked_entries = torch.sum(x < -1e8, axis=-1) + mask = masked_entries >= 3 + + return x, ~mask diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py index 50e7db9..2bae93e 100644 --- a/gridfm_graphkit/models/gps_transformer.py +++ b/gridfm_graphkit/models/gps_transformer.py @@ -139,4 +139,6 @@ def forward(self, x, pe, edge_index, edge_attr, batch): x = self.pre_decoder_norm(x) x = self.decoder(x) - return x + masked_entries = torch.sum(x < -1e8, axis=-1) + mask = masked_entries >= 3 + return x, ~mask diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 11b6c7c..382bb53 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -172,13 +172,15 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data): mask: incoming values to mask for prediction """ # print('***batch***', data) - # print(x.size(), batched_data) + # print('====', x.size(), batched_data) # print(data.attn_bias.size(), data.spatial_pos.size()) mask = None - masked_entries = torch.sum(x < -100, axis=-1) #TODO make this mesh with normalizn - mask = masked_entries == x.size(-1) - print('pad mask >>>', mask.size(), mask.sum()) + masked_entries = torch.sum(x < -1e8, axis=-1) #TODO make this mesh with normalizn + # print('>>', masked_entries.size()) + # TODO key to make this more general to handle other masking objectives + mask = masked_entries >= 3 # due to masking # x.size(-1) + # print('pad mask >>>', mask.size(), mask.sum()) # TODO note that the x, pe are redundant or not needed, so clean up at the end @@ -190,7 +192,10 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data): output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batched_data) output = self.decoder(output) - return output + # evaluate where mask is True, so update it TODO + # print('ooooooooo', output[~mask].size()) + # print('bbbbbbbb', data.mask.size(), data.mask, data.mask.sum()/len(data.mask.flatten())) + return output, ~mask # TODO maybe set this as the decoder diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py index 117596c..e7d5d79 100644 --- a/gridfm_graphkit/tasks/feature_reconstruction_task.py +++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py @@ -110,7 +110,7 @@ def on_fit_start(self): ) def shared_step(self, batch): - output = self.forward( + output, valid = self.forward( x=batch.x, pe=batch.pe, edge_index=batch.edge_index, @@ -121,11 +121,11 @@ def shared_step(self, batch): ) loss_dict = self.loss_fn( - output, - batch.y, + output[valid], + batch.y[valid], batch.edge_index, batch.edge_attr, - batch.mask, + batch.mask[valid], ) return output, loss_dict From 40140349b03f5d1ed10e991e917ee1fff3af2116 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:45 -0400 Subject: [PATCH 22/55] clean up and include cython code for encoding Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/algos.pyx | 91 +++++++++++++ gridfm_graphkit/models/gmae_collator.py | 127 ------------------ gridfm_graphkit/models/gmae_data.py | 165 ------------------------ gridfm_graphkit/models/gmae_wrapper.py | 88 ------------- gridfm_graphkit/models/graphormer.py | 11 +- 5 files changed, 94 insertions(+), 388 deletions(-) create mode 100644 gridfm_graphkit/models/algos.pyx delete mode 100644 gridfm_graphkit/models/gmae_collator.py delete mode 100644 gridfm_graphkit/models/gmae_data.py delete mode 100644 gridfm_graphkit/models/gmae_wrapper.py diff --git a/gridfm_graphkit/models/algos.pyx b/gridfm_graphkit/models/algos.pyx new file mode 100644 index 0000000..8600367 --- /dev/null +++ b/gridfm_graphkit/models/algos.pyx @@ -0,0 +1,91 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import cython +from cython.parallel cimport prange, parallel +cimport numpy +import numpy + +def floyd_warshall(adjacency_matrix): + + (nrows, ncols) = adjacency_matrix.shape + assert nrows == ncols + cdef unsigned int n = nrows + + adj_mat_copy = adjacency_matrix.astype(numpy.int32, order='C', casting='safe', copy=True) + assert adj_mat_copy.flags['C_CONTIGUOUS'] + cdef numpy.ndarray[long, ndim=2, mode='c'] M = adj_mat_copy + cdef numpy.ndarray[long, ndim=2, mode='c'] path = numpy.zeros([n, n], dtype=numpy.int32) + + cdef unsigned int i, j, k + cdef long M_ij, M_ik, cost_ikkj + cdef long* M_ptr = &M[0,0] + cdef long* M_i_ptr + cdef long* M_k_ptr + + # set unreachable nodes distance to 510 + for i in range(n): + for j in range(n): + if i == j: + M[i][j] = 0 + elif M[i][j] == 0: + M[i][j] = 510 + + # floyed algo + for k in range(n): + M_k_ptr = M_ptr + n*k + for i in range(n): + M_i_ptr = M_ptr + n*i + M_ik = M_i_ptr[k] + for j in range(n): + cost_ikkj = M_ik + M_k_ptr[j] + M_ij = M_i_ptr[j] + if M_ij > cost_ikkj: + M_i_ptr[j] = cost_ikkj + path[i][j] = k + + # set unreachable path to 510 + for i in range(n): + for j in range(n): + if M[i][j] >= 510: + path[i][j] = 510 + M[i][j] = 510 + + return M, path + + +def get_all_edges(path, i, j): + cdef unsigned int k = path[i][j] + if k == 0: + return [] + else: + return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j) + + +def gen_edge_input(max_dist, path, edge_feat): + + (nrows, ncols) = path.shape + assert nrows == ncols + cdef unsigned int n = nrows + cdef unsigned int max_dist_copy = max_dist + + path_copy = path.astype(long, order='C', casting='safe', copy=True) + edge_feat_copy = edge_feat.astype(long, order='C', casting='safe', copy=True) + assert path_copy.flags['C_CONTIGUOUS'] + assert edge_feat_copy.flags['C_CONTIGUOUS'] + + cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=numpy.int64) + cdef unsigned int i, j, k, num_path, cur + + for i in range(n): + for j in range(n): + if i == j: + continue + if path_copy[i][j] == 510: + continue + path = [i] + get_all_edges(path_copy, i, j) + [j] + num_path = len(path) - 1 + for k in range(num_path): + edge_fea_all[i, j, k, :] = edge_feat_copy[path[k], path[k+1], :] + + return edge_fea_all diff --git a/gridfm_graphkit/models/gmae_collator.py b/gridfm_graphkit/models/gmae_collator.py deleted file mode 100644 index f4bc532..0000000 --- a/gridfm_graphkit/models/gmae_collator.py +++ /dev/null @@ -1,127 +0,0 @@ -import torch - - -def pad_1d_unsqueeze(x, padlen): - x = x + 1 # pad id = 0 - xlen = x.size(0) - if xlen < padlen: - new_x = x.new_zeros([padlen], dtype=x.dtype) - new_x[:xlen] = x - x = new_x - return x.unsqueeze(0) - - -def pad_2d_unsqueeze(x, padlen): - x = x + 1 # pad id = 0 - # print('-------->', x.size()) - xlen, xdim = x.size() - if xlen < padlen: - new_x = x.new_zeros([padlen, xdim], dtype=x.dtype) - new_x[:xlen, :] = x - x = new_x - return x.unsqueeze(0) - - -def pad_attn_bias_unsqueeze(x, padlen): - xlen = x.size(0) - if xlen < padlen: - new_x = x.new_zeros( - [padlen, padlen], dtype=x.dtype).fill_(float('-inf')) - new_x[:xlen, :xlen] = x - new_x[xlen:, :xlen] = 0 - x = new_x - return x.unsqueeze(0) - - -def pad_spatial_pos_unsqueeze(x, padlen): - x = x + 1 - xlen = x.size(0) - if xlen < padlen: - new_x = x.new_zeros([padlen, padlen], dtype=x.dtype) - new_x[:xlen, :xlen] = x - x = new_x - return x.unsqueeze(0) - - -class Batch(): - def __init__(self, - min_node_num, - attn_bias, - spatial_pos, - in_degree, - out_degree, - x, - y, - orig_id - ): - super(Batch, self).__init__() - self.min_node_num = int(min_node_num) - self.in_degree, self.out_degree = in_degree, out_degree - self.x, self.y = x, y - self.attn_bias, self.spatial_pos = attn_bias, spatial_pos - self.orig_id = orig_id - - def to(self, device): - self.in_degree, self.out_degree = self.in_degree.to( - device), self.out_degree.to(device) - self.x = self.x.to(device) - self.y = self.y.to(device) - self.attn_bias, self.spatial_pos = self.attn_bias.to( - device), self.spatial_pos.to(device) - return self - - def __len__(self): - return self.in_degree.size(0) - - -def collator(items, spatial_pos_max=20): - """ - custom collator, among other transformations... - - unequal input graphs are padded to all have the same size - - adds 1 to the input x via pad_2d_unsqueeze and similar functions - """ - items = [ - item for item in items if item is not None] - items = [ - (item[0], item[1], item[2], item[3], item[4], item[5], item[6], item[7]) - for item in items - ] - - # at this step all graphs in batch have their input size - xs, ys, adjs, attn_biases, spatial_poses, in_degrees, out_degrees, orig_ids = zip(*items) - - for idx, _ in enumerate(attn_biases): - attn_biases[idx][spatial_poses[idx] >= spatial_pos_max] = float('-inf') - max_node_num = max(i.size(0) for i in xs) - min_node_num = min(i.size(0) for i in xs) - - if all([torch.all(xx == yy) for xx,yy in zip(xs, ys)]): # then this is for and encoder-decoder setup - y = torch.cat([pad_2d_unsqueeze(i, max_node_num) for i in ys]) - else: - y = torch.stack(ys) - - # following steps pad the smaller graphs to match the largest for batching - # incidentally a constant value of 1 is added as well - x = torch.cat([pad_2d_unsqueeze(i, max_node_num) for i in xs]) - attn_bias = torch.cat([pad_attn_bias_unsqueeze( - i, max_node_num) for i in attn_biases]) - spatial_pos = torch.cat([pad_spatial_pos_unsqueeze(i, max_node_num) - for i in spatial_poses]) - in_degree = torch.cat([pad_1d_unsqueeze(i, max_node_num) - for i in in_degrees]) - out_degree = torch.cat([pad_1d_unsqueeze(i, max_node_num) - for i in out_degrees]) - - - return Batch( - min_node_num=min_node_num, - attn_bias=attn_bias, - spatial_pos=spatial_pos, - in_degree=in_degree, - out_degree=out_degree, - x=x, - y=y, - orig_id=orig_ids - ) diff --git a/gridfm_graphkit/models/gmae_data.py b/gridfm_graphkit/models/gmae_data.py deleted file mode 100644 index 1be7279..0000000 --- a/gridfm_graphkit/models/gmae_data.py +++ /dev/null @@ -1,165 +0,0 @@ -from collator import collator -from pytorch_lightning import LightningDataModule -from torch.utils.data import DataLoader, random_split -from functools import partial -import random -import torch -from wrapper import MyDataset, process_samples -from torch_geometric.utils import to_undirected - -from torch_geometric.datasets import Planetoid, WikiCS, Amazon -from torch_geometric.loader import NeighborSampler -import torch_geometric.transforms as T -import hqdata - - -dataset = None - - -def get_dataset(dataset_name='Cora', nodefile='', edgefile=''): - global dataset - path = 'dataset/' + dataset_name - if dataset is not None: - return dataset - - elif dataset_name in ['Cora', 'CiteSeer', 'PubMed']: - return Planetoid(root=path, name=dataset_name, transform=T.NormalizeFeatures()) - elif dataset_name == 'WikiCS': - return WikiCS(root=path, transform=T.NormalizeFeatures()) - elif dataset_name == 'Amazon-Computers': - return Amazon(root=path, name='computers', transform=T.NormalizeFeatures()) - elif dataset_name == 'Amazon-Photo': - return Amazon(root=path, name='photo', transform=T.NormalizeFeatures()) - elif dataset_name == 'hqdata': - return hqdata.simple_batch(nodefile, edgefile) - else: - raise NotImplementedError - -def read_csv(infile): - """ - assume two columns: instances number, file location and name - """ - - lines = [] - with open(infile, 'r') as ff: - for line in ff: - lines.append([xx.strip() for xx in line.split(',')]) - - return lines - -class GraphDataModule(LightningDataModule): - name = "Cora" - - def __init__( - self, - dataset_name: str = 'Cora', - num_workers: int = 8, - batch_size: int = 64, - seed: int = 42, - edgefile: str = '', - nodefile: str = '', - processedfile: str = '', # preprocessed dataset file in pt format - n_val_sampler: int = 10, - num_node_features: int = 25, - test=False, - *args, - **kwargs, - ): - super().__init__(*args, **kwargs) - self.dataset_name = dataset_name - if nodefile and edgefile: - self.dataset = get_dataset(dataset_name, nodefile, edgefile) - else: - self.dataset = read_csv(processedfile) - self.num_node_features = num_node_features - self.seed = seed - self.n_val_sampler = n_val_sampler - - self.num_workers = num_workers - self.batch_size = batch_size - self.dataset_full = ... - self.dataset_train = ... - self.dataset_val = ... - self.dataset_test = ... # not currently in use - self.train_frac = 0.8 # train-val split only - self.istest = test - - - def setup(self, stage: str = None): - """ - automatically called, if prepare_data() is defined, then the latter - is called first - - during testing this section is not needed - """ - - if self.istest: - pass - else: - items = self.dataset # for disk data the dataset is in items form - self.dataset_full = MyDataset( - items, - settype='csv', - ) - - # split the train and validation data - train_set_size = int(self.train_frac*len(self.dataset_full)) - valid_set_size = len(self.dataset_full) - train_set_size - seed = torch.Generator().manual_seed(self.seed) - train_set, valid_set = random_split( - self.dataset_full, - [train_set_size, valid_set_size], - generator=seed - ) - print('**train and val dataset sizes**',len(train_set),len(valid_set)) - self.dataset_train = train_set - self.dataset_val = valid_set - - - def train_dataloader(self): - loader = DataLoader(self.dataset_train, batch_size=self.batch_size, - shuffle=True, - num_workers=self.num_workers, - collate_fn=partial(collator), - ) - return loader - - def val_dataloader(self): - loader = DataLoader(self.dataset_val, batch_size=self.batch_size, - shuffle=False, - num_workers=self.num_workers, - collate_fn=partial(collator), - ) - return loader - - def eval_dataloader(self): - """ - for downstream evaluation - """ - # do not wish to shuffle for evaluation - graphs_to_process = self.dataset.datalist - - - items = [] # from in mem dataset - - for graphdata in graphs_to_process: - # padding and mask creation should happend here - num_nodes = graphdata.num_nodes - ns0 = 1 # batch size - ns1 = torch.arange(num_nodes, dtype=torch.int32) # node ids - ns2 = graphdata.edge_index - data_item = process_samples( - ns0, - ns1, - ns2, - graphdata) + [0] # TODO completely remove the appended [0] - items.append(data_item) - - self.dataset_eval = MyDataset(items) - loader = DataLoader(self.dataset_eval, - batch_size=self.batch_size*self.n_val_sampler, - shuffle=False, - num_workers=self.num_workers, - collate_fn=partial(collator), - ) - return loader diff --git a/gridfm_graphkit/models/gmae_wrapper.py b/gridfm_graphkit/models/gmae_wrapper.py deleted file mode 100644 index dfc1367..0000000 --- a/gridfm_graphkit/models/gmae_wrapper.py +++ /dev/null @@ -1,88 +0,0 @@ -import torch - -import numpy as np - -from torch_geometric.loader import NeighborSampler -from torch_geometric.utils import to_undirected - - - -def process_samples(batch_size, n_id, edge_index, dataset): - """ - transformation of sampled nodes to: - - node features of sampled set, - - y, - - edges tensor - - # TODO reconcile redundance of using edge_index and dataset - # in the case where the full graph is used - """ - - # print(edge_index) - # print('<------->') - if edge_index.size(1) != 0: - edge_index = to_undirected(edge_index) - n_nodes = len(n_id) - edge_sp_adj = torch.sparse.FloatTensor(edge_index, - torch.ones(edge_index.shape[1]), - [n_nodes, n_nodes]) - edge_adj = edge_sp_adj - - # print('<<---------------->>') - # print(n_id) - # print(dataset.x.size()) - # print(dataset.y.size()) - - return [dataset.x[n_id], dataset.y[n_id], edge_adj] - - -# GMAE_graph positional encoding -class MyDataset(torch.utils.data.Dataset): - def __init__(self, items, settype=''): - super(MyDataset, self).__init__() - - self.items = items - self.type = settype - - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - item = self.items[idx] - - if self.type=='csv': - graphdata = torch.load(item[1]) - num_nodes = graphdata.num_nodes - - # padding and mask creation should happend here - ns0 = 1 # batch size - ns1 = torch.arange(num_nodes, dtype=torch.int32) # node ids - ns2 = graphdata.edge_index - data_item = process_samples( - ns0, - ns1, - ns2, - graphdata) + [0] # TODO completely remove the appended [0] - else: - data_item = item # in memory dataset in use - - return preprocess_item(data_item) - - -def preprocess_item(item): - """ - """ - x, y, adj, orig_id = item[0], item[1], item[2].to_dense(), item[3] - N = x.size(0) - - # node adj matrix [N, N] bool - adj = adj.bool() - - shortest_path_result, path = algos.floyd_warshall(adj.numpy()) - spatial_pos = torch.from_numpy((shortest_path_result)).long() - attn_bias = torch.zeros([N, N], dtype=torch.float) - - in_degree = adj.long().sum(dim=1).view(-1) - out_degree = adj.long().sum(dim=0).view(-1) - return x, y, adj, attn_bias, spatial_pos, in_degree, out_degree, orig_id diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 382bb53..27c7da9 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -171,16 +171,11 @@ def forward(self, x, pe, edge_index, edge_attr, batched_data, data): mask: incoming values to mask for prediction """ - # print('***batch***', data) - # print('====', x.size(), batched_data) - # print(data.attn_bias.size(), data.spatial_pos.size()) mask = None - masked_entries = torch.sum(x < -1e8, axis=-1) #TODO make this mesh with normalizn - # print('>>', masked_entries.size()) - # TODO key to make this more general to handle other masking objectives - mask = masked_entries >= 3 # due to masking # x.size(-1) - # print('pad mask >>>', mask.size(), mask.sum()) + masked_entries = torch.sum(x < -1e8, axis=-1) + mask = masked_entries >= 3 # due to masking up to feature 6 of 9 # x.size(-1) + # TODO note that the x, pe are redundant or not needed, so clean up at the end From 6cc690ecf43240b82c76c6c7154318e0a58c70ae Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:45 -0400 Subject: [PATCH 23/55] clean up Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/temp_leftovers.py | 145 ----------------------- 1 file changed, 145 deletions(-) delete mode 100644 gridfm_graphkit/models/temp_leftovers.py diff --git a/gridfm_graphkit/models/temp_leftovers.py b/gridfm_graphkit/models/temp_leftovers.py deleted file mode 100644 index f46a8c5..0000000 --- a/gridfm_graphkit/models/temp_leftovers.py +++ /dev/null @@ -1,145 +0,0 @@ -# temporary file to hold functions while they wait to be -# transferred to other modules - - - - - - - - - def training_step(self, batched_data, batch_idx): - num_nodes = batched_data.x.size(1) - - # create a boolean mask where padding was added - # note that this assumes all input data had features with - # values >= 0 - mask = None - masked_entries = torch.sum(batched_data.x == 0, axis=2) - mask = masked_entries == batched_data.x.size(2) - - # add low-level random noise to input X - noise = np.random.normal( - loc=0.0, - scale=0.00001, # TODO make configurable - size=batched_data.x.size() - ) - device = batched_data.x.device - orig_data = batched_data.x - batched_data.x = batched_data.x + torch.Tensor(noise).to(device) - - strategy = '' - # fifty-fifty split between random masking and power-flow solution - if np.random.uniform() > 0.5: - # find location of all nozero entries for masking and shuffle, select, mask - inds = torch.where(orig_data.flatten() != 0) - num_mask = int(self.mask_ratio * len(inds[0])) - shuf_inds = (inds[0][torch.randperm(len(inds[0]))],) - - nshape = batched_data.x.size() - batched_data.x = batched_data.x.flatten() - batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value - batched_data.x = torch.reshape(batched_data.x, nshape) - else: # assume only voltage and power variables to be masked - inds = torch.cat([ - # to pred - torch.range(xx,len(orig_data.flatten()), 25, dtype=int) - for xx in [ii for ii in range(17,25)] - ]) - - shuf_inds = inds[torch.randperm(len(inds))] - - nshape = batched_data.x.size() - batched_data.x = batched_data.x.flatten() - batched_data.x[shuf_inds.to(device)] = self.masking_value - batched_data.x = torch.reshape(batched_data.x, nshape) - - - y_hat, graph_mask = self(batched_data, mask) # [n_graph, n_masked_node, n_feature] - if graph_mask is not None: - y_gt = orig_data[graph_mask].float() - else: - y_gt = orig_data.float() - - y_gt = y_gt[~mask] - y_hat = y_hat[~mask] - - # print('pre loss shapes', y_gt.size(), y_hat.size()) - loss = self.loss_fn(y_hat, y_gt) - loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device) - self.log('train_loss', loss) - self.log('activ_loss', loss_actv) - - return loss + loss_actv - - def validation_step(self, batched_data, batch_idx): - num_nodes = batched_data.x.size(1) - mask = None - - masked_entries = torch.sum(batched_data.x == 0, axis=2) - mask = masked_entries == batched_data.x.size(2) - - # add low-level random noise to input X - noise = np.random.normal( - loc=0.0, - scale=0.00001, # TODO make configurable - size=batched_data.x.size() - ) - device = batched_data.x.device - orig_data = batched_data.x - batched_data.x = batched_data.x + torch.Tensor(noise).to(device) - - # fifty-fifty split between random masking and power-flow solution - if np.random.uniform() > 0.5: - # find location of all nozero entries for masking and shuffle, select, mask - inds = torch.where(orig_data.flatten() != 0) - num_mask = int(self.mask_ratio * len(inds[0])) - shuf_inds = (inds[0][torch.randperm(len(inds[0]))],) - - nshape = batched_data.x.size() - batched_data.x = batched_data.x.flatten() - batched_data.x[shuf_inds[0][:num_mask].to(device)] = self.masking_value - batched_data.x = torch.reshape(batched_data.x, nshape) - else: # assume only voltage and power variables to be masked - inds = torch.cat([ - # to pred - torch.range(xx,len(orig_data.flatten()), 25, dtype=int) - for xx in [ii for ii in range(17,25)] - ]) - - shuf_inds = inds[torch.randperm(len(inds))] - - nshape = batched_data.x.size() - batched_data.x = batched_data.x.flatten() - batched_data.x[shuf_inds.to(device)] = self.masking_value - batched_data.x = torch.reshape(batched_data.x, nshape) - - y_hat, graph_mask = self(batched_data, mask) # [n_graph, n_masked_node, n_feature] - if graph_mask is not None: - y_gt = orig_data[graph_mask].float() - else: - y_gt = orig_data.float() - - no_features = y_hat.size(2) - y_gt = y_gt[~mask] - y_hat = y_hat[~mask] - y_hat = y_hat.reshape(-1, y_hat.size(1)) # [n_graph*n_masked_node, n_feature] - y_gt = y_gt.reshape(-1, y_gt.size(1)) # [n_graph*n_masked_node, n_feature] - pad_mask = torch.nonzero(y_gt.sum(-1)) - - y_gt = y_gt[pad_mask, :] - y_hat = y_hat[pad_mask, :] - - loss = self.loss_fn(y_hat, y_gt) - loss_actv = self.alpha*self.loss_phys1(y_hat, y_gt, device) - self.log('val_loss', loss, batch_size=1) - - # loss per feature, for logging only - for ii in range(no_features): - self.log( - 'val_loss_{}'.format(ii), - self.loss_fn(y_hat[ii::no_features], y_gt[ii::no_features]), - batch_size=1 - ) - - return loss + loss_actv \ No newline at end of file From 87ef06510060c3d7cb922d11688e5bd81d9fc79b Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:45 -0400 Subject: [PATCH 24/55] rework function head and parameters Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/gps_transformer.py | 2 - gridfm_graphkit/models/graphormer.py | 66 ++++++++++++----------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py index 2bae93e..b3f1043 100644 --- a/gridfm_graphkit/models/gps_transformer.py +++ b/gridfm_graphkit/models/gps_transformer.py @@ -121,9 +121,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch): """ x_pe = self.pe_norm(pe) - # print('enc>>>', x.size()) # TODO remove x = self.encoder(x) - # print('post>>>', x.size()) # TODO remove x = self.input_norm(x) x = torch.cat((x, x_pe), 1) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 27c7da9..77a5ba2 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -15,45 +15,48 @@ @MODELS_REGISTRY.register("Graphormer") class Graphormer(nn.Module): """ - TODO fill in description + A Graph Transformer model based on the Graphormer architecture + + This model directly modifies the attention between nodes based on + its graph encodings. This requires padding the input nodes and propogating + the associated mask as needed. + + Args: + args (NestedNamespace): Parameters + + Attributes: + input_dim (int): Dimension of input node features. From ``args.model.input_dim``. + hidden_size (int): Hidden dimension size for all layers. From ``args.model.hidden_size``. + output_dim (int): Dimension of the output node features. From ``args.model.output_dim``. + edge_dim (int): Dimension of edge features. From ``args.model.edge_dim``. + pe_dim (int): Dimension of the positional encoding. Must be less than ``hidden_dim``. From ``args.model.pe_dim``. + num_layers (int): Number of GPSConv layers. From ``args.model.num_layers``. + heads (int, optional): Number of attention heads in GPSConv. From ``args.model.attention_head``. Defaults to 1. + dropout (float, optional): Dropout rate in GPSConv. From ``args.model.dropout``. Defaults to 0.0. + mask_dim (int, optional): Dimension of the mask vector. From ``args.data.mask_dim``. Defaults to 6. + mask_value (float, optional): Initial value for learnable mask parameters. From ``args.data.mask_value``. Defaults to -1.0. + learn_mask (bool, optional): Whether to learn mask values as parameters. From ``args.data.learn_mask``. Defaults to True. + """ - def __init__( - self, - # n_encoder_layers, - # n_decoder_layers, - # num_heads, - # hidden_dim, - # dropout_rate, - # intput_dropout_rate, - # weight_decay, - # ffn_dim, - # dataset_name, - # warmup_updates, - # tot_updates, - # peak_lr, - # end_lr, - # attention_dropout_rate, - # n_node_features, - # mask_ratio, - # n_val_sampler, - args - ): + def __init__(self, args): super().__init__() self.n_node_features = args.model.input_dim - self.num_heads = 8 # TODO make this configurable or to match their structure self.hidden_dim = args.model.hidden_size + self.output_dim = args.model.output_dim self.n_encoder_layers = args.model.num_layers - intput_dropout_rate = 0.3 - dropout_rate = 0.3 + self.num_heads = args.model.attention_head + + # TODO move these to config or calculate + self.dropout = getattr(args.model, "dropout", 0.0) # TODO propagate attention_dropout_rate = 0.3 # variables flown over from GPS TODO check self.mask_dim = getattr(args.data, "mask_dim", 6) self.mask_value = getattr(args.data, "mask_value", -1.0) self.learn_mask = getattr(args.data, "learn_mask", True) - self.output_dim = args.model.output_dim - + + # TODO verify function of mask if self.learn_mask: self.mask_value = nn.Parameter( torch.randn(self.mask_dim) + self.mask_value, @@ -66,13 +69,12 @@ def __init__( ) self.input_proj = nn.Linear(self.n_node_features, self.hidden_dim) - self.input_dropout = nn.Dropout(intput_dropout_rate) + self.input_dropout = nn.Dropout(self.dropout) encoders = [ EncoderLayer( self.hidden_dim, self.hidden_dim, - dropout_rate, - attention_dropout_rate, + self.dropout, self.num_heads ) for _ in range(self.n_encoder_layers) @@ -279,12 +281,12 @@ def forward(self, q, k, v, attn_bias=None, mask=None): class EncoderLayer(nn.Module): - def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate, num_heads): + def __init__(self, hidden_size, ffn_size, dropout_rate, num_heads): super(EncoderLayer, self).__init__() self.self_attention_norm = nn.LayerNorm(hidden_size) self.self_attention = MultiHeadAttention( - hidden_size, attention_dropout_rate, num_heads) + hidden_size, dropout_rate, num_heads) self.self_attention_dropout = nn.Dropout(dropout_rate) self.ffn_norm = nn.LayerNorm(hidden_size) From 74486b3c5fdd71f6fad981c60a349f91d2b28847 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:45 -0400 Subject: [PATCH 25/55] clean up Graphormer code Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/graphormer.py | 149 ++++++------------ .../tasks/feature_reconstruction_task.py | 2 +- 2 files changed, 51 insertions(+), 100 deletions(-) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 77a5ba2..1188d2e 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -1,15 +1,10 @@ from gridfm_graphkit.io.registries import MODELS_REGISTRY import torch -import numpy as np import torch.nn as nn -import pytorch_lightning as pl - -from torch.nn import functional as F from torch_geometric.utils import to_dense_batch -# from gridfm_graphkit.datasets.transforms import AddGraphormerEncodings @MODELS_REGISTRY.register("Graphormer") @@ -25,17 +20,15 @@ class Graphormer(nn.Module): args (NestedNamespace): Parameters Attributes: - input_dim (int): Dimension of input node features. From ``args.model.input_dim``. - hidden_size (int): Hidden dimension size for all layers. From ``args.model.hidden_size``. + n_node_features (int): Dimension of input node features. From ``args.model.input_dim``. + hidden_dim (int): Hidden dimension size for all layers. From ``args.model.hidden_size``. output_dim (int): Dimension of the output node features. From ``args.model.output_dim``. - edge_dim (int): Dimension of edge features. From ``args.model.edge_dim``. - pe_dim (int): Dimension of the positional encoding. Must be less than ``hidden_dim``. From ``args.model.pe_dim``. - num_layers (int): Number of GPSConv layers. From ``args.model.num_layers``. - heads (int, optional): Number of attention heads in GPSConv. From ``args.model.attention_head``. Defaults to 1. - dropout (float, optional): Dropout rate in GPSConv. From ``args.model.dropout``. Defaults to 0.0. + n_encoder_layers (int): Number of transformer blocks. From ``args.model.num_layers``. + num_heads (int): Number of attention heads. From ``args.model.attention_head``. Defaults to 1. + dropout (float, optional): Dropout rate in attention blocks. From ``args.model.dropout``. Defaults to 0.0. mask_dim (int, optional): Dimension of the mask vector. From ``args.data.mask_dim``. Defaults to 6. mask_value (float, optional): Initial value for learnable mask parameters. From ``args.data.mask_value``. Defaults to -1.0. - learn_mask (bool, optional): Whether to learn mask values as parameters. From ``args.data.learn_mask``. Defaults to True. + learn_mask (bool, optional): Whether to learn mask values as parameters. From ``args.data.learn_mask``. Defaults to False. """ def __init__(self, args): @@ -46,17 +39,11 @@ def __init__(self, args): self.output_dim = args.model.output_dim self.n_encoder_layers = args.model.num_layers self.num_heads = args.model.attention_head - - # TODO move these to config or calculate - self.dropout = getattr(args.model, "dropout", 0.0) # TODO propagate - attention_dropout_rate = 0.3 - - # variables flown over from GPS TODO check + self.dropout = getattr(args.model, "dropout", 0.0) self.mask_dim = getattr(args.data, "mask_dim", 6) self.mask_value = getattr(args.data, "mask_value", -1.0) - self.learn_mask = getattr(args.data, "learn_mask", True) + self.learn_mask = getattr(args.data, "learn_mask", False) - # TODO verify function of mask if self.learn_mask: self.mask_value = nn.Parameter( torch.randn(self.mask_dim) + self.mask_value, @@ -68,6 +55,7 @@ def __init__(self, args): requires_grad=False, ) + # model layers self.input_proj = nn.Linear(self.n_node_features, self.hidden_dim) self.input_dropout = nn.Dropout(self.dropout) encoders = [ @@ -88,64 +76,41 @@ def __init__(self, args): nn.Linear(self.hidden_dim, self.output_dim) ) - - # for pos embeddings + # for positional embeddings self.spatial_pos_encoder = nn.Embedding(512, self.num_heads, padding_idx=0) self.in_degree_encoder = nn.Embedding( 512, self.hidden_dim, padding_idx=0) self.out_degree_encoder = nn.Embedding( 512, self.hidden_dim, padding_idx=0) - # self.loss_fn = F.mse_loss # TODO remove eventually as they are specd elsewhere - # self.masking_value = -4 - def compute_pos_embeddings(self, batched_data, batch): - attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x - in_degree, out_degree = batched_data.in_degree, batched_data.in_degree - - # gr_transform = AddGraphormerEncodings( - # attr_name="gr", - # ) - # batched_data = gr_transform(batched_data) - # attn_bias, spatial_pos, x = batched_data.attn_bias, batched_data.spatial_pos, batched_data.x - # in_degree, out_degree = batched_data.in_degree, batched_data.in_degree - - - # print('--->', attn_bias.size(), attn_bias.device, batch.size()) - - # yy0, mask = to_dense_batch(attn_bias, batch=batch, max_num_nodes=2000, batch_size=8) - # yy1, mask = to_dense_batch(spatial_pos, batch=batch, max_num_nodes=2000, batch_size=8) - - # attn_bias = yy0 - # spatial_pos = yy1 + def compute_pos_embeddings(self, data): + """ + Calculate Graphormer positional encodings, and attention biases - # print('yyyyyy', yy0.size(), yy1.size(), x.size()) + Args: + data (Data): Input node features of shape [num_nodes, input_dim]. - # attn_bias = attn_bias.reshape(8,-1) - # spatial_pos = spatial_pos.reshape(8,-1) - # print('-----', attn_bias.size(), spatial_pos.size(), x.size()) - # odim = int(torch.sqrt(torch.as_tensor(attn_bias.size(-1))).item()) - # print('oooo', odim) - # attn_bias = attn_bias.reshape(-1,odim,odim) - # spatial_pos = spatial_pos.reshape(-1,odim,odim) - # print('-----', attn_bias.size(), spatial_pos.size(), x.size()) + Returns: + graph_node_feature (Tensor): data.x with positional encoding appended. + graph_attn_bias (Tensor): attention bais terms. + """ + attn_bias, spatial_pos, x = data.attn_bias, data.spatial_pos, data.x + in_degree, out_degree = data.in_degree, data.in_degree # graph_attn_bias graph_attn_bias = attn_bias.clone() graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat( 1, self.num_heads, 1, 1) # [n_graph, n_head, n_node, n_node] - # print('aaaaaaaaaa', graph_attn_bias.size(), graph_attn_bias.device) # spatial pos # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) - # print('sssssssssss', spatial_pos_bias.size(), spatial_pos_bias.device) graph_attn_bias = graph_attn_bias + spatial_pos_bias graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset node_feature = self.input_proj(x) - # print('nf>>', node_feature.size(), in_degree.size(), out_degree.size(), self.in_degree_encoder(in_degree).size()) node_feature = node_feature + \ self.in_degree_encoder(in_degree) + \ self.out_degree_encoder(out_degree) @@ -156,46 +121,42 @@ def compute_pos_embeddings(self, batched_data, batch): def encoder(self, graph_node_feature, graph_attn_bias, mask=None, batch=1): - graph_node_feature_masked = graph_node_feature #TODO simplify this - graph_attn_bias_masked = graph_attn_bias - # transfomrer encoder - output = self.input_dropout(graph_node_feature_masked) + output = self.input_dropout(graph_node_feature) for enc_layer in self.encoder_layers: - output = enc_layer(output, graph_attn_bias_masked, mask=mask, batch=batch) + output = enc_layer(output, graph_attn_bias, mask=mask, batch=batch) output = self.encoder_final_ln(output) return output - def forward(self, x, pe, edge_index, edge_attr, batched_data, data): - """ - process a batch of data, applying the input mask, while - excluding non-valid values that arrise during processing - mask: incoming values to mask for prediction + def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data=None): + """ + Forward pass for Graphormer. + + Args: + x (Tensor): Input node features of shape [num_nodes, input_dim]. + pe (Tensor): Positional encoding of shape [num_nodes, pe_dim]. + edge_index (Tensor): Edge indices for graph convolution. + edge_attr (Tensor): Edge feature tensor. + batch (Tensor): Batch vector assigning nodes to graphs. + data (Data): Pytorch Geometric Batch() object. + + Returns: + output (Tensor): Output node features of shape [num_nodes, output_dim]. """ - mask = None + # identify buffer nodes, and create a mask for them masked_entries = torch.sum(x < -1e8, axis=-1) - mask = masked_entries >= 3 # due to masking up to feature 6 of 9 # x.size(-1) - - - # TODO note that the x, pe are redundant or not needed, so clean up at the end - - # TODO in the baseline code the PE is an input here and passes through - # a normalization before being concatenated to the features, follow this in final version + mask = masked_entries >= 3 # due to masking up to feature 6 of 9 - graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data, batched_data) - # print('gnodes********', graph_node_feature.size(), graph_attn_bias.size()) - output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batched_data) + graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data) + output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batch) output = self.decoder(output) - # evaluate where mask is True, so update it TODO - # print('ooooooooo', output[~mask].size()) - # print('bbbbbbbb', data.mask.size(), data.mask, data.mask.sum()/len(data.mask.flatten())) + # return the negative of the buffer mask to select data for loss calculation return output, ~mask -# TODO maybe set this as the decoder class FeedForwardNetwork(nn.Module): def __init__(self, hidden_size, ffn_size, dropout_rate): super(FeedForwardNetwork, self).__init__() @@ -212,6 +173,10 @@ def forward(self, x): class MultiHeadAttention(nn.Module): + """ + This is a slight modification of vanilla attention, to allow masking + of buffer nodes, and the addition of biasses to the attention mechanism. + """ def __init__(self, hidden_size, attention_dropout_rate, num_heads): super(MultiHeadAttention, self).__init__() @@ -228,7 +193,7 @@ def __init__(self, hidden_size, attention_dropout_rate, num_heads): self.output_layer = nn.Linear(num_heads * att_size, hidden_size) def forward(self, q, k, v, attn_bias=None, mask=None): - + orig_q_size = q.size() d_k = self.att_size @@ -248,11 +213,7 @@ def forward(self, q, k, v, attn_bias=None, mask=None): # Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V q = q * self.scale x = torch.matmul(q, k) # [b, h, q_len, k_len] - # print('**********', - # x.size(), q.size(), - # k.size(), v.size(), - # attn_bias.size(), mask.size() - # ) + if attn_bias is not None: if mask is not None: usm0 = mask.unsqueeze(1).unsqueeze(3) @@ -298,23 +259,13 @@ def forward(self, x, attn_bias=None, mask=None, batch=1): It is assumed that the mask is 1 where values are to be ignored and then 0 where there are valid data """ - - # print('xxxxxxxxxxxxx', x.size(), batch.size()) - x, bmask = to_dense_batch(x, batch) # TODO remove bmask if padding remains in final + x, _ = to_dense_batch(x, batch) mask, _ = to_dense_batch(mask, batch) y = self.self_attention_norm(x) - # print(y.size(), attn_bias.size(), batch) - attn_bias = attn_bias.squeeze() - # attn_bias = attn_bias.permute(1, 2, 0) - # attn_bias, maska = to_dense_batch(attn_bias, batch) - # print('dense>>>', y.size(), bmask.size(), attn_bias.size()) - # print('msum>>>', bmask.sum(dim=-1), ) - # print('msum2>>', mask.size(),mask.sum(dim=-1)) y = self.self_attention(y, y, y, attn_bias, mask) y = self.self_attention_dropout(y) - # print('<<<<<>>>>', x.size(), y.size()) x = x + torch.reshape(y, x.size()) y = self.ffn_norm(x) @@ -322,5 +273,5 @@ def forward(self, x, attn_bias=None, mask=None, batch=1): y = self.ffn_dropout(y) x = x + y x=x.flatten(0,1) - # print('222222222222222', x.size()) + return x diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py index e7d5d79..e3bd215 100644 --- a/gridfm_graphkit/tasks/feature_reconstruction_task.py +++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py @@ -78,7 +78,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch, mask=None, data=None): if mask is not None: mask_value_expanded = self.model.mask_value.expand(x.shape[0], -1) x[:, : mask.shape[1]][mask] = mask_value_expanded[mask] - return self.model(x, pe, edge_index, edge_attr, batch, data) # + return self.model(x, pe, edge_index, edge_attr, batch, data) # TODO prop args to GPS @rank_zero_only def on_fit_start(self): From e9a4d0435e249deab5827086ff243abcb0e60eae Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:45 -0400 Subject: [PATCH 26/55] clean up Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/gps_transformer.py | 3 ++- gridfm_graphkit/models/graphormer.py | 4 ++-- gridfm_graphkit/tasks/feature_reconstruction_task.py | 4 +--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py index b3f1043..178570b 100644 --- a/gridfm_graphkit/models/gps_transformer.py +++ b/gridfm_graphkit/models/gps_transformer.py @@ -105,7 +105,7 @@ def __init__(self, args): requires_grad=False, ) - def forward(self, x, pe, edge_index, edge_attr, batch): + def forward(self, x, pe, edge_index, edge_attr, batch, data=None): """ Forward pass for the GPSTransformer. @@ -115,6 +115,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch): edge_index (Tensor): Edge indices for graph convolution. edge_attr (Tensor): Edge feature tensor. batch (Tensor): Batch vector assigning nodes to graphs. + data (Data): Pytorch Geometric Data/Batch object. Returns: output (Tensor): Output node features of shape [num_nodes, output_dim]. diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 1188d2e..d861755 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -89,7 +89,7 @@ def compute_pos_embeddings(self, data): Calculate Graphormer positional encodings, and attention biases Args: - data (Data): Input node features of shape [num_nodes, input_dim]. + data (Data): Pytorch geometric Data/Batch object Returns: graph_node_feature (Tensor): data.x with positional encoding appended. @@ -139,7 +139,7 @@ def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data= edge_index (Tensor): Edge indices for graph convolution. edge_attr (Tensor): Edge feature tensor. batch (Tensor): Batch vector assigning nodes to graphs. - data (Data): Pytorch Geometric Batch() object. + data (Data): Pytorch Geometric Data/Batch object. Returns: output (Tensor): Output node features of shape [num_nodes, output_dim]. diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py index e3bd215..96d79bd 100644 --- a/gridfm_graphkit/tasks/feature_reconstruction_task.py +++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py @@ -78,7 +78,7 @@ def forward(self, x, pe, edge_index, edge_attr, batch, mask=None, data=None): if mask is not None: mask_value_expanded = self.model.mask_value.expand(x.shape[0], -1) x[:, : mask.shape[1]][mask] = mask_value_expanded[mask] - return self.model(x, pe, edge_index, edge_attr, batch, data) # TODO prop args to GPS + return self.model(x, pe, edge_index, edge_attr, batch, data) @rank_zero_only def on_fit_start(self): @@ -130,8 +130,6 @@ def shared_step(self, batch): return output, loss_dict def training_step(self, batch): - # print('trainbatch>>>>', batch.size()) # TODO remove - # print(batch) _, loss_dict = self.shared_step(batch) current_lr = self.optimizer.param_groups[0]["lr"] metrics = {} From e3430492a8206f845c5ced0be8eb40a62f5b44b5 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:46 -0400 Subject: [PATCH 27/55] flow dataset parameters from the config Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- .../datasets/powergrid_datamodule.py | 1 + gridfm_graphkit/datasets/powergrid_dataset.py | 23 ++++------ gridfm_graphkit/datasets/transforms.py | 43 +++++-------------- 3 files changed, 21 insertions(+), 46 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_datamodule.py b/gridfm_graphkit/datasets/powergrid_datamodule.py index c18c360..ff796a8 100644 --- a/gridfm_graphkit/datasets/powergrid_datamodule.py +++ b/gridfm_graphkit/datasets/powergrid_datamodule.py @@ -128,6 +128,7 @@ def setup(self, stage: str): pe_dim=self.args.model.pe_dim, mask_dim=self.args.data.mask_dim, transform=get_transform(args=self.args), + args=self.args.data ) self.datasets.append(dataset) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index 09374ba..b58289a 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -44,6 +44,7 @@ def __init__( transform: Optional[Callable] = None, pre_transform: Optional[Callable] = None, pre_filter: Optional[Callable] = None, + args: Optional[dict] = None, ): self.norm_method = norm_method self.node_normalizer = node_normalizer @@ -52,6 +53,10 @@ def __init__( self.mask_dim = mask_dim self.length = None + if args.add_graphormer_encoding: + self.add_graphormer_encoding = args.add_graphormer_encoding + self.max_node_num = args.max_node_num + super().__init__(root, transform, pre_transform, pre_filter) # Load normalization stats if available @@ -173,11 +178,6 @@ def process(self): ) graph_data = pe_transform(graph_data) - # gr_transform = AddGraphormerEncodings( - # attr_name="gr", - # ) - # graph_data = gr_transform(graph_data) - torch.save( graph_data, osp.join( @@ -217,15 +217,10 @@ def get(self, idx): if self.transform: data = self.transform(data) - # TODO move this to the pretreatment when validated - # print('datab>>>>>>>', data) - # print('qqqqqq', data.x.min(), data.x.max()) - gr_transform = AddGraphormerEncodings( - attr_name="gr", - ) - data = gr_transform(data) - # print('aaaaaaaaaaaaaaa', data.x.min(), data.x.max()) - # print('dataa>>>>>>>', data) # TODO remove + if self.add_graphormer_encoding: + gr_transform = AddGraphormerEncodings(self.max_node_num) + data = gr_transform(data) + return data def change_transform(self, new_transform): diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index c3f039f..654d3fa 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -20,9 +20,6 @@ pyximport.install(setup_args={'include_dirs': np.get_include()}) import gridfm_graphkit.models.algos as algos -# from networkx import floyd_warshall_numpy -# from torch_geometric.utils import to_networkx - class AddNormalizedRandomWalkPE(BaseTransform): r"""Adds the random walk positional encoding from the @@ -118,18 +115,7 @@ def preprocess_item(data): adj = edge_adj.to_dense().to(torch.int16) - # TODO replace the placeholder with actual algorithm - # shortest_path_result = np.ones((N,N)) - - # print('+++++++',adj.dtype, adj.numpy().dtype) shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32)) - #gg = to_networkx(data) - #shortest_path_result = floyd_warshall_numpy(gg) - - # TODO the output of fw is integer number of hops in n x n, review if need to norm etc. - # print('sp>>>', shortest_path_result) - # print('sp>>>', shortest_path_result.shape) - # print(shortest_path_result.shape) spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device) attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) # TODO verifie is updated @@ -138,7 +124,6 @@ def preprocess_item(data): return attn_bias, spatial_pos, in_degree, out_degree def pad_1d_unsqueeze(x, padlen): - # x = x + 1 # pad id = 0 #TODO remove all +1s xlen = x.size(0) if xlen < padlen: new_x = x.new_zeros([padlen], dtype=x.dtype) @@ -146,10 +131,7 @@ def pad_1d_unsqueeze(x, padlen): x = new_x return x.unsqueeze(0) - def pad_2d_unsqueeze(x, padlen): - # x = x + 1 # pad id = 0 - # print('-------->', x.size()) xlen, xdim = x.size() if xlen < padlen: new_x = x.new_zeros([padlen, xdim], dtype=x.dtype) @@ -158,7 +140,6 @@ def pad_2d_unsqueeze(x, padlen): x = new_x return x.unsqueeze(0) - def pad_attn_bias_unsqueeze(x, padlen): xlen = x.size(0) if xlen < padlen: @@ -169,9 +150,7 @@ def pad_attn_bias_unsqueeze(x, padlen): x = new_x return x.unsqueeze(0) - def pad_spatial_pos_unsqueeze(x, padlen): - x = x + 1 xlen = x.size(0) if xlen < padlen: new_x = x.new_zeros([padlen, padlen], dtype=x.dtype) @@ -179,16 +158,18 @@ def pad_spatial_pos_unsqueeze(x, padlen): x = new_x return x.unsqueeze(0) + class AddGraphormerEncodings(BaseTransform): - """ - TODO update with encoding info + """Adds a positional encoding (node centrallity) to the given graph, as + well as the attention biases, as described in: Do transformers really + perform badly for graph representation?, C. Ying et al., 2021. """ def __init__( self, - attr_name: Optional[str] = "gres" # TODO remove if not needed + max_node_num: int, ) -> None: - self.attr_name = attr_name + self.max_node_num = max_node_num def forward(self, data: Data) -> Data: if data.edge_index is None: @@ -198,20 +179,18 @@ def forward(self, data: Data) -> Data: if N is None: raise ValueError("Expected data.num_nodes to be not None") - attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data) - max_node_num = 118 # TODO extract from batch - attn_bias = pad_attn_bias_unsqueeze(attn_bias, max_node_num) - spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, max_node_num) - in_degree = pad_1d_unsqueeze(in_degree, max_node_num).squeeze() + attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num) + spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num) + in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze() data = add_node_attr(data, attn_bias, attr_name='attn_bias') data = add_node_attr(data, spatial_pos, attr_name='spatial_pos') data = add_node_attr(data, in_degree, attr_name='in_degree') - data.x = pad_2d_unsqueeze(data.x, max_node_num).squeeze() - data.y = pad_2d_unsqueeze(data.y, max_node_num).squeeze() + data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze() + data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze() return data From ddb0ff806317500014e27feb40af9ff2e83f1498 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:46 -0400 Subject: [PATCH 28/55] baseline dataset finalized Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 2 +- gridfm_graphkit/datasets/transforms.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index b58289a..309a155 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -44,7 +44,7 @@ def __init__( transform: Optional[Callable] = None, pre_transform: Optional[Callable] = None, pre_filter: Optional[Callable] = None, - args: Optional[dict] = None, + args: Optional = None, ): self.norm_method = norm_method self.node_normalizer = node_normalizer diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 654d3fa..4221246 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -103,7 +103,7 @@ def add_node_attr(data: Data, value: Any, def preprocess_item(data): """ - TODO fill in header for the function + Calculation of the attention bias, and positional/structural data """ edge_index = data.edge_index N = data.num_nodes From 6124403e0da13d7f3c5a8c70bb992f56817af647 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:46 -0400 Subject: [PATCH 29/55] flow over baseline logic for edge encodings Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 2 +- gridfm_graphkit/datasets/transforms.py | 27 +++++++++++++-- gridfm_graphkit/models/graphormer.py | 34 +++++++++++++++++++ 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index 309a155..f67d3cd 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -202,7 +202,7 @@ def len(self): return self.length def __cat_dim__(self, key, value, *args, **kwargs): - if key in ['attn_bias', 'spatial_pos', 'in_degree']: + if key in ['attn_bias', 'spatial_pos', 'in_degree', 'edge_input']: return None return super().__cat_dim__(key, value, *args, **kwargs) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 4221246..1e7273b 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -101,11 +101,22 @@ def add_node_attr(data: Data, value: Any, return data +def get_edge_encoding(edge_attr): + if len(edge_attr.size()) == 1: + edge_attr = edge_attr[:, None] + attn_edge_type = torch.zeros([N, N, edge_attr.size(-1)], dtype=torch.long) + attn_edge_type[edge_index[0, :], edge_index[1, :] + ] = convert_to_single_emb(edge_attr.long()) + 1 + edge_input = algos.gen_edge_input(max_dist, path, attn_edge_type.numpy()) + + return attn_edge_type, torch.from_numpy(edge_input).long() + def preprocess_item(data): """ Calculation of the attention bias, and positional/structural data """ edge_index = data.edge_index + edge_attr = data.edge_attr N = data.num_nodes edge_adj = torch.sparse_coo_tensor( edge_index, @@ -119,9 +130,15 @@ def preprocess_item(data): spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device) attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) # TODO verifie is updated + if edge_attr is not None: + attn_edge_type, edge_input = get_edge_encoding(edge_attr) + else: + edge_input = None + attn_edge_type = None + in_degree = adj.long().sum(dim=1).view(-1) out_degree = adj.long().sum(dim=0).view(-1) - return attn_bias, spatial_pos, in_degree, out_degree + return attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input def pad_1d_unsqueeze(x, padlen): xlen = x.size(0) @@ -179,15 +196,21 @@ def forward(self, data: Data) -> Data: if N is None: raise ValueError("Expected data.num_nodes to be not None") - attn_bias, spatial_pos, in_degree, out_degree = preprocess_item(data) + attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = preprocess_item(data) attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num) spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num) in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze() + print('eeeeee>', edge_input.size()) # TODO remove + edge_input = pad_attn_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name + # TODO need to verify padding for attn_edge_type + print('etetetet>', attn_edge_type.size()) data = add_node_attr(data, attn_bias, attr_name='attn_bias') data = add_node_attr(data, spatial_pos, attr_name='spatial_pos') data = add_node_attr(data, in_degree, attr_name='in_degree') + data = add_node_attr(data, edge_input, attr_name='edge_input') + data = add_node_attr(data, edge_input, attr_name='attn_edge_type') data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze() data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze() diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index d861755..4522344 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -82,6 +82,8 @@ def __init__(self, args): 512, self.hidden_dim, padding_idx=0) self.out_degree_encoder = nn.Embedding( 512, self.hidden_dim, padding_idx=0) + self.edge_encoder = nn.Embedding( + 512 * self.n_edge_features + 1, num_heads, padding_idx=0) def compute_pos_embeddings(self, data): @@ -108,6 +110,38 @@ def compute_pos_embeddings(self, data): spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) graph_attn_bias = graph_attn_bias + spatial_pos_bias + + ########### + if data.edge_input is not None: + edge_input, attn_edge_type = data.edge_input, data.attn_edge_type + # edge feature + # TODO flow over the upstream logic for edge_types... + if self.edge_type == 'multi_hop': + spatial_pos_ = spatial_pos.clone() + spatial_pos_[spatial_pos_ == 0] = 1 # set pad to 1 + # set 1 to 1, x > 1 to x - 1 + spatial_pos_ = torch.where(spatial_pos_ > 1, spatial_pos_ - 1, spatial_pos_) + if self.multi_hop_max_dist > 0: + spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist) + edge_input = edge_input[:, :, :, :self.multi_hop_max_dist, :] + # [n_graph, n_node, n_node, max_dist, n_head] + edge_input = self.edge_encoder(edge_input).mean(-2) + max_dist = edge_input.size(-2) + edge_input_flat = edge_input.permute( + 3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads) + edge_input_flat = torch.bmm(edge_input_flat, self.edge_dis_encoder.weight.reshape( + -1, self.num_heads, self.num_heads)[:max_dist, :, :]) + edge_input = edge_input_flat.reshape( + max_dist, n_graph, n_node, n_node, self.num_heads).permute(1, 2, 3, 0, 4) + edge_input = (edge_input.sum(-2) / + (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2) + else: + # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] + edge_input = self.edge_encoder( + attn_edge_type).mean(-2).permute(0, 3, 1, 2) + graph_attn_bias = graph_attn_bias + edge_input + ########### + graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset node_feature = self.input_proj(x) From 61fd07edf7ff7507febd50cb79055e58a883ecb8 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:46 -0400 Subject: [PATCH 30/55] added model parameters for managing edge data Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/graphormer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 4522344..bca816a 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -29,6 +29,8 @@ class Graphormer(nn.Module): mask_dim (int, optional): Dimension of the mask vector. From ``args.data.mask_dim``. Defaults to 6. mask_value (float, optional): Initial value for learnable mask parameters. From ``args.data.mask_value``. Defaults to -1.0. learn_mask (bool, optional): Whether to learn mask values as parameters. From ``args.data.learn_mask``. Defaults to False. + edge_type (string, optional): Type of edge to consider multi_hop or not. From ``args.data.edge_type``. Defaults to multi_hop. + multi_hop_max_dist (int, optional): Maximum number of hops to consider at edges. From ``args.data.multi_hop_max_dist``. Defaults to 20. """ def __init__(self, args): @@ -43,6 +45,8 @@ def __init__(self, args): self.mask_dim = getattr(args.data, "mask_dim", 6) self.mask_value = getattr(args.data, "mask_value", -1.0) self.learn_mask = getattr(args.data, "learn_mask", False) + self.edge_type = getattr(args.model, "edge_type", "multi_hop") + self.multi_hop_max_dist = getattr(args.model, "multi_hop_max_dist", 20) if self.learn_mask: self.mask_value = nn.Parameter( @@ -111,7 +115,6 @@ def compute_pos_embeddings(self, data): graph_attn_bias = graph_attn_bias + spatial_pos_bias - ########### if data.edge_input is not None: edge_input, attn_edge_type = data.edge_input, data.attn_edge_type # edge feature @@ -140,7 +143,6 @@ def compute_pos_embeddings(self, data): edge_input = self.edge_encoder( attn_edge_type).mean(-2).permute(0, 3, 1, 2) graph_attn_bias = graph_attn_bias + edge_input - ########### graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset From 32c19f324dea998dd2d9d3e97fcfb8753ca57e44 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:46 -0400 Subject: [PATCH 31/55] work in progress for incorporating edge data Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 33 +++++++++++++++++++------- gridfm_graphkit/models/algos.pyx | 2 +- gridfm_graphkit/models/graphormer.py | 21 +++++++++++----- gridfm_graphkit/training/loss.py | 4 ++++ 4 files changed, 44 insertions(+), 16 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 1e7273b..e4ae56f 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -101,7 +101,15 @@ def add_node_attr(data: Data, value: Any, return data -def get_edge_encoding(edge_attr): +# TODO verify how this meshes with the node features, as compared to orig version +def convert_to_single_emb(x, offset=512): + feature_num = x.size(1) if len(x.size()) > 1 else 1 + feature_offset = 1 + \ + torch.arange(0, feature_num * offset, offset, dtype=torch.long) + x = x + feature_offset + return x + +def get_edge_encoding(edge_attr, N, edge_index, max_dist, path): if len(edge_attr.size()) == 1: edge_attr = edge_attr[:, None] attn_edge_type = torch.zeros([N, N, edge_attr.size(-1)], dtype=torch.long) @@ -131,7 +139,8 @@ def preprocess_item(data): attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) # TODO verifie is updated if edge_attr is not None: - attn_edge_type, edge_input = get_edge_encoding(edge_attr) + max_dist = np.amax(shortest_path_result) + attn_edge_type, edge_input = get_edge_encoding(edge_attr, N, edge_index, max_dist, path) else: edge_input = None attn_edge_type = None @@ -161,7 +170,17 @@ def pad_attn_bias_unsqueeze(x, padlen): xlen = x.size(0) if xlen < padlen: new_x = x.new_zeros( - [padlen, padlen], dtype=x.dtype).fill_(float('-inf')) + [padlen, padlen], dtype=x.dtype).fill_(float('-inf')) # TODO verify if masking is needed given this is at -inf... + new_x[:xlen, :xlen] = x + new_x[xlen:, :xlen] = 0 # TODO verify if masking is needed given this is at -inf... + x = new_x + return x.unsqueeze(0) + +def pad_edge_bias_unsqueeze(x, padlen): + xlen = x.size(0) + if xlen < padlen: + new_x = x.new_zeros( + (padlen, padlen) + x.size()[-2:], dtype=x.dtype).fill_(int(0)) new_x[:xlen, :xlen] = x new_x[xlen:, :xlen] = 0 x = new_x @@ -197,20 +216,16 @@ def forward(self, data: Data) -> Data: raise ValueError("Expected data.num_nodes to be not None") attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = preprocess_item(data) - attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num) spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num) in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze() - print('eeeeee>', edge_input.size()) # TODO remove - edge_input = pad_attn_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name - # TODO need to verify padding for attn_edge_type - print('etetetet>', attn_edge_type.size()) + edge_input = pad_edge_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name data = add_node_attr(data, attn_bias, attr_name='attn_bias') data = add_node_attr(data, spatial_pos, attr_name='spatial_pos') data = add_node_attr(data, in_degree, attr_name='in_degree') data = add_node_attr(data, edge_input, attr_name='edge_input') - data = add_node_attr(data, edge_input, attr_name='attn_edge_type') + data = add_node_attr(data, attn_edge_type, attr_name='attn_edge_type') data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze() data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze() diff --git a/gridfm_graphkit/models/algos.pyx b/gridfm_graphkit/models/algos.pyx index 8600367..d25b99c 100644 --- a/gridfm_graphkit/models/algos.pyx +++ b/gridfm_graphkit/models/algos.pyx @@ -74,7 +74,7 @@ def gen_edge_input(max_dist, path, edge_feat): assert path_copy.flags['C_CONTIGUOUS'] assert edge_feat_copy.flags['C_CONTIGUOUS'] - cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=numpy.int64) + cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=numpy.int32) cdef unsigned int i, j, k, num_path, cur for i in range(n): diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index bca816a..45bf63b 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -25,6 +25,7 @@ class Graphormer(nn.Module): output_dim (int): Dimension of the output node features. From ``args.model.output_dim``. n_encoder_layers (int): Number of transformer blocks. From ``args.model.num_layers``. num_heads (int): Number of attention heads. From ``args.model.attention_head``. Defaults to 1. + n_edge_features (int): Dimension of edge features. From ``args.model.edge_dim``. dropout (float, optional): Dropout rate in attention blocks. From ``args.model.dropout``. Defaults to 0.0. mask_dim (int, optional): Dimension of the mask vector. From ``args.data.mask_dim``. Defaults to 6. mask_value (float, optional): Initial value for learnable mask parameters. From ``args.data.mask_value``. Defaults to -1.0. @@ -41,6 +42,7 @@ def __init__(self, args): self.output_dim = args.model.output_dim self.n_encoder_layers = args.model.num_layers self.num_heads = args.model.attention_head + self.n_edge_features = args.model.edge_dim self.dropout = getattr(args.model, "dropout", 0.0) self.mask_dim = getattr(args.data, "mask_dim", 6) self.mask_value = getattr(args.data, "mask_value", -1.0) @@ -86,8 +88,12 @@ def __init__(self, args): 512, self.hidden_dim, padding_idx=0) self.out_degree_encoder = nn.Embedding( 512, self.hidden_dim, padding_idx=0) - self.edge_encoder = nn.Embedding( - 512 * self.n_edge_features + 1, num_heads, padding_idx=0) + if self.n_edge_features is not None: + self.edge_encoder = nn.Embedding( + 512 * self.n_edge_features + 1, self.num_heads, padding_idx=0) + if self.edge_type == 'multi_hop': + self.edge_dis_encoder = nn.Embedding( + 128 * self.num_heads * self.num_heads, 1) def compute_pos_embeddings(self, data): @@ -118,8 +124,8 @@ def compute_pos_embeddings(self, data): if data.edge_input is not None: edge_input, attn_edge_type = data.edge_input, data.attn_edge_type # edge feature - # TODO flow over the upstream logic for edge_types... if self.edge_type == 'multi_hop': + n_graph, n_node = edge_input.size()[:2] spatial_pos_ = spatial_pos.clone() spatial_pos_[spatial_pos_ == 0] = 1 # set pad to 1 # set 1 to 1, x > 1 to x - 1 @@ -128,7 +134,10 @@ def compute_pos_embeddings(self, data): spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist) edge_input = edge_input[:, :, :, :self.multi_hop_max_dist, :] # [n_graph, n_node, n_node, max_dist, n_head] - edge_input = self.edge_encoder(edge_input).mean(-2) + print('!!!!!!', edge_input.size()) + print('mmmmm', edge_input.max(), edge_input.min()) + edge_input = self.edge_encoder(edge_input+1).mean(-2) # TODO determine source of -1 and correct + print('22222', edge_input.size()) max_dist = edge_input.size(-2) edge_input_flat = edge_input.permute( 3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads) @@ -140,9 +149,9 @@ def compute_pos_embeddings(self, data): (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2) else: # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] - edge_input = self.edge_encoder( + edge_input = self.edge_encoder( # TODO test this path attn_edge_type).mean(-2).permute(0, 3, 1, 2) - graph_attn_bias = graph_attn_bias + edge_input + #graph_attn_bias = graph_attn_bias + edge_input # TODO uncomment graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset diff --git a/gridfm_graphkit/training/loss.py b/gridfm_graphkit/training/loss.py index f90953b..e705232 100644 --- a/gridfm_graphkit/training/loss.py +++ b/gridfm_graphkit/training/loss.py @@ -176,6 +176,10 @@ def forward(self, pred, target, edge_index=None, edge_attr=None, mask=None): loss_details = {} for i, loss_fn in enumerate(self.loss_functions): + print('---x', pred.size(), pred.min(), pred.max()) + print('---y', target.size(), target.min(), target.max()) + print('---ei', edge_index.size(), edge_index.min(), edge_index.max()) + print('----ea', edge_attr.size(), edge_attr.min(), edge_attr.max()) loss_output = loss_fn( pred, target, From bf1e221aeeb66f0813b7adbd2143307ff1a35923 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:46 -0400 Subject: [PATCH 32/55] multi-hop functional with N nodes upper limit Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 9 +++++++-- gridfm_graphkit/models/graphormer.py | 10 ++++++---- gridfm_graphkit/training/loss.py | 5 +---- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index e4ae56f..89df689 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -139,7 +139,8 @@ def preprocess_item(data): attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) # TODO verifie is updated if edge_attr is not None: - max_dist = np.amax(shortest_path_result) + max_dist = N # fix this to allow multiple graphs # np.amax(shortest_path_result) + # print('----->', max_dist) # TODO remove attn_edge_type, edge_input = get_edge_encoding(edge_attr, N, edge_index, max_dist, path) else: edge_input = None @@ -216,11 +217,15 @@ def forward(self, data: Data) -> Data: raise ValueError("Expected data.num_nodes to be not None") attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = preprocess_item(data) + + # print('e>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size()) attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num) spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num) in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze() edge_input = pad_edge_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name - + # TODO check padding of attn_edge_type, and num steps to sort out batching issue + # print('ffe>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size()) + data = add_node_attr(data, attn_bias, attr_name='attn_bias') data = add_node_attr(data, spatial_pos, attr_name='spatial_pos') data = add_node_attr(data, in_degree, attr_name='in_degree') diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 45bf63b..f38159c 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -134,10 +134,10 @@ def compute_pos_embeddings(self, data): spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist) edge_input = edge_input[:, :, :, :self.multi_hop_max_dist, :] # [n_graph, n_node, n_node, max_dist, n_head] - print('!!!!!!', edge_input.size()) - print('mmmmm', edge_input.max(), edge_input.min()) + # print('!!!!!!', edge_input.size()) + # print('mmmmm', edge_input.max(), edge_input.min()) edge_input = self.edge_encoder(edge_input+1).mean(-2) # TODO determine source of -1 and correct - print('22222', edge_input.size()) + # print('22222', edge_input.size()) max_dist = edge_input.size(-2) edge_input_flat = edge_input.permute( 3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads) @@ -149,9 +149,11 @@ def compute_pos_embeddings(self, data): (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2) else: # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] + # TODO pad attn_edge_type for this path edge_input = self.edge_encoder( # TODO test this path attn_edge_type).mean(-2).permute(0, 3, 1, 2) - #graph_attn_bias = graph_attn_bias + edge_input # TODO uncomment + print('sum>>>', graph_attn_bias.size(), edge_input.size(), attn_edge_type.size()) + graph_attn_bias = graph_attn_bias + edge_input graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset diff --git a/gridfm_graphkit/training/loss.py b/gridfm_graphkit/training/loss.py index e705232..34664ee 100644 --- a/gridfm_graphkit/training/loss.py +++ b/gridfm_graphkit/training/loss.py @@ -176,10 +176,7 @@ def forward(self, pred, target, edge_index=None, edge_attr=None, mask=None): loss_details = {} for i, loss_fn in enumerate(self.loss_functions): - print('---x', pred.size(), pred.min(), pred.max()) - print('---y', target.size(), target.min(), target.max()) - print('---ei', edge_index.size(), edge_index.min(), edge_index.max()) - print('----ea', edge_attr.size(), edge_attr.min(), edge_attr.max()) + loss_output = loss_fn( pred, target, From fc5a93e25d28eea36559d59f245840bf1bc4d3ec Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:46 -0400 Subject: [PATCH 33/55] set max hops for edge encoding Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 6 +++++- gridfm_graphkit/datasets/transforms.py | 19 +++++++++++++------ gridfm_graphkit/models/algos.pyx | 5 ++++- gridfm_graphkit/models/graphormer.py | 2 +- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index f67d3cd..3691500 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -56,6 +56,7 @@ def __init__( if args.add_graphormer_encoding: self.add_graphormer_encoding = args.add_graphormer_encoding self.max_node_num = args.max_node_num + self.max_hops = args.max_hops super().__init__(root, transform, pre_transform, pre_filter) @@ -218,7 +219,10 @@ def get(self, idx): data = self.transform(data) if self.add_graphormer_encoding: - gr_transform = AddGraphormerEncodings(self.max_node_num) + gr_transform = AddGraphormerEncodings( + self.max_node_num, + self.max_hops + ) data = gr_transform(data) return data diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 89df689..f7f8799 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -119,7 +119,7 @@ def get_edge_encoding(edge_attr, N, edge_index, max_dist, path): return attn_edge_type, torch.from_numpy(edge_input).long() -def preprocess_item(data): +def preprocess_item(data, max_hops): """ Calculation of the attention bias, and positional/structural data """ @@ -134,14 +134,18 @@ def preprocess_item(data): adj = edge_adj.to_dense().to(torch.int16) - shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32)) + # get shortest paths in number of hops (shortest_path_result) and intermediate nodes + # for those shortest paths (path) + shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32), max_hops) spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device) attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) # TODO verifie is updated if edge_attr is not None: - max_dist = N # fix this to allow multiple graphs # np.amax(shortest_path_result) - # print('----->', max_dist) # TODO remove - attn_edge_type, edge_input = get_edge_encoding(edge_attr, N, edge_index, max_dist, path) + # print(path) + # print(path.shape) + # print(shortest_path_result) + # print(shortest_path_result.shape) + attn_edge_type, edge_input = get_edge_encoding(edge_attr, N, edge_index, max_hops, path) else: edge_input = None attn_edge_type = None @@ -205,8 +209,10 @@ class AddGraphormerEncodings(BaseTransform): def __init__( self, max_node_num: int, + max_hops: int, ) -> None: self.max_node_num = max_node_num + self.max_hops = max_hops def forward(self, data: Data) -> Data: if data.edge_index is None: @@ -216,7 +222,8 @@ def forward(self, data: Data) -> Data: if N is None: raise ValueError("Expected data.num_nodes to be not None") - attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = preprocess_item(data) + attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = \ + preprocess_item(data, self.max_hops) # print('e>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size()) attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num) diff --git a/gridfm_graphkit/models/algos.pyx b/gridfm_graphkit/models/algos.pyx index d25b99c..7ab5851 100644 --- a/gridfm_graphkit/models/algos.pyx +++ b/gridfm_graphkit/models/algos.pyx @@ -6,11 +6,12 @@ from cython.parallel cimport prange, parallel cimport numpy import numpy -def floyd_warshall(adjacency_matrix): +def floyd_warshall(adjacency_matrix, max_hops): (nrows, ncols) = adjacency_matrix.shape assert nrows == ncols cdef unsigned int n = nrows + cdef unsigned int max_hops_copy = max_hops adj_mat_copy = adjacency_matrix.astype(numpy.int32, order='C', casting='safe', copy=True) assert adj_mat_copy.flags['C_CONTIGUOUS'] @@ -40,6 +41,8 @@ def floyd_warshall(adjacency_matrix): for j in range(n): cost_ikkj = M_ik + M_k_ptr[j] M_ij = M_i_ptr[j] + if cost_ikkj > max_hops_copy: # TODO flow from above + continue if M_ij > cost_ikkj: M_i_ptr[j] = cost_ikkj path[i][j] = k diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index f38159c..8005863 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -48,7 +48,7 @@ def __init__(self, args): self.mask_value = getattr(args.data, "mask_value", -1.0) self.learn_mask = getattr(args.data, "learn_mask", False) self.edge_type = getattr(args.model, "edge_type", "multi_hop") - self.multi_hop_max_dist = getattr(args.model, "multi_hop_max_dist", 20) + self.multi_hop_max_dist = getattr(args.data, "max_hops", 20) if self.learn_mask: self.mask_value = nn.Parameter( From 04aed1033fa54ab2a164ba3e1222ab70e2c3a307 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:47 -0400 Subject: [PATCH 34/55] add buffer for single hop case Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 3 ++- gridfm_graphkit/models/graphormer.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index f7f8799..916a2ea 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -185,7 +185,7 @@ def pad_edge_bias_unsqueeze(x, padlen): xlen = x.size(0) if xlen < padlen: new_x = x.new_zeros( - (padlen, padlen) + x.size()[-2:], dtype=x.dtype).fill_(int(0)) + (padlen, padlen) + x.size()[2:], dtype=x.dtype).fill_(int(0)) new_x[:xlen, :xlen] = x new_x[xlen:, :xlen] = 0 x = new_x @@ -230,6 +230,7 @@ def forward(self, data: Data) -> Data: spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num) in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze() edge_input = pad_edge_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name + attn_edge_type = pad_edge_bias_unsqueeze(attn_edge_type, self.max_node_num) # TODO check padding of attn_edge_type, and num steps to sort out batching issue # print('ffe>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size()) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 8005863..dbed710 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -152,7 +152,7 @@ def compute_pos_embeddings(self, data): # TODO pad attn_edge_type for this path edge_input = self.edge_encoder( # TODO test this path attn_edge_type).mean(-2).permute(0, 3, 1, 2) - print('sum>>>', graph_attn_bias.size(), edge_input.size(), attn_edge_type.size()) + # print('sum>>>', graph_attn_bias.size(), edge_input.size(), attn_edge_type.size()) graph_attn_bias = graph_attn_bias + edge_input graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset From b9e9efd3b5bafc868352d23a1fcc8f6c438abf2d Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:47 -0400 Subject: [PATCH 35/55] checkpoint before cleanup and testing Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gridfm_graphkit/models/algos.pyx b/gridfm_graphkit/models/algos.pyx index 7ab5851..003eae9 100644 --- a/gridfm_graphkit/models/algos.pyx +++ b/gridfm_graphkit/models/algos.pyx @@ -41,7 +41,7 @@ def floyd_warshall(adjacency_matrix, max_hops): for j in range(n): cost_ikkj = M_ik + M_k_ptr[j] M_ij = M_i_ptr[j] - if cost_ikkj > max_hops_copy: # TODO flow from above + if cost_ikkj > max_hops_copy: continue if M_ij > cost_ikkj: M_i_ptr[j] = cost_ikkj From 98e1b123c0af6b7d6d66eac66565688a141e3d42 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:47 -0400 Subject: [PATCH 36/55] TODOs cleared from transforms Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 916a2ea..452a265 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -101,7 +101,7 @@ def add_node_attr(data: Data, value: Any, return data -# TODO verify how this meshes with the node features, as compared to orig version + def convert_to_single_emb(x, offset=512): feature_num = x.size(1) if len(x.size()) > 1 else 1 feature_offset = 1 + \ @@ -109,6 +109,7 @@ def convert_to_single_emb(x, offset=512): x = x + feature_offset return x + def get_edge_encoding(edge_attr, N, edge_index, max_dist, path): if len(edge_attr.size()) == 1: edge_attr = edge_attr[:, None] @@ -119,6 +120,7 @@ def get_edge_encoding(edge_attr, N, edge_index, max_dist, path): return attn_edge_type, torch.from_numpy(edge_input).long() + def preprocess_item(data, max_hops): """ Calculation of the attention bias, and positional/structural data @@ -138,7 +140,7 @@ def preprocess_item(data, max_hops): # for those shortest paths (path) shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32), max_hops) spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device) - attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) # TODO verifie is updated + attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) if edge_attr is not None: # print(path) @@ -175,9 +177,9 @@ def pad_attn_bias_unsqueeze(x, padlen): xlen = x.size(0) if xlen < padlen: new_x = x.new_zeros( - [padlen, padlen], dtype=x.dtype).fill_(float('-inf')) # TODO verify if masking is needed given this is at -inf... + [padlen, padlen], dtype=x.dtype).fill_(float('-inf')) new_x[:xlen, :xlen] = x - new_x[xlen:, :xlen] = 0 # TODO verify if masking is needed given this is at -inf... + new_x[xlen:, :xlen] = 0 x = new_x return x.unsqueeze(0) @@ -225,14 +227,11 @@ def forward(self, data: Data) -> Data: attn_bias, spatial_pos, in_degree, out_degree, attn_edge_type, edge_input = \ preprocess_item(data, self.max_hops) - # print('e>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size()) attn_bias = pad_attn_bias_unsqueeze(attn_bias, self.max_node_num) spatial_pos = pad_spatial_pos_unsqueeze(spatial_pos, self.max_node_num) in_degree = pad_1d_unsqueeze(in_degree, self.max_node_num).squeeze() - edge_input = pad_edge_bias_unsqueeze(edge_input, self.max_node_num) # TODO if using change function name + edge_input = pad_edge_bias_unsqueeze(edge_input, self.max_node_num) attn_edge_type = pad_edge_bias_unsqueeze(attn_edge_type, self.max_node_num) - # TODO check padding of attn_edge_type, and num steps to sort out batching issue - # print('ffe>E>E>E>>E>E>', attn_edge_type.size(), edge_input.size()) data = add_node_attr(data, attn_bias, attr_name='attn_bias') data = add_node_attr(data, spatial_pos, attr_name='spatial_pos') From a3b5b61b87090c51ad74b37f0e00d7f9f0a68a69 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:47 -0400 Subject: [PATCH 37/55] TODOs cleared from transforms Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 452a265..86a98c8 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -143,10 +143,6 @@ def preprocess_item(data, max_hops): attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) if edge_attr is not None: - # print(path) - # print(path.shape) - # print(shortest_path_result) - # print(shortest_path_result.shape) attn_edge_type, edge_input = get_edge_encoding(edge_attr, N, edge_index, max_hops, path) else: edge_input = None From 498709bfd5fbf5b574dbe180c3a888ce436c37d9 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:47 -0400 Subject: [PATCH 38/55] TODOs cleared from graphormer Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/graphormer.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index dbed710..42d5f4d 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -134,10 +134,7 @@ def compute_pos_embeddings(self, data): spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist) edge_input = edge_input[:, :, :, :self.multi_hop_max_dist, :] # [n_graph, n_node, n_node, max_dist, n_head] - # print('!!!!!!', edge_input.size()) - # print('mmmmm', edge_input.max(), edge_input.min()) - edge_input = self.edge_encoder(edge_input+1).mean(-2) # TODO determine source of -1 and correct - # print('22222', edge_input.size()) + edge_input = self.edge_encoder(edge_input+1).mean(-2) max_dist = edge_input.size(-2) edge_input_flat = edge_input.permute( 3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads) @@ -149,10 +146,9 @@ def compute_pos_embeddings(self, data): (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2) else: # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] - # TODO pad attn_edge_type for this path - edge_input = self.edge_encoder( # TODO test this path + edge_input = self.edge_encoder( attn_edge_type).mean(-2).permute(0, 3, 1, 2) - # print('sum>>>', graph_attn_bias.size(), edge_input.size(), attn_edge_type.size()) + graph_attn_bias = graph_attn_bias + edge_input graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset From 7f0e2851290e47c4eaa9fc26fe2cad7e5dc10ecf Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:47 -0400 Subject: [PATCH 39/55] add raw graphormer config Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- .../config/my_gridFMv0.2_pretraining.yaml | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 examples/config/my_gridFMv0.2_pretraining.yaml diff --git a/examples/config/my_gridFMv0.2_pretraining.yaml b/examples/config/my_gridFMv0.2_pretraining.yaml new file mode 100644 index 0000000..d740d34 --- /dev/null +++ b/examples/config/my_gridFMv0.2_pretraining.yaml @@ -0,0 +1,56 @@ +callbacks: + patience: 100 + tol: 0 +data: + baseMVA: 100 + learn_mask: false + mask_dim: 6 + mask_ratio: 0.5 + mask_type: rnd + mask_value: -1.0 + networks: + # - Texas2k_case1_2016summerpeak + - case24_ieee_rts + - case118_ieee + - case300_ieee + normalization: baseMVAnorm + scenarios: + - 5000 + - 5000 + - 5000 + test_ratio: 0.1 + val_ratio: 0.1 + workers: 4 + add_graphormer_encoding: true + max_node_num: 300 # necessary for Graphormer + max_hops: 6 # for the edge encoding, should match + edge_type: multi_hop # singlehop +model: + attention_head: 8 + dropout: 0.1 + edge_dim: 2 + hidden_size: 123 + input_dim: 9 + num_layers: 14 + output_dim: 6 + pe_dim: 20 + type: Graphormer #GPSTransformer # +optimizer: + beta1: 0.9 + beta2: 0.999 + learning_rate: 0.0001 + lr_decay: 0.7 + lr_patience: 10 +seed: 0 +training: + batch_size: 8 + epochs: 500 + loss_weights: + - 0.01 + - 0.99 + losses: + - MaskedMSE + - PBE + accelerator: auto + devices: auto + strategy: auto From d3abe1f77d87f1061debcaf2836ba8e257529e99 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:47 -0400 Subject: [PATCH 40/55] rename graphormer config Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- ...my_gridFMv0.2_pretraining.yaml => graphormer_pretraining.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/config/{my_gridFMv0.2_pretraining.yaml => graphormer_pretraining.yaml} (100%) diff --git a/examples/config/my_gridFMv0.2_pretraining.yaml b/examples/config/graphormer_pretraining.yaml similarity index 100% rename from examples/config/my_gridFMv0.2_pretraining.yaml rename to examples/config/graphormer_pretraining.yaml From f63a760a8bbf0f3533b4e274e8602e9bb3662472 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:47 -0400 Subject: [PATCH 41/55] adjust calc of mask Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 16 +++++++--------- gridfm_graphkit/models/graphormer.py | 4 +++- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 86a98c8..5d14a46 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -88,14 +88,12 @@ def get_pe(out: Tensor) -> Tensor: return data -def add_node_attr(data: Data, value: Any, - attr_name: Optional[str] = None) -> Data: +def add_node_attr(data: Data, + value: Any, + attr_name: Optional[str] = None + ) -> Data: if attr_name is None: - if 'x' in data: - x = data.x.view(-1, 1) if data.x.dim() == 1 else data.x - data.x = torch.cat([x, value.to(x.device, x.dtype)], dim=-1) - else: - data.x = value + raise ValueError("Expected attr_name to be not None") else: data[attr_name] = value @@ -165,7 +163,7 @@ def pad_2d_unsqueeze(x, padlen): if xlen < padlen: new_x = x.new_zeros([padlen, xdim], dtype=x.dtype) new_x[:,:] = -1e9 - new_x[:xlen, :] = x + new_x[:xlen, :] = x # TODO verify this step as well with x shape x = new_x return x.unsqueeze(0) @@ -175,7 +173,7 @@ def pad_attn_bias_unsqueeze(x, padlen): new_x = x.new_zeros( [padlen, padlen], dtype=x.dtype).fill_(float('-inf')) new_x[:xlen, :xlen] = x - new_x[xlen:, :xlen] = 0 + new_x[xlen:, :xlen] = 0 # TODO verify this step x = new_x return x.unsqueeze(0) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 42d5f4d..ea3e419 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -189,8 +189,10 @@ def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data= """ # identify buffer nodes, and create a mask for them + # note masking will be done up to feature mask_dim of n_node_features masked_entries = torch.sum(x < -1e8, axis=-1) - mask = masked_entries >= 3 # due to masking up to feature 6 of 9 + mask = masked_entries >= (self.n_node_features - self.mask_dim) + graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data) output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batch) From 47a7cef8588219d53d03f8d2dc914656bf533aee Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:48 -0400 Subject: [PATCH 42/55] clean up comments Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 25 ++++++++++++++++--------- gridfm_graphkit/models/graphormer.py | 7 +++---- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 5d14a46..9bae59b 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -88,9 +88,10 @@ def get_pe(out: Tensor) -> Tensor: return data + def add_node_attr(data: Data, value: Any, - attr_name: Optional[str] = None + attr_name: str ) -> Data: if attr_name is None: raise ValueError("Expected attr_name to be not None") @@ -99,7 +100,6 @@ def add_node_attr(data: Data, return data - def convert_to_single_emb(x, offset=512): feature_num = x.size(1) if len(x.size()) > 1 else 1 feature_offset = 1 + \ @@ -107,7 +107,6 @@ def convert_to_single_emb(x, offset=512): x = x + feature_offset return x - def get_edge_encoding(edge_attr, N, edge_index, max_dist, path): if len(edge_attr.size()) == 1: edge_attr = edge_attr[:, None] @@ -118,10 +117,13 @@ def get_edge_encoding(edge_attr, N, edge_index, max_dist, path): return attn_edge_type, torch.from_numpy(edge_input).long() - def preprocess_item(data, max_hops): """ - Calculation of the attention bias, and positional/structural data + Calculation of the Graphormer attention bias, and positional/structural + variables. From a Data-like object the shortest paths in number of hops + between nodes are calculated, being cut off at max_hops. In addition to the + centrality (assume undirected graphs) and attention bias, these are the + inputs to the model structural and positional encodings. """ edge_index = data.edge_index edge_attr = data.edge_attr @@ -163,7 +165,7 @@ def pad_2d_unsqueeze(x, padlen): if xlen < padlen: new_x = x.new_zeros([padlen, xdim], dtype=x.dtype) new_x[:,:] = -1e9 - new_x[:xlen, :] = x # TODO verify this step as well with x shape + new_x[:xlen, :] = x x = new_x return x.unsqueeze(0) @@ -173,7 +175,7 @@ def pad_attn_bias_unsqueeze(x, padlen): new_x = x.new_zeros( [padlen, padlen], dtype=x.dtype).fill_(float('-inf')) new_x[:xlen, :xlen] = x - new_x[xlen:, :xlen] = 0 # TODO verify this step + new_x[xlen:, :xlen] = 0 x = new_x return x.unsqueeze(0) @@ -198,8 +200,13 @@ def pad_spatial_pos_unsqueeze(x, padlen): class AddGraphormerEncodings(BaseTransform): """Adds a positional encoding (node centrallity) to the given graph, as - well as the attention biases, as described in: Do transformers really - perform badly for graph representation?, C. Ying et al., 2021. + well as the attention and edge biases, as described in: Do transformers + really perform badly for graph representation?, C. Ying et al., 2021. + + Args: + max_node_num (int): The number of nodes in the largest graph considered. + max_hops (int): The maximum path length between nodes to consider for + the edge encodings. """ def __init__( diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index ea3e419..342186f 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -10,7 +10,7 @@ @MODELS_REGISTRY.register("Graphormer") class Graphormer(nn.Module): """ - A Graph Transformer model based on the Graphormer architecture + A Graph Transformer model based on the Graphormer architecture. This model directly modifies the attention between nodes based on its graph encodings. This requires padding the input nodes and propogating @@ -83,7 +83,8 @@ def __init__(self, args): ) # for positional embeddings - self.spatial_pos_encoder = nn.Embedding(512, self.num_heads, padding_idx=0) + self.spatial_pos_encoder = nn.Embedding( + 512, self.num_heads, padding_idx=0) self.in_degree_encoder = nn.Embedding( 512, self.hidden_dim, padding_idx=0) self.out_degree_encoder = nn.Embedding( @@ -163,8 +164,6 @@ def compute_pos_embeddings(self, data): def encoder(self, graph_node_feature, graph_attn_bias, mask=None, batch=1): - - # transfomrer encoder output = self.input_dropout(graph_node_feature) for enc_layer in self.encoder_layers: output = enc_layer(output, graph_attn_bias, mask=mask, batch=batch) From 893146be83e773c1e62f01bd89fd26ff157eb51f Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:48 -0400 Subject: [PATCH 43/55] add swith for windows and linux for cython algos Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 24 ++++++++++++++++++++++-- gridfm_graphkit/models/algos.pyx | 10 +++++----- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 9bae59b..535b70c 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -16,6 +16,7 @@ ) import numpy as np +import os import pyximport pyximport.install(setup_args={'include_dirs': np.get_include()}) import gridfm_graphkit.models.algos as algos @@ -113,7 +114,15 @@ def get_edge_encoding(edge_attr, N, edge_index, max_dist, path): attn_edge_type = torch.zeros([N, N, edge_attr.size(-1)], dtype=torch.long) attn_edge_type[edge_index[0, :], edge_index[1, :] ] = convert_to_single_emb(edge_attr.long()) + 1 - edge_input = algos.gen_edge_input(max_dist, path, attn_edge_type.numpy()) + if os.name == 'nt': + edge_input = algos.gen_edge_input( + max_dist, + path, + attn_edge_type.numpy(), + localtype=np.int32 + ) + else: + edge_input = algos.gen_edge_input(max_dist, path, attn_edge_type.numpy()) return attn_edge_type, torch.from_numpy(edge_input).long() @@ -138,7 +147,18 @@ def preprocess_item(data, max_hops): # get shortest paths in number of hops (shortest_path_result) and intermediate nodes # for those shortest paths (path) - shortest_path_result, path = algos.floyd_warshall(adj.numpy().astype(np.int32), max_hops) + if os.name == 'nt': + shortest_path_result, path = algos.floyd_warshall( + adj.numpy().astype(np.int32), + max_hops, + localtype=np.int32 + ) + else: + shortest_path_result, path = algos.floyd_warshall( + adj.numpy().astype(np.int32), + max_hops + ) + spatial_pos = torch.from_numpy((shortest_path_result)).long().to(data.x.device) attn_bias = torch.zeros([N, N], dtype=torch.float).to(data.x.device) diff --git a/gridfm_graphkit/models/algos.pyx b/gridfm_graphkit/models/algos.pyx index 003eae9..6701740 100644 --- a/gridfm_graphkit/models/algos.pyx +++ b/gridfm_graphkit/models/algos.pyx @@ -6,17 +6,17 @@ from cython.parallel cimport prange, parallel cimport numpy import numpy -def floyd_warshall(adjacency_matrix, max_hops): +def floyd_warshall(adjacency_matrix, max_hops, localtype=long): (nrows, ncols) = adjacency_matrix.shape assert nrows == ncols cdef unsigned int n = nrows cdef unsigned int max_hops_copy = max_hops - adj_mat_copy = adjacency_matrix.astype(numpy.int32, order='C', casting='safe', copy=True) + adj_mat_copy = adjacency_matrix.astype(localtype, order='C', casting='safe', copy=True) assert adj_mat_copy.flags['C_CONTIGUOUS'] cdef numpy.ndarray[long, ndim=2, mode='c'] M = adj_mat_copy - cdef numpy.ndarray[long, ndim=2, mode='c'] path = numpy.zeros([n, n], dtype=numpy.int32) + cdef numpy.ndarray[long, ndim=2, mode='c'] path = numpy.zeros([n, n], dtype=localtype) cdef unsigned int i, j, k cdef long M_ij, M_ik, cost_ikkj @@ -65,7 +65,7 @@ def get_all_edges(path, i, j): return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j) -def gen_edge_input(max_dist, path, edge_feat): +def gen_edge_input(max_dist, path, edge_feat, localtype=long): (nrows, ncols) = path.shape assert nrows == ncols @@ -77,7 +77,7 @@ def gen_edge_input(max_dist, path, edge_feat): assert path_copy.flags['C_CONTIGUOUS'] assert edge_feat_copy.flags['C_CONTIGUOUS'] - cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=numpy.int32) + cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=localtype) cdef unsigned int i, j, k, num_path, cur for i in range(n): From 8c44e25a4556324c7465a4e941a16ad1c9fc8494 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:48 -0400 Subject: [PATCH 44/55] add layer norm to decoder Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/graphormer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 342186f..b1dd263 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -79,6 +79,7 @@ def __init__(self, args): self.decoder = nn.Sequential( nn.Linear(self.hidden_dim, self.hidden_dim), nn.LeakyReLU(), + nn.LayerNorm(self.hidden_dim), nn.Linear(self.hidden_dim, self.output_dim) ) @@ -122,7 +123,7 @@ def compute_pos_embeddings(self, data): graph_attn_bias = graph_attn_bias + spatial_pos_bias - if data.edge_input is not None: + if (data.edge_input is not None) and (self.edge_type is not None): edge_input, attn_edge_type = data.edge_input, data.attn_edge_type # edge feature if self.edge_type == 'multi_hop': From 1478b3bfaeaedd71571aff6ca04b3fab7ea1ddfc Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:48 -0400 Subject: [PATCH 45/55] adjust edge encoding to accomodate better switching and negative values Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 4 +++- gridfm_graphkit/datasets/transforms.py | 9 +++++++-- gridfm_graphkit/models/graphormer.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index 3691500..17b205c 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -53,10 +53,12 @@ def __init__( self.mask_dim = mask_dim self.length = None - if args.add_graphormer_encoding: + if ("add_graphormer_encoding" in args) and args.add_graphormer_encoding: self.add_graphormer_encoding = args.add_graphormer_encoding self.max_node_num = args.max_node_num self.max_hops = args.max_hops + else: + self.add_graphormer_encoding = False super().__init__(root, transform, pre_transform, pre_filter) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 535b70c..e8573d8 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -102,10 +102,15 @@ def add_node_attr(data: Data, return data def convert_to_single_emb(x, offset=512): + """ + The edge feature embedding range is set to start at 512 to accomodate + negative branch feature values in PF data. + """ feature_num = x.size(1) if len(x.size()) > 1 else 1 feature_offset = 1 + \ - torch.arange(0, feature_num * offset, offset, dtype=torch.long) + torch.arange(offset, (feature_num + 1) * offset, offset, dtype=torch.long) x = x + feature_offset + return x def get_edge_encoding(edge_attr, N, edge_index, max_dist, path): @@ -123,7 +128,7 @@ def get_edge_encoding(edge_attr, N, edge_index, max_dist, path): ) else: edge_input = algos.gen_edge_input(max_dist, path, attn_edge_type.numpy()) - + return attn_edge_type, torch.from_numpy(edge_input).long() def preprocess_item(data, max_hops): diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index b1dd263..af2396e 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -92,7 +92,7 @@ def __init__(self, args): 512, self.hidden_dim, padding_idx=0) if self.n_edge_features is not None: self.edge_encoder = nn.Embedding( - 512 * self.n_edge_features + 1, self.num_heads, padding_idx=0) + 512 * (self.n_edge_features+1) + 1, self.num_heads, padding_idx=0) if self.edge_type == 'multi_hop': self.edge_dis_encoder = nn.Embedding( 128 * self.num_heads * self.num_heads, 1) From 518f549cb859bbe6a6c1701e94a08c3458e5d16f Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:48 -0400 Subject: [PATCH 46/55] checkpoint in testing Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 10 +++++++++- gridfm_graphkit/models/graphormer.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index e8573d8..e0aa9a6 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -106,9 +106,17 @@ def convert_to_single_emb(x, offset=512): The edge feature embedding range is set to start at 512 to accomodate negative branch feature values in PF data. """ + x = torch.clamp(x, 0, 512) feature_num = x.size(1) if len(x.size()) > 1 else 1 feature_offset = 1 + \ - torch.arange(offset, (feature_num + 1) * offset, offset, dtype=torch.long) + torch.arange( + 0, + (feature_num) * offset, + offset, + dtype=torch.long + ) # start range at offset to accomodate -ve values TODO finalize + # torch.arange(offset, (feature_num + 1) * offset, offset, dtype=torch.long) + x = x + feature_offset return x diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index af2396e..b1dd263 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -92,7 +92,7 @@ def __init__(self, args): 512, self.hidden_dim, padding_idx=0) if self.n_edge_features is not None: self.edge_encoder = nn.Embedding( - 512 * (self.n_edge_features+1) + 1, self.num_heads, padding_idx=0) + 512 * self.n_edge_features + 1, self.num_heads, padding_idx=0) if self.edge_type == 'multi_hop': self.edge_dis_encoder = nn.Embedding( 128 * self.num_heads * self.num_heads, 1) From b4c1a6f2f2501101e0ce5762e6811254a861eb29 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:48 -0400 Subject: [PATCH 47/55] verified fix to edge encoding range Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 15 ++++++++------- gridfm_graphkit/models/graphormer.py | 1 + 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index e0aa9a6..7a0d6cd 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -103,10 +103,12 @@ def add_node_attr(data: Data, def convert_to_single_emb(x, offset=512): """ - The edge feature embedding range is set to start at 512 to accomodate - negative branch feature values in PF data. + The edge feature embedding range is set to 512, with the futher assumption + that the input range is from -512 to 512. This may need to change in the future. """ - x = torch.clamp(x, 0, 512) + x = torch.clamp(x, -512, 512) + x = ( 512*(x+512)/1024 ).long() + feature_num = x.size(1) if len(x.size()) > 1 else 1 feature_offset = 1 + \ torch.arange( @@ -114,11 +116,10 @@ def convert_to_single_emb(x, offset=512): (feature_num) * offset, offset, dtype=torch.long - ) # start range at offset to accomodate -ve values TODO finalize - # torch.arange(offset, (feature_num + 1) * offset, offset, dtype=torch.long) - + ) + x = x + feature_offset - + return x def get_edge_encoding(edge_attr, N, edge_index, max_dist, path): diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index b1dd263..3d92498 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -93,6 +93,7 @@ def __init__(self, args): if self.n_edge_features is not None: self.edge_encoder = nn.Embedding( 512 * self.n_edge_features + 1, self.num_heads, padding_idx=0) + # 1024, self.num_heads, padding_idx=0) if self.edge_type == 'multi_hop': self.edge_dis_encoder = nn.Embedding( 128 * self.num_heads * self.num_heads, 1) From 1b63f2b8b9dd76a52dab203ab29d04c333d8b64c Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:48 -0400 Subject: [PATCH 48/55] checkpoint before rework of tensor shapes Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 2 +- gridfm_graphkit/models/graphormer.py | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index 7a0d6cd..bd24e5d 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -275,7 +275,7 @@ def forward(self, data: Data) -> Data: data = add_node_attr(data, attn_edge_type, attr_name='attn_edge_type') data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze() - data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze() + # data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze() return data diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 3d92498..d1b6eab 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -112,6 +112,7 @@ def compute_pos_embeddings(self, data): """ attn_bias, spatial_pos, x = data.attn_bias, data.spatial_pos, data.x in_degree, out_degree = data.in_degree, data.in_degree + print('>>>>', attn_bias.size(), spatial_pos.size(), in_degree.size()) # graph_attn_bias graph_attn_bias = attn_bias.clone() @@ -157,10 +158,9 @@ def compute_pos_embeddings(self, data): graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset node_feature = self.input_proj(x) - node_feature = node_feature + \ + graph_node_feature = node_feature + \ self.in_degree_encoder(in_degree) + \ self.out_degree_encoder(out_degree) - graph_node_feature = node_feature return graph_node_feature, graph_attn_bias @@ -188,12 +188,15 @@ def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data= Returns: output (Tensor): Output node features of shape [num_nodes, output_dim]. """ + print('0***', x.size(), data.y.size()) + print('<<<', x.min(), x.max()) + # x, _ = to_dense_batch(x, batch, max_num_nodes=30) + # print('1***', x.size()) # identify buffer nodes, and create a mask for them # note masking will be done up to feature mask_dim of n_node_features masked_entries = torch.sum(x < -1e8, axis=-1) mask = masked_entries >= (self.n_node_features - self.mask_dim) - graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data) output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batch) @@ -308,6 +311,18 @@ def forward(self, x, attn_bias=None, mask=None, batch=1): x, _ = to_dense_batch(x, batch) mask, _ = to_dense_batch(mask, batch) + # print('***', x[~mask].min(), x[~mask].max()) + + print('-----') + # print(x.size()) + print(mask.size(), attn_bias.size(), batch) + # print(mask.sum(axis=1)) + # print((x[1:2,:,:] < -1e7).sum(axis=(1,2))) + print(x.min(), x.max()) + # print((attn_bias[2:3,2:3,:,:]).sum(axis=2)) + # print((attn_bias[2:3,2:3,:,:]).sum(axis=3)) + # print(mask.sum()) + y = self.self_attention_norm(x) attn_bias = attn_bias.squeeze() y = self.self_attention(y, y, y, attn_bias, mask) From 1f6fe592604e4860ca6b56251e551e87d7a6021f Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:49 -0400 Subject: [PATCH 49/55] mask based on dense batch tested Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 7 +- gridfm_graphkit/models/graphormer.py | 70 ++++++++++++------- .../tasks/feature_reconstruction_task.py | 8 +-- 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index bd24e5d..fbe61de 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -274,8 +274,13 @@ def forward(self, data: Data) -> Data: data = add_node_attr(data, edge_input, attr_name='edge_input') data = add_node_attr(data, attn_edge_type, attr_name='attn_edge_type') - data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze() + # data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze() # TODO finalize # data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze() + + # TODO remove testing lines + # masked_entries = torch.sum(data.x < -1e8, axis=-1) + # mask = masked_entries >= (9 - 3) + # print('ssssss_orig', mask.sum()) return data diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index d1b6eab..6c7d383 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -49,6 +49,7 @@ def __init__(self, args): self.learn_mask = getattr(args.data, "learn_mask", False) self.edge_type = getattr(args.model, "edge_type", "multi_hop") self.multi_hop_max_dist = getattr(args.data, "max_hops", 20) + self.max_node_num = getattr(args.data, "max_node_num", 24) if self.learn_mask: self.mask_value = nn.Parameter( @@ -76,12 +77,13 @@ def __init__(self, args): self.encoder_layers = nn.ModuleList(encoders) self.encoder_final_ln = nn.LayerNorm(self.hidden_dim) - self.decoder = nn.Sequential( + self.decoder_layers = nn.Sequential( nn.Linear(self.hidden_dim, self.hidden_dim), nn.LeakyReLU(), nn.LayerNorm(self.hidden_dim), nn.Linear(self.hidden_dim, self.output_dim) ) + # for positional embeddings self.spatial_pos_encoder = nn.Embedding( @@ -99,7 +101,7 @@ def __init__(self, args): 128 * self.num_heads * self.num_heads, 1) - def compute_pos_embeddings(self, data): + def compute_pos_embeddings(self, data, x): """ Calculate Graphormer positional encodings, and attention biases @@ -110,9 +112,9 @@ def compute_pos_embeddings(self, data): graph_node_feature (Tensor): data.x with positional encoding appended. graph_attn_bias (Tensor): attention bais terms. """ - attn_bias, spatial_pos, x = data.attn_bias, data.spatial_pos, data.x + attn_bias, spatial_pos = data.attn_bias, data.spatial_pos #, data.x in_degree, out_degree = data.in_degree, data.in_degree - print('>>>>', attn_bias.size(), spatial_pos.size(), in_degree.size()) + # print('>>>>', attn_bias.size(), spatial_pos.size(), in_degree.size()) # graph_attn_bias graph_attn_bias = attn_bias.clone() @@ -158,9 +160,11 @@ def compute_pos_embeddings(self, data): graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset node_feature = self.input_proj(x) - graph_node_feature = node_feature + \ + # print('zzz', node_feature.flatten(0,1).size()) + graph_node_feature = node_feature.flatten(0,1) + \ self.in_degree_encoder(in_degree) + \ self.out_degree_encoder(out_degree) + graph_node_feature = graph_node_feature.reshape(node_feature.size()) return graph_node_feature, graph_attn_bias @@ -169,9 +173,12 @@ def encoder(self, graph_node_feature, graph_attn_bias, mask=None, batch=1): output = self.input_dropout(graph_node_feature) for enc_layer in self.encoder_layers: output = enc_layer(output, graph_attn_bias, mask=mask, batch=batch) - output = self.encoder_final_ln(output) + output[~mask] = self.encoder_final_ln(output[~mask]) + return output + + def decoder(self, x): + output = self.decoder_layers(x) return output - def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data=None): """ @@ -188,26 +195,30 @@ def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data= Returns: output (Tensor): Output node features of shape [num_nodes, output_dim]. """ - print('0***', x.size(), data.y.size()) - print('<<<', x.min(), x.max()) - # x, _ = to_dense_batch(x, batch, max_num_nodes=30) + # print('0***', x.size(), data.y.size()) + # print('<<<', x.min(), x.max()) + x, valid_nodes = to_dense_batch(x, batch, max_num_nodes=self.max_node_num) + mask = ~valid_nodes # print('1***', x.size()) + # TODO remove prints # identify buffer nodes, and create a mask for them # note masking will be done up to feature mask_dim of n_node_features - masked_entries = torch.sum(x < -1e8, axis=-1) - mask = masked_entries >= (self.n_node_features - self.mask_dim) + # masked_entries = torch.sum(x < -1e8, axis=-1) + # mask = masked_entries >= (self.n_node_features - self.mask_dim) + # print('mmmmmm', mask.size(), mask.size()) + # print('ssssss', (~mask).sum()) - graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data) + graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data, x) output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batch) - output = self.decoder(output) + output = self.decoder(output[valid_nodes]) # return the negative of the buffer mask to select data for loss calculation - return output, ~mask + return output class FeedForwardNetwork(nn.Module): - def __init__(self, hidden_size, ffn_size, dropout_rate): + def __init__(self, hidden_size, ffn_size): super(FeedForwardNetwork, self).__init__() self.layer1 = nn.Linear(hidden_size, ffn_size) @@ -300,39 +311,44 @@ def __init__(self, hidden_size, ffn_size, dropout_rate, num_heads): self.self_attention_dropout = nn.Dropout(dropout_rate) self.ffn_norm = nn.LayerNorm(hidden_size) - self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate) + self.ffn = FeedForwardNetwork(hidden_size, ffn_size) self.ffn_dropout = nn.Dropout(dropout_rate) - def forward(self, x, attn_bias=None, mask=None, batch=1): + def forward(self, x, attn_bias=None, mask=None, batch=1): #TODO remove batch if not needed """ It is assumed that the mask is 1 where values are to be ignored and then 0 where there are valid data """ - x, _ = to_dense_batch(x, batch) - mask, _ = to_dense_batch(mask, batch) + # print('enin****', x.size()) + # x, _ = to_dense_batch(x, batch) + # mask, _ = to_dense_batch(mask, batch) - # print('***', x[~mask].min(), x[~mask].max()) + # print('enc***', x[~mask].min(), x[~mask].max()) + # print('enc***', x.size()) - print('-----') + # print('-----') # print(x.size()) - print(mask.size(), attn_bias.size(), batch) + # print(mask.size(), attn_bias.size(), batch) # print(mask.sum(axis=1)) # print((x[1:2,:,:] < -1e7).sum(axis=(1,2))) - print(x.min(), x.max()) + # print(x.min(), x.max()) + # print('vvvvv', x[~mask].size(), x[~mask].min(), x[~mask].max()) # print((attn_bias[2:3,2:3,:,:]).sum(axis=2)) # print((attn_bias[2:3,2:3,:,:]).sum(axis=3)) # print(mask.sum()) + # print('>>>', x[~mask].min(), x[~mask].max(), '-', x.min(), x.max()) - y = self.self_attention_norm(x) + y = x + y[~mask] = self.self_attention_norm(x[~mask]) attn_bias = attn_bias.squeeze() y = self.self_attention(y, y, y, attn_bias, mask) y = self.self_attention_dropout(y) x = x + torch.reshape(y, x.size()) - y = self.ffn_norm(x) + y[~mask] = self.ffn_norm(x[~mask]) y = self.ffn(y) y = self.ffn_dropout(y) x = x + y - x=x.flatten(0,1) + # x=x.flatten(0,1) return x diff --git a/gridfm_graphkit/tasks/feature_reconstruction_task.py b/gridfm_graphkit/tasks/feature_reconstruction_task.py index 96d79bd..902acd0 100644 --- a/gridfm_graphkit/tasks/feature_reconstruction_task.py +++ b/gridfm_graphkit/tasks/feature_reconstruction_task.py @@ -110,7 +110,7 @@ def on_fit_start(self): ) def shared_step(self, batch): - output, valid = self.forward( + output = self.forward( x=batch.x, pe=batch.pe, edge_index=batch.edge_index, @@ -121,11 +121,11 @@ def shared_step(self, batch): ) loss_dict = self.loss_fn( - output[valid], - batch.y[valid], + output, + batch.y, batch.edge_index, batch.edge_attr, - batch.mask[valid], + batch.mask, ) return output, loss_dict From 03f4f7ac13e2266ce67322c72cd88aedd43fefb8 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:49 -0400 Subject: [PATCH 50/55] clean up Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 8 ----- gridfm_graphkit/models/graphormer.py | 44 ++++---------------------- 2 files changed, 6 insertions(+), 46 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index fbe61de..a24edcd 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -274,14 +274,6 @@ def forward(self, data: Data) -> Data: data = add_node_attr(data, edge_input, attr_name='edge_input') data = add_node_attr(data, attn_edge_type, attr_name='attn_edge_type') - # data.x = pad_2d_unsqueeze(data.x, self.max_node_num).squeeze() # TODO finalize - # data.y = pad_2d_unsqueeze(data.y, self.max_node_num).squeeze() - - # TODO remove testing lines - # masked_entries = torch.sum(data.x < -1e8, axis=-1) - # mask = masked_entries >= (9 - 3) - # print('ssssss_orig', mask.sum()) - return data diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index 6c7d383..fb6288a 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -112,9 +112,8 @@ def compute_pos_embeddings(self, data, x): graph_node_feature (Tensor): data.x with positional encoding appended. graph_attn_bias (Tensor): attention bais terms. """ - attn_bias, spatial_pos = data.attn_bias, data.spatial_pos #, data.x + attn_bias, spatial_pos = data.attn_bias, data.spatial_pos in_degree, out_degree = data.in_degree, data.in_degree - # print('>>>>', attn_bias.size(), spatial_pos.size(), in_degree.size()) # graph_attn_bias graph_attn_bias = attn_bias.clone() @@ -160,7 +159,6 @@ def compute_pos_embeddings(self, data, x): graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset node_feature = self.input_proj(x) - # print('zzz', node_feature.flatten(0,1).size()) graph_node_feature = node_feature.flatten(0,1) + \ self.in_degree_encoder(in_degree) + \ self.out_degree_encoder(out_degree) @@ -169,10 +167,10 @@ def compute_pos_embeddings(self, data, x): return graph_node_feature, graph_attn_bias - def encoder(self, graph_node_feature, graph_attn_bias, mask=None, batch=1): + def encoder(self, graph_node_feature, graph_attn_bias, mask=None): output = self.input_dropout(graph_node_feature) for enc_layer in self.encoder_layers: - output = enc_layer(output, graph_attn_bias, mask=mask, batch=batch) + output = enc_layer(output, graph_attn_bias, mask=mask) output[~mask] = self.encoder_final_ln(output[~mask]) return output @@ -195,25 +193,14 @@ def forward(self, x, pe=None, edge_index=None, edge_attr=None, batch=None, data= Returns: output (Tensor): Output node features of shape [num_nodes, output_dim]. """ - # print('0***', x.size(), data.y.size()) - # print('<<<', x.min(), x.max()) + x, valid_nodes = to_dense_batch(x, batch, max_num_nodes=self.max_node_num) mask = ~valid_nodes - # print('1***', x.size()) - # TODO remove prints - - # identify buffer nodes, and create a mask for them - # note masking will be done up to feature mask_dim of n_node_features - # masked_entries = torch.sum(x < -1e8, axis=-1) - # mask = masked_entries >= (self.n_node_features - self.mask_dim) - # print('mmmmmm', mask.size(), mask.size()) - # print('ssssss', (~mask).sum()) graph_node_feature, graph_attn_bias = self.compute_pos_embeddings(data, x) - output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask, batch=batch) + output = self.encoder(graph_node_feature, graph_attn_bias, mask=mask) output = self.decoder(output[valid_nodes]) - # return the negative of the buffer mask to select data for loss calculation return output @@ -314,29 +301,11 @@ def __init__(self, hidden_size, ffn_size, dropout_rate, num_heads): self.ffn = FeedForwardNetwork(hidden_size, ffn_size) self.ffn_dropout = nn.Dropout(dropout_rate) - def forward(self, x, attn_bias=None, mask=None, batch=1): #TODO remove batch if not needed + def forward(self, x, attn_bias=None, mask=None): """ It is assumed that the mask is 1 where values are to be ignored and then 0 where there are valid data """ - # print('enin****', x.size()) - # x, _ = to_dense_batch(x, batch) - # mask, _ = to_dense_batch(mask, batch) - - # print('enc***', x[~mask].min(), x[~mask].max()) - # print('enc***', x.size()) - - # print('-----') - # print(x.size()) - # print(mask.size(), attn_bias.size(), batch) - # print(mask.sum(axis=1)) - # print((x[1:2,:,:] < -1e7).sum(axis=(1,2))) - # print(x.min(), x.max()) - # print('vvvvv', x[~mask].size(), x[~mask].min(), x[~mask].max()) - # print((attn_bias[2:3,2:3,:,:]).sum(axis=2)) - # print((attn_bias[2:3,2:3,:,:]).sum(axis=3)) - # print(mask.sum()) - # print('>>>', x[~mask].min(), x[~mask].max(), '-', x.min(), x.max()) y = x y[~mask] = self.self_attention_norm(x[~mask]) @@ -349,6 +318,5 @@ def forward(self, x, attn_bias=None, mask=None, batch=1): #TODO remove batch i y = self.ffn(y) y = self.ffn_dropout(y) x = x + y - # x=x.flatten(0,1) return x From c988fbb42361dd68f862a662b603d8dd3c8be251 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:49 -0400 Subject: [PATCH 51/55] clean up Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/gnn_transformer.py | 5 +---- gridfm_graphkit/models/gps_transformer.py | 4 +--- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/gridfm_graphkit/models/gnn_transformer.py b/gridfm_graphkit/models/gnn_transformer.py index 627cd49..7747f59 100644 --- a/gridfm_graphkit/models/gnn_transformer.py +++ b/gridfm_graphkit/models/gnn_transformer.py @@ -94,7 +94,4 @@ def forward(self, x, pe, edge_index, edge_attr, batch): x = self.mlps(x) - masked_entries = torch.sum(x < -1e8, axis=-1) - mask = masked_entries >= 3 - - return x, ~mask + return x diff --git a/gridfm_graphkit/models/gps_transformer.py b/gridfm_graphkit/models/gps_transformer.py index 178570b..e99188a 100644 --- a/gridfm_graphkit/models/gps_transformer.py +++ b/gridfm_graphkit/models/gps_transformer.py @@ -138,6 +138,4 @@ def forward(self, x, pe, edge_index, edge_attr, batch, data=None): x = self.pre_decoder_norm(x) x = self.decoder(x) - masked_entries = torch.sum(x < -1e8, axis=-1) - mask = masked_entries >= 3 - return x, ~mask + return x From f3c0f60d259a541f6a402fd5269c11dc1c650229 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:49 -0400 Subject: [PATCH 52/55] clean up Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/transforms.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/gridfm_graphkit/datasets/transforms.py b/gridfm_graphkit/datasets/transforms.py index a24edcd..832087b 100644 --- a/gridfm_graphkit/datasets/transforms.py +++ b/gridfm_graphkit/datasets/transforms.py @@ -194,15 +194,6 @@ def pad_1d_unsqueeze(x, padlen): x = new_x return x.unsqueeze(0) -def pad_2d_unsqueeze(x, padlen): - xlen, xdim = x.size() - if xlen < padlen: - new_x = x.new_zeros([padlen, xdim], dtype=x.dtype) - new_x[:,:] = -1e9 - new_x[:xlen, :] = x - x = new_x - return x.unsqueeze(0) - def pad_attn_bias_unsqueeze(x, padlen): xlen = x.size(0) if xlen < padlen: From e8ec0437614c79a7791ecdd129c99bcb34a4046d Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:49 -0400 Subject: [PATCH 53/55] update comments Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/models/graphormer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gridfm_graphkit/models/graphormer.py b/gridfm_graphkit/models/graphormer.py index fb6288a..b36610d 100644 --- a/gridfm_graphkit/models/graphormer.py +++ b/gridfm_graphkit/models/graphormer.py @@ -32,7 +32,7 @@ class Graphormer(nn.Module): learn_mask (bool, optional): Whether to learn mask values as parameters. From ``args.data.learn_mask``. Defaults to False. edge_type (string, optional): Type of edge to consider multi_hop or not. From ``args.data.edge_type``. Defaults to multi_hop. multi_hop_max_dist (int, optional): Maximum number of hops to consider at edges. From ``args.data.multi_hop_max_dist``. Defaults to 20. - + max_node_num (int, optional): Maximum number of node in the input graphs. From ``args.data.max_node_num``. Defaults to 24. """ def __init__(self, args): super().__init__() @@ -83,7 +83,6 @@ def __init__(self, args): nn.LayerNorm(self.hidden_dim), nn.Linear(self.hidden_dim, self.output_dim) ) - # for positional embeddings self.spatial_pos_encoder = nn.Embedding( @@ -95,7 +94,6 @@ def __init__(self, args): if self.n_edge_features is not None: self.edge_encoder = nn.Embedding( 512 * self.n_edge_features + 1, self.num_heads, padding_idx=0) - # 1024, self.num_heads, padding_idx=0) if self.edge_type == 'multi_hop': self.edge_dis_encoder = nn.Embedding( 128 * self.num_heads * self.num_heads, 1) @@ -107,6 +105,7 @@ def compute_pos_embeddings(self, data, x): Args: data (Data): Pytorch geometric Data/Batch object + x (Tensor): The node feature tensor from data Returns: graph_node_feature (Tensor): data.x with positional encoding appended. From 23f0119266cbd60b3c29729fb9ee198e7147144e Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:49 -0400 Subject: [PATCH 54/55] specify cython version in toml Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 10719f1..caafe70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ dependencies = [ "pyyaml", "lightning", "seaborn", - "cython" + "cython==3.0.11" ] [project.optional-dependencies] From 43a5e3c8aa270fab94defa0de3a5e564e7239d14 Mon Sep 17 00:00:00 2001 From: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:16:49 -0400 Subject: [PATCH 55/55] changed default of args for dataset Signed-off-by: Thomas Tolhurst <99353435+ttolhurst@users.noreply.github.com> --- gridfm_graphkit/datasets/powergrid_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gridfm_graphkit/datasets/powergrid_dataset.py b/gridfm_graphkit/datasets/powergrid_dataset.py index 17b205c..309800a 100644 --- a/gridfm_graphkit/datasets/powergrid_dataset.py +++ b/gridfm_graphkit/datasets/powergrid_dataset.py @@ -44,7 +44,7 @@ def __init__( transform: Optional[Callable] = None, pre_transform: Optional[Callable] = None, pre_filter: Optional[Callable] = None, - args: Optional = None, + args: Optional = {}, ): self.norm_method = norm_method self.node_normalizer = node_normalizer