Note
Click here to download the full example code
General DataloderΒΆ
This is the dataloader file for the PyTorch implementation
import numpy as np
from abc import abstractmethod
from torch.utils.data import DataLoader
import torch
from torchvision import datasets, transforms
import networkx as nx
import typing
import scipy
import scipy.io as spio
import numpy as np
import os
def loadmat(filename):
'''
this function should be called instead of direct spio.loadmat
as it cures the problem of not properly recovering python dictionaries
from mat files. It calls the function check keys to cure all entries
which are still mat-objects
'''
def _check_keys(d):
'''
checks if entries in dictionary are mat-objects. If yes
todict is called to change them to nested dictionaries
'''
for key in d:
if isinstance(d[key], spio.matlab.mio5_params.mat_struct):
d[key] = _todict(d[key])
return d
def _todict(matobj):
'''
A recursive function which constructs from matobjects nested dictionaries
'''
d = {}
for strg in matobj._fieldnames:
elem = matobj.__dict__[strg]
if isinstance(elem, spio.matlab.mio5_params.mat_struct):
d[strg] = _todict(elem)
elif isinstance(elem, np.ndarray):
d[strg] = _tolist(elem)
else:
d[strg] = elem
return d
def _tolist(ndarray):
'''
A recursive function which constructs lists from cellarrays
(which are loaded as numpy ndarrays), recursing into the elements
if they contain matobjects.
'''
elem_list = []
for sub_elem in ndarray:
if isinstance(sub_elem, spio.matlab.mio5_params.mat_struct):
elem_list.append(_todict(sub_elem))
elif isinstance(sub_elem, np.ndarray):
elem_list.append(_tolist(sub_elem))
else:
elem_list.append(sub_elem)
return elem_list
data = scipy.io.loadmat(filename, struct_as_record=False, squeeze_me=True)
return _check_keys(data)
# train = loadmat('multi1')
# thanks Pedro H. Avelar
def nx_to_format(G, aggregation_type, sparse_matrix=True):
e = len(G.edges)
n = len(G.nodes)
# edges = torch.LongTensor(list(G.edges))
edg = sorted(list(G.edges))
edges = torch.LongTensor(edg)
adj_matrix = np.asarray(nx.to_numpy_matrix(G))
if aggregation_type == "sum":
pass
elif aggregation_type == "degreenorm":
row_sum = np.sum(adj_matrix, axis=0, keepdims=True)
adj_matrix = adj_matrix / row_sum
elif aggregation_type == "symdegreenorm":
raise NotImplementedError("Symmetric degree normalization not yet implemented")
else:
raise ValueError("Invalid neighbour aggregation type")
if sparse_matrix:
agg_matrix_i = torch.LongTensor([[s for s, t in G.edges], list(range(e))])
agg_matrix_v = torch.FloatTensor([adj_matrix[s, t] for s, t in G.edges])
agg_matrix = torch.sparse.FloatTensor(agg_matrix_i, agg_matrix_v, torch.Size([n, e]))
else:
agg_matrix = torch.zeros(*[n, e])
for i, (s, t) in enumerate(edg):
agg_matrix[s, i] = adj_matrix[s, t]
return edges, agg_matrix
class Dataset:
def __init__(
self,
name,
num_nodes,
num_edges,
label_dim,
is_multiclass,
num_classes,
edges,
agg_matrix,
node_labels,
targets,
idx_train=None,
idx_valid=None,
idx_test=None,
graph_node=None
):
self.name = name
self.num_nodes = num_nodes
self.num_edges = num_edges
self.node_label_dim = label_dim
self.num_classes = num_classes
self.is_multiclass = is_multiclass
self.edges = edges
self.agg_matrix = agg_matrix
self.node_labels = node_labels
self.targets = targets
self.idx_train = idx_train
self.idx_valid = idx_valid
self.idx_test = idx_test
def cuda(self):
self.edges, self.agg_matrix, self.node_labels, self.targets, self.idx_train, self.idx_test = map(
lambda x: x.cuda() if x is not None else None,
[self.edges, self.agg_matrix, self.node_labels, self.targets, self.idx_train, self.idx_test]
)
return self
def cpu(self):
self.edges, self.agg_matrix, self.node_labels, self.targets, self.idx_train, self.idx_test = map(
lambda x: x.cuda(),
[self.edges, self.agg_matrix, self.node_labels, self.targets, self.idx_train, self.idx_test]
)
return self
def to(self, device):
if "cuda" in device.type:
torch.cuda.set_device(device)
return self.cuda()
else:
return self.cpu()
def get_twochains(num_nodes_per_graph=50, pct_labels=.1, pct_valid=.5, aggregation_type="sum", sparse_matrix=True):
G1 = nx.generators.classic.path_graph(num_nodes_per_graph)
G2 = nx.generators.classic.path_graph(num_nodes_per_graph)
G = nx.disjoint_union(G1, G2)
G = G.to_directed()
e = len(G.edges)
n = len(G.nodes)
edges, agg_matrix = nx_to_format(G, aggregation_type, sparse_matrix)
is_multilabel = False
n_classes = 2
d_l = 1
node_labels = torch.zeros(*[n, d_l])
# node_labels = torch.eye(n)
targets = torch.tensor(np.array(([0] * (n // 2)) + ([1] * (n // 2)), dtype=np.int64), dtype=torch.long)
idx = np.random.permutation(np.arange(n))
idx_trainval = idx[:int(n * pct_labels)]
idx_train = torch.LongTensor(idx_trainval[:-int(len(idx_trainval) * pct_valid)])
idx_valid = torch.LongTensor(
idx_trainval[-int(len(idx_trainval) * pct_valid):]) # TODO wht is he doing, why with BoolTensro is strange?
idx_test = torch.LongTensor(idx[int(n * pct_labels):])
return Dataset(
"twochains",
n,
e,
d_l,
is_multilabel,
n_classes,
edges,
agg_matrix,
node_labels,
targets,
idx_train,
idx_valid,
idx_test,
)
############## SSE ################
def sample_mask(idx, l):
"""Create mask."""
mask = np.zeros(l)
mask[idx] = 1
return np.array(mask, dtype=np.bool)
def read_sse_ids(percentage=None, dataset=None):
def _internal(file):
ids = []
with open(os.path.join(dataset, file), 'r') as f:
for line in f:
ids.append(int(line.strip()))
return ids
if percentage:
train_ids = _internal(
"train_idx-{}.txt".format(
percentage)) # list, each element a row of the file => id of the graph belonging to train set
test_ids = _internal("test_idx-{}.txt".format(percentage))
return train_ids, test_ids
def sample_mask(idx, l):
"""Create mask."""
mask = np.zeros(l)
mask[idx] = 1
return np.array(mask, dtype=np.bool)
def get_twochainsSSE(aggregation_type, percentage=0.9, dataset="data/n-chains-connect", node_has_feature=False,
train_file="train_idx-", test_file="test_idx-", sparse_matrix=True):
import os
print('Loading dataset: {}'.format(dataset))
graph_info = "meta.txt"
neigh = "adj_list.txt"
labels_file = "label.txt"
# loading targets
targets = np.loadtxt(os.path.join(dataset, labels_file))
targets = torch.tensor(np.argmax(targets, axis=1), dtype=torch.long)
with open(os.path.join(dataset, graph_info), 'r') as f:
info = f.readline().strip().split() # (ex. MUTAG - 23 2) number of nodes in the graph, target of the graph
if node_has_feature:
n_nodes, l, n_feat = [int(w) for w in info] # n == number of nodes, l label (target) of the graph
else:
n_nodes, l = [int(w) for w in info] # n == number of nodes, l label (target) of the graph
# load adj_list
if node_has_feature:
features = np.loadtxt(os.path.join(dataset, "features.txt"))
else:
features = np.zeros((n_nodes, 1), dtype=np.float32) # zero feature else
with open(os.path.join(dataset, neigh), 'r') as f:
g = nx.Graph() # netxgraph
node_features = []
# n_edges = 0 # edges in the graph
for j in range(n_nodes):
# for every row of the current graph create the graph itself
g.add_node(j) # add node to networkx graph
row = [int(w) for w in
f.readline().strip().split()] # composition of each row : number of neighbors, id_neigh_1, id_neigh_2 ...
n_edges = row[0] # increment edge counter with number of neighbors => number of arcs
for k in range(1, n_edges + 1):
g.add_edge(j, row[k]) # add edge in graph to all nodes from current one
g = g.to_directed() # every arc # in this example, state of
# e = [list(pair) for pair in g.edges()] # [[0, 1], [0, 5], [1, 2], ... list containing lists of edge pair
edges, agg_matrix = nx_to_format(g, aggregation_type, sparse_matrix)
e = len(g.edges)
n = len(g.nodes)
d_l = 1
is_multilabel = False
n_classes = 2
node_labels = torch.tensor(features, dtype=torch.float)
# targets = torch.tensor(np.clip(target, 0, 1), dtype=torch.long) # convert -1 to 0
# creation of N matrix - [node_features, graph_id (to which the node belongs)] #here there is a unique graph
# create mask for training
train_ids, test_ids = read_sse_ids(percentage=percentage, dataset=dataset)
# train_mask = sample_mask(train_ids, n)
test_ids_temp = range(0, 2000)
test_ids = [i for i in test_ids_temp if i not in train_ids]
idx_train = torch.LongTensor(train_ids)
idx_test = torch.LongTensor(test_ids)
idx_valid = torch.LongTensor(test_ids)
return Dataset(
"two_chainsSSE",
n,
e,
d_l,
is_multilabel,
n_classes,
edges,
agg_matrix,
node_labels,
targets,
idx_train,
idx_valid,
idx_test,
)
def get_subgraph(set="sub_10_5_200", aggregation_type="sum", sparse_matrix=False):
from scipy.sparse import coo_matrix
import scipy.sparse as sp
import pandas as pd
types = ["train", "validation", "test"]
set_name = set
train = loadmat("./data/subcli/{}.mat".format(set_name))
train = train["dataSet"]
dset = {}
for set_type in types:
adj = coo_matrix(train['{}Set'.format(set_type)]['connMatrix'].T)
edges = np.array([adj.row, adj.col]).T
G = nx.DiGraph()
G.add_nodes_from(range(0, np.max(edges) + 1))
G.add_edges_from(edges)
# G = nx.from_edgelist(edges)
lab = np.asarray(train['{}Set'.format(set_type)]['nodeLabels']).T
if len(lab.shape) < 2:
lab = lab.reshape(lab.shape[0], 1)
lab = torch.tensor(lab, dtype=torch.float)
target = np.asarray(train['{}Set'.format(set_type)]['targets']).T
targets = torch.tensor(np.clip(target, 0, 1), dtype=torch.long) # convert -1 to 0
edges, agg_matrix = nx_to_format(G, aggregation_type, sparse_matrix)
e = len(G.edges)
n = len(G.nodes)
d_l = lab.shape[1]
is_multilabel = False
n_classes = 2
node_labels = lab
dset[set_type] = Dataset(
"subgraph_{}".format(set_type),
n,
e,
d_l,
is_multilabel,
n_classes,
edges,
agg_matrix,
node_labels,
targets)
return dset
def get_karate(num_nodes_per_graph=None, aggregation_type="sum", sparse_matrix=True):
# F = nx.read_edgelist("./data/karate/edges.txt", nodetype=int)
G = nx.karate_club_graph()
# edge = np.loadtxt("./data/karate/edges.txt", dtype=np.int32) # 0-based indexing
# edge_inv = np.flip(edge, axis=1)
# edges = np.concatenate((edge, edge_inv))
# G = nx.DiGraph()
# G.add_edges_from(edges)
G = G.to_directed()
e = len(G.edges)
n = len(G.nodes)
# F = nx.Graph()
# F.add_edges_from(G.edges)
edges, agg_matrix = nx_to_format(G, aggregation_type, sparse_matrix=sparse_matrix)
is_multilabel = False
n_classes = 4
targets = [0] * n
# class_nodes = [[]] * n_classes # NB keeps broadcasting also at append time
class_nodes = [[], [], [], []]
with open("./data/karate/classes.txt") as f:
for line in f:
node, node_class = map(int, line.split(" "))
targets[node] = node_class
class_nodes[node_class].append(node)
d_l = n
# node_labels = torch.zeros(*[n, d_l])
node_labels = torch.eye(n)
targets = torch.tensor(targets, dtype=torch.long)
idx_train = []
idx_test = []
for c in class_nodes:
perm = np.random.permutation(c)
idx_train += list(perm[:1]) # first index for training
idx_test += list(perm[1:]) # all other indexes for testing
# idx_train += list(perm) # first index for training
# idx_test += list(perm) # all other indexes for testing
idx_valid = torch.LongTensor(idx_train)
idx_train = torch.LongTensor(idx_train)
idx_test = torch.LongTensor(idx_test)
return Dataset(
"karate",
n,
e,
d_l,
is_multilabel,
n_classes,
edges,
agg_matrix,
node_labels,
targets,
idx_train,
idx_valid,
idx_test,
)
def collate(samples):
import dgl
# The input `samples` is a list of pairs
# (graph, label).
graphs, labels = map(list, zip(*samples))
batched_graph = dgl.batch(graphs)
return batched_graph, torch.tensor(labels)
def get_dgl_minigc(aggregation_type="sum", ):
import dgl
from dgl.data import MiniGCDataset
tr_set = MiniGCDataset(80, 10, 20)
test_set = MiniGCDataset(20, 10, 20)
data_loader = DataLoader(tr_set, batch_size=80, shuffle=True,
collate_fn=collate)
dataiter = iter(data_loader)
images, labels = dataiter.next() # get all the dataset
G = images.to_networkx()
e = len(G.edges)
n = len(G.nodes)
edges, agg_matrix = nx_to_format(G, aggregation_type)
print("ciao")
def get_dgl_cora(aggregation_type="sum", sparse_matrix=False):
import dgl
from dgl.data import CoraDataset
tr_set = CoraDataset()
G = tr_set.graph
e = len(G.edges)
n = len(G.nodes)
d_l = tr_set.features.shape[1]
is_multilabel = False
n_classes = tr_set.num_labels
node_labels = torch.tensor(tr_set.features)
targets = torch.tensor(tr_set.labels)
idx_train = torch.BoolTensor(tr_set.train_mask) # in this case, there are msk => convert to boolean mask
idx_valid = torch.BoolTensor(tr_set.val_mask)
idx_test = torch.BoolTensor(tr_set.test_mask)
edges, agg_matrix = nx_to_format(G, aggregation_type, sparse_matrix)
return Dataset(
"cora",
n,
e,
d_l,
is_multilabel,
n_classes,
edges,
agg_matrix,
node_labels,
targets,
idx_train,
idx_valid,
idx_test,
)
def get_dgl_citation(aggregation_type="sum", dataset="pubmed"):
import dgl
from dgl.data import CitationGraphDataset
tr_set = CitationGraphDataset(dataset)
G = tr_set.graph
e = len(G.edges)
n = len(G.nodes)
d_l = tr_set.features.shape[1]
is_multilabel = False
n_classes = tr_set.num_labels
node_labels = torch.tensor(tr_set.features)
targets = torch.tensor(tr_set.labels)
idx_train = torch.BoolTensor(tr_set.train_mask)
idx_valid = torch.BoolTensor(tr_set.val_mask)
idx_test = torch.BoolTensor(tr_set.test_mask)
edges, agg_matrix = nx_to_format(G, aggregation_type)
return Dataset(
"cora",
n,
e,
d_l,
is_multilabel,
n_classes,
edges,
agg_matrix,
node_labels,
targets,
idx_train,
idx_valid,
idx_test,
)
def get_dgl_karate(aggregation_type="sum"):
import dgl
from dgl.data import KarateClub
tr_set = KarateClub()
G = tr_set.graph
e = len(G.edges)
n = len(G.nodes)
d_l = tr_set.features.shape[1]
is_multilabel = False
n_classes = tr_set.num_labels
node_labels = torch.tensor(tr_set.features)
targets = torch.tensor(tr_set.labels)
idx_train = torch.BoolTensor(tr_set.train_mask)
idx_valid = torch.BoolTensor(tr_set.val_mask)
idx_test = torch.BoolTensor(tr_set.test_mask)
edges, agg_matrix = nx_to_format(G, aggregation_type)
return Dataset(
"cora",
n,
e,
d_l,
is_multilabel,
n_classes,
edges,
agg_matrix,
node_labels,
targets,
idx_train,
idx_valid,
idx_test,
)
def from_EN_to_GNN(E, N, targets, aggregation_type, sparse_matrix=True):
"""
:param E: # E matrix - matrix of edges : [[id_p, id_c, graph_id],...]
:param N: # N matrix - [node_features, graph_id (to which the node belongs)]
:return: # L matrix - list of graph targets [tar_g_1, tar_g_2, ...]
"""
N_full = N
E_full = E
N = N[:, :-1] # avoid graph_id
e = E[:, :2] # take only first tow columns => id_p, id_c
# creating input for gnn => [id_p, id_c, label_p, label_c]
# creating arcnode matrix, but transposed
"""
1 1 0 0 0 0 0
0 0 1 1 0 0 0
0 0 0 0 1 1 1
""" # for the indices where to insert the ones, stack the id_p and the column id (single 1 for column)
G = nx.DiGraph()
G.add_nodes_from(range(0, np.max(e) + 1))
G.add_edges_from(e)
edges, agg_matrix = nx_to_format(G, aggregation_type, sparse_matrix)
# get the number of graphs => from the graph_id
num_graphs = int(max(N_full[:, -1]) + 1)
# get all graph_ids
g_ids = N_full[:, -1]
g_ids = g_ids.astype(np.int32)
# creating graphnode matrix => create identity matrix get row corresponding to id of the graph
# graphnode = np.take(np.eye(num_graphs), g_ids, axis=0).T
# substitued with same code as before
if sparse_matrix:
unique, counts = np.unique(g_ids, return_counts=True)
values_matrix = np.ones([len(g_ids)]).astype(np.float32)
if aggregation_type == "degreenorm":
values_matrix_normalized = values_matrix[g_ids] / counts[g_ids]
else:
values_matrix_normalized = values_matrix
# graphnode = SparseMatrix(indices=np.stack((g_ids, np.arange(len(g_ids))), axis=1),
# values=np.ones([len(g_ids)]).astype(np.float32),
# dense_shape=[num_graphs, len(N)])
agg_matrix_i = torch.LongTensor([g_ids, list(range(len(g_ids)))])
agg_matrix_v = torch.FloatTensor(values_matrix_normalized)
graphnode = torch.sparse.FloatTensor(agg_matrix_i, agg_matrix_v, torch.Size([num_graphs, len(N)]))
else:
graphnode = torch.tensor(np.take(np.eye(num_graphs), g_ids, axis=0).T)
# print(graphnode.shape)
e = E_full.shape[0]
n = N_full.shape[0]
d_l = N.shape[1]
is_multilabel = False
n_classes = (np.max(targets).astype(np.int) + 1)
node_labels = torch.FloatTensor(N)
targets = torch.tensor(targets, dtype=torch.long)
return Dataset(
"name",
n,
e,
d_l,
is_multilabel,
n_classes,
edges,
agg_matrix,
node_labels,
targets,
graph_node=graphnode
)
def old_load_karate(path="data/karate/"):
"""Load karate club dataset"""
print('Loading karate club dataset...')
import random
import scipy.sparse as sp
edges = np.loadtxt("{}edges.txt".format(path), dtype=np.int32) # 0-based indexing
# edge_inv = np.flip(edges, axis=1) # add also archs in opposite direction
# edges = np.concatenate((edges, edge_inv))
edges = edges[np.lexsort((edges[:, 1], edges[:, 0]))] # reorder list of edges also by second column
features = sp.eye(np.max(edges+1), dtype=np.float).tocsr()
idx_labels = np.loadtxt("{}classes.txt".format(path), dtype=np.float32)
idx_labels = idx_labels[idx_labels[:, 0].argsort()]
labels = idx_labels[:, 1]
#labels = np.eye(max(idx_labels[:, 1])+1, dtype=np.int32)[idx_labels[:, 1]] # one-hot encoding of labels
E = np.concatenate((edges, np.zeros((len(edges), 1), dtype=np.int32)), axis=1)
N = np.concatenate((features.toarray(), np.zeros((features.shape[0], 1), dtype=np.int32)), axis=1)
mask_train = np.zeros(shape=(34,), dtype=np.float32)
id_0, id_4, id_5, id_12 = random.choices(np.argwhere(labels == 0), k=4)
id_1, id_6, id_7, id_13 = random.choices(np.argwhere(labels == 1), k=4)
id_2, id_8, id_9, id_14 = random.choices(np.argwhere(labels == 2), k=4)
id_3, id_10, id_11, id_15 = random.choices(np.argwhere(labels == 3), k=4)
mask_train[id_0] = 1. # class 1
mask_train[id_1] = 1. # class 2
mask_train[id_2] = 1. # class 0
mask_train[id_3] = 1. # class 3
mask_test = 1. - mask_train
return E, N, labels, torch.BoolTensor(mask_train), torch.BoolTensor(mask_test)
Total running time of the script: ( 0 minutes 0.000 seconds)