Note
Click here to download the full example code
UtilsΒΆ
Common utilities for data loading and preparation
import numpy as np
import pandas as pd
import scipy.io as sio
import os
from scipy.sparse import coo_matrix
from collections import namedtuple
import scipy.sparse as sp
SparseMatrix = namedtuple("SparseMatrix", "indices values dense_shape")
def GetInput(mat, lab, batch=1, grafi=None):
"""grafi is vector with same cardinaluty of nodes, denoting to which graph
belongs each node
"""
# numero di batch
batch_number = grafi.max() // batch # if only one graph => grafi.max() is 0 => batch_number == 0
# dataframe containing adjacency matrix
dmat = pd.DataFrame(mat, columns=["id_1", "id_2"])
# dataframe containing labels each node
dlab = pd.DataFrame(lab, columns=["lab" + str(i) for i in range(0, lab.shape[1])])
# darch=pd.DataFrame(arc, columns=["arch"+str(i) for i in range(0,arc.shape[1])])
# dataframe denoting graph belonging each node
dgr = pd.DataFrame(grafi, columns=["graph"])
# creating input : id_p, id_c, label_p, label_c, graph_belong
dresult = dmat
dresult = pd.merge(dresult, dlab, left_on="id_1", right_index=True, how='left')
dresult = pd.merge(dresult, dlab, left_on="id_2", right_index=True, how='left')
# dresult=pd.concat([dresult, darch], axis=1)
dresult = pd.merge(dresult, dgr, left_on="id_1", right_index=True, how='left')
data_batch = []
arcnode_batch = []
nodegraph_batch = []
node_in = []
# creating batch data => for each batch, redefining the id so that they start from 0 index
for i in range(0, batch_number + 1):
# getting minimum index of the current batch
grafo_indexMin = (i * batch)
grafo_indexMax = (i * batch) + batch
adj = dresult.loc[(dresult["graph"] >= grafo_indexMin) & (dresult["graph"] < grafo_indexMax)]
min_id = adj[["id_1", "id_2"]].min(axis=0).min()
#start from 0 index for the new batch
adj["id_1"] = adj["id_1"] - min_id
adj["id_2"] = adj["id_2"] - min_id
min_gr = adj["graph"].min()
adj["graph"] = adj["graph"] - min_gr
# append values to batches : id_2, lab0_1, lab1_1, lab0_2, lab1_2 (excluded first and last - id_p and graph_id)
data_batch.append(adj.values[:, :-1])
# arcMat creation
# max_id of nodes in the current batch
max_id = int(adj[["id_1", "id_2"]].max(axis=0).max())
max_gr = int(adj["graph"].max())
# getting ids of nodes (p and c)
mt = adj[["id_1", "id_2"]].values
# arcnode matrix : first shape same as arcs, second same as nodes in the batch
arcnode = np.zeros((mt.shape[0], max_id + 1))
# arcnode: state of parent node = sum (h(state of all the neighbors ,..) (of the parent node)
# => sum contributes of all the arcs involving the parent
# in j-th arc (row) => put one in the position corresponding to the parent node's column
# => found in the adjacnecy matrix in j-th row, 1 st position
# for j in range(0, mt.shape[0]):
# arcnode[j][mt[j][0]] = 1
arcnode = SparseMatrix(indices=np.stack((mt[:, 0], np.arange(len(mt))), axis=1), values=np.ones([len(mt)]),
dense_shape=[max_id + 1, len(mt)])
arcnode_batch.append(arcnode)
# nodegraph
nodegraph = np.zeros((max_id + 1, max_gr + 1))
for t in range(0, max_id + 1):
val = adj[["graph"]].loc[(adj["id_1"] == t) | (adj["id_2"] == t)].values[0]
nodegraph[t][val] = 1
nodegraph_batch.append(nodegraph)
# node number in each graph
grbtc = dgr.loc[(dgr["graph"] >= grafo_indexMin) & (dgr["graph"] < grafo_indexMax)]
#counting number nodes in current batch
node_in.append(grbtc.groupby(["graph"]).size().values)
return data_batch, arcnode_batch, nodegraph_batch, node_in
def set_load_subgraph(data_path, set_type):
# load adjacency list
types = ["train", "valid", "test"]
try:
if set_type not in types:
raise NameError('Wrong set name!')
# load adjacency list
mat = sio.loadmat(os.path.join(data_path, 'conmat{}.mat'.format(set_type)))
# load adiacenyc matrixc in sparse format
adj = coo_matrix(mat["conmat_{}set".format(set_type)].T)
adj = np.array([adj.row, adj.col]).T
# load node label
mat = sio.loadmat(os.path.join(data_path, "nodelab{}.mat".format(set_type)))
lab = np.asarray(mat["nodelab_{}set".format(set_type)]).T
# load target and convert to one-hot encoding
mat = sio.loadmat(os.path.join(data_path, "tar{}.mat".format(set_type)))
target = np.asarray(mat["target_{}set".format(set_type)]).T
# one-hot encoding of targets
labels = pd.get_dummies(pd.Series(target.reshape(-1)))
labels = labels.values
# compute inputs and arcnode
inp, arcnode, nodegraph, nodein = GetInput(adj, lab, 1, np.zeros(len(labels), dtype=int)) # last argument: graph to which each node belongs
return inp, arcnode, nodegraph, nodein, labels, lab
except Exception as e:
print("Caught exception: ", e)
exit(1)
def set_load_clique(data_path, set_type):
import load as ld
# load adjacency list
types = ["train", "validation", "test"]
train = ld.loadmat(os.path.join(data_path, "cliquedataset.mat"))
train = train["dataSet"]
try:
if set_type not in types:
raise NameError('Wrong set name!')
# load adjacency list
# take adjacency list
adj = coo_matrix(train['{}Set'.format(set_type)]['connMatrix'].T)
adj = np.array([adj.row, adj.col]).T
# take node labels
lab = np.asarray(train['{}Set'.format(set_type)]['nodeLabels']).T
# take targets and convert to one-hot encoding
target = np.asarray(train['{}Set'.format(set_type)]['targets']).T
labels = pd.get_dummies(pd.Series(target))
labels = labels.values
# compute inputs and arcnode
get_lab = lab.reshape(lab.shape[0], 1) if set_type == "train" else lab.reshape(len(labels), 1)
inp, arcnode, nodegraph, nodein = GetInput(adj, get_lab, 1,
np.zeros(len(labels), dtype=int))
return inp, arcnode, nodegraph, nodein, labels
except Exception as e:
print("Caught exception: ", e)
exit(1)
def set_load_mutag(set_type, train):
# load adjacency list
types = ["train", "validation", "test"]
try:
if set_type not in types:
raise NameError('Wrong set name!')
############ training set #############
# take adjacency list
adj = coo_matrix(train['{}Set'.format(set_type)]['connMatrix'])
adj = np.array([adj.row, adj.col]).T
# take node labels
lab = np.asarray(train['{}Set'.format(set_type)]['nodeLabels']).T
mask = coo_matrix(train['{}Set'.format(set_type)]["maskMatrix"])
# take target, generate output for each graph, and convert to one-hot encoding
target = np.asarray(train['{}Set'.format(set_type)]['targets']).T
v = mask.col
target = np.asarray([target[x] for x in v])
# target = target[target != 0] # equivalent code
labels = pd.get_dummies(pd.Series(target))
labels = labels.values
# build graph indices
gr = np.array(mask.col)
indicator = []
for j in range(0, len(gr) - 1):
for i in range(gr[j], gr[j + 1]):
indicator.append(j)
for i in range(gr[-1], adj.max() + 1):
indicator.append(len(gr) - 1)
indicator = np.asarray(indicator)
# take input, arcnode matrix, nodegraph matrix
inp, arcnode, nodegraph, nodein = GetInput(adj, lab, indicator.max() + 1, indicator)
return inp, arcnode, nodegraph, nodein, labels
except Exception as e:
print("Caught exception: ", e)
exit(1)
def set_load_general(data_path, set_type, set_name="sub_30_15"):
import load as ld
# load adjacency list
types = ["train", "validation", "test"]
train = ld.loadmat(os.path.join(data_path, "{}.mat".format(set_name)))
train = train["dataSet"]
try:
if set_type not in types:
raise NameError('Wrong set name!')
# load adjacency list
# take adjacency list
adj = coo_matrix(train['{}Set'.format(set_type)]['connMatrix'].T)
adj = np.array([adj.row, adj.col]).T
# take node labels
lab = np.asarray(train['{}Set'.format(set_type)]['nodeLabels']).T
# if clique (labels with only one dimension
if len(lab.shape) < 2:
lab = lab.reshape(lab.shape[0], 1)
# take targets and convert to one-hot encoding
target = np.asarray(train['{}Set'.format(set_type)]['targets']).T
labels = pd.get_dummies(pd.Series(target))
labels = labels.values
# compute inputs and arcnode
inp, arcnode, nodegraph, nodein = GetInput(adj, lab, 1,
np.zeros(len(labels), dtype=int))
return inp, arcnode, nodegraph, nodein, labels, lab
except Exception as e:
print("Caught exception: ", e)
exit(1)
def load_karate(path="data/karate-club/"):
"""Load karate club dataset"""
print('Loading karate club dataset...')
edges = np.loadtxt("{}edges.txt".format(path), dtype=np.int32) - 1 # 0-based indexing
edges = edges[np.lexsort((edges[:, 1], edges[:, 0]))] # reorder list of edges also by second column
features = sp.eye(np.max(edges+1), dtype=np.float32).tocsr()
idx_labels = np.loadtxt("{}mod-based-clusters.txt".format(path), dtype=np.int32)
idx_labels = idx_labels[idx_labels[:, 0].argsort()]
labels = np.eye(max(idx_labels[:, 1])+1, dtype=np.int32)[idx_labels[:, 1]] # one-hot encoding of labels
E = np.concatenate((edges, np.zeros((len(edges), 1), dtype=np.int32)), axis=1)
N = np.concatenate((features.toarray(), np.zeros((features.shape[0], 1), dtype=np.int32)), axis=1)
return E, N, labels,
def from_EN_to_GNN(E, N):
"""
:param E: # E matrix - matrix of edges : [[id_p, id_c, graph_id],...]
:param N: # N matrix - [node_features, graph_id (to which the node belongs)]
:return: # L matrix - list of graph targets [tar_g_1, tar_g_2, ...]
"""
N_full = N
N = N[:, :-1] # avoid graph_id
e = E[:, :2] # take only first tow columns => id_p, id_c
feat_temp = np.take(N, e, axis=0) # take id_p and id_c => (n_archs, 2, label_dim)
feat = np.reshape(feat_temp, [len(E), -1]) # (n_archs, 2*label_dim) => [[label_p, label_c], ...]
# creating input for gnn => [id_p, id_c, label_p, label_c]
inp = np.concatenate((E[:, 1:2], feat), axis=1)
# creating arcnode matrix, but transposed
"""
1 1 0 0 0 0 0
0 0 1 1 0 0 0
0 0 0 0 1 1 1
""" # for the indices where to insert the ones, stack the id_p and the column id (single 1 for column)
arcnode = SparseMatrix(indices=np.stack((E[:, 0], np.arange(len(E))), axis=1),
values=np.ones([len(E)]).astype(np.float32),
dense_shape=[len(N), len(E)])
# get the number of graphs => from the graph_id
num_graphs = int(max(N_full[:, -1]) + 1)
# get all graph_ids
g_ids = N_full[:, -1]
g_ids = g_ids.astype(np.int32)
# creating graphnode matrix => create identity matrix get row corresponding to id of the graph
# graphnode = np.take(np.eye(num_graphs), g_ids, axis=0).T
# substitued with same code as before
graphnode = SparseMatrix(indices=np.stack((g_ids, np.arange(len(g_ids))), axis=1),
values=np.ones([len(g_ids)]).astype(np.float32),
dense_shape=[num_graphs, len(N)])
# print(graphnode.shape)
return inp, arcnode, graphnode
Total running time of the script: ( 0 minutes 0.000 seconds)