[SciPy-User] Conversion to graph adjacency list

Sat Dec 8 09:26:28 EST 2018

Thanks Robert for your hepfull response, now I better understand how csr conversion works.

I'm benchmarking some graph libraries. My edges-to-csr conversion was hand-written in Python, so very slow! Now, with your code, execution time is 7 times faster!

Here is the code, perhaps we can optimize more:

# =============================================
from time import clock
from sys import stderr, stdin

from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse import csr_matrix
import numpy as np

# ------------ Scipy Code ---------------------------

def wedges2adj(edges, n):

    G=[[]for _ in range(n)]
    for a, b, w in edges:
        G[a].append((b,w))
        G[b].append((a,w))
    return G

def wedges2scr(edges, n):
    G=wedges2adj(edges, n)
    indptr=[0] 
    cnt=0

    for line in G:
        cnt+=len(line)
        indptr.append(cnt)
    data=[]
    indices=[]
    for i in range(n):
        for (j,w) in G[i]:
            data.append(w)
            indices.append(j)

    return [data, indptr, indices]

def csr2wedges(Mcsr, shape):
    n, p=shape
    k=0
    edges=[]
    data, cumul, cols  = Mcsr.data,Mcsr.indptr, Mcsr.indices  
    for i in range(n):
        for j in range(cumul[i+1]-cumul[i]):
            edges.append((i, cols[k], data[k]))
            k+=1
    return edges

def kruskal_scipy(edges, n):
    data, indptr, indices=wedges2scr(edges, n)
    csr=csr_matrix((data, indices, indptr), shape=(n, n))     
    tree=minimum_spanning_tree(csr)
    edges=csr2wedges(tree, (n,n))
    return int(sum((round(w)) for (a,b,w) in edges))

# °°°°°°°°°°°°°°°°°°°°°°°

def wedges2scr_FAST(wedges, n):
    row_ind, col_ind, costs = np.transpose(wedges)
    return csr_matrix((costs, (row_ind, col_ind)),
    shape=(n,n))

def kruskal_scipy_FAST(wedges, n):    
    csr=wedges2scr_FAST(wedges, n)
    tree=minimum_spanning_tree(csr).tocoo()
    return int(round(sum(w for (i,j,w) in zip(tree.row, tree.col, tree.data))))

# ------------ Benchmark ---------------------------

def go(solver, L):
    global duration

    N=len(L)
    k=1
    solution=[]
    while k<N:
        edges=[]
        n=L[k]
        k+=1

        for a in range(n):
            d=L[k]
            k+=1
            for j in range(d):
                b=L[k]
                k+=1
                w=L[k]
                k+=1
                if a<b:
                    edges.append([a,b-1,w])
        if solver==kruskal_scipy:
            data, indptr, indices=wedges2scr(edges, n)

            start=clock()
            csr=csr_matrix((data, indices, indptr), shape=(n, n))             
            tree=minimum_spanning_tree(csr)
            edges=csr2wedges(tree, (n,n)) 
            sol=solver(edges, n)
            duration+=clock()-start              
            solution.append(sol)             
        else:

            start=clock()
            sol=solver(edges, n)
            duration+=clock()-start
            solution.append(sol)
    return solution

# data
L=list(int(z) for z in stdin.read().split() if z.isdigit())
output=[]
solvers=[kruskal_scipy, kruskal_scipy_FAST]
for solver in solvers:
    duration=0
    costs=go(solver, L)
    output.append(costs)
    print("%-20s : %.3f" %(solver.__name__, duration), file=stderr)

if all(output[i]== output[0] for i in range(len(solvers))):
    print("NO error detected", file=stderr)
else:
    print("ERROR detected", file=stderr)

# =============================================

Link to data file (100 graphs with at most 30000 vertices each):

https://drive.google.com/file/d/1BKxRAzJ9jeowbrFPyLZ23EomKn1eqohs/view?usp=sharing

The above code is faster than Networkit library code and little slower than Graph-Tool library. 

I have two observations:

1. Python code is only 2 times slower
2. Pure C-code is about 9 times faster

In each case, even SciPy cf. _min_spanning_tree.pyx, algorithm is the same: kruskal with Union-find.