In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import sparse
import scipy.sparse.linalg as spla
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
sns.set_context('notebook', font_scale=1.5)
There are many applications in which we deal with matrices that are mostly zeros. For example, a matrix representing social networks is very sparse - there are 7 billion people, but most people are only connected to a few hundred or thousand others directly. Storing such a social network as a sparse rather than dense matrix will offer orders of magnitude reductions in memory requirements and corresponding speed-ups in computation.
In [3]:
A = np.random.poisson(0.2, (5,15)) * np.random.randint(0, 10, (5, 15))
A
Out[3]:
In [4]:
rows, cols = np.nonzero(A)
vals = A[rows, cols]
In [5]:
vals
Out[5]:
In [6]:
rows
Out[6]:
In [7]:
cols
Out[7]:
In [8]:
X1 = sparse.coo_matrix(A)
X1
Out[8]:
In [9]:
print(X1)
In [10]:
X2 = sparse.coo_matrix((vals, (rows, cols)))
X2
Out[10]:
In [11]:
print(X2)
In [12]:
X2.todense()
Out[12]:
In [13]:
np.vstack([rows, cols])
Out[13]:
In [14]:
indptr = np.r_[np.searchsorted(rows, np.unique(rows)), len(rows)]
indptr
Out[14]:
In [15]:
X3 = sparse.csr_matrix((vals, cols, indptr))
X3
Out[15]:
In [16]:
X3.todense()
Out[16]:
In [17]:
X4 = X2.tocsr()
In [18]:
X4
Out[18]:
In [19]:
rows = np.r_[np.zeros(4), np.ones(4)]
cols = np.repeat([0,1], 4)
vals = np.arange(8)
In [20]:
rows
Out[20]:
In [21]:
cols
Out[21]:
In [22]:
vals
Out[22]:
In [23]:
X5 = sparse.csr_matrix((vals, (rows, cols)))
In [24]:
print(X5)
In [25]:
obs = np.random.randint(0, 2, 100)
pred = np.random.randint(0, 2, 100)
vals = np.ones(100).astype('int')
In [26]:
pred
Out[26]:
In [27]:
vals.shape, obs.shape , pred.shape
Out[27]:
In [28]:
X6 = sparse.coo_matrix((vals, (pred, obs)))
In [29]:
X6.todense()
Out[29]:
In [30]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
In [31]:
iris = datasets.load_iris()
In [32]:
knn = KNeighborsClassifier()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
test_size=0.5, random_state=42)
In [33]:
pred = knn.fit(X_train, y_train).predict(X_test)
In [34]:
X7 = sparse.coo_matrix((np.ones(len(pred)).astype('int'), (pred, y_test)))
pd.DataFrame(X7.todense(), index=iris.target_names, columns=iris.target_names)
Out[34]:
SciPy provides efficient routines for solving large sparse systems as for dense matrices. We will illustrate by calculating the page rank for airports using data from the Bureau of Transportation Statisitcs.
In [35]:
data = pd.read_csv('data/airports.csv', usecols=[0,1])
In [36]:
data.shape
Out[36]:
In [37]:
data.head()
Out[37]:
In [38]:
lookup = pd.read_csv('data/names.csv', index_col=0)
In [39]:
lookup.shape
Out[39]:
In [40]:
lookup.head()
Out[40]:
In [41]:
import networkx as nx
In [42]:
g = nx.from_pandas_dataframe(data, source='ORIGIN_AIRPORT_ID', target='DEST_AIRPORT_ID')
In [43]:
airports = np.array(g.nodes())
adj_matrix = nx.to_scipy_sparse_matrix(g)
In [44]:
out_degrees = np.ravel(adj_matrix.sum(axis=1))
diag_matrix = sparse.diags(1 / out_degrees).tocsr()
M = (diag_matrix @ adj_matrix).T
The PageRank algorithm assumes that every node can be reached from every other node. To guard against case where a node has out-degree 0, we allow every node a small random chance of transitioning to any other node using a damping factor $d$. Then we solve the linear system to find the pagerank score $r$.
$$ r = (I - dM)^{-1}\frac{1-d}{N}\mathbb{1} $$or equivalently in the $Ax = b$ format
$$ (I - dM)r = \frac{1-d}{N}\mathbb{1} $$
In [45]:
n = len(airports)
d = 0.85
I = sparse.eye(n, format='csc')
A = I - d * M
b = (1-d) / n * np.ones(n) # so the sum of all page ranks is 1
In [46]:
A.todense()
Out[46]:
In [47]:
from scipy.sparse.linalg import spsolve
In [48]:
r = spsolve(A, b)
r.sum()
Out[48]:
In [49]:
idx = np.argsort(r)
In [50]:
top10 = idx[-10:][::-1]
bot10 = idx[:10]
In [51]:
df = lookup.loc[airports[top10]]
df['degree'] = out_degrees[top10]
df['pagerank']= r[top10]
df
Out[51]:
In [52]:
df = lookup.loc[airports[bot10]]
df['degree'] = out_degrees[bot10]
df['pagerank']= r[bot10]
df
Out[52]:
In [53]:
import warnings
In [54]:
labels = {airports[i]: lookup.loc[airports[i]].str.split(':').str[0].values[0]
for i in np.r_[top10[:5], bot10[:5]]}
with warnings.catch_warnings():
warnings.simplefilter('ignore')
nx.draw(g, pos=nx.spring_layout(g), labels=labels,
node_color='blue', font_color='red', alpha=0.5,
node_size=np.clip(5000*r, 1, 5000*r), width=0.1)
In [ ]: