In [15]:
# First we do all the imports
%pylab inline
import numpy as np
import seaborn as sns
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
pylab.rcParams['figure.figsize'] = (20.0, 8.0)
#import ....


Populating the interactive namespace from numpy and matplotlib

Numerical data


In [13]:
# Generate dataset from drawing points from a normal distribution with zero mean and standard deviation=1
np.random.seed(42)
data = np.random.normal(0, 1, (5, 20))
data.shape


Out[13]:
(5, 20)

In [16]:
sns.tsplot(data, err_style='unit_traces')


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7528efc7d0>

In [17]:
# with matplotlib
plt.plot(data.T)


Out[17]:
[<matplotlib.lines.Line2D at 0x7f7528cddf10>,
 <matplotlib.lines.Line2D at 0x7f7528cea150>,
 <matplotlib.lines.Line2D at 0x7f7528cea290>,
 <matplotlib.lines.Line2D at 0x7f7528cea3d0>,
 <matplotlib.lines.Line2D at 0x7f7528cea510>]

In [18]:
# plot the mean of each trace
data_mean = data.mean(axis=0); 
plt.plot(data_mean)
plt.plot(data_mean-data.std(axis=0), 'g')
plt.plot(data_mean+data.std(axis=0), 'g')


Out[18]:
[<matplotlib.lines.Line2D at 0x7f7528d4fe10>]

In [19]:
# Result is a tuple
description = stats.describe(data, axis=1)

In [20]:
description.minmax


Out[20]:
(array([-1.55221355, -1.90301966, -2.01796133, -2.49235022, -1.23445121]),
 array([ 2.94912227,  2.49341406,  1.21801783,  1.39426207,  2.2075641 ]))

In [21]:
np.sqrt(description.variance)


Out[21]:
array([ 1.10279421,  0.85930616,  0.77560303,  1.00238478,  0.94477352])

In [22]:
stats.describe(data.flatten())


Out[22]:
DescribeResult(nobs=100, minmax=(-2.4923502245972684, 2.9491222733326525), mean=0.17233067578109829, variance=0.91309832209343467, skewness=-0.14802590289285314, kurtosis=0.5504859214411524)

And something similar can be achieved using pandas


In [23]:
frame = pd.DataFrame(np.random.normal(0, 1, (1000, 5)), columns=['a', 'b', 'c', 'd', 'e'])
frame.describe()


Out[23]:
a b c d e
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.026575 0.008048 0.016839 0.034060 -0.018381
std 1.014459 0.997368 0.965286 1.000918 0.999798
min -3.758356 -3.875580 -2.873010 -3.414485 -4.290439
25% -0.707505 -0.650649 -0.650374 -0.651669 -0.735294
50% -0.010827 0.014685 0.020628 0.073832 0.033922
75% 0.644255 0.672597 0.662280 0.739632 0.625549
max 3.094362 3.773501 3.156186 3.186570 2.907562

Sometimes you may need to characterize one specific timeseries (and not across a set of timeseries)


In [24]:
time       = np.linspace(0, 15, 100) # [s]
f_baseline = 1.0                     # [s^-1]
N          = 10

# List comprehension works for creating arrays too :)
def generate_timeseries(N, t, f):
    '''Generate array of N x length(t)'''
    d = np.array([i * np.sin(2*np.pi*f*i*t) for i in range(N)])
    return d

data = generate_timeseries(N, time, f_baseline)

In [25]:
r, c = 2, 5
plt.figure(figsize=(20, 10))
for k in range(r*c):
    plt.subplot(r,c, k+1)
    plt.plot(data[k, :])
    plt.plot(np.tile(data[k, :].std(), len(time)), 'r', alpha=0.9)
    plt.plot(np.tile(- data[k, :].std(), len(time)), 'r', alpha=0.9)
    plt.ylim([-N, N])


Correlation and brainnetworks

In [26]:
sns.set(context="paper", font="monospace")

# Load the datset of correlations between cortical brain networks
df = sns.load_dataset("brain_networks", header=[0, 1, 2], index_col=0)
corrmat = df.corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 9))

# Draw the heatmap using seaborn
sns.heatmap(corrmat, vmax=.8, square=True)

# Use matplotlib directly to emphasize known networks
networks = corrmat.columns.get_level_values("network")
for i, network in enumerate(networks):
    if i and network != networks[i - 1]:
        ax.axhline(len(networks) - i, c="w")
        ax.axvline(i, c="w")
f.tight_layout()



In [27]:
cc = np.corrcoef(data)

f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(cc)
f.tight_layout()



In [34]:
cc


Out[34]:
array([[             nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan],
       [             nan,   1.00000000e+00,   1.27395288e-15,
          9.63688538e-16,  -5.11375454e-16,  -1.36725648e-15,
          7.47624932e-18,   8.07434927e-16,   9.91350660e-16,
          3.54473901e-15],
       [             nan,   1.27395288e-15,   1.00000000e+00,
          2.73107388e-15,   2.21596030e-15,   9.40213115e-16,
          2.95610898e-15,   1.05094705e-16,   6.97534062e-16,
          1.62284452e-15],
       [             nan,   9.63688538e-16,   2.73107388e-15,
          1.00000000e+00,  -2.66154476e-15,   2.27876079e-15,
          4.26644628e-16,  -1.65246471e-15,  -1.09228003e-15,
          8.94491697e-16],
       [             nan,  -5.11375454e-16,   2.21596030e-15,
         -2.66154476e-15,   1.00000000e+00,   1.49285747e-15,
          1.42945887e-15,   1.14514779e-15,  -1.34572488e-16,
          1.73249618e-15],
       [             nan,  -1.36725648e-15,   9.40213115e-16,
          2.27876079e-15,   1.49285747e-15,   1.00000000e+00,
          3.72257406e-15,   5.79199988e-15,   1.65075585e-16,
          1.89159076e-15],
       [             nan,   7.47624932e-18,   2.95610898e-15,
          4.26644628e-16,   1.42945887e-15,   3.72257406e-15,
          1.00000000e+00,  -3.93720650e-15,   1.54758361e-15,
         -3.25765104e-15],
       [             nan,   8.07434927e-16,   1.05094705e-16,
         -1.65246471e-15,   1.14514779e-15,   5.79199988e-15,
         -3.93720650e-15,   1.00000000e+00,  -4.17815534e-16,
          9.26200488e-16],
       [             nan,   9.98079285e-16,   7.14355623e-16,
         -1.09377528e-15,  -1.26722426e-16,   1.77747828e-16,
          1.53711686e-15,  -4.17815534e-16,   1.00000000e+00,
         -4.91837522e-15],
       [             nan,   3.53676435e-15,   1.61686352e-15,
          8.98479030e-16,   1.72850884e-15,   1.88560976e-15,
         -3.25100548e-15,   9.19365060e-16,  -4.91837522e-15,
          1.00000000e+00]])

In [28]:
def generate_correlated_timeseries(N, t, f):
    '''Generate array of N x length(t)'''
    d = np.array([i * np.sin(2*np.pi*f*t) for i in range(N)])
    return d

data = generate_correlated_timeseries(N, time, f_baseline)

In [29]:
r, c = 2, 5
plt.figure(figsize=(20, 10))
for k in range(r*c):
    plt.subplot(r,c, k+1)
    plt.plot(data[k, :])
    plt.plot(np.tile(data[k, :].std(), len(time)), 'r', alpha=0.9)
    plt.plot(np.tile(- data[k, :].std(), len(time)), 'r', alpha=0.9)
    plt.ylim([-N, N])



In [111]:
cc = np.corrcoef(data[1:, :])
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(cc, vmax=1.0, vmin=0.9, square=True)
f.tight_layout()


Categorical data


In [2]:
sns.set()
# Load a predefined dataset into a pandas dataframe
df = sns.load_dataset("iris")

It seems that sepal and petal size tend to be related, that is bigger flowers are bigger! In addition there might be a systematic effect of species ...

In this dataset we have 4 variables that are numerical (features) and 3 categories (labels).


In [9]:
# Pairwise relationship plot
sns.pairplot(df, hue="species")


Out[9]:
<seaborn.axisgrid.PairGrid at 0x7fb97396dbd0>

What are we actually seeing here?


In [33]:
%load_ext version_information
%version_information numpy, matplotlib, scipy, seaborn, pandas


The version_information extension is already loaded. To reload it, use:
  %reload_ext version_information
Out[33]:
SoftwareVersion
Python2.7.11 64bit [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
IPython4.0.1
OSLinux 4.1.13 5 default x86_64 with SuSE 42.1 x86_64
numpy1.10.2
matplotlib1.5.0
scipy0.16.1
seaborn0.6.0
pandas0.17.1
Thu Dec 17 16:45:52 2015 AEDT