Tutorial Setup

Check your install



In [0]:

    
import numpy



In [1]:

    
import scipy



In [2]:

    
import matplotlib



In [3]:

    
import sklearn



In [4]:

    
import psutil



In [5]:

    
import pandas



In [6]:

    
import IPython.parallel

Finding the location of an installed package and its version:



In [7]:

    
numpy.__path__









    Out[7]:





['/Users/ogrisel/venvs/py34/lib/python3.4/site-packages/numpy']



In [8]:

    
numpy.__version__









    Out[8]:





'1.9.2'

Check that you have the datasets



In [9]:

    
%run ../fetch_data.py









    



Using existing dataset folder:/Users/ogrisel/code/parallel_ml_tutorial/datasets
Checking availability of the 20 newsgroups dataset
Found archive: /Users/ogrisel/code/parallel_ml_tutorial/datasets/20news-bydate.tar.gz
Checking that the 20 newsgroups files exist...
=> Success!



In [10]:

    
import os
for fname in os.listdir('../datasets/'):
    print(fname)









    



20news-bydate-test
20news-bydate-train
20news-bydate.tar.gz
sentiment140
titanic_train.csv
trainingandtestdata.zip

A NumPy primer

NumPy array dtypes and shapes



In [11]:

    
import numpy as np



In [12]:

    
a = np.array([1, 2, 3])



In [13]:

    
a









    Out[13]:





array([1, 2, 3])



In [14]:

    
b = np.array([[0, 2, 4], [1, 3, 5]])



In [15]:

    
b









    Out[15]:





array([[0, 2, 4],
       [1, 3, 5]])



In [16]:

    
b.shape









    Out[16]:





(2, 3)



In [17]:

    
b.dtype









    Out[17]:





dtype('int64')



In [18]:

    
a.shape









    Out[18]:





(3,)



In [19]:

    
a.dtype









    Out[19]:





dtype('int64')



In [20]:

    
np.zeros(5)









    Out[20]:





array([ 0.,  0.,  0.,  0.,  0.])



In [21]:

    
np.ones(shape=(3, 4), dtype=np.int32)









    Out[21]:





array([[1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1]], dtype=int32)

Common array operations



In [22]:

    
c = b * 0.5



In [23]:

    
c









    Out[23]:





array([[ 0. ,  1. ,  2. ],
       [ 0.5,  1.5,  2.5]])



In [24]:

    
c.shape









    Out[24]:





(2, 3)



In [25]:

    
c.dtype









    Out[25]:





dtype('float64')



In [26]:

    
a









    Out[26]:





array([1, 2, 3])



In [27]:

    
d = a + c



In [28]:

    
d









    Out[28]:





array([[ 1. ,  3. ,  5. ],
       [ 1.5,  3.5,  5.5]])



In [29]:

    
d[0]









    Out[29]:





array([ 1.,  3.,  5.])



In [30]:

    
d[0, 0]









    Out[30]:





1.0



In [31]:

    
d[:, 0]









    Out[31]:





array([ 1. ,  1.5])



In [32]:

    
d.sum()









    Out[32]:





19.5



In [33]:

    
d.mean()









    Out[33]:





3.25



In [34]:

    
d.sum(axis=0)









    Out[34]:





array([  2.5,   6.5,  10.5])



In [35]:

    
d.mean(axis=1)









    Out[35]:





array([ 3. ,  3.5])

Reshaping and inplace update



In [36]:

    
e = np.arange(12)



In [37]:

    
e









    Out[37]:





array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])



In [38]:

    
f = e.reshape(3, 4)



In [39]:

    
f









    Out[39]:





array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])



In [40]:

    
e









    Out[40]:





array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])



In [41]:

    
e[5:] = 0



In [42]:

    
e









    Out[42]:





array([0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0])



In [43]:

    
f









    Out[43]:





array([[0, 1, 2, 3],
       [4, 0, 0, 0],
       [0, 0, 0, 0]])

Combining arrays



In [44]:

    
a









    Out[44]:





array([1, 2, 3])



In [45]:

    
b









    Out[45]:





array([[0, 2, 4],
       [1, 3, 5]])



In [46]:

    
d









    Out[46]:





array([[ 1. ,  3. ,  5. ],
       [ 1.5,  3.5,  5.5]])



In [47]:

    
np.concatenate([a, a, a])









    Out[47]:





array([1, 2, 3, 1, 2, 3, 1, 2, 3])



In [48]:

    
np.vstack([a, b, d])









    Out[48]:





array([[ 1. ,  2. ,  3. ],
       [ 0. ,  2. ,  4. ],
       [ 1. ,  3. ,  5. ],
       [ 1. ,  3. ,  5. ],
       [ 1.5,  3.5,  5.5]])



In [49]:

    
np.hstack([b, d])









    Out[49]:





array([[ 0. ,  2. ,  4. ,  1. ,  3. ,  5. ],
       [ 1. ,  3. ,  5. ,  1.5,  3.5,  5.5]])

A Matplotlib primer



In [50]:

    
%matplotlib inline



In [51]:

    
import matplotlib.pyplot as plt



In [52]:

    
x = np.linspace(0, 2, 10)



In [53]:

    
x









    Out[53]:





array([ 0.        ,  0.22222222,  0.44444444,  0.66666667,  0.88888889,
        1.11111111,  1.33333333,  1.55555556,  1.77777778,  2.        ])



In [54]:

    
plt.plot(x, 'o-');



In [55]:

    
plt.plot(x, x, 'o-', label='linear')
plt.plot(x, x ** 2, 'x-', label='quadratic')

plt.legend(loc='best')
plt.title('Linear vs Quadratic progression')
plt.xlabel('Input')
plt.ylabel('Output');



In [56]:

    
samples = np.random.normal(loc=1.0, scale=0.5, size=1000)



In [57]:

    
samples.shape









    Out[57]:





(1000,)



In [58]:

    
samples.dtype









    Out[58]:





dtype('float64')



In [59]:

    
samples[:30]









    Out[59]:





array([ 0.49154735, -0.20568649,  1.31377114,  2.00566731,  0.7841002 ,
        1.37162944,  0.35393214,  0.64864944,  0.61681463,  1.42298751,
        1.10847225,  1.90760358,  0.98976634,  1.32393257,  1.15057275,
        0.71417411,  1.27197596,  1.82152059,  1.12728921,  0.93116918,
        0.54941321,  1.18379504,  0.60626759,  1.16439308,  2.41235109,
        1.92930117,  0.30859505, -0.12063577,  2.29732949,  1.47543464])



In [60]:

    
plt.hist(samples, bins=50);



In [61]:

    
samples_1 = np.random.normal(loc=1, scale=.5, size=10000)
samples_2 = np.random.standard_t(df=10, size=10000)



In [62]:

    
bins = np.linspace(-3, 3, 50)
_ = plt.hist(samples_1, bins=bins, alpha=0.5, label='samples 1')
_ = plt.hist(samples_2, bins=bins, alpha=0.5, label='samples 2')
plt.legend(loc='upper left');



In [63]:

    
plt.scatter(samples_1, samples_2, alpha=0.1);



In [64]: