Chapter 2: Basic Operations

Vectors, Data, Matrices and Subsetting



In [1]:

    
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline



In [2]:

    
x = np.array([2,7,5])    # explicit vector creation
x









    Out[2]:





array([2, 7, 5])



In [3]:

    
y = np.arange(4, 13, 3)  # vector creation from a sequence (start, stop, step)
y









    Out[3]:





array([ 4,  7, 10])



In [4]:

    
x + y    # vectors can be added









    Out[4]:





array([ 6, 14, 15])



In [5]:

    
x / y    # divided









    Out[5]:





array([ 0.5,  1. ,  0.5])



In [6]:

    
x ** y    # exponentiated









    Out[6]:





array([     16,  823543, 9765625])



In [7]:

    
x[1]    # vector elements can be selected by position









    Out[7]:





7



In [8]:

    
x[1:3]  # multiple elements can be selected using slices, note we use 3 to get the index 2 element in slicing









    Out[8]:





array([7, 5])



In [9]:

    
x[-2]  # elements can be specified as offset from end









    Out[9]:





7



In [10]:

    
x[np.array([0,1])]  # elements can be specified as an array









    Out[10]:





array([2, 7])



In [11]:

    
Z = np.matrix(np.arange(1,13)).reshape((4, 3))
Z                 # note: R arranges the elements column-wise, reshape(nrows,ncols)









    Out[11]:





matrix([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12]])



In [12]:

    
Z[2:4, 1:3]    # R is 1-based and includes ending index, Python is 0 based and does not.









    Out[12]:





matrix([[ 8,  9],
        [11, 12]])



In [13]:

    
Z[:, 1:3]    # column slice









    Out[13]:





matrix([[ 2,  3],
        [ 5,  6],
        [ 8,  9],
        [11, 12]])



In [14]:

    
Z.shape









    Out[14]:





(4, 3)



In [15]:

    
dir(Z) #view all functions and variables of the object









    Out[15]:





['A',
 'A1',
 'H',
 'I',
 'T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_finalize__',
 '__array_interface__',
 '__array_prepare__',
 '__array_priority__',
 '__array_struct__',
 '__array_wrap__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__delslice__',
 '__dict__',
 '__div__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getslice__',
 '__gt__',
 '__hash__',
 '__hex__',
 '__iadd__',
 '__iand__',
 '__idiv__',
 '__ifloordiv__',
 '__ilshift__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lshift__',
 '__lt__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__oct__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rlshift__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__rpow__',
 '__rrshift__',
 '__rshift__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__',
 '__setitem__',
 '__setslice__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__xor__',
 '_align',
 '_collapse',
 '_getitem',
 'all',
 'any',
 'argmax',
 'argmin',
 'argpartition',
 'argsort',
 'astype',
 'base',
 'byteswap',
 'choose',
 'clip',
 'compress',
 'conj',
 'conjugate',
 'copy',
 'ctypes',
 'cumprod',
 'cumsum',
 'data',
 'diagonal',
 'dot',
 'dtype',
 'dump',
 'dumps',
 'fill',
 'flags',
 'flat',
 'flatten',
 'getA',
 'getA1',
 'getH',
 'getI',
 'getT',
 'getfield',
 'imag',
 'item',
 'itemset',
 'itemsize',
 'max',
 'mean',
 'min',
 'nbytes',
 'ndim',
 'newbyteorder',
 'nonzero',
 'partition',
 'prod',
 'ptp',
 'put',
 'ravel',
 'real',
 'repeat',
 'reshape',
 'resize',
 'round',
 'searchsorted',
 'setfield',
 'setflags',
 'shape',
 'size',
 'sort',
 'squeeze',
 'std',
 'strides',
 'sum',
 'swapaxes',
 'take',
 'tobytes',
 'tofile',
 'tolist',
 'tostring',
 'trace',
 'transpose',
 'var',
 'view']

Generating Random Data, Graphics



In [16]:

    
x = np.random.uniform(0.0, 1.0, 50)
x









    Out[16]:





array([ 0.42369854,  0.42839659,  0.71399128,  0.89095727,  0.62831616,
        0.15604243,  0.03450789,  0.19857348,  0.82162552,  0.98037713,
        0.80739905,  0.59012158,  0.59515533,  0.69312869,  0.9680758 ,
        0.77593064,  0.86326066,  0.45264975,  0.35392443,  0.2865041 ,
        0.69942752,  0.95571815,  0.67025405,  0.96471432,  0.19223278,
        0.65874443,  0.14674968,  0.0244347 ,  0.66109786,  0.63937212,
        0.31824101,  0.12380106,  0.55152282,  0.65004601,  0.88418646,
        0.50943384,  0.14433865,  0.68443028,  0.21952859,  0.4777811 ,
        0.81016352,  0.01943896,  0.03336433,  0.98473835,  0.73446844,
        0.09294225,  0.7496063 ,  0.84556825,  0.88154502,  0.09045074])



In [17]:

    
y = np.random.normal(0.0, 1.0, 50)
y









    Out[17]:





array([ 1.70448876,  0.057993  , -0.12248449,  0.77515888, -0.05920301,
        0.92306028,  0.74042274, -0.01647755,  0.55442844,  0.23525554,
        1.04680052, -1.11913652,  0.66382281, -1.3489195 ,  0.76220764,
        0.02919578, -0.82111368,  0.36832419,  1.54455885,  0.15253834,
        1.79642844,  0.2379548 , -0.81717162, -0.91250789, -0.91916105,
       -1.5614881 , -1.93276631,  0.30286587,  0.30912252,  0.43490375,
       -0.03810153, -1.87290898,  0.52298002,  3.34468179,  1.59432497,
       -0.08903501, -0.51092418, -1.98920752, -0.54877045,  0.4204875 ,
        0.04426435, -0.02577603, -1.00311173,  0.80105511, -0.48216409,
       -2.65420899,  0.13021135,  0.32371668,  1.90412064,  1.76431656])



In [18]:

    
fig, ax = plt.subplots()
plt.scatter(x, y)









    Out[18]:





<matplotlib.collections.PathCollection at 0x7fa35982ebd0>



In [19]:

    
fig, ax = plt.subplots()
plt.xlabel("Random Uniform")
plt.ylabel("Random Normal")
plt.scatter(x, y, marker='o', color='red')  # plot customizations









    Out[19]:





<matplotlib.collections.PathCollection at 0x7fa35972ea10>



In [20]:

    
plt.subplot(121)    # parameter indicates 1 rows, 2 col, first figure
plt.scatter(x, y)
plt.subplot(122)
plt.hist(y)









    Out[20]:





(array([  1.,   4.,   5.,   5.,  16.,  11.,   2.,   5.,   0.,   1.]),
 array([-2.65420899, -2.05431991, -1.45443084, -0.85454176, -0.25465268,
         0.3452364 ,  0.94512548,  1.54501455,  2.14490363,  2.74479271,
         3.34468179]),
 <a list of 10 Patch objects>)

Reading in data



In [21]:

    
# data comes from ISL book site: http://www-bcf.usc.edu/~gareth/ISL/data.html
auto_df = pd.read_csv("../data/Auto.csv")
auto_df.columns     # column names









    Out[21]:





Index([u'mpg', u'cylinders', u'displacement', u'horsepower', u'weight',
       u'acceleration', u'year', u'origin', u'name'],
      dtype='object')



In [22]:

    
auto_df.shape    # number of rows, number of columns









    Out[22]:





(397, 9)



In [23]:

    
type(auto_df)









    Out[23]:





pandas.core.frame.DataFrame



In [24]:

    
auto_df.describe()  # equivalent of R's DataFrame.summary()









    Out[24]:






  
    
      
      mpg
      cylinders
      displacement
      weight
      acceleration
      year
      origin
    
  
  
    
      count
      397.000000
      397.000000
      397.000000
      397.000000
      397.000000
      397.000000
      397.000000
    
    
      mean
      23.515869
      5.458438
      193.532746
      2970.261965
      15.555668
      75.994962
      1.574307
    
    
      std
      7.825804
      1.701577
      104.379583
      847.904119
      2.749995
      3.690005
      0.802549
    
    
      min
      9.000000
      3.000000
      68.000000
      1613.000000
      8.000000
      70.000000
      1.000000
    
    
      25%
      17.500000
      4.000000
      104.000000
      2223.000000
      13.800000
      73.000000
      1.000000
    
    
      50%
      23.000000
      4.000000
      146.000000
      2800.000000
      15.500000
      76.000000
      1.000000
    
    
      75%
      29.000000
      8.000000
      262.000000
      3609.000000
      17.100000
      79.000000
      2.000000
    
    
      max
      46.600000
      8.000000
      455.000000
      5140.000000
      24.800000
      82.000000
      3.000000



In [25]:

    
auto_df.plot(x="cylinders", y="mpg", style='o')
plt.ylabel("MPG")









    Out[25]:





<matplotlib.text.Text at 0x7fa35949f110>



In [26]:

    
auto_df.boxplot(column="mpg", by="cylinders") # MPG distribution by number of cylinders









    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fa359434a90>



In [27]:

    
# similar to R pairs, shows correlation scatter plots between columns and distribution for each 
# column along the diagonal.
# The R version that uses formulas does not seem to have a Python equivalent (and doesn't seem
# to be very useful for exploratory analysis IMO).
axes = pd.tools.plotting.scatter_matrix(auto_df, color="brown")
f = plt.gcf()
f.set_size_inches(10,8)



In [28]:

    
np.exp(-.5)/(1+np.exp(-.5))









    Out[28]:





0.37754066879814546



In [ ]:

	mpg	cylinders	displacement	weight	acceleration	year	origin
count	397.000000	397.000000	397.000000	397.000000	397.000000	397.000000	397.000000
mean	23.515869	5.458438	193.532746	2970.261965	15.555668	75.994962	1.574307
std	7.825804	1.701577	104.379583	847.904119	2.749995	3.690005	0.802549
min	9.000000	3.000000	68.000000	1613.000000	8.000000	70.000000	1.000000
25%	17.500000	4.000000	104.000000	2223.000000	13.800000	73.000000	1.000000
50%	23.000000	4.000000	146.000000	2800.000000	15.500000	76.000000	1.000000
75%	29.000000	8.000000	262.000000	3609.000000	17.100000	79.000000	2.000000
max	46.600000	8.000000	455.000000	5140.000000	24.800000	82.000000	3.000000