Chapter 2: Basic Operations

Vectors, Data, Matrices and Subsetting



In [1]:

    
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
x = np.array([2,7,5])    # explicit vector creation
x









    Out[2]:





array([2, 7, 5])



In [3]:

    
y = np.arange(4, 13, 3)  # vector creation from a sequence (start, stop, step)
y









    Out[3]:





array([ 4,  7, 10])



In [4]:

    
x + y    # vectors can be added









    Out[4]:





array([ 6, 14, 15])



In [5]:

    
x / y    # divided









    Out[5]:





array([ 0.5,  1. ,  0.5])



In [6]:

    
x ** y    # exponentiated









    Out[6]:





array([     16,  823543, 9765625])



In [7]:

    
x[1]    # vector elements can be selected by position









    Out[7]:





7



In [8]:

    
x[1:3]  # multiple elements can be selected using slices









    Out[8]:





array([7, 5])



In [9]:

    
x[-2]  # elements can be specified as offset from end









    Out[9]:





7



In [10]:

    
x[np.array([0,1])]  # elements can be specified as an array









    Out[10]:





array([2, 7])



In [11]:

    
Z = np.matrix(np.arange(1,13)).reshape((4, 3))
Z                 # note: R arranges the elements column-wise









    Out[11]:





matrix([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12]])



In [12]:

    
Z[2:4, 1:3]    # R is 1-based and includes ending index, Python is 0 based and does not.









    Out[12]:





matrix([[ 8,  9],
        [11, 12]])



In [13]:

    
Z[:, 1:3]    # column slice









    Out[13]:





matrix([[ 2,  3],
        [ 5,  6],
        [ 8,  9],
        [11, 12]])



In [14]:

    
Z.shape









    Out[14]:





(4, 3)

Generating Random Data, Graphics



In [15]:

    
x = np.random.uniform(0.0, 1.0, 50)
x









    Out[15]:





array([ 0.65268288,  0.55438063,  0.12332376,  0.73656161,  0.172588  ,
        0.6952661 ,  0.36758793,  0.42034067,  0.54835434,  0.85493252,
        0.36666796,  0.14029571,  0.16910043,  0.18956811,  0.77016189,
        0.1914512 ,  0.48202498,  0.98936099,  0.43339457,  0.3690124 ,
        0.91976653,  0.42354589,  0.23428098,  0.24846295,  0.20409963,
        0.71024602,  0.57361799,  0.12510961,  0.70550836,  0.62120022,
        0.7089459 ,  0.67670746,  0.39836928,  0.17109468,  0.57367169,
        0.60584339,  0.72445482,  0.67991423,  0.94658844,  0.91068048,
        0.65489981,  0.05985463,  0.56797862,  0.67604148,  0.989085  ,
        0.79619792,  0.46406902,  0.91227664,  0.38043679,  0.04246386])



In [16]:

    
y = np.random.normal(0.0, 1.0, 50)
y









    Out[16]:





array([ -5.95648785e-01,   1.54445571e+00,   1.35312925e+00,
         1.88123098e+00,  -3.80353324e-01,  -9.67551972e-04,
         5.97422145e-01,   3.76114342e-02,  -1.33779968e+00,
        -5.95407697e-01,   1.27206236e+00,  -1.70114410e+00,
        -1.84147571e+00,   7.16611666e-01,   2.33006825e+00,
        -6.56709283e-01,   8.26951067e-01,   4.64854930e-01,
         8.79290232e-01,  -7.42861574e-02,   4.88797828e-01,
         1.50860241e+00,   1.07467313e+00,  -7.99843607e-01,
         6.81719603e-02,   5.06424430e-01,   2.15770333e-01,
         1.67843398e+00,  -5.01086886e-01,   1.51407152e-01,
        -3.30648417e-02,  -8.50683506e-01,  -1.56531920e+00,
        -6.43438018e-01,  -9.55514605e-01,   1.51696803e-01,
        -9.87417666e-01,  -4.44820480e-01,   1.21902188e-01,
        -5.20474443e-01,   8.31544982e-01,  -6.75527626e-01,
         1.00537660e+00,   4.87948062e-01,   3.43123002e-01,
         3.95408496e-01,   4.27046930e-01,   3.69945068e-01,
         1.36728064e+00,  -3.15495180e-01])



In [17]:

    
fig, ax = plt.subplots()
plt.scatter(x, y)









    Out[17]:





<matplotlib.collections.PathCollection at 0x420b410>



In [18]:

    
fig, ax = plt.subplots()
plt.xlabel("Random Uniform")
plt.ylabel("Random Normal")
plt.scatter(x, y, marker='o', color='red')  # plot customizations









    Out[18]:





<matplotlib.collections.PathCollection at 0x4241f50>



In [19]:

    
plt.subplot(121)    # parameter indicates 1 rows, 2 col, first figure
plt.scatter(x, y)
plt.subplot(122)
plt.hist(y)









    Out[19]:





(array([ 3.,  1.,  9.,  5.,  9.,  9.,  6.,  3.,  4.,  1.]),
 array([-1.84147571, -1.42432131, -1.00716692, -0.59001252, -0.17285812,
        0.24429627,  0.66145067,  1.07860506,  1.49575946,  1.91291385,
        2.33006825]),
 <a list of 10 Patch objects>)

Reading in data



In [20]:

    
# data comes from ISL book site: http://www-bcf.usc.edu/~gareth/ISL/data.html
auto_df = pd.read_csv("../data/Auto.csv")
auto_df.columns     # column names









    Out[20]:





Index([u'mpg', u'cylinders', u'displacement', u'horsepower', u'weight', u'acceleration', u'year', u'origin', u'name'], dtype='object')



In [21]:

    
auto_df.shape    # number of rows, number of columns









    Out[21]:





(397, 9)



In [22]:

    
type(auto_df)









    Out[22]:





pandas.core.frame.DataFrame



In [23]:

    
auto_df.describe()  # equivalent of R's DataFrame.summary()









    Out[23]:






  
    
      
      mpg
      cylinders
      displacement
      weight
      acceleration
      year
      origin
    
  
  
    
      count
       397.000000
       397.000000
       397.000000
        397.000000
       397.000000
       397.000000
       397.000000
    
    
      mean
        23.515869
         5.458438
       193.532746
       2970.261965
        15.555668
        75.994962
         1.574307
    
    
      std
         7.825804
         1.701577
       104.379583
        847.904119
         2.749995
         3.690005
         0.802549
    
    
      min
         9.000000
         3.000000
        68.000000
       1613.000000
         8.000000
        70.000000
         1.000000
    
    
      25%
        17.500000
         4.000000
       104.000000
       2223.000000
        13.800000
        73.000000
         1.000000
    
    
      50%
        23.000000
         4.000000
       146.000000
       2800.000000
        15.500000
        76.000000
         1.000000
    
    
      75%
        29.000000
         8.000000
       262.000000
       3609.000000
        17.100000
        79.000000
         2.000000
    
    
      max
        46.600000
         8.000000
       455.000000
       5140.000000
        24.800000
        82.000000
         3.000000
    
  

8 rows × 7 columns



In [24]:

    
plt.ylabel("MPG")
auto_df.plot(x="cylinders", y="mpg", style='o')









    Out[24]:





<matplotlib.axes.AxesSubplot at 0x46ef650>



In [25]:

    
auto_df.boxplot(column="mpg", by="cylinders") # MPG distribution by number of cylinders









    Out[25]:





<matplotlib.axes.AxesSubplot at 0x499f510>



In [26]:

    
# similar to R pairs, shows correlation scatter plots between columns and distribution for each 
# column along the diagonal.
# The R version that uses formulas does not seem to have a Python equivalent (and doesn't seem
# to be very useful for exploratory analysis IMO).
axes = pd.tools.plotting.scatter_matrix(auto_df, color="brown")

	mpg	cylinders	displacement	weight	acceleration	year	origin
count	397.000000	397.000000	397.000000	397.000000	397.000000	397.000000	397.000000
mean	23.515869	5.458438	193.532746	2970.261965	15.555668	75.994962	1.574307
std	7.825804	1.701577	104.379583	847.904119	2.749995	3.690005	0.802549
min	9.000000	3.000000	68.000000	1613.000000	8.000000	70.000000	1.000000
25%	17.500000	4.000000	104.000000	2223.000000	13.800000	73.000000	1.000000
50%	23.000000	4.000000	146.000000	2800.000000	15.500000	76.000000	1.000000
75%	29.000000	8.000000	262.000000	3609.000000	17.100000	79.000000	2.000000
max	46.600000	8.000000	455.000000	5140.000000	24.800000	82.000000	3.000000