In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
x = np.array([2,7,5]) # explicit vector creation
x
Out[2]:
In [3]:
y = np.arange(4, 13, 3) # vector creation from a sequence (start, stop, step)
y
Out[3]:
In [4]:
x + y # vectors can be added
Out[4]:
In [5]:
x / y # divided
Out[5]:
In [6]:
x ** y # exponentiated
Out[6]:
In [7]:
x[1] # vector elements can be selected by position
Out[7]:
In [8]:
x[1:3] # multiple elements can be selected using slices
Out[8]:
In [9]:
x[-2] # elements can be specified as offset from end
Out[9]:
In [10]:
x[np.array([0,1])] # elements can be specified as an array
Out[10]:
In [11]:
Z = np.matrix(np.arange(1,13)).reshape((4, 3))
Z # note: R arranges the elements column-wise
Out[11]:
In [12]:
Z[2:4, 1:3] # R is 1-based and includes ending index, Python is 0 based and does not.
Out[12]:
In [13]:
Z[:, 1:3] # column slice
Out[13]:
In [14]:
Z.shape
Out[14]:
In [15]:
x = np.random.uniform(0.0, 1.0, 50)
x
Out[15]:
In [16]:
y = np.random.normal(0.0, 1.0, 50)
y
Out[16]:
In [17]:
fig, ax = plt.subplots()
plt.scatter(x, y)
Out[17]:
In [18]:
fig, ax = plt.subplots()
plt.xlabel("Random Uniform")
plt.ylabel("Random Normal")
plt.scatter(x, y, marker='o', color='red') # plot customizations
Out[18]:
In [19]:
plt.subplot(121) # parameter indicates 1 rows, 2 col, first figure
plt.scatter(x, y)
plt.subplot(122)
plt.hist(y)
Out[19]:
In [20]:
# data comes from ISL book site: http://www-bcf.usc.edu/~gareth/ISL/data.html
auto_df = pd.read_csv("../data/Auto.csv")
auto_df.columns # column names
Out[20]:
In [21]:
auto_df.shape # number of rows, number of columns
Out[21]:
In [22]:
type(auto_df)
Out[22]:
In [23]:
auto_df.describe() # equivalent of R's DataFrame.summary()
Out[23]:
In [24]:
plt.ylabel("MPG")
auto_df.plot(x="cylinders", y="mpg", style='o')
Out[24]:
In [25]:
auto_df.boxplot(column="mpg", by="cylinders") # MPG distribution by number of cylinders
Out[25]:
In [26]:
# similar to R pairs, shows correlation scatter plots between columns and distribution for each
# column along the diagonal.
# The R version that uses formulas does not seem to have a Python equivalent (and doesn't seem
# to be very useful for exploratory analysis IMO).
axes = pd.tools.plotting.scatter_matrix(auto_df, color="brown")