A Crash Course: Introduction to Python for Data Science

1. Obtain

2. Scrub/Cleaning

3. Explore/Understand

4. Modeling

5. Reporting/Presenting

USING JUPYTER NOTEBOOK

https://goo.gl/Q9tU2l


In [ ]:
# MAGIC 

%lsmagic
#OR
#lsmagic

In [ ]:
import sys 

sys.path

In [ ]:
import sys
import pprint
pprint.pprint(sys.path)

In [ ]:
help(pprint)
"data is stored many machine readable formats." "the 3 most common are: " "CSV - Comma Seperated Values" "JSON - JavaScript Object Notation" "XML - Extensible Markup Language."

In [ ]:
ls -laph

In [ ]:
cd data

In [ ]:
pwd

In [ ]:
ls -laph

In [ ]:
pwd

In [ ]:
cd ../

In [ ]:
pwd

In [ ]:
ls -laph

In [ ]:
pwd

In [ ]:
# an assignment
the_world_is_flat = True

In [ ]:
if the_world_is_flat:
    print("1. Be careful not to fall off!")
    
print("2. Be careful not to fall off ARGGGHHHHHHHHH!")

In [ ]:
# NUMBERS

In [ ]:
height = 201
width = 5
area = height * width

In [ ]:
area

In [ ]:
area * 7

In [ ]:
area = area + area

In [ ]:
area

In [ ]:
_

In [ ]:
# understanding errors
# a padawan asks, "how?" 
# replies yoda, "read, you must..."
m

In [ ]:
m = "the force is now with you!"

In [ ]:
m

In [ ]:
tax = 12.5 / 100
price = 100.50
price * tax

In [ ]:
# What is the purpose of _ , the underscore?
# It holds the last printed expression.  
# however, don't use it!
price + _

In [ ]:
# see here!
_

In [ ]:
round(_, 2)

STRINGS

string are immutable


In [ ]:
'spam eggs'  # single quotes

In [ ]:
'doesn\'t'  # use \' to escape the single quote...

In [ ]:
"doesn't"  # ...or use double quotes instead

In [ ]:
'"Yes," he said.'

In [ ]:
"\"Yes,\" he said."

In [ ]:
'"Isn\'t," she said.'

In [ ]:
print('"Isn\'t," she said.')

In [ ]:
help(print)

In [ ]:
s = 'First line.\nSecond line\nHAHAHAHAHAHAHA\tJOKER!.'  # \n means newline
s

In [ ]:
print(s)

In [ ]:
# For Windows the filepath is defined as seen below...
# using raw strings by adding r before the quote
print('C:\some\naMe\tnamE')  # here \n means newline!

In [ ]:
print(r'C:\some\naMe\namE')  # note the r before the quote

In [ ]:
print(r'C:/some/name')

In [ ]:
# using triple-quotes either """...""" or '''...''' to spane multiple lines

print("""\
Usage: thingy [OPTIONS]
     -h                        Display this usage message
     -H hostname               Hostname to connect to
     -B banana
""")

In [ ]:
# 3 times 'un', followed by 'ium'
3 * 'gumMMM' + 'dropPPPP' + 'StarWars'

In [ ]:
# parentheses to encapsulate two or more string literals 
# side by side will be concatenated

text = ('Put several strings within parentheses '
        'to have them joined together. '
        'WOO!!!!!!')
text

In [ ]:
# string are lists (i.e. arrays) and thus have indices 
word = 'Python'
word[0]

In [ ]:
word[5]

In [ ]:
word[7]

In [ ]:
# indices can be in negative number, starting from the right
word[-1]

In [ ]:
word[-6]

In [ ]:
# slicing 
# this is the interval [0, 2)
word[0:2]

In [ ]:
#this is the interval [2, 5)
word[2:5]

In [ ]:
word[:2] + word[2:]

In [ ]:
word[2:] + word[:2]

BUILT-IN FUNCTIONS


In [ ]:
s = 'supercalifra gilisticexpialidoc ious'
len(s)

In [ ]:
help(len)

LISTS

lists are mutable


In [ ]:
squares = [1, 4, 9, 16, 25, 36]
squares

In [ ]:
squares[0] = 345

In [ ]:
squares

In [ ]:
squares[-1]

In [ ]:
squares[-3:]

In [ ]:
squares[:-2]

In [ ]:
squares[:]

In [ ]:
# concatenation
squares + [36, 49, 64, 81, 100]

In [ ]:
cubes = [1, 8, 27, 65, 125] # something's wrong here

In [ ]:
cubes

In [ ]:
4**3

In [ ]:
cubes[3] = 64
cubes

In [ ]:
# cubes.<TAB>

In [ ]:
cubes[:] = []
cubes

Baby Steps: We're Getting Closer to Programming


In [ ]:
# CONTROL FLOW
# if... else statement

x = int(input("Please enter an integer: "))

if x == 5:
    print ('x is equal to 5.')
else:
    print ('x is not equal to 5.')

In [ ]:
# if... elif... else statement


x = int(input("Please enter an integer: "))

if x == 5:
    print ('x is equal to 5.')
elif x > 5:
    print ('x is greater than 5')
else:
    print ('x is les than 5.')

In [ ]:
# while loop statement

# fibbonaci series:
# the sum of two elements defined the next
a, b, c = 0, 1, int(input("Please enter an integer: "))  # this is a multiple assignment
print(a)
while (b < c):
    print(b)
    a, b = b, a + b

In [ ]:
help(print)

In [ ]:
a, b, c = 0, 1, int(input("Please enter an integer: "))  # this is a multiple assignment

while (b < c):
    print(b, end=', ')
    a, b = b, a + b

In [ ]:
# Measure some strings:
words = ['cat', 'window', 'defenestrate', 'aaafkwnfkn']
for word in words:
    print(word, len(word))

In [ ]:
# [1:len(words)]

words[2:4]

In [ ]:
for word in words[:]:  # Loop over a slice copy of the entire list.
    if len(word) > 5:
        words.insert(0, word)
words

In [ ]:
help(range)

In [ ]:
for i in range(5):
    print(i)

In [ ]:
# To iterate over the indices of a sequence, you can combine range() and len() as follows:
a = ['Mary', 'had', 'a', 'little', 'lamb']
for i in range(len(a)):
    print(i, a[i])

# Or
list(enumerate(a))
composing functions # x --> f(x), this is x maps to f(x) # analogous to # a --> enumerate(a) # enumerate(a) --> list(enumerate(a))

In [ ]:
help(enumerate)

In [ ]:
type(2)

In [ ]:
type(enumerate(a))

In [ ]:
enumerate(a)

In [ ]:
type(list(enumerate(a)))

In [ ]:
# BREAK and CONTINUE STATEMENT
for x in range(2, 10):
    for y in range(2, x):
        # % is the modulus or remainder operator
        if x % y == 0:
            # // is the floor division operator
            print(x, 'equals', y, '*', x//y)
            break
        else:
            # loop fell through without finding a factor
            print(x, 'is a prime number')

In [ ]:
# Let's talk about the importance of indentation

In [ ]:
for x in range(2, 10):
    for y in range(2, x):
        if x % y == 0:
            print(x, 'equals', y, '*', x//y)
            break
    else:
        # loop fell through without finding a factor
        print(x, 'is a prime number')

In [ ]:
for num in range(2, 10):
    if num % 2 == 0:
        print("Found an even number", num)
        #continue
        break
    print("Found a number", num)

DEFINING YOUR OWN FUNCTIONS


In [ ]:
# Remember:
a, b, c = 0, 1, int(input("enter a number: "))  # this is a multiple assignment
while (b < c):
    print(b, end=', ')
    a, b = b, a + b

In [ ]:
# used as a placeholder while developing code

def fib(n):
    pass   # Remember to implement this!
    #return None

In [ ]:
# def is a keyword
# def meaning definition

# fib is not a function but a procedure since it doesn’t return a value

n = int(input("enter a number: "))

def fib(n):
    pass

In [ ]:
# def is a keyword
# def meaning definition

# fib is not a function but a procedure since it doesn’t return a value

n = int(input("enter a number: "))


def fib(n): # write Fibonacci series up to n
    a, b = 0, 1  # this is a multiple assignment
    """Print a Fibonacci series up to n."""
    while (b < n):
        print(b, end=', ')
        a, b = b, a + b
    print() # return nothing

fib(n)

In [ ]:
fib

In [ ]:
# Aliasing a function
f = fib
f(3000)

In [ ]:
fib(0)

In [ ]:
print(fib(0))

In [ ]:
# It is simple to write a function that returns a list of the
# numbers of the Fibonacci series, instead of printing it:

n = int(input("enter a number: "))

def fib2(n):  # return Fibonacci series up to n
    """Return a list containing the Fibonacci series up to n."""
    result = []
    a, b = 0, 1
    while a < n:
        result.append(a)    # see below
        a, b = b, a+b
    return result

fib2(n)

In [ ]:
# LAMBDA EXPRESSIONS
# lambda keyword

# borrowed from functional programming, which is a manifestation of lambda calculus

# lambda function is a whereever function.  it is a normal function with some 
# syntactic sugar.

# lambda arguments: expression, giving a function object 
funcX   = lambda x: x + 1                    # think f(x)       = x + 1
funcXY  = lambda x, y: x + y                 # think f(x, y)    = x + y
funcXYZ = lambda x, y, z: x * y + pow(z, 3)  # think f(x, y, z) = x * y + z^3

In [ ]:
print(funcX(5))
print(funcXY(2,3))
print(funcXYZ(2,3,4))

In [ ]:
n = int(input("enter a number: "))

def make_incrementor(n):
    
    function = lambda x: x + n
    return function
    

f = make_incrementor(n)

In [ ]:
f(12)

In [ ]:
f(1)

In [ ]:
2**3

In [ ]:
pow(2,3)

Data Science Packages

Importing packages


In [ ]:
# numpy, Numerical Python, is a package for scientific computing
import numpy as np

In [ ]:
# pandas, Python Data Analysis library
import pandas as pd

In [ ]:
# 2D plotting library
import matplotlib as mpl

In [ ]:
import matplotlib.pyplot as plt

In [ ]:
help

In [ ]:
help(pd)

In [ ]:
help(np)

In [ ]:
help(plt)

In [ ]:
# magic functions in jupyter
# note: this is not python
# these are special jupyter notebook functions
# that act as settings for the notebook
%who

In [ ]:
# %pylab inline
%matplotlib inline

x = np.arange(0, 10, 0.1);
y = np.sin(x)
plt.plot(x, y)

In [ ]:
# creating a DataFrame by passing a numpy array
dates = pd.date_range('20160101', periods=12)
#dates = pd.date_range('20160101', periods=12, freq='M')
#dates = pd.date_range('20160101', periods=12, freq='3M')
#dates = pd.date_range('20040101', periods=144, freq='M')
dates

In [ ]:
help(pd.date_range)

In [ ]:
df = pd.DataFrame(np.random.randn(12,4), index=dates, columns=list('ABCD'))
#df = pd.DataFrame(np.random.randn(144,4), index=dates, columns=list('ABCD'))
df

In [ ]:
df.dtypes

In [ ]:
#df.<TAB>

In [ ]:
df.info()

In [ ]:
# Exploring the data
df.head()

In [ ]:
df.head(1)

In [ ]:
df.tail(3)

In [ ]:
# decomposing the underlying numpy data

In [ ]:
df.index

In [ ]:
df.columns

In [ ]:
df.values

In [ ]:
# getting some summary statistics
df.describe()

In [ ]:
#transposing data
df.T

In [ ]:
# sorting by an axis
df.sort_index(axis=1, ascending=False)

In [ ]:
help(df.sort_index)

In [ ]:
df.sort_values(by='B')

In [ ]:
# selecting (chopping up) data

In [ ]:
# by column
df['A']

In [ ]:
# by row
df[0:3]

In [ ]:
# by row also
df['2016-01-05':'2016-01-07']

In [ ]:
# by label
df.loc[dates[0]]

In [ ]:
# on a multi-axis by label
df.loc[:, ['A', 'B']]

In [ ]:
df.loc['20160102':'20160104',['A','B']]

In [ ]:
# Reduction in the dimensions of the returned object

df.loc['20160102',['A','B']]

In [ ]:
# For getting a scalar value
df.loc[dates[0],'A']

In [ ]:
# For getting fast access to a scalar (equiv to the prior method
df.at[dates[0],'A']

In [ ]:
help(df.iloc)

In [ ]:
# by position
df.iloc[3]

In [ ]:
df.iloc[3:5,0:2]

In [ ]:
df.iloc[[1,2,4],[0,2]]

In [ ]:
df.iloc[1:3,:]

In [ ]:
df.iloc[:,1:3]

In [ ]:
df.iloc[1,1]

In [ ]:
df.iat[1,1]

In [ ]:
# Boolean Indexing

In [ ]:
df[df.A > 0]

In [ ]:
df[df > 0]

In [ ]:
# Descriptive Statistics

In [ ]:
help(df.mean)

In [ ]:
df.mean()

In [ ]:
df.mean(1)

In [ ]:
# Applying functions to data

In [ ]:
help(np.cumsum)

In [ ]:
df.apply(np.cumsum)

In [ ]:
df.apply(lambda x: x.max() - x.min())

In [ ]:
# Reading and Writing Data

In [ ]:
# this is not python 
# this is commandline bash commands

ls -laph data/

In [ ]:
# export the dataframe we created to a csv file
df.to_csv('data/data01.csv')

In [ ]:
# import the csv to a pandas dataframe
pd.read_csv('data/data01.csv')

In [ ]:
# %pylab inline
%matplotlib inline

plt.figure()
# plt.figure(figsize=(20, 10)) 
df.plot()
plt.legend(loc='best');

In [ ]:
help(plt.figure)

In [ ]:
plt.figure(figsize(15,10))
df.plot()
plt.legend(loc='best');

In [ ]:
aapl = pd.read_csv('data/AAPL.csv')
aapl.info()

CANDY! OH, SWEET CANDY!


In [109]:
%%latex

$$c = \sqrt{a^2 + b^2}$$

\begin{equation} 
c = \sqrt{a^2 + b^2}
\end{equation}


$$c = \sqrt{a^2 + b^2}$$ \begin{equation} c = \sqrt{a^2 + b^2} \end{equation}

Adding Images to the Notebook


In [110]:
from IPython.display import Image
i = Image(filename='./ml_map.png')
i


Out[110]:

In [ ]:


In [ ]: