Python 101

Importing Packages



In [ ]:

    
from numpy import *
from pandas import *

#using import * brings ALL the packages classes and functions into the name space,
#for large packages you can bring in only some parts by doing from [name] import [class/object]
#to avoid name space conflicts you can also rename what you import

#import pandas and rename it
import pandas as pd
#import the Series and dataframe classes
from pandas import Series, DataFrame

#common naming conventions. numpy is np, pandas pd, and matplotlib is plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Declaring Variables



In [ ]:

    
#no data types need to be associated with variables
#all variables are "Objects" in python. 

num = 5
alpha = 'hello world'
mixed = str(num) + ' ' + alpha

print num
print alpha
print mixed

Lists

Creating



In [ ]:

    
#this is a list
a=[0,1,2,3,4,5,6,7,8,9]
a

Slice and Dice



In [ ]:

    
#use [x] to access the item at location x in the list
#all lists start at 0
print 'first item', a[0]

#you can also index from back by using -1 for last, -2 for "second from last" etc
print 'last item', a[-1]

#you can "slice" a list using : and ::
print 'first three items', a[:3]
print 'last three items', a[-3:]
print 'start at the 4th item', a[3:]
print 'the odd items', a[::2]



In [ ]:

    
#all lists have a length, use len(list_name) to get the number of items in it
#mathematical functions can also, in general, be applied to lists if they contain numbers
print 'length of list', len(a)
print 'largets number in list', max(a)
print 'smallest number', min(a)
print 'average', mean(a)

#we can find the index of the max and min using argmax() and argmin()
print 'the largest number in the list is', max(a), 'and is found at index:', argmax(a)

Add Items



In [ ]:

    
#items can be added to a list by using list_name.append(item)

#add 3 to the list
a.append(3)
#add 4
a.append(4)
a



In [ ]:

    
#finally, we can de dupe a list by using the unique function
#we can also sort a list using sort(list_name)

print 'sorted list', sort(a)
print 'select distinct values', unique(a)

Remove Items



In [ ]:

    
#items can be removed from the list using list_name.remove(item)

#remove 3 from the list
a.remove(3)
#remove 4
a.remove(4)
a

Itterators



In [ ]:

    
#we can itterate over n items using a for loop

#a shortcut for making the list [0,...,n-1] is the function range(n)

#print the numbers 0 -4
for i in range(5):
    print i



In [ ]:

    
#print the numbers 0 -4
for i in range(0,5):
    print i



In [ ]:

    
#print the a list
for i in a:
    print i



In [ ]:

    
#itterating over something and appending is a common way of building lists

#create array
output=[]

#build the list holding the first 4 squares by using a for loop
for i in range(5):
    output.append(i**2) #**2 operator means squared
output



In [ ]:

    
#this works but is slow, a faster way to do this is to use list comprehension

output2=[i**2 for i in range(5)]
output2



In [ ]:

    
#we can also put conditions in the list comprehension

#build the first 10 squares for all the even numbers
output3=[i**2 for i in range(10) if i%2==0] # % is means modulus (remainder)
output3



In [ ]:

    
#the zip command lines up two lists together

L1=[1,2,3]
L2=['x','y','z']

#the output is a list of tuples
print zip(L1,L2)



In [ ]:

    
#if they are of different size, it gets chopped off

L1=[1,2,3,4]
L2=['x','y','z']

#the output is a list of tuples
print zip(L1,L2)



In [ ]:

    
#it is very common to itterate over lists using zip

for list1,list2 in zip(L1,L2):
    print list1,list2



In [ ]:

    
#this can also be done with list comprehension
print [(x,y) for x,y in zip(L1,L2)]

#we can also make more complex lists
output=[(x,y,str(x)+y) for x,y in zip(L1,L2)]

#itterate over our output for a nicer looking print statement
for z in output:
    print z

#we can also do this differently
for a1,a2,a3 in output:
    print a1,a2,a3

IF ELSE Statements



In [ ]:

    
if 1==1:
    print 'one equals one'



In [ ]:

    
if 1<0:
    print 'one is less than zero'
else:
    print '1 does not equal to 0'



In [ ]:

    
if 1<>1: #you can also use the != operator
    print 'one does not equal to one '
elif 1==0:
    print '1 is equal to zero'
else:
    print '1 does not equal to 0'



In [ ]:

    
if (1==1 and 0<1):
    print 'and operator'
    
if (1==1 or 0==1):
    print 'or operator'

WHILE Statements



In [ ]:

    
#counter
i = 0

#loop while i < 5
while i < 5:
    print i
    
    #increment counter
    i = i + 1

Functions



In [ ]:

    
#define functions

def SimpleAdd(Number):
    return Number

def SimpleAdd2(N1, N2):
    return N1 + N2

def SimpleAdd3(N1=2, N2=2):
    return N1 + N2

#return a 10
print SimpleAdd(10)

#return 2 + 5
print SimpleAdd2(2,5)

#return 1 + 3
print SimpleAdd3(1,3)

#use default parameters
print SimpleAdd3()

Excel



In [ ]:

    
from xlwt import *

Add an image



In [ ]:

    
# Create a workbook
wb = Workbook()

# Add a sheet/tab
ws0 = wb.add_sheet('Picture_Test')

# Add picture at location (2,1)
# Note: Only accepts bmp files
# i.e. ws0.insert_bitmap('C:\Users\username\Desktop/test.bmp', 2, 1)
ws0.insert_bitmap('DataFiles\\testpic.bmp', 2, 1)

# Write excel file
# Note: This will overwrite any other files with the same name
wb.save('AddImage.xls')

Convert image to BMP



In [ ]:

    
#convert an image file to a .bmp image file using PIL
from PIL import Image

# Convert to bmp
def ConvertImg(file_in,file_out):
    img = Image.open(file_in)

    #print len(img.split())  # test
    if len(img.split()) == 4:
        # prevent IOError: cannot write mode RGBA as BMP
        r, g, b, a = img.split()
        img = Image.merge("RGB", (r, g, b))
        img.save(file_out)
    else:
        img.save(file_out)

ConvertImg('DataFiles/pngInput.png', 'DataFiles/bmpOutput.bmp')

Delete Files



In [ ]:

    
import os

# Delete file
os.remove('DataFiles/bmpOutput.bmp')

Dates



In [ ]:

    
from datetime import datetime, date, time

# Get today's date
CurrentTime = datetime.now()
CurrentTime = CurrentTime.strftime("%Y-%m-%d_%H-%M-%S")
CurrentTime

IO

Dataframe to SQL

Import sqlalchemy



In [ ]:

    
from sqlalchemy import *



In [ ]:

    
# DB Parameters
ServerName = "devdb4\sql4"
Database = "BizIntel"

# To create a temp table just add a "#" to the table name
# To create a global table just add a "##" to the table name
TableName = "#TableCheckTest"

# pyobdc must be installed
engine = create_engine('mssql+pyodbc://' + ServerName + '/' + Database)
conn = engine.connect()



In [ ]:

    
# Required for querying tables
metadata = MetaData(conn)

## Create table
tbl = Table(TableName, metadata,
    Column('DateAdded', DateTime),
    Column('Revenue', Integer)
)

# This actually creates a table in the sql database
# checkfirst=True >> create if table does not exist
tbl.create(checkfirst=True)



In [ ]:

    
# Create data to insert into table
# Create a dataframe with dates as your index
data = [1,2,3,4,5,6,7,8,9,10]
idx = date_range('1/1/2012', periods=10, freq='MS')
df = DataFrame(data, index=idx, columns=['Revenue'])

# Remove the index if you want to include it in the insert
df = df.reset_index()
#print df



In [ ]:

    
# Iterate through each of the columns and insert into table
for x in df.iterrows():
    #print list(x[1])
    sql = tbl.insert(list(x[1]))
    conn.execute(sql)

# select all form table
sql = tbl.select()
result = conn.execute(sql)
for row in result:
    print 'Write to SQL', row

SQL to Dataframe



In [ ]:

    
# Select all
sql = tbl.select()

# run sql code
result = conn.execute(sql)



In [ ]:

    
# Insert to a dataframe
df2 = DataFrame(data=list(result), columns=result.keys())

# Convert data types 
df2.DateAdded = df2.DateAdded.astype('datetime64')
df2.Revenue = df2.Revenue.astype('int')
print ' '
print 'Data Types'
print df2.dtypes

# Set index to dataframe
df2 = df2.set_index('DateAdded')
print ' '
print 'Read from SQL', df2
#print df2.head().to_string()

# Close connection
conn.close()

Clipboard to Dataframe



In [ ]:

    
# Copy paste data into a dataframe
# Make sure you first copy data before running the code

# This example assumes you are copying data with the first column composed of dates
# index_col=0, means the date column is in the first column
# parse_dates=True, makes sure dates are converted to datetime datatypes
df = read_clipboard(index_col=0,parse_dates=True)
df

# This has worked for me in SQL, Excel, and LibreOffice

Dataframe to Excel



In [ ]:

    
# Export dataframe to Excel
# you can export to both .xls and .xlsx
df.to_excel('dfToExcel.xls','Sheet Name')

Dataframe to Excel (alternate)



In [ ]:

    
# Import packages
from xlwt import *
from pandas import *
import numpy as np



In [ ]:

    
# Create a dataframe with dates as your index
data = [1,2,3,4,5,6,7,8,9,10]
idx = date_range('1/1/2012', periods=10, freq='MS')
df = DataFrame(data, index=idx, columns=['Revenue'])
df



In [ ]:

    
# Create a workbook
wb = Workbook()

# Add a sheet/tab
ws0 = wb.add_sheet('DataFrame_Test')



In [ ]:

    
# Write dataframe

# formatting values in excel
date_xf = easyxf(num_format_str='DD/MM/YYYY') # sets date format in Excel
num_xf = easyxf(num_format_str='#0.000000') # sets date format in Excel

# Iterate through each of the columns and insert into sheet
for i, (date, row) in enumerate(df.T.iteritems()):
    #print i, date, row[0],type(row[0]).__name__

    # Skip first line
    if i > 0:
        if type(date).__name__ == 'Timestamp':
            ws0.write(i,0,date,date_xf)
        elif type(date).__name__ == 'str':
            ws0.write(i,0,date)
        else:
            ws0.write(i,0,date.astype(np.float),num_xf)
        if type(row[0]).__name__ == 'Timestamp':
            ws0.write(i,1,row[0].astype(np.float),date_xf)
        elif type(row[0]).__name__ == 'str':
            ws0.write(i,1,row[0].astype(np.float))
        else:
            ws0.write(i,1,row[0].astype(np.float),num_xf)



In [ ]:

    
# Remove the index if you want to include it in the insert
df = df.reset_index()

# Rename columns
df.columns = ['DateAdded', 'Revenue']

# Add column Headers
for i, cols in enumerate(df.columns):
    #print i, cols
    ws0.write(0,i,cols)



In [ ]:

    
# Write excel file
# Note: This will overwrite any other files with the same name
wb.save('DFtoExcel.xls')

Matplotlib



In [ ]:

    
#import statement (automatic in pylab)
import matplotlib.pyplot as plt



In [ ]:

    
#make some data
x=linspace(1,50,100) # linspace(start,end,num_points) 
y=np.sin(x)

#a line graph:
plot(x,y)



In [ ]:

    
#a scatter plot
scatter(x,y)



In [ ]:

    
#another way to make a scatter plot
plot(x,y,'x')



In [ ]:

    
#by default all the plots go into the same figure. we can make a new figure by calling figure()
#we can also get a refrence to the figure

#WITHOUT calling figure()

#a line graph:
plot(x,y)

#a scatter plot
scatter(x,y)



In [ ]:

    
#calling figure()

#a line graph:
plot(x,y)

fig=figure()

#a scatter plot
scatter(x,y)



In [ ]:

    
#plot the data, show data points as x's, connect them with a line, make it red and kinda see through
#name the data
plt.plot(x,y, 'x-', color='red', alpha=.5, label='the data')

#add a title
plt.title('The Title')

#name the axis
xlabel('x axis label')
ylabel('y axis label')

#the legend
plt.legend(loc='best') #loc = 'best' tries to make the legend not overlap the data

#turn a grid on
plt.grid()

#save the figure as an image
#this will create a ".png" file in the file location you run the code in 
plt.savefig('test_fig.png')



In [ ]:

    
#subplots and multiple lines

t=linspace(1,100,100)
x1=np.sin(20./(2*pi) * t)
x2=np.sin(40./(2*pi)*t)
x3=x1+x2

figure()
subplot(2,1,1)
plot(t,x1,label='x1')
plot(t,x2,label='x2')
plot(t,x3,label='x3')
legend(loc='best')

subplot(2,1,2)
#plot a histogram
#we save the histogram data but it is not needed to do so for plotting purposes
x1hist=hist(x1,normed='true', bins=25, alpha=.25,label='x1')
legend(loc='best')



In [ ]:

    
#more histograms
x1=np.random.randn(1000)
x2=2.0*np.random.randn(1000)+2

x1h=hist(x1,bins=50,alpha=.25)
x2h=hist(x2,bins=50,alpha=.25)

BaseMap



In [ ]:

    
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt



In [ ]:

    
#setup the projection
m = Basemap(projection='merc',llcrnrlat=20,urcrnrlat=50,\
                llcrnrlon=-130,urcrnrlon=-60,lat_ts=20,resolution='i')



In [ ]:

    
#create some points to plot
#provided is a csv file with the lat/long of US states
import pandas as pd
from pandas import DataFrame
data=pd.read_csv('DataFiles\\usa_lat_long.csv') #it has some duplicates
data.head()



In [ ]:

    
#I just want the lat/long in a list
points=data[['longitude','latitude']].values



In [ ]:

    
#transform the points into map coordinates
transform_points=[m(lng, lat) for lng, lat in points]



In [ ]:

    
#do all the drawing:
fig=figure(figsize=(10,10)) #make a larger than default image
fig.add_subplot(1,1,1) #not strictly required

m.drawmapboundary(fill_color='white')
m.fillcontinents(color='white',lake_color='white')
m.drawcoastlines(color='black', linewidth=.3)
m.drawcountries(color='black', linewidth=.3)
m.drawstates(color='black', linewidth=.3)

#plot the points on the  map. These are just regular calls to matplotlib with x,y data
#you could also do this in one shot by using plot(xlist, ylist...)
#or using scatter(). 
for x,y in transform_points:
    plot(x,y,'o',color='red',ms=10*rand()) #plot them at random sizes

#we can plot some labels
text(transform_points[7][0],transform_points[7][1],'California',fontsize=15)
text(transform_points[12][0],transform_points[12][1],'Florida',fontsize=15)

#draw some great circles
lng1,lat1 =points[12]
lng2,lat2 =points[7]
m.drawgreatcircle(lng1,lat1,lng2,lat2,linewidth=3,color='blue',alpha=.5)

Scipy Solver



In [ ]:

    
# Objective Function
#Max C = 2x + 3y

# Constraints
#2x+y<=15
#x+3y<=20
#x>=0
#y>=0

# Solution
#(5,5)



In [ ]:

    
from scipy.optimize import minimize



In [ ]:

    
# Since the objective is to maximize C, we
# multiple the objective function by -1

# x[0] equals x
# x[1] equals y

## Objective Function
fun = lambda x: -1*(2*x[0] + 3*x[1])



In [ ]:

    
# Equality constraint are defined as follows:
# This is not in the correct format: x = -2
# Equation has to always be equal to zero: x + 2 = 0
# i.e. {'type': 'eq', 'fun': lambda x: x[0] + 2}

#-------------------------------------------------------------

# Inequality contraints are defined as follows:
# This is not in the correct format: 2x+y<=15
# Equation has to always be greater than or equal to zero: 0 <= 15 + -2x + -y 

## Contraints
cons = ({'type': 'ineq', 'fun': lambda x:  15 + -2*x[0] + -1*x[1]}, #15-2x-y>=0
         {'type': 'ineq', 'fun': lambda x: 20 + -1*x[0] + -3*x[1]}, #20-x-3y
         {'type': 'ineq', 'fun': lambda x: x[0]}, #x>=0
         {'type': 'ineq', 'fun': lambda x: x[1]}) #y>=0



In [ ]:

    
# STILL NOT SURE WHAT THIS MEANS

## Bounds
bnds = ((None, None), (None, None))



In [ ]:

    
## Solver
#   = minimize(Objective Function, Initial Guess, Solver to use, Bounds, Constraints)
res = minimize(fun, (0, 0), method='SLSQP', bounds=bnds, constraints=cons)
res

# solution is in res.x

SQLAlchemy



In [ ]:

    
from sqlalchemy import *

Connecting



In [ ]:

    
# Parameters
ServerName = "Type Server Name Here"
Database = "Type database name here"
TableName = "Type table name here"

# Note: pyobdc must be installed
engine = create_engine('mssql+pyodbc://' + ServerName + '/' + Database)
conn = engine.connect()

SELECT Statements



In [ ]:

    
# Required for querying tables
metadata = MetaData(conn)

# Table to query
tbl = Table(TableName, metadata, autoload=True, schema="dbo")

# select all where 
sql = select(tbl.c.Column1 == 'Type value to match')

# select specific columns
sql = select([tbl.c.Column1, tbl.c.Column2, tbl.c.Column3])

# select top N
sql = select([tbl.c.Column1, tbl.c.Column2, tbl.c.Column3],limit=10)

# select specific column and a where clause
sql = select([tbl.c.Column1], tbl.c.Column1 == 'Type value to match')

# and, or, not, in
sql = select([tbl], and_(tbl.c.Column1 < 4, tbl.c.Column1 != 1))
sql = select([tbl], or_(tbl.c.Column1 < 4, tbl.c.Column1 != 1))
sql = select([tbl], not_(tbl.c.Column1 > 4))
sql = select([tbl], tbl.c.Column1.in_( (1,4) ))

# like, between
sql = select([tbl], tbl.c.Column1.startswith('M'))
sql = select([tbl], tbl.c.Column1.like('%a%'))
sql = select([tbl], tbl.c.Column1.endswith('n'))
sql = select([tbl], tbl.c.Column1.between(30,39),limit=10)

result = conn.execute(sql)
for row in result:
    print row

# Close connection
conn.close()

IPython NoteBook



In [ ]:

    
# Import related packages
from IPython.display import YouTubeVideo,HTML,Image



In [ ]:

    
# Embed Videos
YouTubeVideo('BHEr1dBc0yM');



In [ ]:

    
# Embed WebPages
HTML('<iframe src=http://m.bing.com/ width=700 height=350></iframe>');



In [ ]:

    
# Embed Images
Image(filename='DataFiles\infiniteenergy.jpg');



In [ ]: