In [1]:
import numpy as np
import pandas as pd
import math
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import os
%matplotlib inline
In [2]:
# get Pansda's vesrion #
print 'Pandas version ' + pd.__version__
In [6]:
# record-setting temperatures in Ohio cities on Jan 7 2014
city = ['Akron-Canton', 'Mansfield', 'Cleveland', 'Toledo', 'Youngstown', 'Columbus', 'ew Philadelphia', 'Zanesville']
temperature=[-11, -12, -11, -14, -12, -7, -9, -8]
In [7]:
# create a zipped oabject (list of tuples)
record_temps = zip(city,temperature)
print record_temps
In [8]:
# create a Panda dataframe
df = DataFrame(data = record_temps, columns=['City', 'Temperature'])
df
Out[8]:
In [9]:
# save the frames out to csv files
df.to_csv('tempJan72014-1.csv',index=False,header=False)
df.to_csv('tempJan72014-2.csv',index=True,header=False)
df.to_csv('tempJan72014-3.csv',index=False,header=True)
df.to_csv('tempJan72014-4.csv',index=True,header=True)
In [7]:
# read the saved files and see what the different "writes" gave us
df1 = pd.read_csv('tempJan72014-1.csv')
df2 = pd.read_csv('tempJan72014-2.csv')
df3 = pd.read_csv('tempJan72014-3.csv')
df4 = pd.read_csv('tempJan72014-4.csv')
In [8]:
df1
Out[8]:
In [9]:
df2
Out[9]:
In [10]:
df3 # this seems the best way to write out a csv file
Out[10]:
In [11]:
df4
Out[11]:
In [12]:
# read without the header
df1 = pd.read_csv('tempJan72014-1.csv', header=None)
df1
Out[12]:
In [13]:
# read and assign values to the header
df1 = pd.read_csv('tempJan72014-1.csv', names=['City', 'Temperature'])
df1
Out[13]:
In [14]:
# compute the min temperature
df['Temperature'].min()
Out[14]:
In [10]:
# anothe way to define a DF
df5 = pd.DataFrame({'City': ['Akron-Canton', 'Mansfield', 'Cleveland', 'Toledo', 'Youngstown', 'Columbus', 'ew Philadelphia', 'Zanesville'],
'Temperature': [-11, -12, -11, -14, -12, -7, -9, -8]})
df5
Out[10]:
In [11]:
df['Temperature'].plot()
Out[11]:
In [12]:
# another way to create a dataframe
df = DataFrame({'int_col' : [1,2,6,8,-1], 'float_col' : [0.1, 0.2,0.2,10.1,None], 'str_col' : ['a','b',None,'c','a']})
df
Out[12]:
In [13]:
# index through the ix mothod
df.ix[:,['float_col','int_col']]
Out[13]:
In [21]:
# index through the columns
df[['float_col','int_col']]
Out[21]:
In [22]:
# idex based on a condition
df[df['float_col'] > 0.15]
Out[22]:
In [14]:
# idex based on a condition
df[df['float_col'] == 0.1]
Out[14]:
In [15]:
# another conditional indexing
df[(df['float_col'] > 0.1) & (df['int_col']>2)]
Out[15]:
In [16]:
# another conditional indexing
df[(df['float_col'] > 0.1) | (df['int_col']>2)]
Out[16]:
In [21]:
# another conditional indexing
df[~(df['float_col'] > 0.1)]
Out[21]:
In [18]:
# change the name of a column and save the dataframe to a another
df2 = df.rename(columns={'int_col' : 'some_other_name'})
df2
Out[18]:
In [22]:
df
Out[22]:
In [23]:
# rename it back and replace the existing dataframe
df2.rename(columns={'some_other_name' : 'int_col'}, inplace = True)
df2
Out[23]:
In [24]:
# drop the missing values
df2.dropna()
Out[24]:
In [25]:
# deep copy
df3 = df.copy()
df3
Out[25]:
In [28]:
# compute the mean of the floats
mean = df3['float_col'].mean()
mean
Out[28]:
In [31]:
# then replace the missing values by the mean
X=df3['float_col'].fillna(mean)
In [32]:
X
Out[32]:
In [33]:
# using map to change the values for a column
df['str_col'].dropna().map(lambda x : 'map_' + x)
Out[33]:
In [34]:
# use the apply method to operate on the columns
df.ix[:,['int_col','float_col']].apply(np.sqrt)
Out[34]:
In [35]:
# use applymap to define a function to operae on the whole dataframe
def some_fn(x):
if type(x) is str:
return 'applymap_' + x
elif x:
return 100 * x
else:
return
df.applymap(some_fn)
Out[35]:
In [44]:
# define a new DF
df = pd.DataFrame(data={"A":[1,2], "B":[1.2,1.3]})
df
Out[44]:
In [45]:
df['A']
Out[45]:
In [46]:
# perform a mathematical operation
df['C'] = df["A"]+df["B"]
df
Out[46]:
In [47]:
# perform another one
df["D"] = df["A"]*3
df
Out[47]:
In [39]:
# and another
df["E"] = np.sqrt(df["A"])
df
Out[39]:
In [48]:
df = pd.DataFrame(data={"A":[1,2], "B":[1.2,1.3], "Z":["a","b"]})
df
Out[48]:
In [41]:
# perform a vectorized operation on strings
df["F"] = df.Z.str.upper()
df
Out[41]:
In [42]:
# deep copy
df = DataFrame({'int_col' : [1,2,6,8,-1], 'float_col' : [0.1, 0.2,0.2,10.1,None], 'str_col' : ['a','b',None,'c','a']})
df4 = df.copy()
def two_three_strings(x):
return x*2, x*3
df4['twice'],df4['thrice'] = zip(*df4['int_col'].map(two_three_strings))
df4
Out[42]:
In [43]:
# Deep copy
df5 = df.copy()
def sum_two_cols(series):
return series['int_col'] + series['float_col']
df5['sum_col'] = df5.apply(sum_two_cols,axis=1)
df5
Out[43]:
In [44]:
def int_float_squares(series):
return pd.Series({'int_sq' : series['int_col']**2, 'flt_sq' : series['float_col']**2})
df.apply(int_float_squares, axis = 1)
Out[44]:
In [45]:
df.describe() #provides quick stats on all suitable columns.
Out[45]:
In [46]:
# compute the covariance
df.cov()
Out[46]:
In [47]:
# compute the correlation
df.corr()
Out[47]:
In [48]:
# define another DF
other = DataFrame({'str_col' : ['a','b'], 'some_val' : [3, 2]})
other
Out[48]:
In [49]:
# merge the two using the intersection of keys from both frames (SQL: inner join)
pd.merge(df,other,on='str_col',how='inner')
Out[49]:
In [50]:
# merge the two using the union of keys from both frames (SQL: inner join)
pd.merge(df,other,on='str_col',how='outer')
Out[50]:
In [51]:
plot_df = DataFrame(np.random.randn(1000,2),columns=['x','y'])
In [52]:
plot_df['y'] = plot_df['y'].map(lambda x : x + 1)
In [53]:
plot_df.plot()
Out[53]:
In [54]:
plot_df.hist()
Out[54]:
In [55]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', None, 'CABA', 'dog', 'cat'])
In [56]:
s
Out[56]:
In [57]:
s.str.lower()
Out[57]:
In [58]:
s.str.upper()
Out[58]:
In [59]:
s.str.len()
Out[59]: