Pandas basic

uses dataframe to put data in easy to use format
SQL like - relational
fast read right from storage



In [12]:

    
import pandas as pd
import numpy as np

creating dataframe

from dict
from numpy array
from csv
from fwf



In [13]:

    
# DataFrame from dict
students = pd.DataFrame( {
    'phone': ['123-1234', '321-4321', '321-4321'],
    'age': [13, 12, 13],
    'grade': [78, 77, 92]}, 
    index = ['alice', 'bob', 'eve'] )
students



In [14]:

    
students.index









    Out[14]:





Index(['alice', 'bob', 'eve'], dtype='object')



In [15]:

    
# Dataframe from numpy array
df = pd.DataFrame(np.random.randn(6,4), 
                  index=['index1', 'index2', 'index3', 'index4','index5','index6'], 
                  columns = list('ABCD'))
df

DataFrame from csv file

load csv --> df = pd.read_csv('./PWD/file.csv', delimiter="\t")
save csv --> df.to_csv('./PWD/newFile.csv ')

fwf with time series

data = pd.read_fwf('location/file.txt', header=None, index_col=0, parse_dates=[[0,1]], infer_datetime_format=True)



In [16]:

    
#first few lines
df.head()



In [17]:

    
df.head(3)



In [18]:

    
#last few lines
df.tail()



In [19]:

    
df.tail(2)

Understanding data



In [20]:

    
# type of data
type(df)









    Out[20]:





pandas.core.frame.DataFrame



In [21]:

    
# number of rows and columns
df.shape









    Out[21]:





(6, 4)



In [22]:

    
# information about data
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, index1 to index6
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0+ bytes



In [23]:

    
df.describe()

Series



In [24]:

    
# series --> 1D set of data with an index
s = pd.Series([1, 3, 4, np.nan, 4, 2])
s









    Out[24]:





0    1.0
1    3.0
2    4.0
3    NaN
4    4.0
5    2.0
dtype: float64



In [25]:

    
# adding index later
s.index = ['a','b','c','d','e', 'f']
s









    Out[25]:





a    1.0
b    3.0
c    4.0
d    NaN
e    4.0
f    2.0
dtype: float64



In [26]:

    
# Missing data
s.isnull()









    Out[26]:





a    False
b    False
c    False
d     True
e    False
f    False
dtype: bool

Plotting



In [27]:

    
# plotting 
import matplotlib.pyplot as plt
%matplotlib inline



In [28]:

    
s.plot()









    Out[28]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3377435a58>



In [29]:

    
df.plot()









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3375395518>

selecting data



In [30]:

    
# see portion of data in dataframe or series
s[s.index > 'c']









    Out[30]:





d    NaN
e    4.0
f    2.0
dtype: float64



In [31]:

    
s[s.isnull() == False]









    Out[31]:





a    1.0
b    3.0
c    4.0
e    4.0
f    2.0
dtype: float64



In [32]:

    
df.head()



In [33]:

    
# select column
df.A









    Out[33]:





index1    0.730595
index2    0.163752
index3   -1.203011
index4    0.644283
index5   -0.136466
index6   -1.627352
Name: A, dtype: float64



In [34]:

    
df['B']









    Out[34]:





index1    1.187878
index2   -1.658283
index3    1.454400
index4   -0.875351
index5    0.879580
index6   -0.430911
Name: B, dtype: float64



In [35]:

    
# column types
df.dtypes









    Out[35]:





A    float64
B    float64
C    float64
D    float64
dtype: object



In [36]:

    
# column names
df.columns









    Out[36]:





Index(['A', 'B', 'C', 'D'], dtype='object')



In [37]:

    
# multiple columns
df[ ['A','B']]



In [38]:

    
# number of rows
df.shape[0]









    Out[38]:





6



In [39]:

    
# select label / name
df.loc['index2']









    Out[39]:





A    0.163752
B   -1.658283
C   -0.660962
D    1.165278
Name: index2, dtype: float64



In [40]:

    
# label and column intersection
df.loc['index3', ['A', 'C']]









    Out[40]:





A   -1.203011
C    0.611386
Name: index3, dtype: float64



In [41]:

    
# select by label location/ index
df.iloc[1,3]









    Out[41]:





1.1652775947912888



In [42]:

    
# get rows by name or number
df.ix[0]









    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  






    Out[42]:





A    0.730595
B    1.187878
C   -0.057026
D    0.497039
Name: index1, dtype: float64



In [43]:

    
df.ix[ ['index2', 'index1' ]]



In [44]:

    
# df.ix[row, column]
df.ix[ [0,3], ['A', 'D']]



In [45]:

    
df.iloc[3, :]









    Out[45]:





A    0.644283
B   -0.875351
C    1.336898
D    0.632972
Name: index4, dtype: float64



In [46]:

    
df.iloc[:, 2]









    Out[46]:





index1   -0.057026
index2   -0.660962
index3    0.611386
index4    1.336898
index5    0.894363
index6    0.371515
Name: C, dtype: float64



In [47]:

    
# add column to dataframe
df['Z'] = [1,3,4,5,6,8]
df



In [48]:

    
df[df['A'] == df['A'].max()]

function



In [49]:

    
#function
df.Z.apply(lambda Z: Z + 0.1)









    Out[49]:





index1    1.1
index2    3.1
index3    4.1
index4    5.1
index5    6.1
index6    8.1
Name: Z, dtype: float64



In [50]:

    
#built-in
df.A.mean()









    Out[50]:





-0.23803309717511253



In [51]:

    
df.A.max()









    Out[51]:





0.7305949004218479



In [52]:

    
df.A.min()









    Out[52]:





-1.6273516826385388



In [53]:

    
df.count()









    Out[53]:





A    6
B    6
C    6
D    6
Z    6
dtype: int64



In [54]:

    
#correlation between columns
df.corr()



In [55]:

    
# cummulative max
df.cummax()

aggregation



In [56]:

    
students



In [57]:

    
# aggregation: mean grade grouped by age
students.groupby('age')['grade'].mean()









    Out[57]:





age
12    77
13    85
Name: grade, dtype: int64



In [58]:

    
#bins
bin = np.linspace(70, 100, 4)
bin









    Out[58]:





array([ 70.,  80.,  90., 100.])



In [59]:

    
# grades grouped by 70s 80s and 90s and age of students
students.groupby( np.digitize(students.grade, bin)).age.mean()









    Out[59]:





1    12.5
3    13.0
Name: age, dtype: float64

more indexing



In [60]:

    
num = list(range(20))



In [61]:

    
num









    Out[61]:





[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]



In [63]:

    
# num[ start: end: jump]
num[ : : 2]









    Out[63]:





[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]



In [66]:

    
num[ 15: 1 : -3]









    Out[66]:





[15, 12, 9, 6, 3]



In [ ]:

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	-0.238033	0.092886	0.416029	0.197064
std	0.974665	1.260956	0.707197	0.937517
min	-1.627352	-1.658283	-0.660962	-1.582012
25%	-0.936375	-0.764241	0.050109	0.200404
50%	0.013643	0.224334	0.491450	0.399946
75%	0.524150	1.110804	0.823618	0.598989
max	0.730595	1.454400	1.336898	1.165278

	A	B	C	D	Z
A	1.000000	-0.163294	-0.045417	0.782776	-0.657204
B	-0.163294	1.000000	0.256123	-0.138042	-0.197453
C	-0.045417	0.256123	1.000000	-0.198932	0.488496
D	0.782776	-0.138042	-0.198932	1.000000	-0.732204
Z	-0.657204	-0.197453	0.488496	-0.732204	1.000000

	A	B	C	D
index1	0.730595	1.187878	-0.057026	0.497039
index2	0.163752	-1.658283	-0.660962	1.165278
index3	-1.203011	1.454400	0.611386	0.166255
index4	0.644283	-0.875351	1.336898	0.632972
index5	-0.136466	0.879580	0.894363	0.302853
index6	-1.627352	-0.430911	0.371515	-1.582012