notebook.community

Edit and run



In [ ]:

    
## Pandas basic
  # uses dataframe to put data in easy to use format
  # SQL like - relational
  # fast read right from storage



In [3]:

    
import pandas as pd
import numpy as np



In [63]:

    
# DataFrame from dict
students = pd.DataFrame( {
    'phone': ['123-1234', '321-4321', '321-4321'],
    'age': [13, 12, 13],
    'grade': [78, 77, 92]}, 
    index = ['alice', 'bob', 'eve'] )
students



In [5]:

    
students.index









    Out[5]:





Index(['alice', 'bob', 'eve'], dtype='object')



In [6]:

    
# Dataframe from numpy array
df = pd.DataFrame(np.random.randn(6,4), 
                  index=['index1', 'index2', 'index3', 'index4','index5','index6'], 
                  columns = list('ABCD'))
df



In [ ]:

    
# DataFrame from csv file
load csv --> df = pd.read_csv('./PWD/file.csv', delimiter="\t")
save csv --> df.to_csv('./PWD/newFile.csv  ')



In [7]:

    
#first few lines
df.head()



In [9]:

    
df.head(3)



In [10]:

    
#last few lines
df.tail()



In [11]:

    
df.tail(2)



In [12]:

    
# type of data
type(df)









    Out[12]:





pandas.core.frame.DataFrame



In [14]:

    
# number of rows and columns
df.shape









    Out[14]:





(6, 4)



In [15]:

    
# information about data
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, index1 to index6
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0+ bytes



In [17]:

    
df.describe()



In [24]:

    
# series --> 1D set of data with an index
s = pd.Series([1, 3, 4, np.nan, 4, 2])
s









    Out[24]:





0    1.0
1    3.0
2    4.0
3    NaN
4    4.0
5    2.0
dtype: float64



In [26]:

    
# adding index later
s.index = ['a','b','c','d','e', 'f']
s









    Out[26]:





a    1.0
b    3.0
c    4.0
d    NaN
e    4.0
f    2.0
dtype: float64



In [30]:

    
# Missing data
s.isnull()









    Out[30]:





a    False
b    False
c    False
d     True
e    False
f    False
dtype: bool



In [29]:

    
# plotting 
import matplotlib.pyplot as plt
%matplotlib inline



In [32]:

    
s.plot()









    Out[32]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fa8d44c4a90>



In [33]:

    
df.plot()









    Out[33]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fa8d447f400>



In [34]:

    
# see portion of data in dataframe or series
s[s.index > 'c']









    Out[34]:





d    NaN
e    4.0
f    2.0
dtype: float64



In [36]:

    
s[s.isnull() == False]









    Out[36]:





a    1.0
b    3.0
c    4.0
e    4.0
f    2.0
dtype: float64



In [37]:

    
df.head()



In [41]:

    
# select column
df.A









    Out[41]:





index1    1.041647
index2   -0.613589
index3   -0.523167
index4    0.711498
index5    2.005816
index6    1.127781
Name: A, dtype: float64



In [40]:

    
df['B']









    Out[40]:





index1   -0.281104
index2    0.232681
index3    0.249878
index4   -0.111896
index5   -0.111702
index6    0.606418
Name: B, dtype: float64



In [20]:

    
# column types
df.dtypes









    Out[20]:





A    float64
B    float64
C    float64
D    float64
dtype: object



In [21]:

    
# column names
df.columns









    Out[21]:





Index(['A', 'B', 'C', 'D'], dtype='object')



In [22]:

    
# multiple columns
df[ ['A','B']]



In [23]:

    
# number of rows
df.shape[0]









    Out[23]:





6



In [42]:

    
# select label / name
df.loc['index2']









    Out[42]:





A   -0.613589
B    0.232681
C   -0.130158
D   -1.382819
Name: index2, dtype: float64



In [43]:

    
# label and column intersection
df.loc['index3', ['A', 'C']]









    Out[43]:





A   -0.523167
C    0.422815
Name: index3, dtype: float64



In [45]:

    
# select by label location/ index
df.iloc[1,3]









    Out[45]:





-1.3828185812005171



In [27]:

    
# get rows by name or number
df.ix[0]









    Out[27]:





A   -0.156835
B   -0.361788
C    0.230273
D    1.107232
Name: index1, dtype: float64



In [42]:

    
df.ix[ ['index2', 'index1' ]]



In [43]:

    
# df.ix[row, column]
df.ix[ [0,3], ['A', 'D']]



In [44]:

    
df.iloc[3, :]









    Out[44]:





A   -0.206060
B   -0.638952
C    0.335568
D   -2.091381
Name: index4, dtype: float64



In [45]:

    
df.iloc[:, 2]









    Out[45]:





index1    0.230273
index2    1.188058
index3    0.238245
index4    0.335568
index5   -0.113527
index6    0.216336
Name: C, dtype: float64



In [46]:

    
# add column to dataframe
df['Z'] = [1,3,4,5,6,8]
df



In [47]:

    
df[df['A'] == df['A'].max()]



In [51]:

    
#function
df.Z.apply(lambda Z: Z + 0.1)









    Out[51]:





index1    1.1
index2    3.1
index3    4.1
index4    5.1
index5    6.1
index6    8.1
Name: Z, dtype: float64



In [55]:

    
#built-in
df.A.mean()









    Out[55]:





0.6249978542633915



In [56]:

    
df.A.max()









    Out[56]:





2.00581642006934



In [57]:

    
df.A.min()









    Out[57]:





-0.6135885851876196



In [52]:

    
df.count()









    Out[52]:





A    6
B    6
C    6
D    6
Z    6
dtype: int64



In [54]:

    
#correlation between columns
df.corr()



In [55]:

    
# cummulative max
df.cummax()



In [67]:

    
students



In [68]:

    
# aggregation: mean grade grouped by age
students.groupby('age')['grade'].mean()









    Out[68]:





age
12    77
13    85
Name: grade, dtype: int64



In [76]:

    
#bins
bin = np.linspace(70, 100, 4)
bin









    Out[76]:





array([ 70.,  80.,  90., 100.])



In [77]:

    
# grades grouped by 70s 80s and 90s and age of students
students.groupby( np.digitize(students.grade, bin)).age.mean()









    Out[77]:





1    12.5
3    13.0
Name: age, dtype: float64

	A	B	C	D
index5	-0.963493	-0.273156	-0.113527	-0.155731
index6	0.382200	0.072323	0.216336	0.055544

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	-0.092351	-0.217336	0.349159	-0.143260
std	0.887802	0.483560	0.438655	1.222480
min	-0.982796	-0.694776	-0.113527	-2.091381
25%	-0.774135	-0.569661	0.219821	-0.693874
50%	-0.181447	-0.317472	0.234259	-0.050094
75%	0.247441	-0.014047	0.311237	0.837411
max	1.372878	0.592330	1.188058	1.107232

	A	B	C	D	Z
A	1.000000	0.864102	-0.287728	0.500292	0.104440
B	0.864102	1.000000	-0.452648	0.694440	0.261574
C	-0.287728	-0.452648	1.000000	-0.333016	-0.366497
D	0.500292	0.694440	-0.333016	1.000000	-0.282841
Z	0.104440	0.261574	-0.366497	-0.282841	1.000000

	A	B	C	D
index1	-0.156835	-0.361788	0.230273	1.107232
index2	-0.982796	-0.694776	1.188058	-0.873255
index3	1.372878	0.592330	0.238245	1.098033
index4	-0.206060	-0.638952	0.335568	-2.091381
index5	-0.963493	-0.273156	-0.113527	-0.155731
index6	0.382200	0.072323	0.216336	0.055544

	A	B	C	D
index1	1.041647	-0.281104	-2.357189	-0.497115
index2	-0.613589	0.232681	-0.130158	-1.382819
index3	-0.523167	0.249878	0.422815	0.628948
index4	0.711498	-0.111896	-0.359935	0.050962
index5	2.005816	-0.111702	0.495628	-0.243297