In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.DataFrame({'Country': ['Russia','Colombia','Chile','Equador','Nigeria'],
                    'Rank':[121,40,100,130,11]})
data


Out[2]:
Country Rank
0 Russia 121
1 Colombia 40
2 Chile 100
3 Equador 130
4 Nigeria 11

In [3]:
data.describe()
#describe() method computes summary statistics of integer / double variables


Out[3]:
Rank
count 5.000000
mean 80.400000
std 52.300096
min 11.000000
25% 40.000000
50% 100.000000
75% 121.000000
max 130.000000

In [4]:
data.info()
#get the complete information about the data set, we can use info() function


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
Country    5 non-null object
Rank       5 non-null int64
dtypes: int64(1), object(1)
memory usage: 160.0+ bytes

In [5]:
data = pd.DataFrame({'group':['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],'ounces':[4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data


Out[5]:
group ounces
0 a 4.0
1 a 3.0
2 a 12.0
3 b 6.0
4 b 7.5
5 b 8.0
6 c 3.0
7 c 5.0
8 c 6.0

In [6]:
#sort the data frame by ounces - inplace = True will make changes to the data
data.sort_values(by=['ounces'],ascending=True,inplace=False)


Out[6]:
group ounces
1 a 3.0
6 c 3.0
0 a 4.0
7 c 5.0
3 b 6.0
8 c 6.0
4 b 7.5
5 b 8.0
2 a 12.0

In [7]:
#sort the data by not just one column but multiple columns.
data.sort_values(by=['group','ounces'],ascending=[True,False],inplace=False)


Out[7]:
group ounces
2 a 12.0
0 a 4.0
1 a 3.0
5 b 8.0
4 b 7.5
3 b 6.0
8 c 6.0
7 c 5.0
6 c 3.0

In [8]:
#remove duplicate rows.
data = pd.DataFrame({'k1':['one']*3 + ['two']*4, 'k2':[3,2,1,3,3,4,4]})
data


Out[8]:
k1 k2
0 one 3
1 one 2
2 one 1
3 two 3
4 two 3
5 two 4
6 two 4

In [9]:
#sort values
data.sort_values(by='k2')


Out[9]:
k1 k2
2 one 1
1 one 2
0 one 3
3 two 3
4 two 3
5 two 4
6 two 4

In [10]:
#remove duplicates
data.drop_duplicates()


Out[10]:
k1 k2
0 one 3
1 one 2
2 one 1
3 two 3
5 two 4

In [11]:
#remove duplicates based on a particular column
data.drop_duplicates(subset='k1')


Out[11]:
k1 k2
0 one 3
3 two 3

In [12]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data


Out[12]:
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0

In [13]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

def meat_2_animal(series):
    if series['food'] == 'bacon':
        return 'pig'
    elif series['food'] == 'pulled pork':
        return 'pig'
    elif series['food'] == 'pastrami':
        return 'cow'
    elif series['food'] == 'corned beef':
        return 'cow'
    elif series['food'] == 'honey ham':
        return 'pig'
    else:
        return 'salmon'


#create a new variable
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data


Out[13]:
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon

In [14]:
#another way of doing it is: convert the food values to the lower case and apply the function
lower = lambda x: x.lower()
data['food'] = data['food'].apply(lower)
data['animal2'] = data.apply(meat_2_animal, axis='columns')
data


Out[14]:
food ounces animal animal2
0 bacon 4.0 pig pig
1 pulled pork 3.0 pig pig
2 bacon 12.0 pig pig
3 pastrami 6.0 cow cow
4 corned beef 7.5 cow cow
5 bacon 8.0 pig pig
6 pastrami 3.0 cow cow
7 honey ham 5.0 pig pig
8 nova lox 6.0 salmon salmon

In [15]:
data.assign(new_variable = data['ounces']*10)
#Another way to create a new variable is by using the assign function


Out[15]:
food ounces animal animal2 new_variable
0 bacon 4.0 pig pig 40.0
1 pulled pork 3.0 pig pig 30.0
2 bacon 12.0 pig pig 120.0
3 pastrami 6.0 cow cow 60.0
4 corned beef 7.5 cow cow 75.0
5 bacon 8.0 pig pig 80.0
6 pastrami 3.0 cow cow 30.0
7 honey ham 5.0 pig pig 50.0
8 nova lox 6.0 salmon salmon 60.0

In [16]:
#remove columns
data.drop('animal2',axis='columns',inplace=True)
data


Out[16]:
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 pastrami 6.0 cow
4 corned beef 7.5 cow
5 bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon

In [17]:
#A quick method for imputing missing values is by filling the missing value with any random number
#also to replace outliers
#Series function from pandas are used to create arrays
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data


Out[17]:
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [18]:
data.replace(-999, np.nan,inplace=True)
data


Out[18]:
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [19]:
#We can also replace multiple values at once.
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace([-999,-1000],np.nan,inplace=True)
data


Out[19]:
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [20]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])
data


Out[20]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11

In [21]:
#Using rename function
data.rename(index = {'Ohio':'SanF'}, columns={'one':'one_p','two':'two_p'},inplace=True)
data


Out[21]:
one_p two_p three four
SanF 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11

In [22]:
#You can also use string functions
data.rename(index = str.upper, columns=str.title,inplace=True)
data


Out[22]:
One_P Two_P Three Four
SANF 0 1 2 3
COLORADO 4 5 6 7
NEW YORK 8 9 10 11

In [23]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
#categorize(bin) continuous variables

In [24]:
#Understand the output - '(' means the value is included in the bin, '[' means the value is excluded
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats


Out[24]:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [25]:
#To include the right bin value, we can do:
pd.cut(ages,bins,right=False)


Out[25]:
[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, object): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [26]:
#pandas library intrinsically assigns an encoding to categorical variables.
cats.labels


/Users/shams/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: 'labels' is deprecated. Use 'codes' instead
  from ipykernel import kernelapp as app
Out[26]:
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [27]:
#Let's check how many observations fall under each bin
pd.value_counts(cats)


Out[27]:
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [28]:
bin_names = ['Youth', 'YoungAdult', 'MiddleAge', 'Senior']
new_cats = pd.cut(ages, bins,labels=bin_names)

pd.value_counts(new_cats)


Out[28]:
Youth         5
MiddleAge     3
YoungAdult    3
Senior        1
dtype: int64

In [29]:
pd.value_counts(new_cats).cumsum()
#cumulative sum


Out[29]:
Youth          5
MiddleAge      8
YoungAdult    11
Senior        12
dtype: int64

In [30]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df


Out[30]:
data1 data2 key1 key2
0 -0.477818 -0.652115 a one
1 0.761973 -1.057828 a two
2 0.495193 -1.623535 b one
3 0.450927 -1.283722 b two
4 -1.001882 -0.469757 a one

In [31]:
#calculate the mean of data1 column by key1
grouped = df['data1'].groupby(df['key1'])
grouped.mean()


Out[31]:
key1
a   -0.239242
b    0.473060
Name: data1, dtype: float64

In [32]:
#slice data
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df


Out[32]:
A B C D
2013-01-01 -0.026436 -0.544089 0.193653 -1.762786
2013-01-02 -0.076448 -0.854257 -0.836730 0.012145
2013-01-03 0.362770 0.240102 0.842131 -0.216190
2013-01-04 -1.194762 0.227675 0.989805 -0.360080
2013-01-05 -0.513112 -0.655481 1.411907 -0.242742
2013-01-06 -0.107620 0.980511 0.155310 -1.356867

In [33]:
#get first n rows from the data frame
df[:3]


Out[33]:
A B C D
2013-01-01 -0.026436 -0.544089 0.193653 -1.762786
2013-01-02 -0.076448 -0.854257 -0.836730 0.012145
2013-01-03 0.362770 0.240102 0.842131 -0.216190

In [34]:
#slice based on date range
df['20130101':'20130104']


Out[34]:
A B C D
2013-01-01 -0.026436 -0.544089 0.193653 -1.762786
2013-01-02 -0.076448 -0.854257 -0.836730 0.012145
2013-01-03 0.362770 0.240102 0.842131 -0.216190
2013-01-04 -1.194762 0.227675 0.989805 -0.360080

In [35]:
#slicing based on column names
df.loc[:,['A','B']]


Out[35]:
A B
2013-01-01 -0.026436 -0.544089
2013-01-02 -0.076448 -0.854257
2013-01-03 0.362770 0.240102
2013-01-04 -1.194762 0.227675
2013-01-05 -0.513112 -0.655481
2013-01-06 -0.107620 0.980511

In [36]:
#slicing based on both row index labels and column names
df.loc['20130102':'20130103',['A','B']]


Out[36]:
A B
2013-01-02 -0.076448 -0.854257
2013-01-03 0.362770 0.240102

In [37]:
#slicing based on index of columns
df.iloc[3] #returns 4th row (index is 3rd)


Out[37]:
A   -1.194762
B    0.227675
C    0.989805
D   -0.360080
Name: 2013-01-04 00:00:00, dtype: float64

In [38]:
#returns a specific range of rows
df.iloc[2:4, 0:2]


Out[38]:
A B
2013-01-03 0.362770 0.240102
2013-01-04 -1.194762 0.227675

In [39]:
#returns specific rows and columns using lists containing columns or row indexes
df.iloc[[1,5],[0,2]]


Out[39]:
A C
2013-01-02 -0.076448 -0.83673
2013-01-06 -0.107620 0.15531

In [40]:
#Boolean indexing based on column values
df[df.A > 0]


Out[40]:
A B C D
2013-01-03 0.36277 0.240102 0.842131 -0.21619

In [41]:
#we can copy the data set
df2 = df.copy()
df2['E']=['one', 'one','two','three','four','three']
df2


Out[41]:
A B C D E
2013-01-01 -0.026436 -0.544089 0.193653 -1.762786 one
2013-01-02 -0.076448 -0.854257 -0.836730 0.012145 one
2013-01-03 0.362770 0.240102 0.842131 -0.216190 two
2013-01-04 -1.194762 0.227675 0.989805 -0.360080 three
2013-01-05 -0.513112 -0.655481 1.411907 -0.242742 four
2013-01-06 -0.107620 0.980511 0.155310 -1.356867 three

In [42]:
#select rows based on column values
df2[df2['E'].isin(['two','four'])]


Out[42]:
A B C D E
2013-01-03 0.362770 0.240102 0.842131 -0.216190 two
2013-01-05 -0.513112 -0.655481 1.411907 -0.242742 four

In [43]:
#select all rows except those with two and four
df2[~df2['E'].isin(['two','four'])]


Out[43]:
A B C D E
2013-01-01 -0.026436 -0.544089 0.193653 -1.762786 one
2013-01-02 -0.076448 -0.854257 -0.836730 0.012145 one
2013-01-04 -1.194762 0.227675 0.989805 -0.360080 three
2013-01-06 -0.107620 0.980511 0.155310 -1.356867 three

In [44]:
#list all columns where A is greater than C
df.query('A > C')


Out[44]:
A B C D
2013-01-02 -0.076448 -0.854257 -0.83673 0.012145

In [45]:
#using OR condition
df.query('A < B | C > A')


Out[45]:
A B C D
2013-01-01 -0.026436 -0.544089 0.193653 -1.762786
2013-01-03 0.362770 0.240102 0.842131 -0.216190
2013-01-04 -1.194762 0.227675 0.989805 -0.360080
2013-01-05 -0.513112 -0.655481 1.411907 -0.242742
2013-01-06 -0.107620 0.980511 0.155310 -1.356867

In [46]:
#create a data frame
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data


Out[46]:
group ounces
0 a 4.0
1 a 3.0
2 a 12.0
3 b 6.0
4 b 7.5
5 b 8.0
6 c 3.0
7 c 5.0
8 c 6.0

In [47]:
data.pivot_table(values='ounces',index='group',aggfunc=np.mean)


Out[47]:
group
a    6.333333
b    7.166667
c    4.666667
Name: ounces, dtype: float64

In [48]:
#calculate count by each group
data.pivot_table(values='ounces',index='group',aggfunc='count')


Out[48]:
group
a    3
b    3
c    3
Name: ounces, dtype: int64