In [1]:
import pandas as pd
import numpy as np
In [2]:
data = pd.DataFrame({'Country': ['Russia','Colombia','Chile','Equador','Nigeria'],
'Rank':[121,40,100,130,11]})
data
Out[2]:
In [3]:
data.describe()
#describe() method computes summary statistics of integer / double variables
Out[3]:
In [4]:
data.info()
#get the complete information about the data set, we can use info() function
In [5]:
data = pd.DataFrame({'group':['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],'ounces':[4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
Out[5]:
In [6]:
#sort the data frame by ounces - inplace = True will make changes to the data
data.sort_values(by=['ounces'],ascending=True,inplace=False)
Out[6]:
In [7]:
#sort the data by not just one column but multiple columns.
data.sort_values(by=['group','ounces'],ascending=[True,False],inplace=False)
Out[7]:
In [8]:
#remove duplicate rows.
data = pd.DataFrame({'k1':['one']*3 + ['two']*4, 'k2':[3,2,1,3,3,4,4]})
data
Out[8]:
In [9]:
#sort values
data.sort_values(by='k2')
Out[9]:
In [10]:
#remove duplicates
data.drop_duplicates()
Out[10]:
In [11]:
#remove duplicates based on a particular column
data.drop_duplicates(subset='k1')
Out[11]:
In [12]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
Out[12]:
In [13]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
def meat_2_animal(series):
if series['food'] == 'bacon':
return 'pig'
elif series['food'] == 'pulled pork':
return 'pig'
elif series['food'] == 'pastrami':
return 'cow'
elif series['food'] == 'corned beef':
return 'cow'
elif series['food'] == 'honey ham':
return 'pig'
else:
return 'salmon'
#create a new variable
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data
Out[13]:
In [14]:
#another way of doing it is: convert the food values to the lower case and apply the function
lower = lambda x: x.lower()
data['food'] = data['food'].apply(lower)
data['animal2'] = data.apply(meat_2_animal, axis='columns')
data
Out[14]:
In [15]:
data.assign(new_variable = data['ounces']*10)
#Another way to create a new variable is by using the assign function
Out[15]:
In [16]:
#remove columns
data.drop('animal2',axis='columns',inplace=True)
data
Out[16]:
In [17]:
#A quick method for imputing missing values is by filling the missing value with any random number
#also to replace outliers
#Series function from pandas are used to create arrays
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
Out[17]:
In [18]:
data.replace(-999, np.nan,inplace=True)
data
Out[18]:
In [19]:
#We can also replace multiple values at once.
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace([-999,-1000],np.nan,inplace=True)
data
Out[19]:
In [20]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])
data
Out[20]:
In [21]:
#Using rename function
data.rename(index = {'Ohio':'SanF'}, columns={'one':'one_p','two':'two_p'},inplace=True)
data
Out[21]:
In [22]:
#You can also use string functions
data.rename(index = str.upper, columns=str.title,inplace=True)
data
Out[22]:
In [23]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
#categorize(bin) continuous variables
In [24]:
#Understand the output - '(' means the value is included in the bin, '[' means the value is excluded
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
Out[24]:
In [25]:
#To include the right bin value, we can do:
pd.cut(ages,bins,right=False)
Out[25]:
In [26]:
#pandas library intrinsically assigns an encoding to categorical variables.
cats.labels
Out[26]:
In [27]:
#Let's check how many observations fall under each bin
pd.value_counts(cats)
Out[27]:
In [28]:
bin_names = ['Youth', 'YoungAdult', 'MiddleAge', 'Senior']
new_cats = pd.cut(ages, bins,labels=bin_names)
pd.value_counts(new_cats)
Out[28]:
In [29]:
pd.value_counts(new_cats).cumsum()
#cumulative sum
Out[29]:
In [30]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})
df
Out[30]:
In [31]:
#calculate the mean of data1 column by key1
grouped = df['data1'].groupby(df['key1'])
grouped.mean()
Out[31]:
In [32]:
#slice data
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df
Out[32]:
In [33]:
#get first n rows from the data frame
df[:3]
Out[33]:
In [34]:
#slice based on date range
df['20130101':'20130104']
Out[34]:
In [35]:
#slicing based on column names
df.loc[:,['A','B']]
Out[35]:
In [36]:
#slicing based on both row index labels and column names
df.loc['20130102':'20130103',['A','B']]
Out[36]:
In [37]:
#slicing based on index of columns
df.iloc[3] #returns 4th row (index is 3rd)
Out[37]:
In [38]:
#returns a specific range of rows
df.iloc[2:4, 0:2]
Out[38]:
In [39]:
#returns specific rows and columns using lists containing columns or row indexes
df.iloc[[1,5],[0,2]]
Out[39]:
In [40]:
#Boolean indexing based on column values
df[df.A > 0]
Out[40]:
In [41]:
#we can copy the data set
df2 = df.copy()
df2['E']=['one', 'one','two','three','four','three']
df2
Out[41]:
In [42]:
#select rows based on column values
df2[df2['E'].isin(['two','four'])]
Out[42]:
In [43]:
#select all rows except those with two and four
df2[~df2['E'].isin(['two','four'])]
Out[43]:
In [44]:
#list all columns where A is greater than C
df.query('A > C')
Out[44]:
In [45]:
#using OR condition
df.query('A < B | C > A')
Out[45]:
In [46]:
#create a data frame
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
Out[46]:
In [47]:
data.pivot_table(values='ounces',index='group',aggfunc=np.mean)
Out[47]:
In [48]:
#calculate count by each group
data.pivot_table(values='ounces',index='group',aggfunc='count')
Out[48]: