notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np



In [2]:

    
data = pd.DataFrame({'Country': ['Russia','Colombia','Chile','Equador','Nigeria'],
                    'Rank':[121,40,100,130,11]})
data



In [3]:

    
data.describe()
#describe() method computes summary statistics of integer / double variables



In [4]:

    
data.info()
#get the complete information about the data set, we can use info() function









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
Country    5 non-null object
Rank       5 non-null int64
dtypes: int64(1), object(1)
memory usage: 160.0+ bytes



In [5]:

    
data = pd.DataFrame({'group':['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],'ounces':[4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data



In [6]:

    
#sort the data frame by ounces - inplace = True will make changes to the data
data.sort_values(by=['ounces'],ascending=True,inplace=False)



In [7]:

    
#sort the data by not just one column but multiple columns.
data.sort_values(by=['group','ounces'],ascending=[True,False],inplace=False)



In [8]:

    
#remove duplicate rows.
data = pd.DataFrame({'k1':['one']*3 + ['two']*4, 'k2':[3,2,1,3,3,4,4]})
data



In [9]:

    
#sort values
data.sort_values(by='k2')



In [10]:

    
#remove duplicates
data.drop_duplicates()



In [11]:

    
#remove duplicates based on a particular column
data.drop_duplicates(subset='k1')



In [12]:

    
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data









    Out[12]:






  
    
      
      food
      ounces
    
  
  
    
      0
      bacon
      4.0
    
    
      1
      pulled pork
      3.0
    
    
      2
      bacon
      12.0
    
    
      3
      Pastrami
      6.0
    
    
      4
      corned beef
      7.5
    
    
      5
      Bacon
      8.0
    
    
      6
      pastrami
      3.0
    
    
      7
      honey ham
      5.0
    
    
      8
      nova lox
      6.0



In [13]:

    
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

def meat_2_animal(series):
    if series['food'] == 'bacon':
        return 'pig'
    elif series['food'] == 'pulled pork':
        return 'pig'
    elif series['food'] == 'pastrami':
        return 'cow'
    elif series['food'] == 'corned beef':
        return 'cow'
    elif series['food'] == 'honey ham':
        return 'pig'
    else:
        return 'salmon'


#create a new variable
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data









    Out[13]:






  
    
      
      food
      ounces
      animal
    
  
  
    
      0
      bacon
      4.0
      pig
    
    
      1
      pulled pork
      3.0
      pig
    
    
      2
      bacon
      12.0
      pig
    
    
      3
      Pastrami
      6.0
      cow
    
    
      4
      corned beef
      7.5
      cow
    
    
      5
      Bacon
      8.0
      pig
    
    
      6
      pastrami
      3.0
      cow
    
    
      7
      honey ham
      5.0
      pig
    
    
      8
      nova lox
      6.0
      salmon



In [14]:

    
#another way of doing it is: convert the food values to the lower case and apply the function
lower = lambda x: x.lower()
data['food'] = data['food'].apply(lower)
data['animal2'] = data.apply(meat_2_animal, axis='columns')
data









    Out[14]:






  
    
      
      food
      ounces
      animal
      animal2
    
  
  
    
      0
      bacon
      4.0
      pig
      pig
    
    
      1
      pulled pork
      3.0
      pig
      pig
    
    
      2
      bacon
      12.0
      pig
      pig
    
    
      3
      pastrami
      6.0
      cow
      cow
    
    
      4
      corned beef
      7.5
      cow
      cow
    
    
      5
      bacon
      8.0
      pig
      pig
    
    
      6
      pastrami
      3.0
      cow
      cow
    
    
      7
      honey ham
      5.0
      pig
      pig
    
    
      8
      nova lox
      6.0
      salmon
      salmon



In [15]:

    
data.assign(new_variable = data['ounces']*10)
#Another way to create a new variable is by using the assign function









    Out[15]:






  
    
      
      food
      ounces
      animal
      animal2
      new_variable
    
  
  
    
      0
      bacon
      4.0
      pig
      pig
      40.0
    
    
      1
      pulled pork
      3.0
      pig
      pig
      30.0
    
    
      2
      bacon
      12.0
      pig
      pig
      120.0
    
    
      3
      pastrami
      6.0
      cow
      cow
      60.0
    
    
      4
      corned beef
      7.5
      cow
      cow
      75.0
    
    
      5
      bacon
      8.0
      pig
      pig
      80.0
    
    
      6
      pastrami
      3.0
      cow
      cow
      30.0
    
    
      7
      honey ham
      5.0
      pig
      pig
      50.0
    
    
      8
      nova lox
      6.0
      salmon
      salmon
      60.0



In [16]:

    
#remove columns
data.drop('animal2',axis='columns',inplace=True)
data









    Out[16]:






  
    
      
      food
      ounces
      animal
    
  
  
    
      0
      bacon
      4.0
      pig
    
    
      1
      pulled pork
      3.0
      pig
    
    
      2
      bacon
      12.0
      pig
    
    
      3
      pastrami
      6.0
      cow
    
    
      4
      corned beef
      7.5
      cow
    
    
      5
      bacon
      8.0
      pig
    
    
      6
      pastrami
      3.0
      cow
    
    
      7
      honey ham
      5.0
      pig
    
    
      8
      nova lox
      6.0
      salmon



In [17]:

    
#A quick method for imputing missing values is by filling the missing value with any random number
#also to replace outliers
#Series function from pandas are used to create arrays
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data









    Out[17]:





0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64



In [18]:

    
data.replace(-999, np.nan,inplace=True)
data









    Out[18]:





0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64



In [19]:

    
#We can also replace multiple values at once.
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace([-999,-1000],np.nan,inplace=True)
data









    Out[19]:





0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64



In [20]:

    
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])
data



In [21]:

    
#Using rename function
data.rename(index = {'Ohio':'SanF'}, columns={'one':'one_p','two':'two_p'},inplace=True)
data



In [22]:

    
#You can also use string functions
data.rename(index = str.upper, columns=str.title,inplace=True)
data



In [23]:

    
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
#categorize(bin) continuous variables



In [24]:

    
#Understand the output - '(' means the value is included in the bin, '[' means the value is excluded
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats









    Out[24]:





[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]



In [25]:

    
#To include the right bin value, we can do:
pd.cut(ages,bins,right=False)









    Out[25]:





[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, object): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]



In [26]:

    
#pandas library intrinsically assigns an encoding to categorical variables.
cats.labels









    



/Users/shams/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: 'labels' is deprecated. Use 'codes' instead
  from ipykernel import kernelapp as app






    Out[26]:





array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)



In [27]:

    
#Let's check how many observations fall under each bin
pd.value_counts(cats)









    Out[27]:





(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64



In [28]:

    
bin_names = ['Youth', 'YoungAdult', 'MiddleAge', 'Senior']
new_cats = pd.cut(ages, bins,labels=bin_names)

pd.value_counts(new_cats)









    Out[28]:





Youth         5
MiddleAge     3
YoungAdult    3
Senior        1
dtype: int64



In [29]:

    
pd.value_counts(new_cats).cumsum()
#cumulative sum









    Out[29]:





Youth          5
MiddleAge      8
YoungAdult    11
Senior        12
dtype: int64



In [30]:

    
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df



In [31]:

    
#calculate the mean of data1 column by key1
grouped = df['data1'].groupby(df['key1'])
grouped.mean()









    Out[31]:





key1
a   -0.239242
b    0.473060
Name: data1, dtype: float64



In [32]:

    
#slice data
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df



In [33]:

    
#get first n rows from the data frame
df[:3]



In [34]:

    
#slice based on date range
df['20130101':'20130104']



In [35]:

    
#slicing based on column names
df.loc[:,['A','B']]



In [36]:

    
#slicing based on both row index labels and column names
df.loc['20130102':'20130103',['A','B']]



In [37]:

    
#slicing based on index of columns
df.iloc[3] #returns 4th row (index is 3rd)









    Out[37]:





A   -1.194762
B    0.227675
C    0.989805
D   -0.360080
Name: 2013-01-04 00:00:00, dtype: float64



In [38]:

    
#returns a specific range of rows
df.iloc[2:4, 0:2]



In [39]:

    
#returns specific rows and columns using lists containing columns or row indexes
df.iloc[[1,5],[0,2]]



In [40]:

    
#Boolean indexing based on column values
df[df.A > 0]



In [41]:

    
#we can copy the data set
df2 = df.copy()
df2['E']=['one', 'one','two','three','four','three']
df2



In [42]:

    
#select rows based on column values
df2[df2['E'].isin(['two','four'])]



In [43]:

    
#select all rows except those with two and four
df2[~df2['E'].isin(['two','four'])]



In [44]:

    
#list all columns where A is greater than C
df.query('A > C')



In [45]:

    
#using OR condition
df.query('A < B | C > A')



In [46]:

    
#create a data frame
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data



In [47]:

    
data.pivot_table(values='ounces',index='group',aggfunc=np.mean)









    Out[47]:





group
a    6.333333
b    7.166667
c    4.666667
Name: ounces, dtype: float64



In [48]:

    
#calculate count by each group
data.pivot_table(values='ounces',index='group',aggfunc='count')









    Out[48]:





group
a    3
b    3
c    3
Name: ounces, dtype: int64

	Rank
count	5.000000
mean	80.400000
std	52.300096
min	11.000000
25%	40.000000
50%	100.000000
75%	121.000000
max	130.000000

	One_P	Two_P	Three	Four
SANF	0	1	2	3
COLORADO	4	5	6	7
NEW YORK	8	9	10	11

	data1	data2	key1	key2
0	-0.477818	-0.652115	a	one
1	0.761973	-1.057828	a	two
2	0.495193	-1.623535	b	one
3	0.450927	-1.283722	b	two
4	-1.001882	-0.469757	a	one

	A	B	C	D
2013-01-01	-0.026436	-0.544089	0.193653	-1.762786
2013-01-02	-0.076448	-0.854257	-0.836730	0.012145
2013-01-03	0.362770	0.240102	0.842131	-0.216190
2013-01-04	-1.194762	0.227675	0.989805	-0.360080
2013-01-05	-0.513112	-0.655481	1.411907	-0.242742
2013-01-06	-0.107620	0.980511	0.155310	-1.356867

	A	B	C	D
2013-01-01	-0.026436	-0.544089	0.193653	-1.762786
2013-01-02	-0.076448	-0.854257	-0.836730	0.012145
2013-01-03	0.362770	0.240102	0.842131	-0.216190

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	Pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

	food	ounces	animal	animal2	new_variable
0	bacon	4.0	pig	pig	40.0
1	pulled pork	3.0	pig	pig	30.0
2	bacon	12.0	pig	pig	120.0
3	pastrami	6.0	cow	cow	60.0
4	corned beef	7.5	cow	cow	75.0
5	bacon	8.0	pig	pig	80.0
6	pastrami	3.0	cow	cow	30.0
7	honey ham	5.0	pig	pig	50.0
8	nova lox	6.0	salmon	salmon	60.0