In [1]:
from platform import python_version
python_version()


Out[1]:
'3.6.7'

In [27]:
import sys

sys.executable


Out[27]:
'/home/felipe/venv36/bin/python3.6'

In [2]:
import pandas as pd
import numpy as np

pd.__version__, np.__version__


Out[2]:
('0.24.2', '1.16.3')

In [3]:
df = pd.DataFrame({
    'name': ['alice','bob','charlie'],
    'age': [25,26,27]
})
df = df[['name','age']]
df


Out[3]:
name age
0 alice 25
1 bob 26
2 charlie 27

rename


In [4]:
df.rename(columns={'name':'person_name'})


Out[4]:
person_name age
0 alice 25
1 bob 26
2 charlie 27

In [5]:
df.rename(columns={'name':'person_name','age':'age_in_years'})


Out[5]:
person_name age_in_years
0 alice 25
1 bob 26
2 charlie 27

apply function to column names


In [6]:
df


Out[6]:
name age
0 alice 25
1 bob 26
2 charlie 27

In [7]:
df2 = df.copy()

df2.columns = [col.upper() for col in df2.columns]

df2


Out[7]:
NAME AGE
0 alice 25
1 bob 26
2 charlie 27

apply function


In [8]:
df2 = df.copy()

df2['name']= df['name'].map(lambda name: name.upper())
df2


Out[8]:
name age
0 ALICE 25
1 BOB 26
2 CHARLIE 27

create derived


In [9]:
df2 = df.copy()

df2['age_times_two']= df['age'].map(lambda age: age*2)
df2


Out[9]:
name age age_times_two
0 alice 25 50
1 bob 26 52
2 charlie 27 54

In [10]:
df2 = df.copy()

df2['age_times_two']= df['age'] *2
df2


Out[10]:
name age age_times_two
0 alice 25 50
1 bob 26 52
2 charlie 27 54

In [11]:
df2 = df.copy()

df2['name_uppercase']= df['name'].map(lambda name: name.upper())
df2


Out[11]:
name age name_uppercase
0 alice 25 ALICE
1 bob 26 BOB
2 charlie 27 CHARLIE

number of NaN in column


In [12]:
import pandas as pd

df = pd.DataFrame({
    'name': ['alice','bob','charlie'],
    'age': [25,26,np.nan],
    'state': ['ak',np.nan,None]
})

print(df['name'].isnull().sum())
print(df['age'].isnull().sum())
print(df['state'].isnull().sum())


0
1
2

column names


In [13]:
import pandas as pd

df = pd.DataFrame({
    'name': ['alice','bob','charlie'],
    'age': [25,26,27],
    'state': ['ak','ny','dc']
})

print(df.columns.values)


['name' 'age' 'state']

number of columns


In [14]:
import pandas as pd

df = pd.DataFrame({
    'name': ['alice','bob','charlie'],
    'age': [25,26,27],
    'state': ['ak','ny','dc']
})

print(len(df.columns.values))


3

change order


In [15]:
import pandas as pd

df = pd.DataFrame({
    'name': ['alice','bob','charlie'],
    'age': [25,26,27],
    'state': ['ak','ny','dc']
})
df


Out[15]:
name age state
0 alice 25 ak
1 bob 26 ny
2 charlie 27 dc

In [16]:
df2 = df[['name','age','state']]
df2


Out[16]:
name age state
0 alice 25 ak
1 bob 26 ny
2 charlie 27 dc

dropping


In [17]:
df2 = df.copy()

df2.drop(columns=['age'],inplace=True)
df2


Out[17]:
name state
0 alice ak
1 bob ny
2 charlie dc

In [18]:
df2 = df.copy()

df2.drop(columns=['age','name'],inplace=True)
df2


Out[18]:
state
0 ak
1 ny
2 dc

append new


In [19]:
df2 = df.copy()

states = pd.Series(['dc','ca','ny'])

df2['state'] = states

df2


Out[19]:
name age state
0 alice 25 dc
1 bob 26 ca
2 charlie 27 ny

check if column in dataframe


In [20]:
df2 = df.copy()

candidate_names = ['name','gender','age']

for name in candidate_names:
    if name in df2.columns.values:
        print('"{}" is a column name'.format(name))


"name" is a column name
"age" is a column name

insert column at specific index


In [21]:
df2 = df.copy()

col = pd.Series(['female','male','male'])

df2.insert(1,'gender',col)
df2


Out[21]:
name gender age state
0 alice female 25 ak
1 bob male 26 ny
2 charlie male 27 dc

astype


In [22]:
import numpy as np

df2 = df.copy()

print(df2['age'].dtype)

df2['age'] = df2['age'].astype(str)

print(df2['age'].dtype)

df2['age'] = df2['age'].astype(np.uint8)

print(df2['age'].dtype)


int64
object
uint8

to datetime


In [23]:
df3 = pd.DataFrame({
    'name': ['alice','bob','charlie'],
    'date_of_birth': ['10/25/2005','10/29/2002','01/01/2001']
})[['name','date_of_birth']]
df3


Out[23]:
name date_of_birth
0 alice 10/25/2005
1 bob 10/29/2002
2 charlie 01/01/2001

In [24]:
df3['date_of_birth'] = pd.to_datetime(df3['date_of_birth'])
df3


Out[24]:
name date_of_birth
0 alice 2005-10-25
1 bob 2002-10-29
2 charlie 2001-01-01

In [25]:
df3 = pd.DataFrame({
    'name': ['alice','bob','charlie'],
    'date_of_birth': ['27/05/2001','16/02/1999','25/09/1998']
})[['name','date_of_birth']]
df3


Out[25]:
name date_of_birth
0 alice 27/05/2001
1 bob 16/02/1999
2 charlie 25/09/1998

In [26]:
df3 = pd.DataFrame({
    'name': ['alice','bob','charlie'],
    'date_of_birth': ['27/05/2001','16/02/1999','25/09/1998']
})[['name','date_of_birth']]
df3
df3['date_of_birth'] = pd.to_datetime(df3['date_of_birth'],format='%d/%m/%Y')
df3


Out[26]:
name date_of_birth
0 alice 2001-05-27
1 bob 1999-02-16
2 charlie 1998-09-25

map example


In [48]:
df4 = pd.DataFrame({
    'name': ['alice','bob','charlie','david'],
    'age': [25,26,27,22],
})[['name', 'age']]
df4


Out[48]:
name age
0 alice 25
1 bob 26
2 charlie 27
3 david 22

In [49]:
df4['name_uppercase'] = df4['name'].map(lambda element: element.upper())
df4


Out[49]:
name age name_uppercase
0 alice 25 ALICE
1 bob 26 BOB
2 charlie 27 CHARLIE
3 david 22 DAVID

apply example

Series.apply


In [51]:
df5 = pd.DataFrame({
    'name': ['alice','bob','charlie','david'],
    'age': [25,26,27,22],
})[['name', 'age']]
df5


Out[51]:
name age
0 alice 25
1 bob 26
2 charlie 27
3 david 22

In [52]:
df5['name_uppercase'] = df5['name'].apply(lambda element: element.upper())
df5


Out[52]:
name age name_uppercase
0 alice 25 ALICE
1 bob 26 BOB
2 charlie 27 CHARLIE
3 david 22 DAVID

In [64]:
df6 = pd.DataFrame({
    'name': ['alice','bob','charlie','david'],
    'age': [25,26,27,22],
})[['name', 'age']]
df6


Out[64]:
name age
0 alice 25
1 bob 26
2 charlie 27
3 david 22

In [66]:
df6['age_times_2'] = df6[['age']].apply(lambda arr: np.multiply(arr,2))
df6


Out[66]:
name age age_times_2
0 alice 25 50
1 bob 26 52
2 charlie 27 54
3 david 22 44

apply vs map


In [68]:
df7 = pd.DataFrame({
    'name': ['alice','bob','charlie'],
    'age': [25,26,27],
    'number_of_children':[1,3,4]
})[['name','age','number_of_children']]
df4


Out[68]:
name age number_of_children
0 alice 25 1
1 bob 26 3
2 charlie 27 4

In [41]:
# returns a new dataframe with a single column
df4[['age']].apply(lambda arr: arr*2)


Out[41]:
age
0 50
1 52
2 54

In [39]:
# returns a new series
df4['age'].map(lambda element: element *2)


Out[39]:
0    50
1    52
2    54
Name: age, dtype: int64

In [ ]:


In [71]:
df_numeric = pd.DataFrame({
    'x': np.random.normal(loc=0.0, scale=1.0, size=10000000),
})
df_numeric.head()


Out[71]:
x
0 2.126701
1 0.428444
2 -0.084118
3 0.324723
4 1.699269

In [78]:
%%time

def multiply_by_two(arr):
    return np.multiply(arr,2)
    
df_numeric['2x_apply'] = df_numeric[['x']].apply(multiply_by_two)


CPU times: user 80 ms, sys: 120 ms, total: 200 ms
Wall time: 198 ms

In [79]:
%%time
def multiply_by_two_map(x):
    return x*2
    
df_numeric['2x_map'] = df_numeric['x'].map(multiply_by_two)


CPU times: user 13.8 s, sys: 284 ms, total: 14.1 s
Wall time: 14.1 s

In [80]:
df_numeric.head()


Out[80]:
x 2x_apply 2x_map
0 2.126701 4.253402 4.253402
1 0.428444 0.856887 0.856887
2 -0.084118 -0.168235 -0.168235
3 0.324723 0.649445 0.649445
4 1.699269 3.398538 3.398538