In [45]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas
import pandas as pd
np.set_printoptions(precision=4, threshold=500)
pd.options.display.max_rows = 100
In [46]:
%matplotlib inline
In [47]:
employee_data = DataFrame({'employee_name': ['Rafferty', 'Jones', 'Steinberg', 'Robinson', 'Smith', 'Jasper'],
'DepartmentID': [31, 33, 33, 34, 34, 'N/A']})
department_data = DataFrame({'DepartmentID': [31, 33, 34, 35],
'DepartmentName': ['Sales', 'Engineering', 'Admin', 'Marketing']})
employee_data
Out[47]:
In [48]:
department_data
Out[48]:
In [49]:
pd.merge(employee_data, department_data, on = 'DepartmentID')
Out[49]:
In [50]:
df3 = DataFrame({'employee_name': ['Rafferty', 'Jones', 'Steinberg', 'Robinson', 'Smith', 'Jasper'],
'department_id': [31, 33, 33, 34, 34, 'N/A']})
df4 = DataFrame({'DepartmentID': [31, 33, 34, 35],
'DepartmentName': ['Sales', 'Engineering', 'Admin', 'Marketing']})
pd.merge(df3, df4, left_on = 'department_id', right_on = 'DepartmentID')[['employee_name', 'DepartmentID', 'DepartmentName']]
Out[50]:
In [51]:
pd.merge(employee_data, department_data, on = 'DepartmentID', how = 'left')
Out[51]:
In [52]:
pd.merge(employee_data, department_data, on = 'DepartmentID', how = 'right')
Out[52]:
In [53]:
pd.merge(employee_data, department_data, on = 'DepartmentID', how = 'outer')
Out[53]:
In [54]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [1, 1, 2, 3, 3, 4, 4]})
data
Out[54]:
In [55]:
data.duplicated()
Out[55]:
In [56]:
data.drop_duplicates()
Out[56]:
In [57]:
data['v1'] = range(7)
data.drop_duplicates(['k1'])
Out[57]:
In [58]:
data
Out[58]:
In [59]:
data.drop_duplicates(['k1', 'k2'], keep='last')
Out[59]:
In [60]:
data = DataFrame([[1.0, 6.5, 8],
[3, np.NAN, np.NAN],
[np.NAN, np.NAN, np.NAN],
[np.NAN, 6.5, 9],
[3, -1, 8],
[3, 6.5, 8]])
data
Out[60]:
In [61]:
clean_data = data.dropna()
clean_data
Out[61]:
In [62]:
def count_missing(x):
return sum(x.isnull())
total = data.shape[0]
missing_stat = data.apply(count_missing, axis = 0)
# print total
missing_stat.apply(lambda x: x / float(total))
Out[62]:
In [63]:
data.fillna(value = '?', inplace = False)
Out[63]:
In [64]:
data.fillna(value = {0:'other', 1:2, 2:5}, inplace = False)
Out[64]:
In [65]:
data.fillna(data.mean(), inplace = False)
Out[65]:
In [66]:
data.fillna(data.median(), inplace = False)
data.median()
Out[66]:
In [67]:
mode = data.mode()
data.fillna(mode.iloc[0], inplace = False)
# data.fillna(value = {0:mode.loc[0][0],
# 1:mode.loc[0][1],
# 2:mode.loc[0][2]}, inplace = False)
Out[67]:
In [68]:
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()
Out[68]:
In [69]:
col = data[3]
col[np.abs(col) > 3]
Out[69]:
In [70]:
outliers = (np.abs(data) > 3).any(1)
data[outliers]
Out[70]:
In [71]:
data[(np.abs(data) <= 3).all(1)]
Out[71]:
In [72]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
Out[72]:
In [73]:
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
print len(df)
sampler = np.random.permutation(len(df))
sampler
Out[73]:
In [74]:
df
Out[74]:
In [75]:
df.take(sampler)
Out[75]:
In [76]:
df.take(np.random.permutation(len(df))[:3])
Out[76]:
In [77]:
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size = 10)
In [78]:
sampler
Out[78]:
In [79]:
draws = bag.take(sampler)
draws
Out[79]:
In [80]:
import random
sampler = random.sample(xrange(len(bag)), 4)
draws = bag.take(sampler)
draws
Out[80]: