In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",sep=',\s', na_values=["?"])
df.columns = ("age", "type_employer", "fnlwgt", "education",
"education_num","marital", "occupation", "relationship", "race","sex",
"capital_gain", "capital_loss", "hr_per_week","country", "income")
del df['fnlwgt']
df.dropna(how='any',inplace=True)
df1= df.drop_duplicates(keep="first")
df1.isnull().sum()
df1.type_employer.value_counts()
Out[1]:
In [2]:
df1.replace(['Local-gov','State-gov','Federal-gov'],['Gov','Gov','Gov'],inplace=True)
In [3]:
df1.replace(['Self-emp-not-inc','Self-emp-inc'],['self-emp','self-emp'],inplace=True)
In [4]:
df1.replace(['Prof-specialty','Adm-clerical','Other-service','Priv-house-serv','Craft-repair','Machine-op-inspct','Transport-moving','Handlers-cleaners','Farming-fishing'],
['Professional','Admin','Service','Service','Blue-Collar','Blue-Collar','Blue-Collar','Blue-Collar','Blue-Collar'],inplace=True)
In [5]:
del df1['capital_gain']
del df1['capital_loss']
In [6]:
import random
from numba import jit
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')
plt.rcParams['figure.figsize'] = (10, 7)
In [7]:
import random
def sample(df1, n):
reservoirs = []
for t, row in df1.iterrows():
if t < n:
reservoirs.append(row)
else:
m = random.randint(0,t)
if m < n:
reservoirs[m] = row
t=t+1
return reservoirs
In [8]:
df2=pd.DataFrame(sample(df1,10))
df2
Out[8]:
In [ ]: