In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",sep=',\s', na_values=["?"])
df.columns = ("age", "type_employer", "fnlwgt", "education", 
                "education_num","marital", "occupation", "relationship", "race","sex",
                "capital_gain", "capital_loss", "hr_per_week","country", "income")
del df['fnlwgt']

df.dropna(how='any',inplace=True)
df1= df.drop_duplicates(keep="first")
df1.isnull().sum()
df1.type_employer.value_counts()


C:\anaconda\lib\site-packages\ipykernel\__main__.py:5: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
Out[1]:
Private             19214
Self-emp-not-inc     2431
Local-gov            2014
State-gov            1252
Self-emp-inc         1049
Federal-gov           929
Without-pay            14
Name: type_employer, dtype: int64

In [2]:
df1.replace(['Local-gov','State-gov','Federal-gov'],['Gov','Gov','Gov'],inplace=True)


C:\anaconda\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [3]:
df1.replace(['Self-emp-not-inc','Self-emp-inc'],['self-emp','self-emp'],inplace=True)


C:\anaconda\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [4]:
df1.replace(['Prof-specialty','Adm-clerical','Other-service','Priv-house-serv','Craft-repair','Machine-op-inspct','Transport-moving','Handlers-cleaners','Farming-fishing'],
            ['Professional','Admin','Service','Service','Blue-Collar','Blue-Collar','Blue-Collar','Blue-Collar','Blue-Collar'],inplace=True)


C:\anaconda\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [5]:
del df1['capital_gain']
del df1['capital_loss']

In [6]:
import random
from numba import jit
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')
plt.rcParams['figure.figsize'] = (10, 7)

In [7]:
import random

def sample(df1, n):
    reservoirs = []
    for t, row in df1.iterrows():
        if t < n:
            reservoirs.append(row)
        else:
            m = random.randint(0,t)
            if m < n:
                reservoirs[m] = row
                t=t+1
    return reservoirs

In [8]:
df2=pd.DataFrame(sample(df1,10))
df2


Out[8]:
age type_employer education education_num marital occupation relationship race sex hr_per_week country income
5699 49 Private 7th-8th 4 Married-civ-spouse Blue-Collar Husband White Male 48 United-States <=50K
21093 23 Private 12th 8 Never-married Blue-Collar Own-child White Female 40 United-States <=50K
18111 46 Gov Some-college 10 Married-civ-spouse Exec-managerial Husband White Male 48 United-States >50K
281 34 Private Assoc-acdm 12 Divorced Sales Unmarried Black Female 45 United-States <=50K
13470 22 Private Some-college 10 Never-married Service Not-in-family White Female 50 United-States <=50K
24050 23 Private HS-grad 9 Never-married Sales Own-child White Male 30 United-States <=50K
15434 47 Private Bachelors 13 Married-civ-spouse Sales Husband White Male 60 United-States >50K
19458 55 Private HS-grad 9 Married-civ-spouse Blue-Collar Husband White Male 40 United-States >50K
10432 27 Private HS-grad 9 Never-married Service Not-in-family White Female 52 United-States <=50K
13908 37 self-emp HS-grad 9 Divorced Blue-Collar Not-in-family White Male 50 United-States <=50K

In [ ]: