notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib as plt

df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",sep=',\s', na_values=["?"])
df.columns = ("age", "type_employer", "fnlwgt", "education", 
                "education_num","marital", "occupation", "relationship", "race","sex",
                "capital_gain", "capital_loss", "hr_per_week","country", "income")
del df['fnlwgt']

df.dropna(how='any',inplace=True)
df1= df.drop_duplicates(keep="first")
df1.isnull().sum()
df1.type_employer.value_counts()









    



C:\anaconda\lib\site-packages\ipykernel\__main__.py:5: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.






    Out[1]:





Private             19214
Self-emp-not-inc     2431
Local-gov            2014
State-gov            1252
Self-emp-inc         1049
Federal-gov           929
Without-pay            14
Name: type_employer, dtype: int64



In [2]:

    
df1.replace(['Local-gov','State-gov','Federal-gov'],['Gov','Gov','Gov'],inplace=True)









    



C:\anaconda\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [3]:

    
df1.replace(['Self-emp-not-inc','Self-emp-inc'],['self-emp','self-emp'],inplace=True)









    



C:\anaconda\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [4]:

    
df1.replace(['Prof-specialty','Adm-clerical','Other-service','Priv-house-serv','Craft-repair','Machine-op-inspct','Transport-moving','Handlers-cleaners','Farming-fishing'],
            ['Professional','Admin','Service','Service','Blue-Collar','Blue-Collar','Blue-Collar','Blue-Collar','Blue-Collar'],inplace=True)









    



C:\anaconda\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app



In [5]:

    
del df1['capital_gain']
del df1['capital_loss']



In [6]:

    
import random
from numba import jit
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')
plt.rcParams['figure.figsize'] = (10, 7)



In [7]:

    
import random

def sample(df1, n):
    reservoirs = []
    for t, row in df1.iterrows():
        if t < n:
            reservoirs.append(row)
        else:
            m = random.randint(0,t)
            if m < n:
                reservoirs[m] = row
                t=t+1
    return reservoirs



In [8]:

    
df2=pd.DataFrame(sample(df1,10))
df2









    Out[8]:






  
    
      
      age
      type_employer
      education
      education_num
      marital
      occupation
      relationship
      race
      sex
      hr_per_week
      country
      income
    
  
  
    
      5699
      49
      Private
      7th-8th
      4
      Married-civ-spouse
      Blue-Collar
      Husband
      White
      Male
      48
      United-States
      <=50K
    
    
      21093
      23
      Private
      12th
      8
      Never-married
      Blue-Collar
      Own-child
      White
      Female
      40
      United-States
      <=50K
    
    
      18111
      46
      Gov
      Some-college
      10
      Married-civ-spouse
      Exec-managerial
      Husband
      White
      Male
      48
      United-States
      >50K
    
    
      281
      34
      Private
      Assoc-acdm
      12
      Divorced
      Sales
      Unmarried
      Black
      Female
      45
      United-States
      <=50K
    
    
      13470
      22
      Private
      Some-college
      10
      Never-married
      Service
      Not-in-family
      White
      Female
      50
      United-States
      <=50K
    
    
      24050
      23
      Private
      HS-grad
      9
      Never-married
      Sales
      Own-child
      White
      Male
      30
      United-States
      <=50K
    
    
      15434
      47
      Private
      Bachelors
      13
      Married-civ-spouse
      Sales
      Husband
      White
      Male
      60
      United-States
      >50K
    
    
      19458
      55
      Private
      HS-grad
      9
      Married-civ-spouse
      Blue-Collar
      Husband
      White
      Male
      40
      United-States
      >50K
    
    
      10432
      27
      Private
      HS-grad
      9
      Never-married
      Service
      Not-in-family
      White
      Female
      52
      United-States
      <=50K
    
    
      13908
      37
      self-emp
      HS-grad
      9
      Divorced
      Blue-Collar
      Not-in-family
      White
      Male
      50
      United-States
      <=50K



In [ ]:

	age	type_employer	education	education_num	marital	occupation	relationship	race	sex	hr_per_week	country	income
5699	49	Private	7th-8th	4	Married-civ-spouse	Blue-Collar	Husband	White	Male	48	United-States	<=50K
21093	23	Private	12th	8	Never-married	Blue-Collar	Own-child	White	Female	40	United-States	<=50K
18111	46	Gov	Some-college	10	Married-civ-spouse	Exec-managerial	Husband	White	Male	48	United-States	>50K
281	34	Private	Assoc-acdm	12	Divorced	Sales	Unmarried	Black	Female	45	United-States	<=50K
13470	22	Private	Some-college	10	Never-married	Service	Not-in-family	White	Female	50	United-States	<=50K
24050	23	Private	HS-grad	9	Never-married	Sales	Own-child	White	Male	30	United-States	<=50K
15434	47	Private	Bachelors	13	Married-civ-spouse	Sales	Husband	White	Male	60	United-States	>50K
19458	55	Private	HS-grad	9	Married-civ-spouse	Blue-Collar	Husband	White	Male	40	United-States	>50K
10432	27	Private	HS-grad	9	Never-married	Service	Not-in-family	White	Female	52	United-States	<=50K
13908	37	self-emp	HS-grad	9	Divorced	Blue-Collar	Not-in-family	White	Male	50	United-States	<=50K