In [1]:
import pandas as pd
import numpy as np

In [7]:
# TODO:
# Load up the dataset, setting correct header labels.
df = pd.read_csv('Datasets/census.data', 
                 header = None, 
                 names=['education', 'age', 'capital-gain', 'race', 'capital-loss', 'hours-per-week', 'sex', 'classification'])
df


Out[7]:
education age capital-gain race capital-loss hours-per-week sex classification
0 Bachelors 39 2174 White 0 40 Male <=50K
1 Bachelors 50 ? White 0 13 Male <=50K
2 HS-grad 38 ? White 0 40 Male <=50K
3 11th 53 ? Black 0 40 Male <=50K
4 Bachelors 28 0 Black 0 40 Female <=50K
5 Masters 37 0 White 0 40 Female <=50K
6 9th 49 0 Black 0 16 Female <=50K
7 HS-grad 52 0 White 0 45 Male >50K
8 Masters 31 14084 White 0 50 Female >50K
9 Bachelors 42 5178 White 0 40 Male >50K
10 Some-college 37 0 Black 0 80 Male >50K
11 Bachelors 30 0 Asian-Pac-Islander 0 40 Male >50K
12 Bachelors 23 0 White 0 30 Female <=50K
13 7th-8th 34 0 Amer-Indian-Eskimo 0 45 Male <=50K
14 HS-grad 25 0 White 0 35 Male <=50K
15 HS-grad 32 0 White 0 40 Male <=50K
16 11th 38 0 White 0 50 Male <=50K
17 Masters 43 0 White 0 45 Female >50K
18 Doctorate 40 0 White 0 60 Male >50K
19 HS-grad 54 0 Black 0 20 Female <=50K
20 9th 35 0 Black 0 40 Male <=50K
21 11th 43 0 White 2042 40 Male <=50K
22 HS-grad 59 0 White 0 40 Female <=50K
23 Bachelors 56 0 White 0 40 Male >50K
24 HS-grad 19 0 White 0 40 Male <=50K
25 Some-college 54 0 Asian-Pac-Islander 0 60 Male >50K
26 HS-grad 39 0 White 0 80 Male <=50K
27 HS-grad 49 0 White 0 40 Male <=50K
28 Some-college 20 0 Black 0 44 Male <=50K
29 Bachelors 45 0 White 1408 40 Male <=50K
... ... ... ... ... ... ... ... ...
29506 HS-grad 30 0 White 0 55 Male <=50K
29507 10th 32 0 White 0 40 Male <=50K
29508 Some-college 22 0 White 0 40 Male <=50K
29509 HS-grad 31 0 White 0 40 Female <=50K
29510 HS-grad 29 0 White 0 35 Female <=50K
29511 Bachelors 35 0 White 0 55 Female >50K
29512 Bachelors 30 0 Asian-Pac-Islander 0 99 Female <=50K
29513 Doctorate 34 0 White 0 60 Male >50K
29514 Bachelors 54 0 Asian-Pac-Islander 0 50 Male >50K
29515 Some-college 37 0 White 0 39 Female <=50K
29516 12th 22 0 Black 0 35 Male <=50K
29517 Bachelors 34 0 White 0 55 Female >50K
29518 HS-grad 30 0 Black 0 46 Male <=50K
29519 Bachelors 38 15020 Black 0 45 Female >50K
29520 Doctorate 71 0 White 0 10 Male >50K
29521 HS-grad 45 0 White 0 40 Female <=50K
29522 HS-grad 41 0 Black 0 32 Female <=50K
29523 HS-grad 72 0 White 0 25 Male <=50K
29524 Masters 31 0 Other 0 30 Female <=50K
29525 HS-grad 43 0 White 0 40 Male <=50K
29526 Some-college 43 0 White 0 40 Female <=50K
29527 Some-college 43 0 White 0 50 Male <=50K
29528 10th 32 0 Amer-Indian-Eskimo 0 40 Male <=50K
29529 Masters 32 0 Asian-Pac-Islander 0 11 Male <=50K
29530 Masters 53 0 White 0 40 Male >50K
29531 Some-college 22 0 White 0 40 Male <=50K
29532 HS-grad 40 0 White 0 40 Male >50K
29533 HS-grad 58 0 White 0 40 Female <=50K
29534 HS-grad 22 0 White 0 20 Male <=50K
29535 HS-grad 52 15024 White 0 40 Female >50K

29536 rows × 8 columns


In [9]:
df.shape


Out[9]:
(29536, 8)

In [8]:
# TODO:
# Use basic pandas commands to look through the dataset... get a
# feel for it before proceeding! Do the data-types of each column
# reflect the values you see when you look through the data using
# a text editor / spread sheet program? If you see 'object' where
# you expect to see 'int32' / 'float64', that is a good indicator
# that there is probably a string or missing value in a column.
# use `your_data_frame['your_column'].unique()` to see the unique
# values of each column and identify the rogue values. If these
# should be represented as nans, you can convert them using
# na_values when loading the dataframe.
df.dtypes


Out[8]:
education         object
age                int64
capital-gain      object
race              object
capital-loss       int64
hours-per-week     int64
sex               object
classification    object
dtype: object

In [ ]:
# TODO:
# Look through your data and identify any potential categorical
# features. Ensure you properly encode any ordinal and nominal
# types using the methods discussed in the chapter.
#
# Be careful! Some features can be represented as either categorical
# or continuous (numerical). If you ever get confused, think to yourself
# what makes more sense generally---to represent such features with a
# continuous numeric type... or a series of categories?

In [ ]:
#
# TODO:
# Print out your dataframe

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: