In [1]:
import pandas as pd
import numpy as np
In [7]:
# TODO:
# Load up the dataset, setting correct header labels.
df = pd.read_csv('Datasets/census.data',
header = None,
names=['education', 'age', 'capital-gain', 'race', 'capital-loss', 'hours-per-week', 'sex', 'classification'])
df
Out[7]:
In [9]:
df.shape
Out[9]:
In [8]:
# TODO:
# Use basic pandas commands to look through the dataset... get a
# feel for it before proceeding! Do the data-types of each column
# reflect the values you see when you look through the data using
# a text editor / spread sheet program? If you see 'object' where
# you expect to see 'int32' / 'float64', that is a good indicator
# that there is probably a string or missing value in a column.
# use `your_data_frame['your_column'].unique()` to see the unique
# values of each column and identify the rogue values. If these
# should be represented as nans, you can convert them using
# na_values when loading the dataframe.
df.dtypes
Out[8]:
In [ ]:
# TODO:
# Look through your data and identify any potential categorical
# features. Ensure you properly encode any ordinal and nominal
# types using the methods discussed in the chapter.
#
# Be careful! Some features can be represented as either categorical
# or continuous (numerical). If you ever get confused, think to yourself
# what makes more sense generally---to represent such features with a
# continuous numeric type... or a series of categories?
In [ ]:
#
# TODO:
# Print out your dataframe
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: