Key point is that when taking a slice of a df, you are returned a copy of that slice, and the changes to the elements might not propogate back into the original df.
code below does not work, just an example for now
In [ ]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
# original array
orig_df
# column 'Age' has some NaN values
# A simple approximation of the distribution of ages is a gaussian, but this is not commonly accurate.
# lets make a vector of random ages centered on the mean, with a width of the std
mn = orig_df["Age"].mean()
st = orig_df["Age"].std()
# number of rows
n = orig_df.shape[0]
# vector of random values using the 'standard normal'. ie. centered on 0, with variance = 1.0
rands = np.random.randn(n)
# change to centered on mean and with width equal to std
rands = rands*st + mn #above two steps could be combined
#--------------------------------
### OR
## use a truncated normal distribution to make sure none of the values are outside the input data's range
import scipy.stats as stats
lower, upper = orig_df['Age'].min(), orig_df['Age'].max()
mu, sigma = orig_df["Age"].mean(), orig_df["Age"].std()
# number of rows
n = orig_df.shape[0]
print 'max: ',traorig_dfin_df['Age'].max()
print 'min: ',orig_df['Age'].min()
# vector of random values using the truncated normal distribution.
X = stats.truncnorm((lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma)
rands = X.rvs(n)
#---------------------------------
# get the indexes of the elements in the original array that are NaN
idx = np.isfinite(orig_df['Age'])
# use the indexes to replace the NON-NaNs in the random array with the good values from the original array
rands[idx.values] = orig_df[idx]['Age'].values
## At this point rands is now the cleaned column of data we wanted, so push it in to the original df
orig_df['Age'] = rands
print 'After this gaussian replacment, the number of NaNs are: ',orig_df['Age'].isnull().sum()
In [2]:
## john recommends trying to learn how to merge/join.
## But, make their indexs different, so it isn't just a simple thing.
In [3]:
# NOTE:
# how to delete column properly
#item_purchase_log_df_clean = item_purchase_log_df_clean.drop("item_id_nm",1)
## where 1 is the axis number (0 for rows and 1 for columns.)
In [ ]:
# Convert categorical columns to ordinal
from sklearn.preprocessing import LabelEncoder
# Convert categorical column values to ordinal for model fitting
le_title = LabelEncoder()
# To convert to ordinal:
orig_df.Title = le_title.fit_transform(orig_df.Title)
# To convert back to categorical:
#orig_df.Title = le_title.inverse_transform(orig_df.Title)
In [ ]:
### OR you could add new columns with ordinal true/false values
titles_dummies = pd.get_dummies(orig_df['Title'],prefix='Title')
orig_df = pd.concat([orig_df,titles_dummies],axis=1)