read_csv has all kinds of important tuning parameters which can make the reading and the saving of the data more efficient
In [2]:
import pandas as pd
df = pd.read_csv("adult.csv").drop('Unnamed: 0',axis=1)
In [3]:
len(df)
Out[3]:
In [4]:
df.dtypes
Out[4]:
In [5]:
df.target.value_counts()
Out[5]:
In [6]:
df.target.value_counts()/len(df)
Out[6]:
In [7]:
df.head()
Out[7]:
In [8]:
df.describe()
Out[8]:
In [9]:
numerical_columns = df.describe().columns
numerical_columns
Out[9]:
In [10]:
%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt
sns.set_style("darkgrid")
plt.figure(figsize=(30,5))
for i,col in enumerate(numerical_columns):
plt.subplot(1,len(numerical_columns),i+1)
df[[col]].boxplot(fontsize=20)
In [11]:
def describe_categorical_values(df, non_interesting_columns=[], num_categories=5):
values_df = pd.DataFrame()
for i, column in enumerate(df.columns):
if column in non_interesting_columns:
continue
top_values0 = ["{}: {}%".format(x,int(round(100*y/len(df))))
for x, y in zip(df[column].value_counts(dropna=False).head(num_categories).index,
df[column].value_counts(dropna=False).head(num_categories).values)]
if len(top_values0) < num_categories:
top_values = [None]*num_categories
top_values[:len(top_values0)] = top_values0
else:
top_values = top_values0
values_df[column] = top_values
return values_df.transpose()
describe_categorical_values(df, non_interesting_columns=numerical_columns, num_categories=6)
Out[11]:
In [12]:
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
for col in numerical_columns:
plt.figure(figsize=(30,3))
sns.kdeplot(df.loc[df.target=='<=50K',col], label='<=50K')
sns.kdeplot(df.loc[df.target=='>50K',col], label='>50K')
plt.legend(fontsize=22)
plt.title(col, fontsize=22);
KDE plots doesn't always work so great, especially for integers - let's try bubble plot!
In [13]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
def get_point(x, digits=2):
if not isinstance(x,pd.Interval):
a, b = x.split(',')
a = float(a.strip("("))
b = float(b.strip("]"))
else:
a,b = x.left, x.right
c = (a+b)/2
return np.round(c,digits)
def bubble_plot(df, x, y, z_boolean=None, ordered_x_values=None, ordered_y_values=None, bins_x=10,
bins_y=10, fontsize=16, figsize=(10,5), maximal_bubble_size=3500,
normalization_by_all=False, log=False):
"""
:param df: dataframe
:param x: name of first numerical/categorical field (string) (for x-axis)
:param y: name of second numerical/categorical field (string) (for y-axis)
:param z_boolean: name of categorical field with two categories / boolean field (for coloring)
:param ordered_x_values: the values we would like to map from x categorical variable
according to the order we would like to present them
:param ordered_y_values: the values we would like to map from the y categorical variable
according to the order we would like to present them
:param bins_x: the bins for x values if x is numberic
:param bins_y: the bins for y values if y is numberic
:param normalization_by_all: True - shows joint distribution p(x,y), False - shows conditional distribution p(y|x)
:param maximal_bubble_size: if the bubbles are too big or too small this is the parameter you should change!
:param log: whether to apply log on the count (influence the size of the bubbles)
:return: nice bubble plot, bubble size is propotional to the frequency of the bucket :)
"""
plt.figure(figsize=figsize)
x_is_numeric = df[x].dtype in (float, int) and ordered_x_values is None
y_is_numeric = df[y].dtype in (float, int) and ordered_y_values is None
count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y]], axis=1)
count_table = count_table.groupby(x)[y].value_counts().unstack().fillna(0)
ordered_x_values = count_table.index.values if ordered_x_values is None else ordered_x_values
ordered_y_values = count_table.columns if ordered_y_values is None else ordered_y_values
if z_boolean is not None:
count_table_long, xticks, yticks, xticklabels, yticklabels = plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size,
normalization_by_all=normalization_by_all)
else:
count_table_long, xticks, yticks, xticklabels, yticklabels = plot_without_z(df, x, y, z_boolean, count_table, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values,
normalization_by_all=normalization_by_all, log=log, maximal_bubble_size=maximal_bubble_size )
plt.xticks(xticks, xticklabels,fontsize=fontsize)
plt.yticks(yticks, yticklabels,fontsize=fontsize)
plt.xlabel(x, fontsize=fontsize)
plt.ylabel(y, fontsize=fontsize)
if z_boolean is None:
plt.title("{} vs {} ".format(y,x),fontsize=fontsize+4);
else:
plt.title("{} vs {} and {} (in colors)".format(y,x, z_boolean),fontsize=fontsize+4);
def plot_without_z(df, x, y, z, count_table, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, normalization_by_all=False, log=False, maximal_bubble_size=4000):
if normalization_by_all:
count_table /= count_table.sum().sum()
else:
count_table = count_table.transpose()
for col in count_table.columns:
count_table[col] /= count_table[col].sum()
count_table = count_table.transpose()
if log:
count_table = np.log(count_table)
maximal_bubble_size /= 2
size_factor = maximal_bubble_size/count_table.max().max()
count_table_long = pd.melt(count_table.reset_index(), id_vars=x)
x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \
if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values}
y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \
if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values}
xticks = np.arange(count_table.shape[0]) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
yticks = np.arange(count_table.shape[1]) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
count_table_long[x] = count_table_long[x].map(x_values_dict)
count_table_long[y] = count_table_long[y].map(y_values_dict)
plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'],
c=count_table_long['value'], cmap='cool')
return count_table_long, xticks, yticks, xticklabels, yticklabels
def plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values,
ordered_y_values, maximal_bubble_size=4000, normalization_by_all=False):
count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y], df[z_boolean]], axis=1)
count_table = count_table.groupby([x,z_boolean])[y].value_counts().unstack().fillna(0)
count_table = count_table.unstack().unstack()
count_table_long = count_table.reset_index().rename(columns={0: 'value'}) #pd.melt(count_table.reset_index(), id_vars=x)
z_boolean_values = count_table_long[z_boolean].unique()
ratio = pd.DataFrame({'ratio':count_table_long.set_index([x,y,z_boolean]).unstack()['value'][z_boolean_values[1]] / (
count_table_long.set_index([x,y,z_boolean]).unstack()['value'].sum(axis=1) )})
count_table_long = count_table_long.set_index([x, y ])[['value']].merge(ratio, left_index=True, right_index=True).reset_index()
size_factor = maximal_bubble_size/count_table_long['value'].max()
x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \
if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values}
y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \
if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values}
xticks = np.arange(len(ordered_x_values)) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
yticks = np.arange(len(ordered_y_values)) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
count_table_long[x] = count_table_long[x].map(x_values_dict)
count_table_long[y] = count_table_long[y].map(y_values_dict)
plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'],
c=count_table_long['ratio'], alpha=0.5,
cmap='cool')
return count_table_long, xticks, yticks, xticklabels, yticklabels
In [14]:
%matplotlib inline
bubble_plot(df,'age','target', normalization_by_all=False)
bubble_plot(df,'hours-per-week','target', normalization_by_all=False)
bubble_plot(df,'education-num','target', normalization_by_all=False)
In [15]:
bubble_plot(df, x='age', y='hours-per-week', z_boolean='target', maximal_bubble_size=8000)
How to deal with missing values?
(1) missing at random - imputation - using common values or estimation of the value using regression methods
(2) missing not at random - treat missing value as a separate indicator variable
In [16]:
df = pd.read_csv("adult.csv").drop('Unnamed: 0',axis=1)
categorical_columns = [x for x in df.columns if x not in numerical_columns and x!='target']
for col in categorical_columns:
df.loc[df[col]==' ?',col] = df[col].value_counts().index.values[0]
In [17]:
def remove_infrequent_values(df, categorical_columns, frequency_threshold=20):
infrequent_cols_to_remove = []
for cat_col in categorical_columns:
col_count = df[cat_col].value_counts()
infrequent_values = col_count[col_count<frequency_threshold].index.values
if not col_count[col_count<20].empty:
print("removing columns: \n{} \n".format(col_count[col_count<frequency_threshold]))
infrequent_cols_to_remove += ["{}_{}".format(cat_col, x) for x in infrequent_values]
return df
df = remove_infrequent_values(df, categorical_columns)
In [18]:
df = pd.read_csv("adult.csv").drop('Unnamed: 0',axis=1)
y = df.target==">50K"
features = [x for x in df.columns if x!='target']
X = pd.get_dummies(df[features])
In [19]:
X.head()
Out[19]:
In [20]:
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(y, 5, random_state=12345, shuffle=True)
train_index, test_index = list(kf)[0]
train, test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y[train_index], y[test_index]
In [21]:
df.target.value_counts()/len(df)
Out[21]:
In [22]:
pd.Series(y_train).value_counts()/len(y_train)
Out[22]:
In [23]:
pd.Series(y_test).value_counts()/len(y_test)
Out[23]:
In [24]:
def get_most_correlated_variables(corr, num_pairs=10):
correlation_melted = pd.melt(corr.reset_index().rename(columns={"index": "var_1"}), id_vars=("var_1"),var_name='var_2')
correlation_melted = correlation_melted[correlation_melted.var_1!=correlation_melted.var_2]
correlation_melted['var_couple'] = correlation_melted[['var_1','var_2']].apply(lambda x:tuple(sorted([x[0],x[1]])), axis=1)
correlation_melted = correlation_melted.drop_duplicates(subset='var_couple').drop(['var_couple'],axis=1)
correlation_melted['abs_value'] = correlation_melted['value'].abs().round(3)
return correlation_melted.sort_values(by='abs_value').tail(num_pairs).drop('abs_value', axis=1).reset_index(drop=True)
def plot_correlation_matrix(X, features2):
corr = X[features2].corr()
# return the most correlated variables
most_correlated_variables = get_most_correlated_variables(corr, num_pairs=10)
max_correlation = 1.25*most_correlated_variables['value'].abs().max()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
ax.set_yticklabels(features2, fontsize=18)
ax.set_xticklabels(features2, rotation='vertical', fontsize=18)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=max_correlation, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .8})
return most_correlated_variables
if 'target' not in X:
X['target'] = y
features2 = ['target'] + X.columns[:20].tolist()
plot_correlation_matrix(X, features2)
Out[24]:
In [25]:
features2 =['target']+[x for x in X.columns if x.startswith('education') ]
plot_correlation_matrix(X, features2)
Out[25]:
In [26]:
features2 =['target']+[x for x in X.columns if x.startswith('relationship_') or x.startswith('marital')]
plot_correlation_matrix(X, features2)
Out[26]:
In [27]:
if 'target' in X:
X = X.drop('target', axis=1)
corr = X.corr()
most_correlated_variables = get_most_correlated_variables(corr, num_pairs=10)
In [28]:
most_correlated_variables
Out[28]:
In [29]:
most_correlated_variables.tail(5).var_2.values
Out[29]:
In [30]:
if 'target' not in X:
X['target'] = y
corr = X.corr()
corr_with_target = corr[['target']].drop('target', axis=0)
corr_with_target['abs_value'] = corr_with_target.abs()
corr_with_target.sort_values(by='abs_value').tail(10).drop('abs_value',axis=1)
Out[30]:
In [31]:
best_features_by_correlation = corr_with_target[corr_with_target.abs_value>0.05].index.values
In [32]:
len(best_features_by_correlation)
Out[32]:
In [33]:
from sklearn.decomposition import PCA
pca = PCA(n_components=train.shape[1])
train_pca = pca.fit_transform(train, )
test_pca = pca.transform(test)
In [34]:
plt.figure(figsize=(8,5))
plt.scatter(train_pca[(y_train==0).values,0], train_pca[(y_train==0).values,1], color='r', label='>50K')
plt.scatter(train_pca[(y_train==1).values,0], train_pca[(y_train==1).values,1], color='b', label='<=50K')
plt.legend()
Out[34]:
In [35]:
plt.figure(figsize=(30,5))
pd.DataFrame(pca.explained_variance_ratio_).plot.bar(log=False, figsize=(35,7), fontsize=16)
plt.xlabel('n_components',fontsize=16)
plt.ylabel('explained_variance_', fontsize=16);
In [36]:
plt.figure(figsize=(30,5))
pd.DataFrame(pca.explained_variance_ratio_).plot.bar(log=True, figsize=(35,7), fontsize=16)
plt.xlabel('n_components',fontsize=16)
plt.ylabel('explained_variance_', fontsize=16);
plt.ylim([0,10]);
In [37]:
%matplotlib inline
import seaborn as sns
from sklearn.metrics import roc_auc_score, classification_report, roc_curve, confusion_matrix
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
def find_best_threshold(thresholds, fpr, tpr):
"""
find the best threshold from the roc curve. by finding the threshold for the point which is closest to (fpr=0,tpr=1)
"""
fpr_tpr = pd.DataFrame({'thresholds': thresholds, 'fpr': fpr, 'tpr': tpr})
fpr_tpr['dist'] = (fpr_tpr['fpr'])**2 + (fpr_tpr['tpr']-1)**2
return fpr_tpr.ix[fpr_tpr.dist.idxmin(), 'thresholds']
def get_model_results(model, train, test, y_train, y_test):
probabilities = model.predict_proba(test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, probabilities)
threshold = find_best_threshold(thresholds, fpr, tpr)
predictions = probabilities>threshold
plt.figure()
plt.plot(fpr, tpr, label='test')
roc_auc = roc_auc_score(y_test, probabilities)
probabilities = model.predict_proba(train)[:,1]
fpr, tpr, thresholds = roc_curve(y_train, probabilities)
plt.plot(fpr, tpr, label='train')
plt.plot([0, 1], [0, 1], 'r--', label='random guess')
plt.title("area under the ROC curve = {}".format(roc_auc), fontsize=18);
print(classification_report(y_test, predictions))
plt.legend()
model = make_pipeline(LogisticRegression())
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [38]:
model = make_pipeline(MinMaxScaler(),LogisticRegression())
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [39]:
feature_importance = pd.Series(index=train.columns , data= model.named_steps['logisticregression'].coef_[0])
feature_importance = feature_importance.sort_values()
plt.figure(figsize=(14,5))
feature_importance.tail(10).plot.barh(fontsize=16,color='b' )
plt.figure(figsize=(14,5))
feature_importance.head(10).plot.barh(fontsize=16, figsize=(14,7), color='r')
Out[39]:
In [40]:
model = make_pipeline(PCA(n_components=30,whiten=False),LogisticRegression())
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [41]:
model = make_pipeline(PolynomialFeatures(),MinMaxScaler(), LogisticRegression())
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [42]:
poly = PolynomialFeatures(2)
train_poly_features = poly.fit_transform(train[numerical_columns])
test_poly_features = poly.transform(test[numerical_columns])
train_with_poly_features = pd.concat([train.reset_index(drop=True),pd.DataFrame(train_poly_features)],axis=1)
test_with_poly_features = pd.concat([test.reset_index(drop=True),pd.DataFrame(test_poly_features)],axis=1)
In [43]:
model = make_pipeline(MinMaxScaler(),LogisticRegression())
model.fit(train_with_poly_features, y_train)
get_model_results(model, train_with_poly_features, test_with_poly_features, y_train, y_test)
In [74]:
feature_importance=pd.DataFrame(index=train_with_poly_features.columns,
data=model.named_steps['logisticregression'].coef_[0])
feature_importance['abs_value']=feature_importance.abs()
feature_importance.sort_values(by='abs_value')[0].tail(15).plot.barh()
Out[74]:
In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
params = {#'logisticregression__penalty': ['l2','l1'], 'logisticregression__C': [1, 10, 100, 1000],
'logisticregression__penalty': ['l1'], 'logisticregression__C': [1,],
# 'solver': ['newton-cg'],
# 'logisticregression__fit_intercept': [False, True]
}
gscv = GridSearchCV(make_pipeline(MinMaxScaler(), LogisticRegression()), params, scoring='roc_auc', cv=3)
gscv.fit(train, y_train)
get_model_results(gscv, train, test, y_train, y_test)
print(gscv.best_params_)
In [77]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [78]:
model = DecisionTreeClassifier(min_samples_split=10)
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [79]:
model = DecisionTreeClassifier(min_samples_split=50)
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [80]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=3, n_estimators=100) # max_depth=4
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [81]:
from sklearn import tree
from os import system
tree_dot_path = "models/tree1.dot"
tree_png_path = "models/tree1.png"
dotfile = open(tree_dot_path, 'w')
tree.export_graphviz(model.estimators_[10], out_file = dotfile,
feature_names = train.columns,
filled=True)
dotfile.close()
system("dot -Tpng {tree_dot_path} -o {tree_png_path}".format(
tree_dot_path=tree_dot_path, tree_png_path=tree_png_path));
In [82]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [83]:
import xgboost as xgb
feature_importance = pd.Series(index=train.columns , data= model.feature_importances_)
feature_importance = feature_importance.sort_values()
feature_importance.tail(20).plot.barh(fontsize=16, figsize=(14,7))
Out[83]:
In [84]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(train_with_poly_features, y_train)
get_model_results(model, train_with_poly_features, test_with_poly_features, y_train, y_test)
In [85]:
best_features = feature_importance.tail(15).index.values
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(train[best_features].rename(columns={x:x.replace(" ","") for x in best_features}), y_train)
get_model_results(model, train[best_features].rename(columns={x:x.replace(" ","") for x in best_features}), test[best_features].rename(columns={x:x.replace(" ","") for x in best_features}), y_train, y_test)
In [88]:
import matplotlib
from xgboost import plot_tree
from matplotlib.pylab import rcParams
ax = plot_tree(model,num_trees=10, rankdir='LR')
fig = ax.figure
fig.set_size_inches(20, 20)
In [89]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE() #n_iter=200)
# train_transformed = tsne.fit_transform(train[best_features])
# pd.DataFrame(train_transformed).to_pickle("train_transformed.pkl")
train_transformed = pd.read_pickle("train_transformed.pkl").values
plt.scatter(train_transformed[(y_train==0).values,0], train_transformed[(y_train==0).values,1], color='r', label='class 1')
plt.scatter(train_transformed[(y_train==1).values,0], train_transformed[(y_train==1).values,1], color='b', label='class 2')
plt.legend()
In [90]:
def encode_column_by_frequency(train, test, col, log=True):
country_by_target = train.groupby(col)['target'].value_counts().unstack().fillna(0)
if log:
country_probability = np.log((country_by_target['>50K']+1e-5)/(country_by_target.sum(axis=1)+1e-5))
else:
country_probability = (country_by_target['>50K']+1e-5)/(country_by_target.sum(axis=1)+1e-5)
for val in test[col].unique():
if val in country_probability.index.values:
continue
country_probability[val] = country_probability.median()
train[col] = train[col].map(country_probability)
test[col] = test[col].map(country_probability)
return train, test
def encode_numeric_column_by_frequency(train, test, col, log=True, plot_flag=False, window_size=3):
country_by_target = train.groupby(col)['target'].value_counts().unstack().fillna(0)
if log:
country_probability = np.log((country_by_target['>50K']+1e-5)/(country_by_target.sum(axis=1)+1e-5))
else:
country_probability = (country_by_target['>50K']+1e-5)/(country_by_target.sum(axis=1)+1e-5)
values = np.unique(train[col].unique().tolist()+test[col].unique().tolist())
resolution = min(1,np.min(np.diff(values)))
start = np.min(country_probability.index.unique().tolist())
end = np.max(country_probability.index.unique().tolist())
country_probability = pd.merge(pd.DataFrame(country_probability),
pd.DataFrame(index=np.arange(start, end+resolution, resolution)),
left_index=True, right_index=True, how='outer').interpolate(method='linear')
if plot_flag:
plt.figure()
country_probability[0].plot(marker='o')
country_probability=((country_probability[0].cumsum() - country_probability[0].shift(window_size).cumsum())/window_size).shift(-1).fillna(0)
if plot_flag:
country_probability.plot()
plt.title(col)
train[col] = train[col].map(country_probability)
test[col] = test[col].map(country_probability)
return train, test
In [91]:
def get_data_with_encoded_fetures(plot_flag=False):
df = pd.read_csv("adult.csv").drop('Unnamed: 0',axis=1)#.drop('fnlwgt', axis=1)
categorical_columns = [x for x in df.columns if x not in numerical_columns and x!='target']
columns_to_encode_with_frequencies=categorical_columns
for col in categorical_columns:
df.loc[df[col]==' ?',col] = df[col].value_counts().index.values[0]
features = [x for x in df if x != 'target']
X = df[features]
y = df.target==">50K"
kf = StratifiedKFold(y, 5, random_state=12345, shuffle=True)
train_index, test_index = list(kf)[0]
train, test = df.iloc[train_index], df.iloc[test_index]
y_train, y_test = y[train_index], y[test_index]
for col in categorical_columns:
train, test = encode_column_by_frequency(train, test, col, log=False)
for col in numerical_columns:
if col=='fnlwgt': continue
train, test = encode_numeric_column_by_frequency(train, test, col,log=False, plot_flag=plot_flag)
train=train.drop('target',axis=1)
test=test.drop('target',axis=1)
return train, test, y_train, y_test
In [92]:
train, test, y_train, y_test = get_data_with_encoded_fetures(plot_flag=False)
model = make_pipeline(MinMaxScaler(),LogisticRegression())
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test )
In [93]:
model = make_pipeline(PolynomialFeatures(), MinMaxScaler(),LogisticRegression())
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [94]:
model = XGBClassifier()
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
In [96]:
model = make_pipeline(PolynomialFeatures(), XGBClassifier())
model.fit(train, y_train)
get_model_results(model, train, test, y_train, y_test)
For additional questions - your'e welcome to stay in touch - meir.shir86@gmail.com