In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
In [2]:
%matplotlib inline
In [3]:
df = pd.read_csv('data/churn.csv')
df.head()
Out[3]:
In [4]:
df.describe()
Out[4]:
In [ ]:
In [5]:
df.var()
Out[5]:
In [ ]:
In [ ]:
In [6]:
plt.figure(figsize=(10,6))
sns.swarmplot(x='age',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
Out[6]:
In [7]:
df.loc[(df.Target=='Yes')].age.min(),df.loc[(df.Target=='Yes')].age.max()
Out[7]:
In [8]:
df.loc[(df.Target=='Yes')].age.mean()
Out[8]:
In [9]:
plt.figure(figsize=(10,6))
sns.boxplot(x='freqOfMoving',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
Out[9]:
In [10]:
plt.figure(figsize=(10,6))
sns.countplot(x='freqOfMoving',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
Out[10]:
In [11]:
plt.figure(figsize=(10,6))
sns.swarmplot(x='avgDailyUsage',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
Out[11]:
In [12]:
plt.figure(figsize=(10,6))
sns.factorplot(x='lateFeePayments',col='Target',hue='Target',data=df,dodge=True,kind="count")
plt.legend(bbox_to_anchor=(1, 1), loc=2)
Out[12]:
In [13]:
plt.figure(figsize=(10,6))
sns.swarmplot(x='AvgComplaintsRaised',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
Out[13]:
In [14]:
plt.figure(figsize=(10,6))
sns.factorplot(x='AvgComplaintsResolved',col='Target',hue='Target',data=df,dodge=True,kind="count")
plt.legend(bbox_to_anchor=(1, 1), loc=2)
Out[14]:
In [15]:
plt.figure(figsize=(10,6))
sns.factorplot(x='AvgComplaintsRaised',col='Target',hue='Target',data=df,dodge=True,kind="count")
plt.legend(bbox_to_anchor=(1, 1), loc=2)
Out[15]:
In [16]:
cor = df.corr()
sns.heatmap(cor)
Out[16]:
In [17]:
len(df.columns)
Out[17]:
In [18]:
## One hot encoding the categorical variables.
df_encoded = pd.get_dummies(data=df)
try:
df_encoded.loc[df['Target'] == 'Yes', 'Target'] = 0
df_encoded.loc[df['Target'] == 'No', 'Target'] = 1
except:
pass
In [19]:
df_encoded.columns
Out[19]:
In [20]:
ic = ['age','freqOfMoving','avgDailyUsage',
'lateFeePayments','AvgComplaintsRaised','AvgComplaintsResolved'
]
mean = df_encoded.loc[(df_encoded[ic[0]] < 60),ic[0]].mean()
df_encoded[ic[0]] = df_encoded[ic[0]].mask(df_encoded[ic[0]] > 60,mean)
mean = df_encoded.loc[(df_encoded[ic[1]] < 8),ic[1]].mean()
df_encoded[ic[1]] = df_encoded[ic[1]].mask(df_encoded[ic[1]] > 8,mean)
mean = df_encoded.loc[(df_encoded[ic[2]] < 20),ic[2]].mean()
df_encoded[ic[2]] = df_encoded[ic[2]].mask(df_encoded[ic[2]] > 20,mean)
#df.loc[(df[ic[1]] > 7)] = df.loc[(df[ic[1]] <= 7)].mean()
#df.loc[(df[ic[2]] > 18)] = df.loc[(df[ic[2]] <= 18)].mean()
#df.loc[(df[ic[0]] > 50)] = df.loc[(df[ic[0]] < 50)].mean()
#df.loc[(df[ic[0]] > 50)] = df.loc[(df[ic[0]] < 50)].mean()
#df.loc[(df[ic[0]] > 50)] = df.loc[(df[ic[0]] < 50)].mean()
In [21]:
train,test,train_label,test_label = train_test_split(df_encoded.iloc[:,:-1].values, df_encoded['Target'].values, test_size=0.33, random_state=42)
In [22]:
train_label
Out[22]:
In [27]:
##Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(train,train_label)
In [ ]:
In [28]:
pred = clf.predict(test)
In [29]:
clf.score(test,test_label)
Out[29]:
In [233]:
f1_score(test_label,pred)
Out[233]:
In [234]:
##Random Forest
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=2,
random_state=0)
clf_rf = clf_rf.fit(train,train_label)
print(clf_rf.score(test,test_label))
In [235]:
np.where(test_label==1)
Out[235]:
In [204]:
train
Out[204]:
In [205]:
clf_rf.feature_importances_
Out[205]:
In [206]:
import operator
features_importance = dict()
for key,val in zip(df_encoded.columns,clf_rf.feature_importances_):
features_importance[key] = val
features_importance
Out[206]:
In [ ]:
##select worthy features
from itertools import islice
def take(n, iterable):
"Return first n items of the iterable as a list"
return list(islice(iterable, n))
imp_features = take(10,sorted(imp_features.items(),key=operator.itemgetter(1))
In [32]:
clf_rf.score(test,test_label)
Out[32]:
In [31]:
df_encoded.columns
Out[31]:
In [33]:
##Naive Bayes
clf_nb = GaussianNB()
clf_nb.fit(train,train_label)
clf_nb.score(test,test_label)
Out[33]:
In [39]:
## Remove Outliers
df_encoded['Target'] = df['Target']
df_without_ol = df_encoded.copy()
df_without_ol = df_without_ol[(np.abs(stats.zscore(df_without_ol)) < 3).all(axis=1)]
df_without_ol.shape,df_encoded.shape
Out[39]:
In [44]:
len(df_without_ol.columns)
Out[44]:
In [46]:
train,test,train_label,test_label = train_test_split(df_without_ol.iloc[:,0:39].values, df_without_ol['Target'].values, test_size=0.33, random_state=42)
In [47]:
##Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(train,train_label)
clf.score(test,test_label)
Out[47]:
In [48]:
##Random Forest
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=2,
random_state=0)
clf_rf = clf_rf.fit(train,train_label)
print(clf_rf.score(test,test_label))
In [49]:
##Naive Bayes
clf_nb = GaussianNB()
clf_nb.fit(train,train_label)
clf_nb.score(test,test_label)
Out[49]:
In [50]:
clf_rf.feature_importances_
Out[50]:
In [51]:
df.columns
Out[51]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: