In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB

In [2]:
%matplotlib inline

Exploratory Data Analysis


In [3]:
df = pd.read_csv('data/churn.csv')
df.head()


Out[3]:
age education familySize typeOfHousing houseOwnership householdAnnualIncome ageOfBuilding freqOfMoving ageOfCustomerAccountInDays avgDailyUsage lateFeePayments AvgComplaintsRaised AvgComplaintsResolved modeOfPayment typeOfPlanI typeOfPlanII Target
0 37 medium Large 3BHK-Apt Owned Medium 10-20yrs 3 2-5yrs 0.766845 1 2 1 DD Fixed Fixed Yes
1 44 High Small 2BHK-Apt Owned Low >30yrs 1 >5yrs 1.339218 3 7 6 Others Variable Variable Yes
2 44 medium Medium 1BHK-Apt Rented Medium 10-20yrs 4 >5yrs 9.591932 2 8 1 Others Fixed Fixed No
3 36 High Medium 1BHK-Apt Owned High 20-30yrs 5 2-5yrs 3.479652 1 7 1 Others Fixed Fixed No
4 35 low Medium 1BHK-Apt Owned Medium <10yrs 2 2-5yrs 3.576235 2 1 0 DD Variable Variable No

In [4]:
df.describe()


Out[4]:
age freqOfMoving avgDailyUsage lateFeePayments AvgComplaintsRaised AvgComplaintsResolved
count 5000.000000 5000.000000 5000.000000 5000.000000 5000.00000 5000.000000
mean 39.470000 3.583400 4.679484 1.422400 5.01520 2.087600
std 7.943634 2.233213 3.394101 1.314353 3.15369 2.223357
min 17.000000 1.000000 0.059012 0.000000 0.00000 0.000000
25% 34.000000 2.000000 2.220354 0.000000 2.00000 0.000000
50% 39.000000 3.000000 3.857608 1.000000 5.00000 1.000000
75% 45.000000 5.000000 6.291110 2.000000 8.00000 3.000000
max 81.000000 10.000000 30.021618 4.000000 10.00000 9.000000

In [ ]:


In [5]:
df.var()


Out[5]:
age                      63.101320
freqOfMoving              4.987242
avgDailyUsage            11.519922
lateFeePayments           1.727524
AvgComplaintsRaised       9.945758
AvgComplaintsResolved     4.943315
dtype: float64

In [ ]:


In [ ]:


In [6]:
plt.figure(figsize=(10,6))
sns.swarmplot(x='age',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)


Out[6]:
<matplotlib.legend.Legend at 0x7fd4c16c9470>

In [7]:
df.loc[(df.Target=='Yes')].age.min(),df.loc[(df.Target=='Yes')].age.max()


Out[7]:
(17, 72)

In [8]:
df.loc[(df.Target=='Yes')].age.mean()


Out[8]:
39.61969111969112

In [9]:
plt.figure(figsize=(10,6))
sns.boxplot(x='freqOfMoving',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)


Out[9]:
<matplotlib.legend.Legend at 0x7fd4bc27d0f0>

In [10]:
plt.figure(figsize=(10,6))
sns.countplot(x='freqOfMoving',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)


No handles with labels found to put in legend.
Out[10]:
<matplotlib.legend.Legend at 0x7fd4bc24d080>

In [11]:
plt.figure(figsize=(10,6))
sns.swarmplot(x='avgDailyUsage',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)


Out[11]:
<matplotlib.legend.Legend at 0x7fd4bc1e8f60>

In [12]:
plt.figure(figsize=(10,6))
sns.factorplot(x='lateFeePayments',col='Target',hue='Target',data=df,dodge=True,kind="count")
plt.legend(bbox_to_anchor=(1, 1), loc=2)


Out[12]:
<matplotlib.legend.Legend at 0x7fd4bc1549b0>
<Figure size 720x432 with 0 Axes>

In [13]:
plt.figure(figsize=(10,6))
sns.swarmplot(x='AvgComplaintsRaised',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)


Out[13]:
<matplotlib.legend.Legend at 0x7fd4bc1a7da0>

In [14]:
plt.figure(figsize=(10,6))
sns.factorplot(x='AvgComplaintsResolved',col='Target',hue='Target',data=df,dodge=True,kind="count")
plt.legend(bbox_to_anchor=(1, 1), loc=2)


Out[14]:
<matplotlib.legend.Legend at 0x7fd4b7fcf278>
<Figure size 720x432 with 0 Axes>

In [15]:
plt.figure(figsize=(10,6))
sns.factorplot(x='AvgComplaintsRaised',col='Target',hue='Target',data=df,dodge=True,kind="count")
plt.legend(bbox_to_anchor=(1, 1), loc=2)


Out[15]:
<matplotlib.legend.Legend at 0x7fd4b7faeb70>
<Figure size 720x432 with 0 Axes>

In [16]:
cor = df.corr()
sns.heatmap(cor)


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd4b7d74ba8>

In [17]:
len(df.columns)


Out[17]:
17

In [18]:
## One hot encoding the categorical variables.
df_encoded = pd.get_dummies(data=df)
try:
    df_encoded.loc[df['Target'] == 'Yes', 'Target'] = 0
    df_encoded.loc[df['Target'] == 'No', 'Target'] = 1
except:
    pass

In [19]:
df_encoded.columns


Out[19]:
Index(['age', 'freqOfMoving', 'avgDailyUsage', 'lateFeePayments',
       'AvgComplaintsRaised', 'AvgComplaintsResolved', 'education_High',
       'education_low', 'education_medium', 'familySize_Large',
       'familySize_Medium', 'familySize_Small', 'typeOfHousing_1BHK-Apt',
       'typeOfHousing_2BHK-Apt', 'typeOfHousing_3BHK-Apt',
       'typeOfHousing_5BHK-House', 'typeOfHousing_Mansion',
       'houseOwnership_Owned', 'houseOwnership_Rented',
       'householdAnnualIncome_High', 'householdAnnualIncome_Low',
       'householdAnnualIncome_Medium', 'ageOfBuilding_10-20yrs',
       'ageOfBuilding_20-30yrs', 'ageOfBuilding_<10yrs',
       'ageOfBuilding_>30yrs', 'ageOfCustomerAccountInDays_1-2yrs',
       'ageOfCustomerAccountInDays_2-5yrs', 'ageOfCustomerAccountInDays_<1yr',
       'ageOfCustomerAccountInDays_>10yrs', 'ageOfCustomerAccountInDays_>5yrs',
       'modeOfPayment_DD', 'modeOfPayment_Others', 'modeOfPayment_PPM',
       'modeOfPayment_SD', 'typeOfPlanI_Fixed', 'typeOfPlanI_Variable',
       'typeOfPlanII_Fixed', 'typeOfPlanII_Variable', 'Target_No',
       'Target_Yes', 'Target'],
      dtype='object')

In [20]:
ic = ['age','freqOfMoving','avgDailyUsage',
                  'lateFeePayments','AvgComplaintsRaised','AvgComplaintsResolved'
                 ]

mean = df_encoded.loc[(df_encoded[ic[0]] < 60),ic[0]].mean()
df_encoded[ic[0]] = df_encoded[ic[0]].mask(df_encoded[ic[0]] > 60,mean)

mean = df_encoded.loc[(df_encoded[ic[1]] < 8),ic[1]].mean()
df_encoded[ic[1]] = df_encoded[ic[1]].mask(df_encoded[ic[1]] > 8,mean)

mean = df_encoded.loc[(df_encoded[ic[2]] < 20),ic[2]].mean()
df_encoded[ic[2]] = df_encoded[ic[2]].mask(df_encoded[ic[2]] > 20,mean)

#df.loc[(df[ic[1]] > 7)] = df.loc[(df[ic[1]] <= 7)].mean()
#df.loc[(df[ic[2]] > 18)] = df.loc[(df[ic[2]] <= 18)].mean()
#df.loc[(df[ic[0]] > 50)] = df.loc[(df[ic[0]] < 50)].mean()
#df.loc[(df[ic[0]] > 50)] = df.loc[(df[ic[0]] < 50)].mean()
#df.loc[(df[ic[0]] > 50)] = df.loc[(df[ic[0]] < 50)].mean()

In [21]:
train,test,train_label,test_label = train_test_split(df_encoded.iloc[:,:-1].values, df_encoded['Target'].values, test_size=0.33, random_state=42)

In [22]:
train_label


Out[22]:
array([0., 1., 1., ..., 1., 1., 1.])

In [27]:
##Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(train,train_label)

In [ ]:


In [28]:
pred = clf.predict(test)

In [29]:
clf.score(test,test_label)


Out[29]:
1.0

In [233]:
f1_score(test_label,pred)


Out[233]:
1.0

In [234]:
##Random Forest
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=2,
                              random_state=0)
clf_rf = clf_rf.fit(train,train_label)
print(clf_rf.score(test,test_label))


1.0

In [235]:
np.where(test_label==1)


Out[235]:
(array([   0,    1,    2, ..., 1644, 1645, 1646]),)

In [204]:
train


Out[204]:
array([[50.        ,  2.98421414,  3.66868092, ...,  0.        ,
         0.        ,  1.        ],
       [47.        ,  1.        ,  3.66588082, ...,  0.        ,
         1.        ,  0.        ],
       [38.        ,  2.98421414,  1.63093267, ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [40.        ,  5.        ,  7.23437268, ...,  1.        ,
         1.        ,  0.        ],
       [46.        ,  2.        , 10.84116965, ...,  1.        ,
         1.        ,  0.        ],
       [57.        ,  2.        ,  4.10219715, ...,  1.        ,
         1.        ,  0.        ]])

In [205]:
clf_rf.feature_importances_


Out[205]:
array([5.30130261e-02, 1.07748475e-02, 7.60581500e-02, 2.83062084e-02,
       2.47603265e-02, 4.60592370e-02, 1.12010982e-02, 4.77218673e-03,
       1.32421837e-05, 3.22052084e-03, 1.87980666e-03, 3.33152214e-03,
       3.54234928e-03, 2.19578759e-03, 3.95327577e-03, 4.44863819e-03,
       4.02980487e-04, 9.64025622e-03, 3.49802492e-03, 4.83102884e-03,
       0.00000000e+00, 7.52926097e-04, 2.86801992e-05, 0.00000000e+00,
       8.76194557e-03, 1.25040461e-02, 1.16447746e-02, 7.50770312e-03,
       3.15435298e-02, 0.00000000e+00, 1.01026214e-03, 6.37543846e-03,
       2.42078904e-03, 1.39797406e-05, 0.00000000e+00, 9.37999886e-03,
       6.54056114e-03, 3.41165577e-03, 6.04679561e-03, 2.86866077e-01,
       3.09288323e-01])

In [206]:
import operator

features_importance = dict()
for key,val in zip(df_encoded.columns,clf_rf.feature_importances_):
    features_importance[key] = val

features_importance


Out[206]:
{'age': 0.05301302607434247,
 'freqOfMoving': 0.010774847518680153,
 'avgDailyUsage': 0.07605814998429931,
 'lateFeePayments': 0.0283062084180509,
 'AvgComplaintsRaised': 0.024760326475224793,
 'AvgComplaintsResolved': 0.046059236988317494,
 'education_High': 0.011201098169394791,
 'education_low': 0.0047721867307148405,
 'education_medium': 1.3242183697348612e-05,
 'familySize_Large': 0.0032205208390761564,
 'familySize_Medium': 0.0018798066635057375,
 'familySize_Small': 0.003331522141859674,
 'typeOfHousing_1BHK-Apt': 0.00354234928344241,
 'typeOfHousing_2BHK-Apt': 0.002195787590264528,
 'typeOfHousing_3BHK-Apt': 0.003953275774472823,
 'typeOfHousing_5BHK-House': 0.004448638193550461,
 'typeOfHousing_Mansion': 0.0004029804869143282,
 'houseOwnership_Owned': 0.009640256218168329,
 'houseOwnership_Rented': 0.0034980249205378965,
 'householdAnnualIncome_High': 0.004831028839618063,
 'householdAnnualIncome_Low': 0.0,
 'householdAnnualIncome_Medium': 0.0007529260968558358,
 'ageOfBuilding_10-20yrs': 2.8680199232491903e-05,
 'ageOfBuilding_20-30yrs': 0.0,
 'ageOfBuilding_<10yrs': 0.008761945573737441,
 'ageOfBuilding_>30yrs': 0.012504046072960124,
 'ageOfCustomerAccountInDays_1-2yrs': 0.011644774585946961,
 'ageOfCustomerAccountInDays_2-5yrs': 0.007507703120868394,
 'ageOfCustomerAccountInDays_<1yr': 0.031543529774350675,
 'ageOfCustomerAccountInDays_>10yrs': 0.0,
 'ageOfCustomerAccountInDays_>5yrs': 0.0010102621421468627,
 'modeOfPayment_DD': 0.006375438464083601,
 'modeOfPayment_Others': 0.0024207890418982392,
 'modeOfPayment_PPM': 1.397974055240929e-05,
 'modeOfPayment_SD': 0.0,
 'typeOfPlanI_Fixed': 0.009379998855156023,
 'typeOfPlanI_Variable': 0.006540561138840899,
 'typeOfPlanII_Fixed': 0.003411655769619071,
 'typeOfPlanII_Variable': 0.006046795612964422,
 'Target_No': 0.2868660769057172,
 'Target_Yes': 0.30928832341093687}

In [ ]:
##select worthy features
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
    
imp_features = take(10,sorted(imp_features.items(),key=operator.itemgetter(1))

In [32]:
clf_rf.score(test,test_label)


Out[32]:
0.7909090909090909

In [31]:
df_encoded.columns


Out[31]:
Index(['age', 'freqOfMoving', 'avgDailyUsage', 'lateFeePayments',
       'AvgComplaintsRaised', 'AvgComplaintsResolved', 'education_High',
       'education_low', 'education_medium', 'familySize_Large',
       'familySize_Medium', 'familySize_Small', 'typeOfHousing_1BHK-Apt',
       'typeOfHousing_2BHK-Apt', 'typeOfHousing_3BHK-Apt',
       'typeOfHousing_5BHK-House', 'typeOfHousing_Mansion',
       'houseOwnership_Owned', 'houseOwnership_Rented',
       'householdAnnualIncome_High', 'householdAnnualIncome_Low',
       'householdAnnualIncome_Medium', 'ageOfBuilding_10-20yrs',
       'ageOfBuilding_20-30yrs', 'ageOfBuilding_<10yrs',
       'ageOfBuilding_>30yrs', 'ageOfCustomerAccountInDays_1-2yrs',
       'ageOfCustomerAccountInDays_2-5yrs', 'ageOfCustomerAccountInDays_<1yr',
       'ageOfCustomerAccountInDays_>10yrs', 'ageOfCustomerAccountInDays_>5yrs',
       'modeOfPayment_DD', 'modeOfPayment_Others', 'modeOfPayment_PPM',
       'modeOfPayment_SD', 'typeOfPlanI_Fixed', 'typeOfPlanI_Variable',
       'typeOfPlanII_Fixed', 'typeOfPlanII_Variable'],
      dtype='object')

In [33]:
##Naive Bayes

clf_nb = GaussianNB()
clf_nb.fit(train,train_label)
clf_nb.score(test,test_label)


Out[33]:
0.7903030303030303

In [39]:
## Remove Outliers
df_encoded['Target'] = df['Target']
df_without_ol = df_encoded.copy()
df_without_ol = df_without_ol[(np.abs(stats.zscore(df_without_ol)) < 3).all(axis=1)]
df_without_ol.shape,df_encoded.shape


Out[39]:
((4424, 40), (5000, 40))

In [44]:
len(df_without_ol.columns)


Out[44]:
40

In [46]:
train,test,train_label,test_label = train_test_split(df_without_ol.iloc[:,0:39].values, df_without_ol['Target'].values, test_size=0.33, random_state=42)

In [47]:
##Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(train,train_label)
clf.score(test,test_label)


Out[47]:
0.6438356164383562

In [48]:
##Random Forest
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=2,
                              random_state=0)
clf_rf = clf_rf.fit(train,train_label)
print(clf_rf.score(test,test_label))


0.7883561643835616

In [49]:
##Naive Bayes

clf_nb = GaussianNB()
clf_nb.fit(train,train_label)
clf_nb.score(test,test_label)


Out[49]:
0.7719178082191781

In [50]:
clf_rf.feature_importances_


Out[50]:
array([0.08988646, 0.06531749, 0.2115208 , 0.01232879, 0.05935326,
       0.10696962, 0.01787085, 0.0029564 , 0.0120856 , 0.0185885 ,
       0.00822154, 0.04581109, 0.02680419, 0.00390294, 0.0012091 ,
       0.03112349, 0.00737305, 0.002809  , 0.01456979, 0.00396808,
       0.0192716 , 0.00179743, 0.00372291, 0.00617589, 0.00942843,
       0.01256241, 0.01568041, 0.00213726, 0.00939205, 0.        ,
       0.01672526, 0.        , 0.00539225, 0.00583584, 0.01542894,
       0.04231056, 0.01217723, 0.02699576, 0.05229574])

In [51]:
df.columns


Out[51]:
Index(['age', 'education', 'familySize', 'typeOfHousing', 'houseOwnership',
       'householdAnnualIncome', 'ageOfBuilding', 'freqOfMoving',
       'ageOfCustomerAccountInDays', 'avgDailyUsage', 'lateFeePayments',
       'AvgComplaintsRaised', 'AvgComplaintsResolved', 'modeOfPayment',
       'typeOfPlanI', 'typeOfPlanII', 'Target'],
      dtype='object')

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: