In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB



In [2]:

    
%matplotlib inline

Exploratory Data Analysis



In [3]:

    
df = pd.read_csv('data/churn.csv')
df.head()









    Out[3]:







  
    
      
      age
      education
      familySize
      typeOfHousing
      houseOwnership
      householdAnnualIncome
      ageOfBuilding
      freqOfMoving
      ageOfCustomerAccountInDays
      avgDailyUsage
      lateFeePayments
      AvgComplaintsRaised
      AvgComplaintsResolved
      modeOfPayment
      typeOfPlanI
      typeOfPlanII
      Target
    
  
  
    
      0
      37
      medium
      Large
      3BHK-Apt
      Owned
      Medium
      10-20yrs
      3
      2-5yrs
      0.766845
      1
      2
      1
      DD
      Fixed
      Fixed
      Yes
    
    
      1
      44
      High
      Small
      2BHK-Apt
      Owned
      Low
      >30yrs
      1
      >5yrs
      1.339218
      3
      7
      6
      Others
      Variable
      Variable
      Yes
    
    
      2
      44
      medium
      Medium
      1BHK-Apt
      Rented
      Medium
      10-20yrs
      4
      >5yrs
      9.591932
      2
      8
      1
      Others
      Fixed
      Fixed
      No
    
    
      3
      36
      High
      Medium
      1BHK-Apt
      Owned
      High
      20-30yrs
      5
      2-5yrs
      3.479652
      1
      7
      1
      Others
      Fixed
      Fixed
      No
    
    
      4
      35
      low
      Medium
      1BHK-Apt
      Owned
      Medium
      <10yrs
      2
      2-5yrs
      3.576235
      2
      1
      0
      DD
      Variable
      Variable
      No



In [4]:

    
df.describe()









    Out[4]:







  
    
      
      age
      freqOfMoving
      avgDailyUsage
      lateFeePayments
      AvgComplaintsRaised
      AvgComplaintsResolved
    
  
  
    
      count
      5000.000000
      5000.000000
      5000.000000
      5000.000000
      5000.00000
      5000.000000
    
    
      mean
      39.470000
      3.583400
      4.679484
      1.422400
      5.01520
      2.087600
    
    
      std
      7.943634
      2.233213
      3.394101
      1.314353
      3.15369
      2.223357
    
    
      min
      17.000000
      1.000000
      0.059012
      0.000000
      0.00000
      0.000000
    
    
      25%
      34.000000
      2.000000
      2.220354
      0.000000
      2.00000
      0.000000
    
    
      50%
      39.000000
      3.000000
      3.857608
      1.000000
      5.00000
      1.000000
    
    
      75%
      45.000000
      5.000000
      6.291110
      2.000000
      8.00000
      3.000000
    
    
      max
      81.000000
      10.000000
      30.021618
      4.000000
      10.00000
      9.000000



In [ ]:



In [5]:

    
df.var()









    Out[5]:





age                      63.101320
freqOfMoving              4.987242
avgDailyUsage            11.519922
lateFeePayments           1.727524
AvgComplaintsRaised       9.945758
AvgComplaintsResolved     4.943315
dtype: float64



In [ ]:



In [ ]:



In [6]:

    
plt.figure(figsize=(10,6))
sns.swarmplot(x='age',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)









    Out[6]:





<matplotlib.legend.Legend at 0x7fd4c16c9470>



In [7]:

    
df.loc[(df.Target=='Yes')].age.min(),df.loc[(df.Target=='Yes')].age.max()









    Out[7]:





(17, 72)



In [8]:

    
df.loc[(df.Target=='Yes')].age.mean()









    Out[8]:





39.61969111969112



In [9]:

    
plt.figure(figsize=(10,6))
sns.boxplot(x='freqOfMoving',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)









    Out[9]:





<matplotlib.legend.Legend at 0x7fd4bc27d0f0>



In [10]:

    
plt.figure(figsize=(10,6))
sns.countplot(x='freqOfMoving',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)









    



No handles with labels found to put in legend.






    Out[10]:





<matplotlib.legend.Legend at 0x7fd4bc24d080>



In [11]:

    
plt.figure(figsize=(10,6))
sns.swarmplot(x='avgDailyUsage',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)









    Out[11]:





<matplotlib.legend.Legend at 0x7fd4bc1e8f60>



In [12]:

    
plt.figure(figsize=(10,6))
sns.factorplot(x='lateFeePayments',col='Target',hue='Target',data=df,dodge=True,kind="count")
plt.legend(bbox_to_anchor=(1, 1), loc=2)









    Out[12]:





<matplotlib.legend.Legend at 0x7fd4bc1549b0>






    





<Figure size 720x432 with 0 Axes>



In [13]:

    
plt.figure(figsize=(10,6))
sns.swarmplot(x='AvgComplaintsRaised',y='Target',hue='Target',data=df,dodge=True)
plt.legend(bbox_to_anchor=(1, 1), loc=2)









    Out[13]:





<matplotlib.legend.Legend at 0x7fd4bc1a7da0>



In [14]:

    
plt.figure(figsize=(10,6))
sns.factorplot(x='AvgComplaintsResolved',col='Target',hue='Target',data=df,dodge=True,kind="count")
plt.legend(bbox_to_anchor=(1, 1), loc=2)









    Out[14]:





<matplotlib.legend.Legend at 0x7fd4b7fcf278>






    





<Figure size 720x432 with 0 Axes>



In [15]:

    
plt.figure(figsize=(10,6))
sns.factorplot(x='AvgComplaintsRaised',col='Target',hue='Target',data=df,dodge=True,kind="count")
plt.legend(bbox_to_anchor=(1, 1), loc=2)









    Out[15]:





<matplotlib.legend.Legend at 0x7fd4b7faeb70>






    





<Figure size 720x432 with 0 Axes>



In [16]:

    
cor = df.corr()
sns.heatmap(cor)









    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd4b7d74ba8>



In [17]:

    
len(df.columns)









    Out[17]:





17



In [18]:

    
## One hot encoding the categorical variables.
df_encoded = pd.get_dummies(data=df)
try:
    df_encoded.loc[df['Target'] == 'Yes', 'Target'] = 0
    df_encoded.loc[df['Target'] == 'No', 'Target'] = 1
except:
    pass



In [19]:

    
df_encoded.columns









    Out[19]:





Index(['age', 'freqOfMoving', 'avgDailyUsage', 'lateFeePayments',
       'AvgComplaintsRaised', 'AvgComplaintsResolved', 'education_High',
       'education_low', 'education_medium', 'familySize_Large',
       'familySize_Medium', 'familySize_Small', 'typeOfHousing_1BHK-Apt',
       'typeOfHousing_2BHK-Apt', 'typeOfHousing_3BHK-Apt',
       'typeOfHousing_5BHK-House', 'typeOfHousing_Mansion',
       'houseOwnership_Owned', 'houseOwnership_Rented',
       'householdAnnualIncome_High', 'householdAnnualIncome_Low',
       'householdAnnualIncome_Medium', 'ageOfBuilding_10-20yrs',
       'ageOfBuilding_20-30yrs', 'ageOfBuilding_<10yrs',
       'ageOfBuilding_>30yrs', 'ageOfCustomerAccountInDays_1-2yrs',
       'ageOfCustomerAccountInDays_2-5yrs', 'ageOfCustomerAccountInDays_<1yr',
       'ageOfCustomerAccountInDays_>10yrs', 'ageOfCustomerAccountInDays_>5yrs',
       'modeOfPayment_DD', 'modeOfPayment_Others', 'modeOfPayment_PPM',
       'modeOfPayment_SD', 'typeOfPlanI_Fixed', 'typeOfPlanI_Variable',
       'typeOfPlanII_Fixed', 'typeOfPlanII_Variable', 'Target_No',
       'Target_Yes', 'Target'],
      dtype='object')



In [20]:

    
ic = ['age','freqOfMoving','avgDailyUsage',
                  'lateFeePayments','AvgComplaintsRaised','AvgComplaintsResolved'
                 ]

mean = df_encoded.loc[(df_encoded[ic[0]] < 60),ic[0]].mean()
df_encoded[ic[0]] = df_encoded[ic[0]].mask(df_encoded[ic[0]] > 60,mean)

mean = df_encoded.loc[(df_encoded[ic[1]] < 8),ic[1]].mean()
df_encoded[ic[1]] = df_encoded[ic[1]].mask(df_encoded[ic[1]] > 8,mean)

mean = df_encoded.loc[(df_encoded[ic[2]] < 20),ic[2]].mean()
df_encoded[ic[2]] = df_encoded[ic[2]].mask(df_encoded[ic[2]] > 20,mean)

#df.loc[(df[ic[1]] > 7)] = df.loc[(df[ic[1]] <= 7)].mean()
#df.loc[(df[ic[2]] > 18)] = df.loc[(df[ic[2]] <= 18)].mean()
#df.loc[(df[ic[0]] > 50)] = df.loc[(df[ic[0]] < 50)].mean()
#df.loc[(df[ic[0]] > 50)] = df.loc[(df[ic[0]] < 50)].mean()
#df.loc[(df[ic[0]] > 50)] = df.loc[(df[ic[0]] < 50)].mean()



In [21]:

    
train,test,train_label,test_label = train_test_split(df_encoded.iloc[:,:-1].values, df_encoded['Target'].values, test_size=0.33, random_state=42)



In [22]:

    
train_label









    Out[22]:





array([0., 1., 1., ..., 1., 1., 1.])



In [27]:

    
##Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(train,train_label)



In [ ]:



In [28]:

    
pred = clf.predict(test)



In [29]:

    
clf.score(test,test_label)









    Out[29]:





1.0



In [233]:

    
f1_score(test_label,pred)









    Out[233]:





1.0



In [234]:

    
##Random Forest
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=2,
                              random_state=0)
clf_rf = clf_rf.fit(train,train_label)
print(clf_rf.score(test,test_label))

1.0



In [235]:

    
np.where(test_label==1)









    Out[235]:





(array([   0,    1,    2, ..., 1644, 1645, 1646]),)



In [204]:

    
train









    Out[204]:





array([[50.        ,  2.98421414,  3.66868092, ...,  0.        ,
         0.        ,  1.        ],
       [47.        ,  1.        ,  3.66588082, ...,  0.        ,
         1.        ,  0.        ],
       [38.        ,  2.98421414,  1.63093267, ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [40.        ,  5.        ,  7.23437268, ...,  1.        ,
         1.        ,  0.        ],
       [46.        ,  2.        , 10.84116965, ...,  1.        ,
         1.        ,  0.        ],
       [57.        ,  2.        ,  4.10219715, ...,  1.        ,
         1.        ,  0.        ]])



In [205]:

    
clf_rf.feature_importances_









    Out[205]:





array([5.30130261e-02, 1.07748475e-02, 7.60581500e-02, 2.83062084e-02,
       2.47603265e-02, 4.60592370e-02, 1.12010982e-02, 4.77218673e-03,
       1.32421837e-05, 3.22052084e-03, 1.87980666e-03, 3.33152214e-03,
       3.54234928e-03, 2.19578759e-03, 3.95327577e-03, 4.44863819e-03,
       4.02980487e-04, 9.64025622e-03, 3.49802492e-03, 4.83102884e-03,
       0.00000000e+00, 7.52926097e-04, 2.86801992e-05, 0.00000000e+00,
       8.76194557e-03, 1.25040461e-02, 1.16447746e-02, 7.50770312e-03,
       3.15435298e-02, 0.00000000e+00, 1.01026214e-03, 6.37543846e-03,
       2.42078904e-03, 1.39797406e-05, 0.00000000e+00, 9.37999886e-03,
       6.54056114e-03, 3.41165577e-03, 6.04679561e-03, 2.86866077e-01,
       3.09288323e-01])



In [206]:

    
import operator

features_importance = dict()
for key,val in zip(df_encoded.columns,clf_rf.feature_importances_):
    features_importance[key] = val

features_importance









    Out[206]:





{'age': 0.05301302607434247,
 'freqOfMoving': 0.010774847518680153,
 'avgDailyUsage': 0.07605814998429931,
 'lateFeePayments': 0.0283062084180509,
 'AvgComplaintsRaised': 0.024760326475224793,
 'AvgComplaintsResolved': 0.046059236988317494,
 'education_High': 0.011201098169394791,
 'education_low': 0.0047721867307148405,
 'education_medium': 1.3242183697348612e-05,
 'familySize_Large': 0.0032205208390761564,
 'familySize_Medium': 0.0018798066635057375,
 'familySize_Small': 0.003331522141859674,
 'typeOfHousing_1BHK-Apt': 0.00354234928344241,
 'typeOfHousing_2BHK-Apt': 0.002195787590264528,
 'typeOfHousing_3BHK-Apt': 0.003953275774472823,
 'typeOfHousing_5BHK-House': 0.004448638193550461,
 'typeOfHousing_Mansion': 0.0004029804869143282,
 'houseOwnership_Owned': 0.009640256218168329,
 'houseOwnership_Rented': 0.0034980249205378965,
 'householdAnnualIncome_High': 0.004831028839618063,
 'householdAnnualIncome_Low': 0.0,
 'householdAnnualIncome_Medium': 0.0007529260968558358,
 'ageOfBuilding_10-20yrs': 2.8680199232491903e-05,
 'ageOfBuilding_20-30yrs': 0.0,
 'ageOfBuilding_<10yrs': 0.008761945573737441,
 'ageOfBuilding_>30yrs': 0.012504046072960124,
 'ageOfCustomerAccountInDays_1-2yrs': 0.011644774585946961,
 'ageOfCustomerAccountInDays_2-5yrs': 0.007507703120868394,
 'ageOfCustomerAccountInDays_<1yr': 0.031543529774350675,
 'ageOfCustomerAccountInDays_>10yrs': 0.0,
 'ageOfCustomerAccountInDays_>5yrs': 0.0010102621421468627,
 'modeOfPayment_DD': 0.006375438464083601,
 'modeOfPayment_Others': 0.0024207890418982392,
 'modeOfPayment_PPM': 1.397974055240929e-05,
 'modeOfPayment_SD': 0.0,
 'typeOfPlanI_Fixed': 0.009379998855156023,
 'typeOfPlanI_Variable': 0.006540561138840899,
 'typeOfPlanII_Fixed': 0.003411655769619071,
 'typeOfPlanII_Variable': 0.006046795612964422,
 'Target_No': 0.2868660769057172,
 'Target_Yes': 0.30928832341093687}



In [ ]:

    
##select worthy features
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
    
imp_features = take(10,sorted(imp_features.items(),key=operator.itemgetter(1))



In [32]:

    
clf_rf.score(test,test_label)









    Out[32]:





0.7909090909090909



In [31]:

    
df_encoded.columns









    Out[31]:





Index(['age', 'freqOfMoving', 'avgDailyUsage', 'lateFeePayments',
       'AvgComplaintsRaised', 'AvgComplaintsResolved', 'education_High',
       'education_low', 'education_medium', 'familySize_Large',
       'familySize_Medium', 'familySize_Small', 'typeOfHousing_1BHK-Apt',
       'typeOfHousing_2BHK-Apt', 'typeOfHousing_3BHK-Apt',
       'typeOfHousing_5BHK-House', 'typeOfHousing_Mansion',
       'houseOwnership_Owned', 'houseOwnership_Rented',
       'householdAnnualIncome_High', 'householdAnnualIncome_Low',
       'householdAnnualIncome_Medium', 'ageOfBuilding_10-20yrs',
       'ageOfBuilding_20-30yrs', 'ageOfBuilding_<10yrs',
       'ageOfBuilding_>30yrs', 'ageOfCustomerAccountInDays_1-2yrs',
       'ageOfCustomerAccountInDays_2-5yrs', 'ageOfCustomerAccountInDays_<1yr',
       'ageOfCustomerAccountInDays_>10yrs', 'ageOfCustomerAccountInDays_>5yrs',
       'modeOfPayment_DD', 'modeOfPayment_Others', 'modeOfPayment_PPM',
       'modeOfPayment_SD', 'typeOfPlanI_Fixed', 'typeOfPlanI_Variable',
       'typeOfPlanII_Fixed', 'typeOfPlanII_Variable'],
      dtype='object')



In [33]:

    
##Naive Bayes

clf_nb = GaussianNB()
clf_nb.fit(train,train_label)
clf_nb.score(test,test_label)









    Out[33]:





0.7903030303030303



In [39]:

    
## Remove Outliers
df_encoded['Target'] = df['Target']
df_without_ol = df_encoded.copy()
df_without_ol = df_without_ol[(np.abs(stats.zscore(df_without_ol)) < 3).all(axis=1)]
df_without_ol.shape,df_encoded.shape









    Out[39]:





((4424, 40), (5000, 40))



In [44]:

    
len(df_without_ol.columns)









    Out[44]:





40



In [46]:

    
train,test,train_label,test_label = train_test_split(df_without_ol.iloc[:,0:39].values, df_without_ol['Target'].values, test_size=0.33, random_state=42)



In [47]:

    
##Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(train,train_label)
clf.score(test,test_label)









    Out[47]:





0.6438356164383562



In [48]:

    
##Random Forest
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=2,
                              random_state=0)
clf_rf = clf_rf.fit(train,train_label)
print(clf_rf.score(test,test_label))









    



0.7883561643835616



In [49]:

    
##Naive Bayes

clf_nb = GaussianNB()
clf_nb.fit(train,train_label)
clf_nb.score(test,test_label)









    Out[49]:





0.7719178082191781



In [50]:

    
clf_rf.feature_importances_









    Out[50]:





array([0.08988646, 0.06531749, 0.2115208 , 0.01232879, 0.05935326,
       0.10696962, 0.01787085, 0.0029564 , 0.0120856 , 0.0185885 ,
       0.00822154, 0.04581109, 0.02680419, 0.00390294, 0.0012091 ,
       0.03112349, 0.00737305, 0.002809  , 0.01456979, 0.00396808,
       0.0192716 , 0.00179743, 0.00372291, 0.00617589, 0.00942843,
       0.01256241, 0.01568041, 0.00213726, 0.00939205, 0.        ,
       0.01672526, 0.        , 0.00539225, 0.00583584, 0.01542894,
       0.04231056, 0.01217723, 0.02699576, 0.05229574])



In [51]:

    
df.columns









    Out[51]:





Index(['age', 'education', 'familySize', 'typeOfHousing', 'houseOwnership',
       'householdAnnualIncome', 'ageOfBuilding', 'freqOfMoving',
       'ageOfCustomerAccountInDays', 'avgDailyUsage', 'lateFeePayments',
       'AvgComplaintsRaised', 'AvgComplaintsResolved', 'modeOfPayment',
       'typeOfPlanI', 'typeOfPlanII', 'Target'],
      dtype='object')



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	age	education	familySize	typeOfHousing	houseOwnership	householdAnnualIncome	ageOfBuilding	freqOfMoving	ageOfCustomerAccountInDays	avgDailyUsage	lateFeePayments	AvgComplaintsRaised	AvgComplaintsResolved	modeOfPayment	typeOfPlanI	typeOfPlanII	Target
0	37	medium	Large	3BHK-Apt	Owned	Medium	10-20yrs	3	2-5yrs	0.766845	1	2	1	DD	Fixed	Fixed	Yes
1	44	High	Small	2BHK-Apt	Owned	Low	>30yrs	1	>5yrs	1.339218	3	7	6	Others	Variable	Variable	Yes
2	44	medium	Medium	1BHK-Apt	Rented	Medium	10-20yrs	4	>5yrs	9.591932	2	8	1	Others	Fixed	Fixed	No
3	36	High	Medium	1BHK-Apt	Owned	High	20-30yrs	5	2-5yrs	3.479652	1	7	1	Others	Fixed	Fixed	No
4	35	low	Medium	1BHK-Apt	Owned	Medium	<10yrs	2	2-5yrs	3.576235	2	1	0	DD	Variable	Variable	No

	age	freqOfMoving	avgDailyUsage	lateFeePayments	AvgComplaintsRaised	AvgComplaintsResolved
count	5000.000000	5000.000000	5000.000000	5000.000000	5000.00000	5000.000000
mean	39.470000	3.583400	4.679484	1.422400	5.01520	2.087600
std	7.943634	2.233213	3.394101	1.314353	3.15369	2.223357
min	17.000000	1.000000	0.059012	0.000000	0.00000	0.000000
25%	34.000000	2.000000	2.220354	0.000000	2.00000	0.000000
50%	39.000000	3.000000	3.857608	1.000000	5.00000	1.000000
75%	45.000000	5.000000	6.291110	2.000000	8.00000	3.000000
max	81.000000	10.000000	30.021618	4.000000	10.00000	9.000000