pandas-lecture-01-telecome-churn



In [46]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('../data/telecom_churn.csv')

In [7]:
data.head()


Out[7]:
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total eve calls Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn
0 KS 128 415 No Yes 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 False
1 OH 107 415 No Yes 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 False
2 NJ 137 415 No No 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 False
3 OH 84 408 Yes No 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 False
4 OK 75 415 Yes No 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 False

In [10]:
data.shape


Out[10]:
(3333, 20)

In [14]:
print(data.columns)


Index([u'State', u'Account length', u'Area code', u'International plan',
       u'Voice mail plan', u'Number vmail messages', u'Total day minutes',
       u'Total day calls', u'Total day charge', u'Total eve minutes',
       u'Total eve calls', u'Total eve charge', u'Total night minutes',
       u'Total night calls', u'Total night charge', u'Total intl minutes',
       u'Total intl calls', u'Total intl charge', u'Customer service calls',
       u'Churn'],
      dtype='object')

In [16]:
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
State                     3333 non-null object
Account length            3333 non-null int64
Area code                 3333 non-null int64
International plan        3333 non-null object
Voice mail plan           3333 non-null object
Number vmail messages     3333 non-null int64
Total day minutes         3333 non-null float64
Total day calls           3333 non-null int64
Total day charge          3333 non-null float64
Total eve minutes         3333 non-null float64
Total eve calls           3333 non-null int64
Total eve charge          3333 non-null float64
Total night minutes       3333 non-null float64
Total night calls         3333 non-null int64
Total night charge        3333 non-null float64
Total intl minutes        3333 non-null float64
Total intl calls          3333 non-null int64
Total intl charge         3333 non-null float64
Customer service calls    3333 non-null int64
Churn                     3333 non-null bool
dtypes: bool(1), float64(8), int64(8), object(3)
memory usage: 498.1+ KB
None

In [23]:
data['Churn'] = data['Churn'].astype('int64')

In [24]:
print(data.describe())


       Account length    Area code  Number vmail messages  Total day minutes  \
count     3333.000000  3333.000000            3333.000000        3333.000000   
mean       101.064806   437.182418               8.099010         179.775098   
std         39.822106    42.371290              13.688365          54.467389   
min          1.000000   408.000000               0.000000           0.000000   
25%         74.000000   408.000000               0.000000         143.700000   
50%        101.000000   415.000000               0.000000         179.400000   
75%        127.000000   510.000000              20.000000         216.400000   
max        243.000000   510.000000              51.000000         350.800000   

       Total day calls  Total day charge  Total eve minutes  Total eve calls  \
count      3333.000000       3333.000000        3333.000000      3333.000000   
mean        100.435644         30.562307         200.980348       100.114311   
std          20.069084          9.259435          50.713844        19.922625   
min           0.000000          0.000000           0.000000         0.000000   
25%          87.000000         24.430000         166.600000        87.000000   
50%         101.000000         30.500000         201.400000       100.000000   
75%         114.000000         36.790000         235.300000       114.000000   
max         165.000000         59.640000         363.700000       170.000000   

       Total eve charge  Total night minutes  Total night calls  \
count       3333.000000          3333.000000        3333.000000   
mean          17.083540           200.872037         100.107711   
std            4.310668            50.573847          19.568609   
min            0.000000            23.200000          33.000000   
25%           14.160000           167.000000          87.000000   
50%           17.120000           201.200000         100.000000   
75%           20.000000           235.300000         113.000000   
max           30.910000           395.000000         175.000000   

       Total night charge  Total intl minutes  Total intl calls  \
count         3333.000000         3333.000000       3333.000000   
mean             9.039325           10.237294          4.479448   
std              2.275873            2.791840          2.461214   
min              1.040000            0.000000          0.000000   
25%              7.520000            8.500000          3.000000   
50%              9.050000           10.300000          4.000000   
75%             10.590000           12.100000          6.000000   
max             17.770000           20.000000         20.000000   

       Total intl charge  Customer service calls        Churn  
count        3333.000000             3333.000000  3333.000000  
mean            2.764581                1.562856     0.144914  
std             0.753773                1.315491     0.352067  
min             0.000000                0.000000     0.000000  
25%             2.300000                1.000000     0.000000  
50%             2.780000                1.000000     0.000000  
75%             3.270000                2.000000     0.000000  
max             5.400000                9.000000     1.000000  

In [22]:
print(data.describe(include=['bool', 'object']))


       State International plan Voice mail plan
count   3333               3333            3333
unique    51                  2               2
top       WV                 No              No
freq     106               3010            2411

In [33]:
data['Churn'].value_counts()


Out[33]:
0    2850
1     483
Name: Churn, dtype: int64

In [35]:
data['Area code'].value_counts(normalize = True)


Out[35]:
415    0.496550
510    0.252025
408    0.251425
Name: Area code, dtype: float64

In [37]:
data.sort_values(by="Total day charge", ascending=False).head()


Out[37]:
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes Total eve calls Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn
365 CO 154 415 No No 0 350.8 75 59.64 216.5 94 18.40 253.9 100 11.43 10.1 9 2.73 1 1
985 NY 64 415 Yes No 0 346.8 55 58.96 249.5 79 21.21 275.4 102 12.39 13.3 9 3.59 1 1
2594 OH 115 510 Yes No 0 345.3 81 58.70 203.4 106 17.29 217.5 107 9.79 11.8 8 3.19 1 1
156 OH 83 415 No No 0 337.4 120 57.36 227.4 116 19.33 153.9 114 6.93 15.8 7 4.27 0 1
605 MO 112 415 No No 0 335.5 77 57.04 212.5 109 18.06 265.0 132 11.93 12.7 8 3.43 2 1

In [38]:
data['Churn'].mean()


Out[38]:
0.14491449144914492

In [40]:
data[data['Churn'] == 1].mean(), \
data[data['Churn'] == 0].mean()


Out[40]:
(Account length            102.664596
 Area code                 437.817805
 Number vmail messages       5.115942
 Total day minutes         206.914079
 Total day calls           101.335404
 Total day charge           35.175921
 Total eve minutes         212.410145
 Total eve calls           100.561077
 Total eve charge           18.054969
 Total night minutes       205.231677
 Total night calls         100.399586
 Total night charge          9.235528
 Total intl minutes         10.700000
 Total intl calls            4.163561
 Total intl charge           2.889545
 Customer service calls      2.229814
 Churn                       1.000000
 dtype: float64, Account length            100.793684
 Area code                 437.074737
 Number vmail messages       8.604561
 Total day minutes         175.175754
 Total day calls           100.283158
 Total day charge           29.780421
 Total eve minutes         199.043298
 Total eve calls           100.038596
 Total eve charge           16.918909
 Total night minutes       200.133193
 Total night calls         100.058246
 Total night charge          9.006074
 Total intl minutes         10.158877
 Total intl calls            4.532982
 Total intl charge           2.743404
 Customer service calls      1.449825
 Churn                       0.000000
 dtype: float64)

In [42]:
data[data['Churn'] == 1]['Total day minutes'].mean()


Out[42]:
206.91407867494823

In [44]:
data[(data['Churn'] == 0) & (data['International plan'] == 'No')]['Total intl minutes'].max()


Out[44]:
18.9

In [47]:
data.apply(np.max)


Out[47]:
State                        WY
Account length              243
Area code                   510
International plan          Yes
Voice mail plan             Yes
Number vmail messages        51
Total day minutes         350.8
Total day calls             165
Total day charge          59.64
Total eve minutes         363.7
Total eve calls             170
Total eve charge          30.91
Total night minutes         395
Total night calls           175
Total night charge        17.77
Total intl minutes           20
Total intl calls             20
Total intl charge           5.4
Customer service calls        9
Churn                         1
dtype: object

In [50]:
columns = ["Total day minutes", 'Total night minutes', 'Total intl minutes']
data.groupby(['Churn'])[columns].describe()


Out[50]:
Total day minutes Total night minutes Total intl minutes
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
Churn
0 2850.0 175.175754 50.181655 0.0 142.825 177.2 210.30 315.6 2850.0 200.133193 ... 234.90 395.0 2850.0 10.158877 2.784489 0.0 8.4 10.2 12.0 18.9
1 483.0 206.914079 68.997792 0.0 153.250 217.6 265.95 350.8 483.0 205.231677 ... 239.85 354.9 483.0 10.700000 2.793190 2.0 8.8 10.6 12.8 20.0

2 rows × 24 columns


In [52]:
data.groupby(['Churn'])[columns].agg([np.mean, np.std, np.min, np.max])


Out[52]:
Total day minutes Total night minutes Total intl minutes
mean std amin amax mean std amin amax mean std amin amax
Churn
0 175.175754 50.181655 0.0 315.6 200.133193 51.105032 23.2 395.0 10.158877 2.784489 0.0 18.9
1 206.914079 68.997792 0.0 350.8 205.231677 47.132825 47.4 354.9 10.700000 2.793190 2.0 20.0

In [53]:
pd.crosstab(data['Churn'], data['International plan'])


Out[53]:
International plan No Yes
Churn
0 2664 186
1 346 137

In [55]:
pd.crosstab(data['Churn'], data['Voice mail plan'], normalize=True)


Out[55]:
Voice mail plan No Yes
Churn
0 0.602460 0.252625
1 0.120912 0.024002

In [59]:
data.pivot_table(['Total day calls', 'Total day calls', 'Total night calls'], ['Area code'], aggfunc='sum'), \
data.pivot_table(['Total day calls', 'Total day calls', 'Total night calls'], ['Area code'], aggfunc='mean')


Out[59]:
(           Total day calls  Total day calls  Total night calls
 Area code                                                     
 408                  84216            84216              82995
 415                 166454           166454             166159
 510                  84082            84082              84505,
            Total day calls  Total day calls  Total night calls
 Area code                                                     
 408             100.496420       100.496420          99.039379
 415             100.576435       100.576435         100.398187
 510             100.097619       100.097619         100.601190)

In [61]:
data['Total calls'] = data['Total day calls'] + data['Total night calls'] + data['Total eve calls'] + data['Total intl calls']
data.head()


Out[61]:
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes ... Total eve charge Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn Total calls
0 KS 128 415 No Yes 25 265.1 110 45.07 197.4 ... 16.78 244.7 91 11.01 10.0 3 2.70 1 0 303
1 OH 107 415 No Yes 26 161.6 123 27.47 195.5 ... 16.62 254.4 103 11.45 13.7 3 3.70 1 0 332
2 NJ 137 415 No No 0 243.4 114 41.38 121.2 ... 10.30 162.6 104 7.32 12.2 5 3.29 0 0 333
3 OH 84 408 Yes No 0 299.4 71 50.90 61.9 ... 5.26 196.9 89 8.86 6.6 7 1.78 2 0 255
4 OK 75 415 Yes No 0 166.7 113 28.34 148.3 ... 12.61 186.9 121 8.41 10.1 3 2.73 3 0 359

5 rows × 21 columns


In [62]:
data['Total charge'] = data['Total day charge'] + data['Total night charge'] + data['Total eve charge'] + data['Total intl charge']
data.head()


Out[62]:
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes ... Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn Total calls Total charge
0 KS 128 415 No Yes 25 265.1 110 45.07 197.4 ... 244.7 91 11.01 10.0 3 2.70 1 0 303 75.56
1 OH 107 415 No Yes 26 161.6 123 27.47 195.5 ... 254.4 103 11.45 13.7 3 3.70 1 0 332 59.24
2 NJ 137 415 No No 0 243.4 114 41.38 121.2 ... 162.6 104 7.32 12.2 5 3.29 0 0 333 62.29
3 OH 84 408 Yes No 0 299.4 71 50.90 61.9 ... 196.9 89 8.86 6.6 7 1.78 2 0 255 66.80
4 OK 75 415 Yes No 0 166.7 113 28.34 148.3 ... 186.9 121 8.41 10.1 3 2.73 3 0 359 52.09

5 rows × 22 columns


In [68]:
data['Column for delete'] = 1
data.head()
data = data.drop(['Column for delete'], axis=1)
data.head()


Out[68]:
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes ... Total night minutes Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn Total calls Total charge
0 KS 128 415 No Yes 25 265.1 110 45.07 197.4 ... 244.7 91 11.01 10.0 3 2.70 1 0 303 75.56
1 OH 107 415 No Yes 26 161.6 123 27.47 195.5 ... 254.4 103 11.45 13.7 3 3.70 1 0 332 59.24
2 NJ 137 415 No No 0 243.4 114 41.38 121.2 ... 162.6 104 7.32 12.2 5 3.29 0 0 333 62.29
3 OH 84 408 Yes No 0 299.4 71 50.90 61.9 ... 196.9 89 8.86 6.6 7 1.78 2 0 255 66.80
4 OK 75 415 Yes No 0 166.7 113 28.34 148.3 ... 186.9 121 8.41 10.1 3 2.73 3 0 359 52.09

5 rows × 22 columns


In [75]:
pd.crosstab(data['Churn'], data['International plan'], margins=True, normalize=True) * 100


Out[75]:
International plan No Yes All
Churn
0 79.927993 5.580558 85.508551
1 10.381038 4.110411 14.491449
All 90.309031 9.690969 100.000000

In [74]:
pd.crosstab(data['Churn'], data['Customer service calls'], margins=True, normalize=True) * 100


Out[74]:
Customer service calls 0 1 2 3 4 5 6 7 8 9 All
Churn
0 18.151815 31.773177 20.162016 11.551155 2.700270 0.780078 0.240024 0.120012 0.030003 0.000000 85.508551
1 2.760276 3.660366 2.610261 1.320132 2.280228 1.200120 0.420042 0.150015 0.030003 0.060006 14.491449
All 20.912091 35.433543 22.772277 12.871287 4.980498 1.980198 0.660066 0.270027 0.060006 0.060006 100.000000

In [81]:
pd.crosstab(data[data['Churn'] == 0]['Churn'], data['Customer service calls'], margins=True, normalize=True) * 100


Out[81]:
Customer service calls 0 1 2 3 4 5 6 7 8 All
Churn
0 21.22807 37.157895 23.578947 13.508772 3.157895 0.912281 0.280702 0.140351 0.035088 100.0
All 21.22807 37.157895 23.578947 13.508772 3.157895 0.912281 0.280702 0.140351 0.035088 100.0

In [80]:
pd.crosstab(data[data['Churn'] == 1]['Churn'], data['Customer service calls'], margins=True, normalize=True) * 100


Out[80]:
Customer service calls 0 1 2 3 4 5 6 7 8 9 All
Churn
1 19.047619 25.258799 18.012422 9.109731 15.73499 8.281573 2.898551 1.035197 0.207039 0.414079 100.0
All 19.047619 25.258799 18.012422 9.109731 15.73499 8.281573 2.898551 1.035197 0.207039 0.414079 100.0

In [95]:
data['Many_service_calls'] = (data['Customer service calls'] > 3).astype('int')
data.head()


Out[95]:
State Account length Area code International plan Voice mail plan Number vmail messages Total day minutes Total day calls Total day charge Total eve minutes ... Total night calls Total night charge Total intl minutes Total intl calls Total intl charge Customer service calls Churn Total calls Total charge Many_service_calls
0 KS 128 415 No Yes 25 265.1 110 45.07 197.4 ... 91 11.01 10.0 3 2.70 1 0 303 75.56 0
1 OH 107 415 No Yes 26 161.6 123 27.47 195.5 ... 103 11.45 13.7 3 3.70 1 0 332 59.24 0
2 NJ 137 415 No No 0 243.4 114 41.38 121.2 ... 104 7.32 12.2 5 3.29 0 0 333 62.29 0
3 OH 84 408 Yes No 0 299.4 71 50.90 61.9 ... 89 8.86 6.6 7 1.78 2 0 255 66.80 0
4 OK 75 415 Yes No 0 166.7 113 28.34 148.3 ... 121 8.41 10.1 3 2.73 3 0 359 52.09 0

5 rows × 23 columns


In [96]:
pd.crosstab(data['Many_service_calls'], data['Churn'], margins=True)


Out[96]:
Churn 0 1 All
Many_service_calls
0 2721 345 3066
1 129 138 267
All 2850 483 3333

In [99]:
pd.crosstab(data['Many_service_calls'] & data['International plan'] , data['Churn'])


Out[99]:
Churn 0 1
row_0
False 2721 345
True 129 138