In [76]:
import pandas as pd
import numpy as np
%matplotlib inline
from bokeh.io import output_notebook
from bokeh.charts import Histogram, Bar, show
import os
output_notebook()
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})


Loading BokehJS ...

In [77]:
df = pd.read_excel('./data/CHE Feb 2017.xls', 
              skiprows=range(0,12),
              skip_footer=1, 
              )

In [78]:
df = df.drop([0])

In [79]:
df = df.drop(['Unnamed: 1', 'Unnamed: 2', 'Int', 'Tdm', 'Cost', 'Taxes',
           'Eq. Cost', 'Total Cost'], axis=1)

In [80]:
df.Total.describe()


Out[80]:
count     4534.000000
mean        88.087120
std        340.872229
min          1.000000
25%         10.000000
50%         35.000000
75%         88.000000
max      19002.000000
Name: Total, dtype: float64

In [81]:
df


Out[81]:
Number In Out Total Duration Avg. Dur. Long. Dur.
1 20100.0 1.0 2.0 3.0 00:01:52 00:00:37 00:00:56
2 27701.0 4.0 3.0 7.0 00:04:20 00:00:37 00:01:04
3 27702.0 75.0 38.0 113.0 05:25:50 00:02:53 00:30:16
4 27703.0 9.0 19.0 28.0 00:45:00 00:01:36 00:10:08
5 27704.0 12.0 4.0 16.0 01:20:14 00:05:00 00:21:32
6 27705.0 16.0 2.0 18.0 00:19:48 00:01:06 00:04:38
7 27706.0 30.0 6.0 36.0 01:54:20 00:03:10 00:19:10
8 27708.0 11.0 3.0 14.0 00:20:58 00:01:29 00:04:22
9 27709.0 20.0 10.0 30.0 00:29:12 00:00:58 00:11:28
10 27712.0 0.0 18.0 18.0 00:49:22 00:02:44 00:09:34
11 27713.0 0.0 6.0 6.0 01:20:02 00:13:20 00:27:28
12 27714.0 54.0 34.0 88.0 02:58:46 00:02:01 00:35:48
13 27717.0 1.0 2.0 3.0 00:03:48 00:01:16 00:02:10
14 27720.0 23.0 11.0 34.0 02:23:06 00:04:12 00:37:22
15 27725.0 21.0 8.0 29.0 02:23:16 00:04:56 00:31:28
16 27728.0 10.0 17.0 27.0 03:59:14 00:08:51 00:47:30
17 27731.0 1.0 0.0 1.0 00:00:04 00:00:04 00:00:04
18 27732.0 5.0 18.0 23.0 04:55:54 00:12:51 00:56:36
19 27733.0 1.0 0.0 1.0 00:00:06 00:00:06 00:00:06
20 27734.0 8.0 8.0 16.0 00:16:28 00:01:01 00:08:56
21 27735.0 17.0 9.0 26.0 02:06:58 00:04:53 00:59:44
22 27737.0 55.0 23.0 78.0 01:57:44 00:01:30 00:10:18
23 27738.0 46.0 28.0 74.0 01:40:06 00:01:21 00:16:20
24 27739.0 75.0 36.0 111.0 06:38:00 00:03:35 01:04:58
25 27740.0 3.0 0.0 3.0 00:01:50 00:00:36 00:01:02
26 27741.0 0.0 2.0 2.0 00:01:24 00:00:42 00:01:18
27 27742.0 8.0 7.0 15.0 04:02:26 00:16:09 00:45:06
28 27744.0 11.0 0.0 11.0 00:05:18 00:00:28 00:01:02
29 27745.0 1.0 0.0 1.0 00:00:06 00:00:06 00:00:06
30 27747.0 5.0 0.0 5.0 01:50:40 00:22:08 01:00:30
... ... ... ... ... ... ... ...
4505 3124350.0 507.0 166.0 673.0 14:02:32 00:01:15 00:22:00
4506 3124351.0 499.0 164.0 663.0 14:35:46 00:01:19 00:20:20
4507 3124352.0 28.0 0.0 28.0 00:05:20 00:00:11 00:00:26
4508 3124353.0 28.0 0.0 28.0 00:04:42 00:00:10 00:00:30
4509 3124354.0 33.0 0.0 33.0 00:03:48 00:00:06 00:00:20
4510 3124355.0 25.0 0.0 25.0 00:02:38 00:00:06 00:00:22
4511 3124356.0 27.0 0.0 27.0 00:03:14 00:00:07 00:00:16
4512 3124357.0 16.0 0.0 16.0 00:01:18 00:00:04 00:00:12
4513 3124358.0 24.0 0.0 24.0 00:03:06 00:00:07 00:00:18
4514 3124359.0 23.0 0.0 23.0 00:02:22 00:00:06 00:00:14
4515 3124370.0 21.0 0.0 21.0 00:02:20 00:00:06 00:00:20
4516 3124371.0 20.0 0.0 20.0 00:02:36 00:00:07 00:00:20
4517 3124372.0 27.0 0.0 27.0 00:03:36 00:00:08 00:00:26
4518 3124373.0 22.0 0.0 22.0 00:02:30 00:00:06 00:00:20
4519 3124374.0 29.0 0.0 29.0 00:03:38 00:00:07 00:00:20
4520 3124375.0 20.0 0.0 20.0 00:02:16 00:00:06 00:00:26
4521 3166532.0 1.0 0.0 1.0 00:56:20 00:56:20 00:56:20
4522 3166534.0 1.0 0.0 1.0 00:37:42 00:37:42 00:37:42
4523 3166535.0 1.0 0.0 1.0 00:39:12 00:39:12 00:39:12
4524 3166543.0 1.0 0.0 1.0 00:00:02 00:00:02 00:00:02
4525 3166544.0 3.0 0.0 3.0 00:03:24 00:01:08 00:03:16
4526 3166546.0 1.0 0.0 1.0 00:36:32 00:36:32 00:36:32
4527 3166548.0 2.0 0.0 2.0 00:39:46 00:19:53 00:39:44
4528 3166552.0 2.0 0.0 2.0 00:42:50 00:21:25 00:42:48
4529 3166554.0 1.0 0.0 1.0 00:39:32 00:39:32 00:39:32
4530 3166556.0 1.0 0.0 1.0 00:00:08 00:00:08 00:00:08
4531 3166598.0 1.0 0.0 1.0 00:54:22 00:54:22 00:54:22
4532 3167802.0 0.0 2.0 2.0 00:00:12 00:00:06 00:00:06
4533 3167878.0 1.0 0.0 1.0 00:02:08 00:02:08 00:02:08
4534 3168654.0 0.0 1.0 1.0 00:00:02 00:00:02 00:00:02

4534 rows × 7 columns


In [82]:
p = Histogram(df[df['Total'] < 500]['Total'], title='Histogram of number of total calls')

In [83]:
show(p)



In [ ]:


In [84]:
df.sort_values('Total', ascending=False).to_csv('num_calls.csv', header=True)

Load all the data


In [85]:
def is_five_digit_num(row):
    """
    Is row num = 5 digits
    """
    try: 
        num = int(row.Number)
    except ValueError:
        return False
    if num > 9999 and num <= 99999:
        return True
    else:
        return False

In [86]:
def load_data(filename):
    """
    Takes a filename, returns df 
    """
    df = None
    if filename == 'CHE March 2017.xls':
        df = pd.read_excel('./data/CHE March 2017.xls',skiprows=range(0,12),skip_footer=12)
        df['Number'] = df['Unnamed: 0']
        df.drop(['Name'], axis=1, inplace=True)
        df.drop(['Unnamed: 1', 'Unnamed: 2', 'Int', 'Tdm', 'Cost', 'Taxes',
           'Eq. Cost', 'Total Cost', 'Unnamed: 15', 'Unnamed: 0'], axis=1, inplace=True)
    else:
        df = pd.read_excel('./data/' + filename, 
                  skiprows=range(0,12),
                  skip_footer=1)
        df.drop([0], inplace=True)
        df.drop(['Unnamed: 1', 'Unnamed: 2', 'Int', 'Tdm', 'Cost', 'Taxes',
               'Eq. Cost', 'Total Cost'], axis=1, inplace=True)
    # Drop nan rows
    df.dropna(inplace=True)
    # drop rows where calls > 1000
    df = df[df['Total'] < 1000]
    df['Include'] = df.apply(is_five_digit_num, axis=1)
    df = df[df['Include'] == True]
    del df['Include']
    return df

In [87]:
files = os.listdir('data')
data = [load_data(file) for file in files]

In [88]:
grouped = pd.concat(data).groupby('Number')

In [128]:
annual_totals = grouped.sum()
print(len(annual_totals))


5700

CityFone Data


In [129]:
cityfone_df = pd.read_excel('./CityFone Extract 20170518 - Whole City - From Ken - UnitNbr Formatted as Text.xlsx', converters={'WRK_TEL_NBR':str})
print(len(cityfone_df))


30253

In [130]:
cityfone_df['Number'] = cityfone_df.WRK_TEL_NBR.apply(lambda x: str(x)[-7:])
cityfone_df['Number'].replace('       ', np.nan, inplace=True)
cityfone_df['Number'].replace('nan', np.nan, inplace=True)
cityfone_df.dropna(subset=['Number'], inplace=True)
print(cityfone_df.columns)
cityfone_df.rename(columns=lambda c: c.replace(" ", ""), inplace=True)
print(cityfone_df.columns)


Index(['WRK_TEL_NBR', 'WRK_TEL_NBR2', 'EMP_LST_NM', 'EMP_FRST_NM',
       'EXT_DEPT_ID', 'DEPT_NAME', 'STR_ADDR_TXT', 'LOC_UNIT_LBL_TXT',
       'LOC_UNIT_NBR', 'CELLR_TEL_NBR', 'cellr_co_cd', 'cellr_txt_cpbl_ind',
       'ALTN_WRK_TEL_NBR', 'Number'],
      dtype='object')
Index(['WRK_TEL_NBR', 'WRK_TEL_NBR2', 'EMP_LST_NM', 'EMP_FRST_NM',
       'EXT_DEPT_ID', 'DEPT_NAME', 'STR_ADDR_TXT', 'LOC_UNIT_LBL_TXT',
       'LOC_UNIT_NBR', 'CELLR_TEL_NBR', 'cellr_co_cd', 'cellr_txt_cpbl_ind',
       'ALTN_WRK_TEL_NBR', 'Number'],
      dtype='object')

Catagorization


In [131]:
def catagorize(row): 
    """
    Return High, Medium, Low catagory 
    """
    if row['Total'] >= len(data) * 201:
        return 'High'
    elif row['Total'] >= len(data) * 41:
        return 'Medium'
    else:
        return 'Low'

In [132]:
annual_totals['catagory'] = annual_totals.apply(catagorize, axis=1)

In [133]:
ax = annual_totals.catagory.value_counts().plot(kind='bar', title='All CHE PBX Lines, April-March Dataset')
ax.set_xlabel('Number of Calls/Month Low <=40, Med 41-200, High >200')
ax.set_ylabel('Number of Phone Lines')
fig = ax.get_figure()
fig.savefig('all_che.png')


/Users/hunterowens/anaconda3/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [134]:
p1 = Bar(annual_totals.catagory.value_counts(), title='System Number of Users, by Type',
                                                palette=['red', 'green', 'blue'],
                                                )

Merge all the data


In [135]:
def make_seven_digit(number):
    """
    makes a seven digit number from the PBX data. 
    """
    first_digit = str(number)[0]
    if first_digit == '3':
        retval = '47' + str(number)
    elif first_digit == '2':
        retval = '92' + str(number)
    elif first_digit == '8':
        if str(number)[:3] == '895':
            retval = '92' + str(number)
        else:
            retval = '97' + str(number)
    return float(retval)

In [136]:
annual_totals.index = annual_totals.index.map(make_seven_digit)

In [137]:
def number_of_cityfone_matches(row):
    """
    Find the number of Cityfone matches
    """
    number_totals = row.name
    cityfone_df['Number'] = pd.to_numeric(cityfone_df.Number)
    df = cityfone_df[cityfone_df['Number'] == number_totals]
    return len(df)

In [138]:
#annual_totals['number_of_cityfone_matches'] = annual_totals.apply(number_of_cityfone_matches, axis =1)
#annual_totals.apply(number_of_cityfone_matches, axis =1)

In [139]:
#annual_totals.to_csv('pbx_data_with_cityfone_matches.csv')

Cityfone Perf Matching


In [140]:
def cityfone_matcher(row, *args):
    """
    Match to CityFone when perfect match  
    """
    number_totals = row.name
    cityfone_df['Number'] = pd.to_numeric(cityfone_df["Number"])
    df = cityfone_df[cityfone_df['Number'] == number_totals]
    try:
        val = df[args[0]].iloc[0]
    except IndexError: 
        return np.NaN
    return val

In [141]:
annual_totals['Last_Name'] = annual_totals.apply(cityfone_matcher, axis=1, args=('EMP_LST_NM',))
annual_totals['First_Name'] = annual_totals.apply(cityfone_matcher, axis=1, args=('EMP_FRST_NM',))
annual_totals['Dept'] = annual_totals.apply(cityfone_matcher, axis=1, args=('DEPT_NAME',))
annual_totals['address'] = annual_totals.apply(cityfone_matcher, axis=1, args=('STR_ADDR_TXT',))
annual_totals['LOC_UNIT_NBR'] = annual_totals.apply(cityfone_matcher, axis=1, args=('LOC_UNIT_NBR',))
annual_totals['LOC_UNIT_LBL_TXT'] = annual_totals.apply(cityfone_matcher, axis=1, args=('LOC_UNIT_LBL_TXT',))
annual_totals['CELLR_TEL_NBR'] = annual_totals.apply(cityfone_matcher, axis=1, args=('CELLR_TEL_NBR',))
annual_totals['ALTN_WRK_TEL_NBR'] = annual_totals.apply(cityfone_matcher, axis=1, args=('ALTN_WRK_TEL_NBR',))
annual_totals['cellr_co_cd'] = annual_totals.apply(cityfone_matcher, axis=1, args=('cellr_co_cd',))
annual_totals['cellr_txt_cpbl_ind'] = annual_totals.apply(cityfone_matcher, axis=1, args=('cellr_txt_cpbl_ind',))

In [142]:
ita = annual_totals[annual_totals['Dept'] == 'Information Technology Agency']
print(len(ita))


287

In [143]:
ax = ita.catagory.value_counts().plot(kind='bar', title='ITA CHE PBX Lines, April-March Dataset')
ax.set_xlabel('Number of Calls/Month Low <=40, Med 41-200, High >200')
ax.set_ylabel('Number of Phone Lines')
fig = ax.get_figure()
fig.savefig('all_ita.png')


/Users/hunterowens/anaconda3/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [144]:
annual_totals.to_csv('annual_totals_with_location.csv')

In [145]:
ita.catagory.value_counts()


Out[145]:
Low       171
Medium    106
High       10
Name: catagory, dtype: int64

Match with Averages, rather than Sum


In [146]:
annual_avg = grouped.mean()

In [147]:
annual_avg.index = annual_avg.index.map(make_seven_digit)

In [148]:
annual_avg.rename(columns={'In': 'In_avg', 'Out': 'Out_avg', 'Total':'Total_avg'}, inplace=True)

In [149]:
joined_again = annual_avg.join(annual_totals)

In [150]:
joined_again.columns


Out[150]:
Index(['In_avg', 'Out_avg', 'Total_avg', 'In', 'Out', 'Total', 'catagory',
       'Last_Name', 'First_Name', 'Dept', 'address', 'LOC_UNIT_NBR',
       'LOC_UNIT_LBL_TXT', 'CELLR_TEL_NBR', 'ALTN_WRK_TEL_NBR', 'cellr_co_cd',
       'cellr_txt_cpbl_ind'],
      dtype='object')

In [151]:
joined_again['phone_number'] = joined_again.index.values

Ordering

* Department
* Phone #
* First Name
* Last Name
* Address
* Loc Unit Lbl Txt
* Loc Unit Nbr
* Category
* Avg Monthly Calls In
* Avg Monthly Calls Out
* Avg Monthly Calls Total

In [152]:
## Remove 0s from datset
joined_again = joined_again.replace(to_replace=0,value='')
joined_again = joined_again.replace(to_replace='0',value='')
joined_again.ix[9227734]


Out[152]:
In_avg                                      3.72727
Out_avg                                     9.27273
Total_avg                                        13
In                                               41
Out                                             102
Total                                           143
catagory                                        Low
Last_Name                                SHAHBAZIAN
First_Name                                  GAIDZAG
Dept                  Information Technology Agency
address                    200 N MAIN ST 13TH FLOOR
LOC_UNIT_NBR                             13-39     
LOC_UNIT_LBL_TXT               Cubicle             
CELLR_TEL_NBR                                   NaN
ALTN_WRK_TEL_NBR                                NaN
cellr_co_cd                                     NaN
cellr_txt_cpbl_ind                              NaN
phone_number                            9.22773e+06
Name: 9227734.0, dtype: object

In [156]:
joined_again.to_csv("ordered_avg_draft.csv", sep=',', encoding='utf-8', header=True, index=False,
          columns=["Dept","phone_number","First_Name","Last_Name","address", "LOC_UNIT_LBL_TXT", "LOC_UNIT_NBR",
                  "catagory","In_avg","Out_avg","Total_avg", "CELLR_TEL_NBR", "cellr_co_cd",
                  "cellr_txt_cpbl_ind", "ALTN_WRK_TEL_NBR"])

In [154]:
joined_again.loc[9227732]


Out[154]:
In_avg                                         2.75
Out_avg                                          11
Total_avg                                     13.75
In                                               22
Out                                              88
Total                                           110
catagory                                        Low
Last_Name                                     OWENS
First_Name                                   HUNTER
Dept                  Information Technology Agency
address                    200 N MAIN ST 14TH FLOOR
LOC_UNIT_NBR                             14-265    
LOC_UNIT_LBL_TXT               Cubicle             
CELLR_TEL_NBR                                   NaN
ALTN_WRK_TEL_NBR                                NaN
cellr_co_cd                                     NaN
cellr_txt_cpbl_ind                              NaN
phone_number                            9.22773e+06
Name: 9227732.0, dtype: object

In [157]:
len(ita)


Out[157]:
287

In [ ]: