In [76]:
import pandas as pd
import numpy as np
%matplotlib inline
from bokeh.io import output_notebook
from bokeh.charts import Histogram, Bar, show
import os
output_notebook()
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})
In [77]:
df = pd.read_excel('./data/CHE Feb 2017.xls',
skiprows=range(0,12),
skip_footer=1,
)
In [78]:
df = df.drop([0])
In [79]:
df = df.drop(['Unnamed: 1', 'Unnamed: 2', 'Int', 'Tdm', 'Cost', 'Taxes',
'Eq. Cost', 'Total Cost'], axis=1)
In [80]:
df.Total.describe()
Out[80]:
In [81]:
df
Out[81]:
In [82]:
p = Histogram(df[df['Total'] < 500]['Total'], title='Histogram of number of total calls')
In [83]:
show(p)
In [ ]:
In [84]:
df.sort_values('Total', ascending=False).to_csv('num_calls.csv', header=True)
In [85]:
def is_five_digit_num(row):
"""
Is row num = 5 digits
"""
try:
num = int(row.Number)
except ValueError:
return False
if num > 9999 and num <= 99999:
return True
else:
return False
In [86]:
def load_data(filename):
"""
Takes a filename, returns df
"""
df = None
if filename == 'CHE March 2017.xls':
df = pd.read_excel('./data/CHE March 2017.xls',skiprows=range(0,12),skip_footer=12)
df['Number'] = df['Unnamed: 0']
df.drop(['Name'], axis=1, inplace=True)
df.drop(['Unnamed: 1', 'Unnamed: 2', 'Int', 'Tdm', 'Cost', 'Taxes',
'Eq. Cost', 'Total Cost', 'Unnamed: 15', 'Unnamed: 0'], axis=1, inplace=True)
else:
df = pd.read_excel('./data/' + filename,
skiprows=range(0,12),
skip_footer=1)
df.drop([0], inplace=True)
df.drop(['Unnamed: 1', 'Unnamed: 2', 'Int', 'Tdm', 'Cost', 'Taxes',
'Eq. Cost', 'Total Cost'], axis=1, inplace=True)
# Drop nan rows
df.dropna(inplace=True)
# drop rows where calls > 1000
df = df[df['Total'] < 1000]
df['Include'] = df.apply(is_five_digit_num, axis=1)
df = df[df['Include'] == True]
del df['Include']
return df
In [87]:
files = os.listdir('data')
data = [load_data(file) for file in files]
In [88]:
grouped = pd.concat(data).groupby('Number')
In [128]:
annual_totals = grouped.sum()
print(len(annual_totals))
In [129]:
cityfone_df = pd.read_excel('./CityFone Extract 20170518 - Whole City - From Ken - UnitNbr Formatted as Text.xlsx', converters={'WRK_TEL_NBR':str})
print(len(cityfone_df))
In [130]:
cityfone_df['Number'] = cityfone_df.WRK_TEL_NBR.apply(lambda x: str(x)[-7:])
cityfone_df['Number'].replace(' ', np.nan, inplace=True)
cityfone_df['Number'].replace('nan', np.nan, inplace=True)
cityfone_df.dropna(subset=['Number'], inplace=True)
print(cityfone_df.columns)
cityfone_df.rename(columns=lambda c: c.replace(" ", ""), inplace=True)
print(cityfone_df.columns)
In [131]:
def catagorize(row):
"""
Return High, Medium, Low catagory
"""
if row['Total'] >= len(data) * 201:
return 'High'
elif row['Total'] >= len(data) * 41:
return 'Medium'
else:
return 'Low'
In [132]:
annual_totals['catagory'] = annual_totals.apply(catagorize, axis=1)
In [133]:
ax = annual_totals.catagory.value_counts().plot(kind='bar', title='All CHE PBX Lines, April-March Dataset')
ax.set_xlabel('Number of Calls/Month Low <=40, Med 41-200, High >200')
ax.set_ylabel('Number of Phone Lines')
fig = ax.get_figure()
fig.savefig('all_che.png')
In [134]:
p1 = Bar(annual_totals.catagory.value_counts(), title='System Number of Users, by Type',
palette=['red', 'green', 'blue'],
)
In [135]:
def make_seven_digit(number):
"""
makes a seven digit number from the PBX data.
"""
first_digit = str(number)[0]
if first_digit == '3':
retval = '47' + str(number)
elif first_digit == '2':
retval = '92' + str(number)
elif first_digit == '8':
if str(number)[:3] == '895':
retval = '92' + str(number)
else:
retval = '97' + str(number)
return float(retval)
In [136]:
annual_totals.index = annual_totals.index.map(make_seven_digit)
In [137]:
def number_of_cityfone_matches(row):
"""
Find the number of Cityfone matches
"""
number_totals = row.name
cityfone_df['Number'] = pd.to_numeric(cityfone_df.Number)
df = cityfone_df[cityfone_df['Number'] == number_totals]
return len(df)
In [138]:
#annual_totals['number_of_cityfone_matches'] = annual_totals.apply(number_of_cityfone_matches, axis =1)
#annual_totals.apply(number_of_cityfone_matches, axis =1)
In [139]:
#annual_totals.to_csv('pbx_data_with_cityfone_matches.csv')
In [140]:
def cityfone_matcher(row, *args):
"""
Match to CityFone when perfect match
"""
number_totals = row.name
cityfone_df['Number'] = pd.to_numeric(cityfone_df["Number"])
df = cityfone_df[cityfone_df['Number'] == number_totals]
try:
val = df[args[0]].iloc[0]
except IndexError:
return np.NaN
return val
In [141]:
annual_totals['Last_Name'] = annual_totals.apply(cityfone_matcher, axis=1, args=('EMP_LST_NM',))
annual_totals['First_Name'] = annual_totals.apply(cityfone_matcher, axis=1, args=('EMP_FRST_NM',))
annual_totals['Dept'] = annual_totals.apply(cityfone_matcher, axis=1, args=('DEPT_NAME',))
annual_totals['address'] = annual_totals.apply(cityfone_matcher, axis=1, args=('STR_ADDR_TXT',))
annual_totals['LOC_UNIT_NBR'] = annual_totals.apply(cityfone_matcher, axis=1, args=('LOC_UNIT_NBR',))
annual_totals['LOC_UNIT_LBL_TXT'] = annual_totals.apply(cityfone_matcher, axis=1, args=('LOC_UNIT_LBL_TXT',))
annual_totals['CELLR_TEL_NBR'] = annual_totals.apply(cityfone_matcher, axis=1, args=('CELLR_TEL_NBR',))
annual_totals['ALTN_WRK_TEL_NBR'] = annual_totals.apply(cityfone_matcher, axis=1, args=('ALTN_WRK_TEL_NBR',))
annual_totals['cellr_co_cd'] = annual_totals.apply(cityfone_matcher, axis=1, args=('cellr_co_cd',))
annual_totals['cellr_txt_cpbl_ind'] = annual_totals.apply(cityfone_matcher, axis=1, args=('cellr_txt_cpbl_ind',))
In [142]:
ita = annual_totals[annual_totals['Dept'] == 'Information Technology Agency']
print(len(ita))
In [143]:
ax = ita.catagory.value_counts().plot(kind='bar', title='ITA CHE PBX Lines, April-March Dataset')
ax.set_xlabel('Number of Calls/Month Low <=40, Med 41-200, High >200')
ax.set_ylabel('Number of Phone Lines')
fig = ax.get_figure()
fig.savefig('all_ita.png')
In [144]:
annual_totals.to_csv('annual_totals_with_location.csv')
In [145]:
ita.catagory.value_counts()
Out[145]:
In [146]:
annual_avg = grouped.mean()
In [147]:
annual_avg.index = annual_avg.index.map(make_seven_digit)
In [148]:
annual_avg.rename(columns={'In': 'In_avg', 'Out': 'Out_avg', 'Total':'Total_avg'}, inplace=True)
In [149]:
joined_again = annual_avg.join(annual_totals)
In [150]:
joined_again.columns
Out[150]:
In [151]:
joined_again['phone_number'] = joined_again.index.values
Ordering
* Department
* Phone #
* First Name
* Last Name
* Address
* Loc Unit Lbl Txt
* Loc Unit Nbr
* Category
* Avg Monthly Calls In
* Avg Monthly Calls Out
* Avg Monthly Calls Total
In [152]:
## Remove 0s from datset
joined_again = joined_again.replace(to_replace=0,value='')
joined_again = joined_again.replace(to_replace='0',value='')
joined_again.ix[9227734]
Out[152]:
In [156]:
joined_again.to_csv("ordered_avg_draft.csv", sep=',', encoding='utf-8', header=True, index=False,
columns=["Dept","phone_number","First_Name","Last_Name","address", "LOC_UNIT_LBL_TXT", "LOC_UNIT_NBR",
"catagory","In_avg","Out_avg","Total_avg", "CELLR_TEL_NBR", "cellr_co_cd",
"cellr_txt_cpbl_ind", "ALTN_WRK_TEL_NBR"])
In [154]:
joined_again.loc[9227732]
Out[154]:
In [157]:
len(ita)
Out[157]:
In [ ]: