In [1]:
# Wifi data amalgamation
In [2]:
import pandas as pd
import glob
import os
In [15]:
# Data flow consists of emails. Each email has 1 zipped folder, containing multiple csv files. Each csv files has one sheet.
# Filenames have same title beginning, with different numbers on end.
data = pd.read_csv("Occupancy_Monitor_Client_Count_-_Most_Buildings_20171024_000515_566.csv", skiprows = 7, encoding = 'macroman', parse_dates = ['Event Time'])
In [16]:
data.head(5)
Out[16]:
In [29]:
#have to do it one-by-one if the column does not import correctly as datetime
datetimes=[]
for i in data.index:
try:
datetimes.append(pd.to_datetime(data.loc[i]['Event Time']))
except:
print(i)
#data['datetimes']=datetimes
In [30]:
#something fishy going on here
data.loc[275:282]
Out[30]:
In [34]:
#we just ignore, but need to maintain column length
#second, we also create a mask of good rows
datetimes=[]
datetimes_mask=[]
for i in data.index:
try:
datetimes.append(pd.to_datetime(data.loc[i]['Event Time']))
datetimes_mask.append(i)
except:
datetimes.append(i)
data['datetimes']=datetimes
datam=data.loc[datetimes_mask]
In [36]:
#still inherist object type, so reassing
datam.dtypes
Out[36]:
In [39]:
datam['datetimes']=pd.DatetimeIndex(datam['datetimes'])
In [40]:
#now AOK
datam.dtypes
Out[40]:
In [6]:
#no need for this now
#data['Event Time'] = data['Event Time'].str.replace(':',' ')
#data2 = pd.DataFrame([ x.split() for x in data['Event Time'].tolist()])
#data2.head(5)
Out[6]:
In [41]:
datam.columns
Out[41]:
In [47]:
# Preparation for multi hierarchy
data2=datam[['Total Count', 'Associated Client Count',
'Authenticated Client Count', 'datetimes']]
dates=pd.DatetimeIndex(datam['datetimes'])
data2['Year']=dates.year
data2['Month']=dates.month
data2['Day']=dates.day
data2['Hour']=dates.hour
data2['Minute']=dates.minute
data2['Second']=dates.second
In [49]:
data2 = data2.set_index(['Year','Month','Day','Hour','Minute','Second'])
In [50]:
data2
Out[50]: