In [1]:
# Wifi data amalgamation

In [2]:
import pandas as pd
import glob
import os

In [14]:
# Data flow consists of emails. Each email has 1 zipped folder, containing multiple csv files. Each csv files has one sheet.
# Filenames have same title beginning, with different numbers on end. 

data = pd.read_csv("Occupancy_Monitor_Client_Count_-_Most_Buildings_20171024_000515_566.csv",
                   skiprows = 7, encoding = 'macroman', 
                   date_col = ['Event Time'])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-14-3c44da352641> in <module>()
      4 data = pd.read_csv("Occupancy_Monitor_Client_Count_-_Most_Buildings_20171024_000515_566.csv",
      5                    skiprows = 7, encoding = 'macroman',
----> 6                    date_col = ['Event Time'])

TypeError: parser_f() got an unexpected keyword argument 'date_col'

In [11]:
import dateutil
pd.read_csv(
    "Occupancy_Monitor_Client_Count_-_Most_Buildings_20171024_000515_566.csv",
    skiprows = 7, encoding = 'macroman', 
    parse_dates=['Event Time'],
    date_parser=dateutil.parser.parse,
)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
   2901                 result = tools.to_datetime(
-> 2902                     date_parser(*date_cols), errors='ignore')
   2903                 if isinstance(result, datetime.datetime):

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(timestr, parserinfo, **kwargs)
   1167     else:
-> 1168         return DEFAULTPARSER.parse(timestr, **kwargs)
   1169 

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    555 
--> 556         res, skipped_tokens = self._parse(timestr, **kwargs)
    557 

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in _parse(self, timestr, dayfirst, yearfirst, fuzzy, fuzzy_with_tokens)
    674         res = self._result()
--> 675         l = _timelex.split(timestr)         # Splits the timestr into tokens
    676 

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in split(cls, s)
    191     def split(cls, s):
--> 192         return list(cls(s))
    193 

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in __init__(self, instream)
     60             raise TypeError('Parser must be a string or character stream, not '
---> 61                             '{itype}'.format(itype=instream.__class__.__name__))
     62 

TypeError: Parser must be a string or character stream, not ndarray

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
   2910                                             parser=date_parser,
-> 2911                                             dayfirst=dayfirst),
   2912                         errors='ignore')

pandas\_libs\src\inference.pyx in pandas._libs.lib.try_parse_dates (pandas\_libs\lib.c:61369)()

pandas\_libs\src\inference.pyx in pandas._libs.lib.try_parse_dates (pandas\_libs\lib.c:61267)()

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(timestr, parserinfo, **kwargs)
   1167     else:
-> 1168         return DEFAULTPARSER.parse(timestr, **kwargs)
   1169 

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    558         if res is None:
--> 559             raise ValueError("Unknown string format")
    560 

ValueError: Unknown string format

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-11-927f1f8ea957> in <module>()
      4     skiprows = 7, encoding = 'macroman',
      5     parse_dates=['Event Time'],
----> 6     date_parser=dateutil.parser.parse,
      7 )

C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    409 
    410     try:
--> 411         data = parser.read(nrows)
    412     finally:
    413         parser.close()

C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
    980                 raise ValueError('skipfooter not supported for iteration')
    981 
--> 982         ret = self._engine.read(nrows)
    983 
    984         if self.options.get('as_recarray'):

C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
   1794             data = dict((k, v) for k, (i, v) in zip(names, data))
   1795 
-> 1796             names, data = self._do_date_conversions(names, data)
   1797             index, names = self._make_index(data, alldata, names)
   1798 

C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in _do_date_conversions(self, names, data)
   1554             data, names = _process_date_conversion(
   1555                 data, self._date_conv, self.parse_dates, self.index_col,
-> 1556                 self.index_names, names, keep_date_col=self.keep_date_col)
   1557 
   1558         return names, data

C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in _process_date_conversion(data_dict, converter, parse_spec, index_col, index_names, columns, keep_date_col)
   2945                 if _isindex(colspec):
   2946                     continue
-> 2947                 data_dict[colspec] = converter(data_dict[colspec])
   2948             else:
   2949                 new_name, col, old_names = _try_convert_dates(

C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
   2912                         errors='ignore')
   2913                 except Exception:
-> 2914                     return generic_parser(date_parser, *date_cols)
   2915 
   2916     return converter

C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\date_converters.py in generic_parser(parse_func, *cols)
     36     for i in range(N):
     37         args = [c[i] for c in cols]
---> 38         results[i] = parse_func(*args)
     39 
     40     return results

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(timestr, parserinfo, **kwargs)
   1166         return parser(parserinfo).parse(timestr, **kwargs)
   1167     else:
-> 1168         return DEFAULTPARSER.parse(timestr, **kwargs)
   1169 
   1170 

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    554             effective_dt = default
    555 
--> 556         res, skipped_tokens = self._parse(timestr, **kwargs)
    557 
    558         if res is None:

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in _parse(self, timestr, dayfirst, yearfirst, fuzzy, fuzzy_with_tokens)
    673 
    674         res = self._result()
--> 675         l = _timelex.split(timestr)         # Splits the timestr into tokens
    676 
    677         # keep up with the last token skipped so we can recombine

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in split(cls, s)
    190     @classmethod
    191     def split(cls, s):
--> 192         return list(cls(s))
    193 
    194     @classmethod

C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in __init__(self, instream)
     59         if getattr(instream, 'read', None) is None:
     60             raise TypeError('Parser must be a string or character stream, not '
---> 61                             '{itype}'.format(itype=instream.__class__.__name__))
     62 
     63         self.instream = instream

TypeError: Parser must be a string or character stream, not float

In [29]:
#have to do it one-by-one if the column does not import correctly as datetime
datetimes=[]
for i in data.index:
    try:
        datetimes.append(pd.to_datetime(data.loc[i]['Event Time']))
    except:
        print(i)
#data['datetimes']=datetimes


279

In [30]:
#something fishy going on here
data.loc[275:282]


Out[30]:
Total Count Event Time Associated Client Count Authenticated Client Count
275 total Mon Oct 23 23:50:45 UTC 2017 8717 8590
276 total Mon Oct 23 23:55:56 UTC 2017 8706 8563
277 total Tue Oct 24 00:00:52 UTC 2017 8637 8500
278 Client Count nan NaN NaN
279 Key Event Time Associated Client Count Authenticated Client Count
280 Lancaster University > Pendle > A Floor Mon Oct 23 00:08:35 UTC 2017 73 71
281 Lancaster University > Pendle > A Floor Mon Oct 23 00:13:55 UTC 2017 72 72
282 Lancaster University > Pendle > A Floor Mon Oct 23 00:19:02 UTC 2017 75 72

In [34]:
#we just ignore, but need to maintain column length
#second, we also create a mask of good rows
datetimes=[]
datetimes_mask=[]
for i in data.index:
    try:
        datetimes.append(pd.to_datetime(data.loc[i]['Event Time']))
        datetimes_mask.append(i)
    except:
        datetimes.append(i)
data['datetimes']=datetimes
datam=data.loc[datetimes_mask]

In [36]:
#still inherist object type, so reassing
datam.dtypes


Out[36]:
Total Count                   object
Event Time                    object
Associated Client Count       object
Authenticated Client Count    object
datetimes                     object
dtype: object

In [39]:
datam['datetimes']=pd.DatetimeIndex(datam['datetimes'])

In [40]:
#now AOK
datam.dtypes


Out[40]:
Total Count                           object
Event Time                            object
Associated Client Count               object
Authenticated Client Count            object
datetimes                     datetime64[ns]
dtype: object

In [6]:
#no need for this now
#data['Event Time'] = data['Event Time'].str.replace(':',' ')
#data2 = pd.DataFrame([ x.split() for x in data['Event Time'].tolist()])
#data2.head(5)


Out[6]:
0 1 2 3 4 5 6 7
0 Mon Oct 23 00 08 35 UTC 2017
1 Mon Oct 23 00 13 55 UTC 2017
2 Mon Oct 23 00 19 02 UTC 2017
3 Mon Oct 23 00 24 10 UTC 2017
4 Mon Oct 23 00 29 18 UTC 2017

In [41]:
datam.columns


Out[41]:
Index(['Total Count', 'Event Time', 'Associated Client Count',
       'Authenticated Client Count', 'datetimes'],
      dtype='object')

In [47]:
# Preparation for multi hierarchy
data2=datam[['Total Count', 'Associated Client Count',
       'Authenticated Client Count', 'datetimes']]
dates=pd.DatetimeIndex(datam['datetimes'])
data2['Year']=dates.year
data2['Month']=dates.month
data2['Day']=dates.day
data2['Hour']=dates.hour
data2['Minute']=dates.minute
data2['Second']=dates.second

In [49]:
data2 = data2.set_index(['Year','Month','Day','Hour','Minute','Second'])

In [50]:
data2


Out[50]:
Total Count Associated Client Count Authenticated Client Count datetimes
Year Month Day Hour Minute Second
2017.0 10.0 23.0 0.0 8.0 35.0 total 8603 8487 2017-10-23 00:08:35
13.0 55.0 total 8515 8402 2017-10-23 00:13:55
19.0 2.0 total 8475 8360 2017-10-23 00:19:02
24.0 10.0 total 8424 8307 2017-10-23 00:24:10
29.0 18.0 total 8352 8239 2017-10-23 00:29:18
34.0 37.0 total 8283 8171 2017-10-23 00:34:37
39.0 30.0 total 8171 8071 2017-10-23 00:39:30
44.0 42.0 total 8140 8031 2017-10-23 00:44:42
49.0 49.0 total 8073 7959 2017-10-23 00:49:49
54.0 53.0 total 8003 7907 2017-10-23 00:54:53
1.0 0.0 12.0 total 7982 7880 2017-10-23 01:00:12
5.0 13.0 total 7979 7864 2017-10-23 01:05:13
10.0 22.0 total 7874 7770 2017-10-23 01:10:22
15.0 39.0 total 7868 7751 2017-10-23 01:15:39
20.0 37.0 total 7791 7667 2017-10-23 01:20:37
25.0 46.0 total 7738 7622 2017-10-23 01:25:46
31.0 7.0 total 7702 7589 2017-10-23 01:31:07
35.0 52.0 total 7672 7559 2017-10-23 01:35:52
41.0 1.0 total 7645 7537 2017-10-23 01:41:01
46.0 12.0 total 7590 7482 2017-10-23 01:46:12
51.0 6.0 total 7571 7459 2017-10-23 01:51:06
56.0 12.0 total 7543 7423 2017-10-23 01:56:12
2.0 1.0 18.0 total 7501 7403 2017-10-23 02:01:18
6.0 16.0 total 7488 7366 2017-10-23 02:06:16
11.0 22.0 total 7455 7341 2017-10-23 02:11:22
16.0 23.0 total 7398 7295 2017-10-23 02:16:23
21.0 28.0 total 7395 7280 2017-10-23 02:21:28
26.0 25.0 total 7341 7219 2017-10-23 02:26:25
31.0 30.0 total 7259 7149 2017-10-23 02:31:30
36.0 31.0 total 7255 7113 2017-10-23 02:36:31
... ... ... ... ... ... ...
20.0 29.0 8.0 Lancaster University > SW hse 24-26 > A 38 37 2017-10-23 20:29:08
34.0 1.0 Lancaster University > SW hse 24-26 > A 42 39 2017-10-23 20:34:01
39.0 50.0 Lancaster University > SW hse 24-26 > A 40 37 2017-10-23 20:39:50
44.0 41.0 Lancaster University > SW hse 24-26 > A 40 40 2017-10-23 20:44:41
49.0 53.0 Lancaster University > SW hse 24-26 > A 37 37 2017-10-23 20:49:53
55.0 3.0 Lancaster University > SW hse 24-26 > A 39 39 2017-10-23 20:55:03
21.0 0.0 12.0 Lancaster University > SW hse 24-26 > A 37 36 2017-10-23 21:00:12
5.0 11.0 Lancaster University > SW hse 24-26 > A 36 35 2017-10-23 21:05:11
10.0 50.0 Lancaster University > SW hse 24-26 > A 35 33 2017-10-23 21:10:50
15.0 46.0 Lancaster University > SW hse 24-26 > A 37 34 2017-10-23 21:15:46
21.0 4.0 Lancaster University > SW hse 24-26 > A 33 32 2017-10-23 21:21:04
26.0 14.0 Lancaster University > SW hse 24-26 > A 34 33 2017-10-23 21:26:14
31.0 28.0 Lancaster University > SW hse 24-26 > A 38 37 2017-10-23 21:31:28
36.0 40.0 Lancaster University > SW hse 24-26 > A 39 39 2017-10-23 21:36:40
41.0 39.0 Lancaster University > SW hse 24-26 > A 40 38 2017-10-23 21:41:39
46.0 42.0 Lancaster University > SW hse 24-26 > A 38 38 2017-10-23 21:46:42
51.0 59.0 Lancaster University > SW hse 24-26 > A 42 39 2017-10-23 21:51:59
57.0 16.0 Lancaster University > SW hse 24-26 > A 39 39 2017-10-23 21:57:16
22.0 2.0 44.0 Lancaster University > SW hse 24-26 > A 38 37 2017-10-23 22:02:44
7.0 31.0 Lancaster University > SW hse 24-26 > A 42 41 2017-10-23 22:07:31
13.0 15.0 Lancaster University > SW hse 24-26 > A 43 43 2017-10-23 22:13:15
18.0 31.0 Lancaster University > SW hse 24-26 > A 44 42 2017-10-23 22:18:31
23.0 9.0 Lancaster University > SW hse 24-26 > A 45 45 2017-10-23 22:23:09
28.0 48.0 Lancaster University > SW hse 24-26 > A 44 43 2017-10-23 22:28:48
33.0 31.0 Lancaster University > SW hse 24-26 > A 44 43 2017-10-23 22:33:31
38.0 39.0 Lancaster University > SW hse 24-26 > A 43 42 2017-10-23 22:38:39
43.0 55.0 Lancaster University > SW hse 24-26 > A 44 44 2017-10-23 22:43:55
48.0 48.0 Lancaster University > SW hse 24-26 > A 48 48 2017-10-23 22:48:48
54.0 2.0 Lancaster University > SW hse 24-26 > A 42 42 2017-10-23 22:54:02
59.0 35.0 Lancaster University > SW hse 24-26 > A 43 42 2017-10-23 22:59:35

15001 rows × 4 columns