In [1]:
# Wifi data amalgamation
In [2]:
import pandas as pd
import glob
import os
In [14]:
# Data flow consists of emails. Each email has 1 zipped folder, containing multiple csv files. Each csv files has one sheet.
# Filenames have same title beginning, with different numbers on end.
data = pd.read_csv("Occupancy_Monitor_Client_Count_-_Most_Buildings_20171024_000515_566.csv",
skiprows = 7, encoding = 'macroman',
date_col = ['Event Time'])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-14-3c44da352641> in <module>()
4 data = pd.read_csv("Occupancy_Monitor_Client_Count_-_Most_Buildings_20171024_000515_566.csv",
5 skiprows = 7, encoding = 'macroman',
----> 6 date_col = ['Event Time'])
TypeError: parser_f() got an unexpected keyword argument 'date_col'
In [11]:
import dateutil
pd.read_csv(
"Occupancy_Monitor_Client_Count_-_Most_Buildings_20171024_000515_566.csv",
skiprows = 7, encoding = 'macroman',
parse_dates=['Event Time'],
date_parser=dateutil.parser.parse,
)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
2901 result = tools.to_datetime(
-> 2902 date_parser(*date_cols), errors='ignore')
2903 if isinstance(result, datetime.datetime):
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(timestr, parserinfo, **kwargs)
1167 else:
-> 1168 return DEFAULTPARSER.parse(timestr, **kwargs)
1169
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
555
--> 556 res, skipped_tokens = self._parse(timestr, **kwargs)
557
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in _parse(self, timestr, dayfirst, yearfirst, fuzzy, fuzzy_with_tokens)
674 res = self._result()
--> 675 l = _timelex.split(timestr) # Splits the timestr into tokens
676
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in split(cls, s)
191 def split(cls, s):
--> 192 return list(cls(s))
193
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in __init__(self, instream)
60 raise TypeError('Parser must be a string or character stream, not '
---> 61 '{itype}'.format(itype=instream.__class__.__name__))
62
TypeError: Parser must be a string or character stream, not ndarray
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
2910 parser=date_parser,
-> 2911 dayfirst=dayfirst),
2912 errors='ignore')
pandas\_libs\src\inference.pyx in pandas._libs.lib.try_parse_dates (pandas\_libs\lib.c:61369)()
pandas\_libs\src\inference.pyx in pandas._libs.lib.try_parse_dates (pandas\_libs\lib.c:61267)()
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(timestr, parserinfo, **kwargs)
1167 else:
-> 1168 return DEFAULTPARSER.parse(timestr, **kwargs)
1169
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
558 if res is None:
--> 559 raise ValueError("Unknown string format")
560
ValueError: Unknown string format
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-11-927f1f8ea957> in <module>()
4 skiprows = 7, encoding = 'macroman',
5 parse_dates=['Event Time'],
----> 6 date_parser=dateutil.parser.parse,
7 )
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
653 skip_blank_lines=skip_blank_lines)
654
--> 655 return _read(filepath_or_buffer, kwds)
656
657 parser_f.__name__ = name
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
409
410 try:
--> 411 data = parser.read(nrows)
412 finally:
413 parser.close()
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
980 raise ValueError('skipfooter not supported for iteration')
981
--> 982 ret = self._engine.read(nrows)
983
984 if self.options.get('as_recarray'):
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1794 data = dict((k, v) for k, (i, v) in zip(names, data))
1795
-> 1796 names, data = self._do_date_conversions(names, data)
1797 index, names = self._make_index(data, alldata, names)
1798
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in _do_date_conversions(self, names, data)
1554 data, names = _process_date_conversion(
1555 data, self._date_conv, self.parse_dates, self.index_col,
-> 1556 self.index_names, names, keep_date_col=self.keep_date_col)
1557
1558 return names, data
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in _process_date_conversion(data_dict, converter, parse_spec, index_col, index_names, columns, keep_date_col)
2945 if _isindex(colspec):
2946 continue
-> 2947 data_dict[colspec] = converter(data_dict[colspec])
2948 else:
2949 new_name, col, old_names = _try_convert_dates(
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
2912 errors='ignore')
2913 except Exception:
-> 2914 return generic_parser(date_parser, *date_cols)
2915
2916 return converter
C:\Anaconda2\envs\python3\lib\site-packages\pandas\io\date_converters.py in generic_parser(parse_func, *cols)
36 for i in range(N):
37 args = [c[i] for c in cols]
---> 38 results[i] = parse_func(*args)
39
40 return results
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(timestr, parserinfo, **kwargs)
1166 return parser(parserinfo).parse(timestr, **kwargs)
1167 else:
-> 1168 return DEFAULTPARSER.parse(timestr, **kwargs)
1169
1170
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
554 effective_dt = default
555
--> 556 res, skipped_tokens = self._parse(timestr, **kwargs)
557
558 if res is None:
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in _parse(self, timestr, dayfirst, yearfirst, fuzzy, fuzzy_with_tokens)
673
674 res = self._result()
--> 675 l = _timelex.split(timestr) # Splits the timestr into tokens
676
677 # keep up with the last token skipped so we can recombine
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in split(cls, s)
190 @classmethod
191 def split(cls, s):
--> 192 return list(cls(s))
193
194 @classmethod
C:\Anaconda2\envs\python3\lib\site-packages\dateutil\parser.py in __init__(self, instream)
59 if getattr(instream, 'read', None) is None:
60 raise TypeError('Parser must be a string or character stream, not '
---> 61 '{itype}'.format(itype=instream.__class__.__name__))
62
63 self.instream = instream
TypeError: Parser must be a string or character stream, not float
In [29]:
#have to do it one-by-one if the column does not import correctly as datetime
datetimes=[]
for i in data.index:
try:
datetimes.append(pd.to_datetime(data.loc[i]['Event Time']))
except:
print(i)
#data['datetimes']=datetimes
279
In [30]:
#something fishy going on here
data.loc[275:282]
Out[30]:
Total Count
Event Time
Associated Client Count
Authenticated Client Count
275
total
Mon Oct 23 23:50:45 UTC 2017
8717
8590
276
total
Mon Oct 23 23:55:56 UTC 2017
8706
8563
277
total
Tue Oct 24 00:00:52 UTC 2017
8637
8500
278
Client Count
nan
NaN
NaN
279
Key
Event Time
Associated Client Count
Authenticated Client Count
280
Lancaster University > Pendle > A Floor
Mon Oct 23 00:08:35 UTC 2017
73
71
281
Lancaster University > Pendle > A Floor
Mon Oct 23 00:13:55 UTC 2017
72
72
282
Lancaster University > Pendle > A Floor
Mon Oct 23 00:19:02 UTC 2017
75
72
In [34]:
#we just ignore, but need to maintain column length
#second, we also create a mask of good rows
datetimes=[]
datetimes_mask=[]
for i in data.index:
try:
datetimes.append(pd.to_datetime(data.loc[i]['Event Time']))
datetimes_mask.append(i)
except:
datetimes.append(i)
data['datetimes']=datetimes
datam=data.loc[datetimes_mask]
In [36]:
#still inherist object type, so reassing
datam.dtypes
Out[36]:
Total Count object
Event Time object
Associated Client Count object
Authenticated Client Count object
datetimes object
dtype: object
In [39]:
datam['datetimes']=pd.DatetimeIndex(datam['datetimes'])
In [40]:
#now AOK
datam.dtypes
Out[40]:
Total Count object
Event Time object
Associated Client Count object
Authenticated Client Count object
datetimes datetime64[ns]
dtype: object
In [6]:
#no need for this now
#data['Event Time'] = data['Event Time'].str.replace(':',' ')
#data2 = pd.DataFrame([ x.split() for x in data['Event Time'].tolist()])
#data2.head(5)
Out[6]:
0
1
2
3
4
5
6
7
0
Mon
Oct
23
00
08
35
UTC
2017
1
Mon
Oct
23
00
13
55
UTC
2017
2
Mon
Oct
23
00
19
02
UTC
2017
3
Mon
Oct
23
00
24
10
UTC
2017
4
Mon
Oct
23
00
29
18
UTC
2017
In [41]:
datam.columns
Out[41]:
Index(['Total Count', 'Event Time', 'Associated Client Count',
'Authenticated Client Count', 'datetimes'],
dtype='object')
In [47]:
# Preparation for multi hierarchy
data2=datam[['Total Count', 'Associated Client Count',
'Authenticated Client Count', 'datetimes']]
dates=pd.DatetimeIndex(datam['datetimes'])
data2['Year']=dates.year
data2['Month']=dates.month
data2['Day']=dates.day
data2['Hour']=dates.hour
data2['Minute']=dates.minute
data2['Second']=dates.second
In [49]:
data2 = data2.set_index(['Year','Month','Day','Hour','Minute','Second'])
In [50]:
data2
Out[50]:
Total Count
Associated Client Count
Authenticated Client Count
datetimes
Year
Month
Day
Hour
Minute
Second
2017.0
10.0
23.0
0.0
8.0
35.0
total
8603
8487
2017-10-23 00:08:35
13.0
55.0
total
8515
8402
2017-10-23 00:13:55
19.0
2.0
total
8475
8360
2017-10-23 00:19:02
24.0
10.0
total
8424
8307
2017-10-23 00:24:10
29.0
18.0
total
8352
8239
2017-10-23 00:29:18
34.0
37.0
total
8283
8171
2017-10-23 00:34:37
39.0
30.0
total
8171
8071
2017-10-23 00:39:30
44.0
42.0
total
8140
8031
2017-10-23 00:44:42
49.0
49.0
total
8073
7959
2017-10-23 00:49:49
54.0
53.0
total
8003
7907
2017-10-23 00:54:53
1.0
0.0
12.0
total
7982
7880
2017-10-23 01:00:12
5.0
13.0
total
7979
7864
2017-10-23 01:05:13
10.0
22.0
total
7874
7770
2017-10-23 01:10:22
15.0
39.0
total
7868
7751
2017-10-23 01:15:39
20.0
37.0
total
7791
7667
2017-10-23 01:20:37
25.0
46.0
total
7738
7622
2017-10-23 01:25:46
31.0
7.0
total
7702
7589
2017-10-23 01:31:07
35.0
52.0
total
7672
7559
2017-10-23 01:35:52
41.0
1.0
total
7645
7537
2017-10-23 01:41:01
46.0
12.0
total
7590
7482
2017-10-23 01:46:12
51.0
6.0
total
7571
7459
2017-10-23 01:51:06
56.0
12.0
total
7543
7423
2017-10-23 01:56:12
2.0
1.0
18.0
total
7501
7403
2017-10-23 02:01:18
6.0
16.0
total
7488
7366
2017-10-23 02:06:16
11.0
22.0
total
7455
7341
2017-10-23 02:11:22
16.0
23.0
total
7398
7295
2017-10-23 02:16:23
21.0
28.0
total
7395
7280
2017-10-23 02:21:28
26.0
25.0
total
7341
7219
2017-10-23 02:26:25
31.0
30.0
total
7259
7149
2017-10-23 02:31:30
36.0
31.0
total
7255
7113
2017-10-23 02:36:31
...
...
...
...
...
...
...
20.0
29.0
8.0
Lancaster University > SW hse 24-26 > A
38
37
2017-10-23 20:29:08
34.0
1.0
Lancaster University > SW hse 24-26 > A
42
39
2017-10-23 20:34:01
39.0
50.0
Lancaster University > SW hse 24-26 > A
40
37
2017-10-23 20:39:50
44.0
41.0
Lancaster University > SW hse 24-26 > A
40
40
2017-10-23 20:44:41
49.0
53.0
Lancaster University > SW hse 24-26 > A
37
37
2017-10-23 20:49:53
55.0
3.0
Lancaster University > SW hse 24-26 > A
39
39
2017-10-23 20:55:03
21.0
0.0
12.0
Lancaster University > SW hse 24-26 > A
37
36
2017-10-23 21:00:12
5.0
11.0
Lancaster University > SW hse 24-26 > A
36
35
2017-10-23 21:05:11
10.0
50.0
Lancaster University > SW hse 24-26 > A
35
33
2017-10-23 21:10:50
15.0
46.0
Lancaster University > SW hse 24-26 > A
37
34
2017-10-23 21:15:46
21.0
4.0
Lancaster University > SW hse 24-26 > A
33
32
2017-10-23 21:21:04
26.0
14.0
Lancaster University > SW hse 24-26 > A
34
33
2017-10-23 21:26:14
31.0
28.0
Lancaster University > SW hse 24-26 > A
38
37
2017-10-23 21:31:28
36.0
40.0
Lancaster University > SW hse 24-26 > A
39
39
2017-10-23 21:36:40
41.0
39.0
Lancaster University > SW hse 24-26 > A
40
38
2017-10-23 21:41:39
46.0
42.0
Lancaster University > SW hse 24-26 > A
38
38
2017-10-23 21:46:42
51.0
59.0
Lancaster University > SW hse 24-26 > A
42
39
2017-10-23 21:51:59
57.0
16.0
Lancaster University > SW hse 24-26 > A
39
39
2017-10-23 21:57:16
22.0
2.0
44.0
Lancaster University > SW hse 24-26 > A
38
37
2017-10-23 22:02:44
7.0
31.0
Lancaster University > SW hse 24-26 > A
42
41
2017-10-23 22:07:31
13.0
15.0
Lancaster University > SW hse 24-26 > A
43
43
2017-10-23 22:13:15
18.0
31.0
Lancaster University > SW hse 24-26 > A
44
42
2017-10-23 22:18:31
23.0
9.0
Lancaster University > SW hse 24-26 > A
45
45
2017-10-23 22:23:09
28.0
48.0
Lancaster University > SW hse 24-26 > A
44
43
2017-10-23 22:28:48
33.0
31.0
Lancaster University > SW hse 24-26 > A
44
43
2017-10-23 22:33:31
38.0
39.0
Lancaster University > SW hse 24-26 > A
43
42
2017-10-23 22:38:39
43.0
55.0
Lancaster University > SW hse 24-26 > A
44
44
2017-10-23 22:43:55
48.0
48.0
Lancaster University > SW hse 24-26 > A
48
48
2017-10-23 22:48:48
54.0
2.0
Lancaster University > SW hse 24-26 > A
42
42
2017-10-23 22:54:02
59.0
35.0
Lancaster University > SW hse 24-26 > A
43
42
2017-10-23 22:59:35
15001 rows × 4 columns
Content source: csaladenes/csaladenes.github.io
Similar notebooks: