In [36]:
%matplotlib inline
In [3]:
import json
path = '../data/book/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]
In [4]:
records[0]
Out[4]:
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
'al': 'en-US,en;q=0.8',
'c': 'US',
'cy': 'Danvers',
'g': 'A6qOVH',
'gr': 'MA',
'h': 'wfLQtf',
'hc': 1331822918,
'hh': '1.usa.gov',
'l': 'orofrog',
'll': [42.576698, -70.954903],
'nk': 1,
'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
't': 1331923247,
'tz': 'America/New_York',
'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
In [5]:
records[0]['tz']
Out[5]:
'America/New_York'
In [6]:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
In [8]:
time_zones[:10]
Out[8]:
['America/New_York',
'America/Denver',
'America/New_York',
'America/Sao_Paulo',
'America/New_York',
'America/New_York',
'Europe/Warsaw',
'',
'',
'']
In [9]:
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
In [10]:
get_counts(time_zones)
Out[10]:
{'': 521,
'Africa/Cairo': 3,
'Africa/Casablanca': 1,
'Africa/Ceuta': 2,
'Africa/Johannesburg': 1,
'Africa/Lusaka': 1,
'America/Anchorage': 5,
'America/Argentina/Buenos_Aires': 1,
'America/Argentina/Cordoba': 1,
'America/Argentina/Mendoza': 1,
'America/Bogota': 3,
'America/Caracas': 1,
'America/Chicago': 400,
'America/Chihuahua': 2,
'America/Costa_Rica': 1,
'America/Denver': 191,
'America/Edmonton': 6,
'America/Guayaquil': 2,
'America/Halifax': 4,
'America/Indianapolis': 20,
'America/La_Paz': 1,
'America/Lima': 1,
'America/Los_Angeles': 382,
'America/Managua': 3,
'America/Mazatlan': 1,
'America/Mexico_City': 15,
'America/Monterrey': 1,
'America/Montevideo': 1,
'America/Montreal': 9,
'America/New_York': 1251,
'America/Phoenix': 20,
'America/Puerto_Rico': 10,
'America/Rainy_River': 25,
'America/Recife': 2,
'America/Santo_Domingo': 1,
'America/Sao_Paulo': 33,
'America/St_Kitts': 1,
'America/Tegucigalpa': 1,
'America/Vancouver': 12,
'America/Winnipeg': 4,
'Asia/Amman': 2,
'Asia/Bangkok': 6,
'Asia/Beirut': 4,
'Asia/Calcutta': 9,
'Asia/Dubai': 4,
'Asia/Harbin': 3,
'Asia/Hong_Kong': 10,
'Asia/Istanbul': 9,
'Asia/Jakarta': 3,
'Asia/Jerusalem': 3,
'Asia/Karachi': 3,
'Asia/Kuala_Lumpur': 3,
'Asia/Kuching': 1,
'Asia/Manila': 1,
'Asia/Nicosia': 1,
'Asia/Novosibirsk': 1,
'Asia/Pontianak': 1,
'Asia/Riyadh': 1,
'Asia/Seoul': 5,
'Asia/Tokyo': 37,
'Asia/Yekaterinburg': 1,
'Australia/NSW': 6,
'Australia/Queensland': 1,
'Chile/Continental': 6,
'Europe/Amsterdam': 22,
'Europe/Athens': 6,
'Europe/Belgrade': 2,
'Europe/Berlin': 28,
'Europe/Bratislava': 3,
'Europe/Brussels': 4,
'Europe/Bucharest': 4,
'Europe/Budapest': 5,
'Europe/Copenhagen': 5,
'Europe/Dublin': 3,
'Europe/Helsinki': 10,
'Europe/Lisbon': 8,
'Europe/Ljubljana': 1,
'Europe/London': 74,
'Europe/Madrid': 35,
'Europe/Malta': 2,
'Europe/Moscow': 10,
'Europe/Oslo': 10,
'Europe/Paris': 14,
'Europe/Prague': 10,
'Europe/Riga': 2,
'Europe/Rome': 27,
'Europe/Skopje': 1,
'Europe/Sofia': 1,
'Europe/Stockholm': 14,
'Europe/Uzhgorod': 1,
'Europe/Vienna': 6,
'Europe/Vilnius': 2,
'Europe/Volgograd': 1,
'Europe/Warsaw': 16,
'Europe/Zurich': 4,
'Pacific/Auckland': 11,
'Pacific/Honolulu': 36}
In [12]:
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int)
for x in sequence:
counts[x] += 1
return counts
In [14]:
counts = get_counts2(time_zones)
In [15]:
len(time_zones)
Out[15]:
3440
In [16]:
counts
Out[16]:
defaultdict(int,
{'': 521,
'Africa/Cairo': 3,
'Africa/Casablanca': 1,
'Africa/Ceuta': 2,
'Africa/Johannesburg': 1,
'Africa/Lusaka': 1,
'America/Anchorage': 5,
'America/Argentina/Buenos_Aires': 1,
'America/Argentina/Cordoba': 1,
'America/Argentina/Mendoza': 1,
'America/Bogota': 3,
'America/Caracas': 1,
'America/Chicago': 400,
'America/Chihuahua': 2,
'America/Costa_Rica': 1,
'America/Denver': 191,
'America/Edmonton': 6,
'America/Guayaquil': 2,
'America/Halifax': 4,
'America/Indianapolis': 20,
'America/La_Paz': 1,
'America/Lima': 1,
'America/Los_Angeles': 382,
'America/Managua': 3,
'America/Mazatlan': 1,
'America/Mexico_City': 15,
'America/Monterrey': 1,
'America/Montevideo': 1,
'America/Montreal': 9,
'America/New_York': 1251,
'America/Phoenix': 20,
'America/Puerto_Rico': 10,
'America/Rainy_River': 25,
'America/Recife': 2,
'America/Santo_Domingo': 1,
'America/Sao_Paulo': 33,
'America/St_Kitts': 1,
'America/Tegucigalpa': 1,
'America/Vancouver': 12,
'America/Winnipeg': 4,
'Asia/Amman': 2,
'Asia/Bangkok': 6,
'Asia/Beirut': 4,
'Asia/Calcutta': 9,
'Asia/Dubai': 4,
'Asia/Harbin': 3,
'Asia/Hong_Kong': 10,
'Asia/Istanbul': 9,
'Asia/Jakarta': 3,
'Asia/Jerusalem': 3,
'Asia/Karachi': 3,
'Asia/Kuala_Lumpur': 3,
'Asia/Kuching': 1,
'Asia/Manila': 1,
'Asia/Nicosia': 1,
'Asia/Novosibirsk': 1,
'Asia/Pontianak': 1,
'Asia/Riyadh': 1,
'Asia/Seoul': 5,
'Asia/Tokyo': 37,
'Asia/Yekaterinburg': 1,
'Australia/NSW': 6,
'Australia/Queensland': 1,
'Chile/Continental': 6,
'Europe/Amsterdam': 22,
'Europe/Athens': 6,
'Europe/Belgrade': 2,
'Europe/Berlin': 28,
'Europe/Bratislava': 3,
'Europe/Brussels': 4,
'Europe/Bucharest': 4,
'Europe/Budapest': 5,
'Europe/Copenhagen': 5,
'Europe/Dublin': 3,
'Europe/Helsinki': 10,
'Europe/Lisbon': 8,
'Europe/Ljubljana': 1,
'Europe/London': 74,
'Europe/Madrid': 35,
'Europe/Malta': 2,
'Europe/Moscow': 10,
'Europe/Oslo': 10,
'Europe/Paris': 14,
'Europe/Prague': 10,
'Europe/Riga': 2,
'Europe/Rome': 27,
'Europe/Skopje': 1,
'Europe/Sofia': 1,
'Europe/Stockholm': 14,
'Europe/Uzhgorod': 1,
'Europe/Vienna': 6,
'Europe/Vilnius': 2,
'Europe/Volgograd': 1,
'Europe/Warsaw': 16,
'Europe/Zurich': 4,
'Pacific/Auckland': 11,
'Pacific/Honolulu': 36})
In [17]:
counts['America/Chicago']
Out[17]:
400
In [18]:
from collections import Counter
counts = Counter(time_zones)
In [19]:
counts
Out[19]:
Counter({'': 521,
'Africa/Cairo': 3,
'Africa/Casablanca': 1,
'Africa/Ceuta': 2,
'Africa/Johannesburg': 1,
'Africa/Lusaka': 1,
'America/Anchorage': 5,
'America/Argentina/Buenos_Aires': 1,
'America/Argentina/Cordoba': 1,
'America/Argentina/Mendoza': 1,
'America/Bogota': 3,
'America/Caracas': 1,
'America/Chicago': 400,
'America/Chihuahua': 2,
'America/Costa_Rica': 1,
'America/Denver': 191,
'America/Edmonton': 6,
'America/Guayaquil': 2,
'America/Halifax': 4,
'America/Indianapolis': 20,
'America/La_Paz': 1,
'America/Lima': 1,
'America/Los_Angeles': 382,
'America/Managua': 3,
'America/Mazatlan': 1,
'America/Mexico_City': 15,
'America/Monterrey': 1,
'America/Montevideo': 1,
'America/Montreal': 9,
'America/New_York': 1251,
'America/Phoenix': 20,
'America/Puerto_Rico': 10,
'America/Rainy_River': 25,
'America/Recife': 2,
'America/Santo_Domingo': 1,
'America/Sao_Paulo': 33,
'America/St_Kitts': 1,
'America/Tegucigalpa': 1,
'America/Vancouver': 12,
'America/Winnipeg': 4,
'Asia/Amman': 2,
'Asia/Bangkok': 6,
'Asia/Beirut': 4,
'Asia/Calcutta': 9,
'Asia/Dubai': 4,
'Asia/Harbin': 3,
'Asia/Hong_Kong': 10,
'Asia/Istanbul': 9,
'Asia/Jakarta': 3,
'Asia/Jerusalem': 3,
'Asia/Karachi': 3,
'Asia/Kuala_Lumpur': 3,
'Asia/Kuching': 1,
'Asia/Manila': 1,
'Asia/Nicosia': 1,
'Asia/Novosibirsk': 1,
'Asia/Pontianak': 1,
'Asia/Riyadh': 1,
'Asia/Seoul': 5,
'Asia/Tokyo': 37,
'Asia/Yekaterinburg': 1,
'Australia/NSW': 6,
'Australia/Queensland': 1,
'Chile/Continental': 6,
'Europe/Amsterdam': 22,
'Europe/Athens': 6,
'Europe/Belgrade': 2,
'Europe/Berlin': 28,
'Europe/Bratislava': 3,
'Europe/Brussels': 4,
'Europe/Bucharest': 4,
'Europe/Budapest': 5,
'Europe/Copenhagen': 5,
'Europe/Dublin': 3,
'Europe/Helsinki': 10,
'Europe/Lisbon': 8,
'Europe/Ljubljana': 1,
'Europe/London': 74,
'Europe/Madrid': 35,
'Europe/Malta': 2,
'Europe/Moscow': 10,
'Europe/Oslo': 10,
'Europe/Paris': 14,
'Europe/Prague': 10,
'Europe/Riga': 2,
'Europe/Rome': 27,
'Europe/Skopje': 1,
'Europe/Sofia': 1,
'Europe/Stockholm': 14,
'Europe/Uzhgorod': 1,
'Europe/Vienna': 6,
'Europe/Vilnius': 2,
'Europe/Volgograd': 1,
'Europe/Warsaw': 16,
'Europe/Zurich': 4,
'Pacific/Auckland': 11,
'Pacific/Honolulu': 36})
In [20]:
counts.most_common(10)
Out[20]:
[('America/New_York', 1251),
('', 521),
('America/Chicago', 400),
('America/Los_Angeles', 382),
('America/Denver', 191),
('Europe/London', 74),
('Asia/Tokyo', 37),
('Pacific/Honolulu', 36),
('Europe/Madrid', 35),
('America/Sao_Paulo', 33)]
In [24]:
from pandas import DataFrame, Series
import pandas as pd
In [25]:
frame = DataFrame(records)
In [26]:
frame
Out[26]:
_heartbeat_
a
al
c
cy
g
gr
h
hc
hh
kw
l
ll
nk
r
t
tz
u
0
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
en-US,en;q=0.8
US
Danvers
A6qOVH
MA
wfLQtf
1331822918
1.usa.gov
NaN
orofrog
[42.576698, -70.954903]
1
http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/...
1331923247
America/New_York
http://www.ncbi.nlm.nih.gov/pubmed/22415991
1
NaN
GoogleMaps/RochesterNY
NaN
US
Provo
mwszkS
UT
mwszkS
1308262393
j.mp
NaN
bitly
[40.218102, -111.613297]
0
http://www.AwareMap.com/
1331923249
America/Denver
http://www.monroecounty.gov/etc/911/rss.php
2
NaN
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
en-US
US
Washington
xxr3Qb
DC
xxr3Qb
1331919941
1.usa.gov
NaN
bitly
[38.9007, -77.043098]
1
http://t.co/03elZC4Q
1331923250
America/New_York
http://boxer.senate.gov/en/press/releases/0316...
3
NaN
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...
pt-br
BR
Braz
zCaLwp
27
zUtuOu
1331923068
1.usa.gov
NaN
alelex88
[-23.549999, -46.616699]
0
direct
1331923249
America/Sao_Paulo
http://apod.nasa.gov/apod/ap120312.html
4
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
en-US,en;q=0.8
US
Shrewsbury
9b6kNl
MA
9b6kNl
1273672411
bit.ly
NaN
bitly
[42.286499, -71.714699]
0
http://www.shrewsbury-ma.gov/selco/
1331923251
America/New_York
http://www.shrewsbury-ma.gov/egov/gallery/1341...
5
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
en-US,en;q=0.8
US
Shrewsbury
axNK8c
MA
axNK8c
1273672506
bit.ly
NaN
bitly
[42.286499, -71.714699]
0
http://www.shrewsbury-ma.gov/selco/
1331923252
America/New_York
http://www.shrewsbury-ma.gov/egov/gallery/1341...
6
NaN
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1...
pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4
PL
Luban
wcndER
77
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
[51.116699, 15.2833]
0
http://plus.url.google.com/url?sa=z&n=13319232...
1331923255
Europe/Warsaw
http://www.nasa.gov/mission_pages/nustar/main/...
7
NaN
Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/2...
bg,en-us;q=0.7,en;q=0.3
None
NaN
wcndER
NaN
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
NaN
0
http://www.facebook.com/
1331923255
http://www.nasa.gov/mission_pages/nustar/main/...
8
NaN
Opera/9.80 (X11; Linux zbov; U; en) Presto/2.1...
en-US, en
None
NaN
wcndER
NaN
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
NaN
0
http://www.facebook.com/l.php?u=http%3A%2F%2F1...
1331923254
http://www.nasa.gov/mission_pages/nustar/main/...
9
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4
None
NaN
zCaLwp
NaN
zUtuOu
1331923068
1.usa.gov
NaN
alelex88
NaN
0
http://t.co/o1Pd0WeV
1331923255
http://apod.nasa.gov/apod/ap120312.html
10
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)...
en-us,en;q=0.5
US
Seattle
vNJS4H
WA
u0uD9q
1319563556
1.usa.gov
NaN
o_4us71ccioa
[47.5951, -122.332603]
1
direct
1331923258
America/Los_Angeles
https://www.nysdot.gov/rexdesign/design/commun...
11
NaN
Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4...
en-us,en;q=0.5
US
Washington
wG7OIH
DC
A0nRz4
1331815838
1.usa.gov
NaN
darrellissa
[38.937599, -77.092796]
0
http://t.co/ND7SoPyo
1331923259
America/New_York
http://oversight.house.gov/wp-content/uploads/...
12
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)...
en-us,en;q=0.5
US
Alexandria
vNJS4H
VA
u0uD9q
1319563556
1.usa.gov
NaN
o_4us71ccioa
[38.790901, -77.094704]
1
direct
1331923259
America/New_York
https://www.nysdot.gov/rexdesign/design/commun...
13
1331923261
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
14
NaN
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...
en-us,en;q=0.5
US
Marietta
2rOUYc
GA
2rOUYc
1255769846
1.usa.gov
NaN
bitly
[33.953201, -84.5177]
1
direct
1331923262
America/New_York
http://toxtown.nlm.nih.gov/index.php
15
NaN
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1...
zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4
HK
Central District
nQvgJp
00
rtrrth
1317318030
j.mp
NaN
walkeryuen
[22.2833, 114.150002]
1
http://forum2.hkgolden.com/view.aspx?type=BW&m...
1331923263
Asia/Hong_Kong
http://www.ssd.noaa.gov/PS/TROP/TCFP/data/curr...
16
NaN
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1...
zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4
HK
Central District
XdUNr
00
qWkgbq
1317318039
j.mp
NaN
walkeryuen
[22.2833, 114.150002]
1
http://forum2.hkgolden.com/view.aspx?type=BW&m...
1331923263
Asia/Hong_Kong
http://www.usno.navy.mil/NOOC/nmfc-ph/RSS/jtwc...
17
NaN
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.5; r...
en-us,en;q=0.5
US
Buckfield
zH1BFf
ME
x3jOIv
1331839576
1.usa.gov
NaN
andyzieminski
[44.299702, -70.369797]
0
http://t.co/6Cx4ROLs
1331923264
America/New_York
http://www.usda.gov/wps/portal/usda/usdahome?c...
18
NaN
GoogleMaps/RochesterNY
NaN
US
Provo
mwszkS
UT
mwszkS
1308262393
1.usa.gov
NaN
bitly
[40.218102, -111.613297]
0
http://www.AwareMap.com/
1331923262
America/Denver
http://www.monroecounty.gov/etc/911/rss.php
19
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
it-IT,it;q=0.8,en-US;q=0.6,en;q=0.4
IT
Venice
wcndER
20
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
[45.438599, 12.3267]
0
http://www.facebook.com/
1331923264
Europe/Rome
http://www.nasa.gov/mission_pages/nustar/main/...
20
NaN
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...
es-ES
ES
Alcal
zQ95Hi
51
ytZYWR
1331670549
bitly.com
NaN
jplnews
[37.516701, -5.9833]
0
http://www.facebook.com/
1331923265
Africa/Ceuta
http://voyager.jpl.nasa.gov/imagesvideo/uranus...
21
NaN
Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6...
en-us,en;q=0.5
US
Davidsonville
wcndER
MD
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
[38.939201, -76.635002]
0
http://www.facebook.com/
1331923267
America/New_York
http://www.nasa.gov/mission_pages/nustar/main/...
22
NaN
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
en-us
US
Hockessin
y3ZImz
DE
y3ZImz
1331064158
1.usa.gov
NaN
bitly
[39.785, -75.682297]
0
direct
1331923267
America/New_York
http://portal.hud.gov/hudportal/documents/hudd...
23
NaN
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3)...
en-us
US
Lititz
wWiOiD
PA
wWiOiD
1330217829
1.usa.gov
NaN
bitly
[40.174999, -76.3078]
0
http://www.facebook.com/l.php?u=http%3A%2F%2F1...
1331923267
America/New_York
http://www.tricare.mil/mybenefit/ProfileFilter...
24
NaN
Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES...
es-es,es;q=0.8,en-us;q=0.5,en;q=0.3
ES
Bilbao
wcndER
59
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
[43.25, -2.9667]
0
http://www.facebook.com/
1331923268
Europe/Madrid
http://www.nasa.gov/mission_pages/nustar/main/...
25
NaN
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1...
en-GB,en;q=0.8,en-US;q=0.6,en-AU;q=0.4
MY
Kuala Lumpur
wcndER
14
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
[3.1667, 101.699997]
0
http://www.facebook.com/
1331923269
Asia/Kuala_Lumpur
http://www.nasa.gov/mission_pages/nustar/main/...
26
NaN
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1...
ro-RO,ro;q=0.8,en-US;q=0.6,en;q=0.4
CY
Nicosia
wcndER
04
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
[35.166698, 33.366699]
0
http://www.facebook.com/?ref=tn_tnmn
1331923268
Asia/Nicosia
http://www.nasa.gov/mission_pages/nustar/main/...
27
NaN
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...
en-US,en;q=0.8
BR
SPaulo
zCaLwp
27
zUtuOu
1331923068
1.usa.gov
NaN
alelex88
[-23.5333, -46.616699]
0
direct
1331923269
America/Sao_Paulo
http://apod.nasa.gov/apod/ap120312.html
28
NaN
Mozilla/5.0 (iPad; CPU OS 5_0_1 like Mac OS X)...
en-us
None
NaN
vNJS4H
NaN
u0uD9q
1319563556
1.usa.gov
NaN
o_4us71ccioa
NaN
0
direct
1331923270
https://www.nysdot.gov/rexdesign/design/commun...
29
NaN
Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X...
en-us
None
NaN
FPX0IM
NaN
FPX0IL
1331922978
1.usa.gov
NaN
twittershare
NaN
1
http://t.co/5xlp0B34
1331923270
http://www.ed.gov/news/media-advisories/us-dep...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
3530
NaN
Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1...
en-US,en;q=0.8
US
San Francisco
xVZg4P
CA
wqUkTo
1331908247
go.nasa.gov
NaN
nasatwitter
[37.7645, -122.429398]
0
http://www.facebook.com/l.php?u=http%3A%2F%2Fg...
1331926815
America/Los_Angeles
http://www.nasa.gov/multimedia/imagegallery/im...
3531
NaN
Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6...
en-US
None
NaN
wcndER
NaN
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
NaN
0
direct
1331926816
http://www.nasa.gov/mission_pages/nustar/main/...
3532
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)...
en-us,en;q=0.5
US
Washington
Au3aUS
DC
A9ct6C
1331926420
1.usa.gov
NaN
ncsha
[38.904202, -77.031998]
1
http://www.ncsha.org/
1331926817
America/New_York
http://portal.hud.gov/hudportal/HUD?src=/press...
3533
NaN
Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) A...
en-us
US
Jacksonville
b2UtUJ
FL
ieCdgH
1301393171
go.nasa.gov
NaN
nasatwitter
[30.279301, -81.585098]
1
direct
1331926818
America/New_York
http://apod.nasa.gov/apod/
3534
NaN
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...
en-us
US
Frisco
vNJS4H
TX
u0uD9q
1319563556
1.usa.gov
NaN
o_4us71ccioa
[33.149899, -96.855499]
1
direct
1331926820
America/Chicago
https://www.nysdot.gov/rexdesign/design/commun...
3535
NaN
Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/...
en-us
US
Houston
zIgLx8
TX
yrPaLt
1331903484
aash.to
NaN
aashto
[29.775499, -95.415199]
1
direct
1331926823
America/Chicago
http://ntl.bts.gov/lib/44000/44300/44374/FHWA-...
3536
NaN
Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; e...
en-US,en;q=0.5
None
NaN
xIcyim
NaN
yG1TTf
1331728309
go.nasa.gov
NaN
nasatwitter
NaN
0
http://t.co/g1VKE8zS
1331926824
http://www.nasa.gov/mission_pages/hurricanes/a...
3537
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)...
es-es,es;q=0.8,en-us;q=0.5,en;q=0.3
HN
Tegucigalpa
zCaLwp
08
w63FZW
1331546756
1.usa.gov
NaN
bufferapp
[14.1, -87.216698]
0
http://t.co/A8TJyibE
1331926825
America/Tegucigalpa
http://apod.nasa.gov/apod/ap120312.html
3538
NaN
Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Ma...
en-us
US
Los Angeles
qMac9k
CA
qds1Ge
1310473559
1.usa.gov
NaN
healthypeople
[34.041599, -118.298798]
0
direct
1331926825
America/Los_Angeles
http://healthypeople.gov/2020/connect/webinars...
3539
NaN
Mozilla/5.0 (compatible; Fedora Core 3) FC3 KDE
NaN
US
Bellevue
zu2M5o
WA
zDhdro
1331586192
bit.ly
NaN
glimtwin
[47.615398, -122.210297]
0
direct
1331926827
America/Los_Angeles
http://www.federalreserve.gov/newsevents/press...
3540
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
en-US,en;q=0.8
US
Payson
wcndER
UT
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
[40.014198, -111.738899]
0
http://www.facebook.com/l.php?u=http%3A%2F%2F1...
1331926828
America/Denver
http://www.nasa.gov/mission_pages/nustar/main/...
3541
NaN
Mozilla/5.0 (X11; U; OpenVMS AlphaServer_ES40;...
NaN
US
Bellevue
zu2M5o
WA
zDhdro
1331586192
1.usa.gov
NaN
glimtwin
[47.615398, -122.210297]
0
direct
1331926828
America/Los_Angeles
http://www.federalreserve.gov/newsevents/press...
3542
NaN
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...
en-us
US
Pittsburg
y3reI1
CA
y3reI1
1331926120
1.usa.gov
NaN
bitly
[38.0051, -121.838699]
0
http://www.facebook.com/l.php?u=http%3A%2F%2F1...
1331926829
America/Los_Angeles
http://www.sba.gov/community/blogs/community-b...
3543
1331926831
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
3544
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0.1) ...
en-us,en;q=0.5
US
Wentzville
vNJS4H
MO
u0uD9q
1319563556
1.usa.gov
NaN
o_4us71ccioa
[38.790001, -90.854897]
1
direct
1331926831
America/Chicago
https://www.nysdot.gov/rexdesign/design/commun...
3545
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)...
en-us,en;q=0.5
US
Saint Charles
vNJS4H
IL
u0uD9q
1319563556
1.usa.gov
NaN
o_4us71ccioa
[41.9352, -88.290901]
1
direct
1331926832
America/Chicago
https://www.nysdot.gov/rexdesign/design/commun...
3546
NaN
Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Ma...
en-us
US
Los Angeles
qMac9k
CA
qds1Ge
1310473559
1.usa.gov
NaN
healthypeople
[34.041599, -118.298798]
1
direct
1331926833
America/Los_Angeles
http://healthypeople.gov/2020/connect/webinars...
3547
NaN
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...
en-us
US
Silver Spring
y0jYkg
MD
y0jYkg
1331851811
1.usa.gov
NaN
bitly
[39.052101, -77.014999]
1
direct
1331926836
America/New_York
http://www.epa.gov/otaq/regs/fuels/additive/e1...
3548
NaN
Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Ma...
en-us
US
Mcgehee
y5rMac
AR
xANY6O
1331916302
1.usa.gov
NaN
twitterfeed
[33.628399, -91.356903]
1
https://twitter.com/fdarecalls/status/18069759...
1331926836
America/Chicago
http://www.fda.gov/Safety/Recalls/ucm296326.htm
3549
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
sv-SE,sv;q=0.8,en-US;q=0.6,en;q=0.4
SE
Sollefte
eH8wu
24
7dtjei
1260316355
1.usa.gov
NaN
tweetdeckapi
[63.166698, 17.266701]
1
direct
1331926834
Europe/Stockholm
http://www.nasa.gov/mission_pages/WISE/main/in...
3550
NaN
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
en-us
US
Conshohocken
A00b72
PA
yGSwzn
1331917632
1.usa.gov
NaN
addthis
[40.0798, -75.2855]
0
http://www.linkedin.com/home?trk=hb_tab_home_top
1331926837
America/New_York
http://www.nlm.nih.gov/medlineplus/news/fullst...
3551
NaN
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
en-US,en;q=0.8
None
NaN
wcndER
NaN
zkpJBR
1331922854
1.usa.gov
NaN
bnjacobs
NaN
0
http://plus.url.google.com/url?sa=z&n=13319268...
1331926837
http://www.nasa.gov/mission_pages/nustar/main/...
3552
NaN
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...
NaN
US
Decatur
rqgJuE
AL
xcz8vt
1331227417
1.usa.gov
NaN
bootsnall
[34.572701, -86.940598]
0
direct
1331926839
America/Chicago
http://travel.state.gov/passport/passport_5535...
3553
NaN
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...
en-us
US
Shrewsbury
9b6kNl
MA
9b6kNl
1273672411
bit.ly
NaN
bitly
[42.286499, -71.714699]
0
http://www.shrewsbury-ma.gov/selco/
1331926840
America/New_York
http://www.shrewsbury-ma.gov/egov/gallery/1341...
3554
NaN
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...
en-us
US
Shrewsbury
axNK8c
MA
axNK8c
1273672506
bit.ly
NaN
bitly
[42.286499, -71.714699]
0
http://www.shrewsbury-ma.gov/selco/
1331926840
America/New_York
http://www.shrewsbury-ma.gov/egov/gallery/1341...
3555
NaN
Mozilla/4.0 (compatible; MSIE 9.0; Windows NT ...
en
US
Paramus
e5SvKE
NJ
fqPSr9
1301298479
1.usa.gov
NaN
tweetdeckapi
[40.9445, -74.07]
1
direct
1331926841
America/New_York
http://www.fda.gov/AdvisoryCommittees/Committe...
3556
NaN
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1...
en-US,en;q=0.8
US
Oklahoma City
jQLtP4
OK
jQLtP4
1307530247
1.usa.gov
NaN
bitly
[35.4715, -97.518997]
0
http://www.facebook.com/l.php?u=http%3A%2F%2F1...
1331926844
America/Chicago
http://www.okc.gov/PublicNotificationSystem/Fo...
3557
NaN
GoogleMaps/RochesterNY
NaN
US
Provo
mwszkS
UT
mwszkS
1308262393
j.mp
NaN
bitly
[40.218102, -111.613297]
0
http://www.AwareMap.com/
1331926846
America/Denver
http://www.monroecounty.gov/etc/911/rss.php
3558
NaN
GoogleProducer
NaN
US
Mountain View
zjtI4X
CA
zjtI4X
1327528527
1.usa.gov
NaN
bitly
[37.419201, -122.057404]
0
direct
1331926847
America/Los_Angeles
http://www.ahrq.gov/qual/qitoolkit/
3559
NaN
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
en-US
US
Mc Lean
qxKrTK
VA
qxKrTK
1312897670
1.usa.gov
NaN
bitly
[38.935799, -77.162102]
0
http://t.co/OEEEvwjU
1331926849
America/New_York
http://herndon-va.gov/Content/public_safety/Pu...
3560 rows × 18 columns
In [27]:
frame['tz']
Out[27]:
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
7
8
9
10 America/Los_Angeles
11 America/New_York
12 America/New_York
13 NaN
14 America/New_York
15 Asia/Hong_Kong
16 Asia/Hong_Kong
17 America/New_York
18 America/Denver
19 Europe/Rome
20 Africa/Ceuta
21 America/New_York
22 America/New_York
23 America/New_York
24 Europe/Madrid
25 Asia/Kuala_Lumpur
26 Asia/Nicosia
27 America/Sao_Paulo
28
29
...
3530 America/Los_Angeles
3531
3532 America/New_York
3533 America/New_York
3534 America/Chicago
3535 America/Chicago
3536
3537 America/Tegucigalpa
3538 America/Los_Angeles
3539 America/Los_Angeles
3540 America/Denver
3541 America/Los_Angeles
3542 America/Los_Angeles
3543 NaN
3544 America/Chicago
3545 America/Chicago
3546 America/Los_Angeles
3547 America/New_York
3548 America/Chicago
3549 Europe/Stockholm
3550 America/New_York
3551
3552 America/Chicago
3553 America/New_York
3554 America/New_York
3555 America/New_York
3556 America/Chicago
3557 America/Denver
3558 America/Los_Angeles
3559 America/New_York
Name: tz, dtype: object
In [28]:
frame['tz'][:10]
Out[28]:
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
7
8
9
Name: tz, dtype: object
In [29]:
tz_counts = frame['tz'].value_counts()
In [30]:
tz_counts
Out[30]:
America/New_York 1251
521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
America/Sao_Paulo 33
Europe/Berlin 28
Europe/Rome 27
America/Rainy_River 25
Europe/Amsterdam 22
America/Phoenix 20
America/Indianapolis 20
Europe/Warsaw 16
America/Mexico_City 15
Europe/Stockholm 14
Europe/Paris 14
America/Vancouver 12
Pacific/Auckland 11
Asia/Hong_Kong 10
Europe/Oslo 10
Europe/Moscow 10
Europe/Helsinki 10
Europe/Prague 10
America/Puerto_Rico 10
Asia/Calcutta 9
Asia/Istanbul 9
...
Europe/Belgrade 2
America/Lima 1
Europe/Volgograd 1
America/St_Kitts 1
America/Argentina/Mendoza 1
Africa/Lusaka 1
Asia/Manila 1
Africa/Casablanca 1
Australia/Queensland 1
America/Argentina/Cordoba 1
Asia/Nicosia 1
Europe/Skopje 1
America/Tegucigalpa 1
Africa/Johannesburg 1
America/Monterrey 1
Asia/Pontianak 1
America/Costa_Rica 1
Asia/Riyadh 1
America/Mazatlan 1
Asia/Novosibirsk 1
America/Montevideo 1
America/Santo_Domingo 1
America/Caracas 1
America/La_Paz 1
Europe/Ljubljana 1
Europe/Uzhgorod 1
Europe/Sofia 1
Asia/Kuching 1
America/Argentina/Buenos_Aires 1
Asia/Yekaterinburg 1
Name: tz, dtype: int64
In [40]:
tz_clean = frame.tz.fillna('Missing')
tz_clean[tz_clean == ''] = 'Unknown'
tz_clean_counts = tz_clean.value_counts()
In [41]:
tz_clean_counts
Out[41]:
America/New_York 1251
Unknown 521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Missing 120
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
America/Sao_Paulo 33
Europe/Berlin 28
Europe/Rome 27
America/Rainy_River 25
Europe/Amsterdam 22
America/Phoenix 20
America/Indianapolis 20
Europe/Warsaw 16
America/Mexico_City 15
Europe/Paris 14
Europe/Stockholm 14
America/Vancouver 12
Pacific/Auckland 11
Europe/Prague 10
Europe/Oslo 10
Europe/Moscow 10
America/Puerto_Rico 10
Asia/Hong_Kong 10
Europe/Helsinki 10
Asia/Calcutta 9
...
Europe/Riga 2
America/Lima 1
Asia/Manila 1
Africa/Lusaka 1
Africa/Casablanca 1
America/Argentina/Mendoza 1
America/Argentina/Cordoba 1
Asia/Pontianak 1
America/Argentina/Buenos_Aires 1
America/St_Kitts 1
Europe/Skopje 1
Asia/Riyadh 1
Australia/Queensland 1
America/Mazatlan 1
America/Tegucigalpa 1
Africa/Johannesburg 1
America/Monterrey 1
Asia/Nicosia 1
America/Costa_Rica 1
Europe/Sofia 1
Asia/Kuching 1
Asia/Novosibirsk 1
America/Montevideo 1
America/Santo_Domingo 1
Europe/Volgograd 1
America/Caracas 1
Europe/Ljubljana 1
Europe/Uzhgorod 1
Asia/Yekaterinburg 1
America/La_Paz 1
Name: tz, dtype: int64
In [42]:
tz_clean_counts[:10].plot(kind='barh', rot=0, title='top time zones in the 1.usa.gov sample data')
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x107686a20>
In [43]:
results = Series([x.split()[0] for x in frame.a.dropna()])
In [44]:
results[:10]
Out[44]:
0 Mozilla/5.0
1 GoogleMaps/RochesterNY
2 Mozilla/4.0
3 Mozilla/5.0
4 Mozilla/5.0
5 Mozilla/5.0
6 Mozilla/5.0
7 Mozilla/5.0
8 Opera/9.80
9 Mozilla/5.0
dtype: object
In [45]:
browser_counts = results.value_counts()
In [46]:
browser_counts
Out[46]:
Mozilla/5.0 2594
Mozilla/4.0 601
GoogleMaps/RochesterNY 121
Opera/9.80 34
TEST_INTERNET_AGENT 24
GoogleProducer 21
Mozilla/6.0 5
BlackBerry8520/5.0.0.681 4
Dalvik/1.4.0 3
BlackBerry8520/5.0.0.592 3
Goldfire 2
Socialite/7766 2
Acoon 2
BlackBerry9630/5.0.0.975 2
Opera/9.00 1
Vancouver 1
Opera/9.30 1
Mozilla/0.6 1
Opera/9.64(Windows 1
Opera/9.50 1
BlackBerry9700/5.0.0.423 1
sometrik.com 1
BlackBerry8520/5.0.0.1067 1
NokiaC3-00/5.0 1
HTTP_Request2/2.0.0 1
BlackBerry9300/5.0.0.997 1
BlackBerry9700/5.0.0.862 1
BlackBerry9530/5.0.0.328 1
LG-GW382f/V10d 1
MOT-MB525/Blur_Version.34.4.709.MB525.Latam.en.01 1
ICE 1
Vodafone/1.0/LG-KU990i/V10c 1
SAMSUNG-SGH-A887/A887UCIJ1 1
LG-LG220C[TF268435458416597116000000013524223841] 1
BlackBerry8530/5.0.0.654 1
Nokia6790s-1b/ATT.03.22 1
dtype: int64
In [48]:
browser_counts[:10].plot(kind='barh', rot=0, title='top browser type in 1.usa.gov sample data')
Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x106e7c898>
In [49]:
browser_counts[:10].plot(kind='bar', rot=0, title='top browser type in 1.usa.gov sample data')
Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x10806dcc0>
In [50]:
cframe = frame[frame.a.notnull()]
In [52]:
frame.shape
Out[52]:
(3560, 18)
In [53]:
cframe.shape
Out[53]:
(3440, 18)
In [54]:
import numpy as np
In [55]:
operating_system = np.where(cframe.a.str.contains('Window'), 'Windows', 'Not Windows')
In [56]:
operating_system[:10]
Out[56]:
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows',
'Windows', 'Windows', 'Windows', 'Not Windows', 'Windows'],
dtype='<U11')
In [57]:
by_tz_os = cframe.groupby(['tz', operating_system])
In [58]:
aggregate_counts = by_tz_os.size().unstack().fillna(0)
In [59]:
aggregate_counts[:10]
Out[59]:
Not Windows
Windows
tz
245
276
Africa/Cairo
0
3
Africa/Casablanca
0
1
Africa/Ceuta
0
2
Africa/Johannesburg
0
1
Africa/Lusaka
0
1
America/Anchorage
4
1
America/Argentina/Buenos_Aires
1
0
America/Argentina/Cordoba
0
1
America/Argentina/Mendoza
0
1
In [60]:
indexer = aggregate_counts.sum(1).argsort()
In [61]:
indexer[:10]
Out[61]:
tz
24
Africa/Cairo 20
Africa/Casablanca 21
Africa/Ceuta 92
Africa/Johannesburg 87
Africa/Lusaka 53
America/Anchorage 54
America/Argentina/Buenos_Aires 57
America/Argentina/Cordoba 26
America/Argentina/Mendoza 55
dtype: int64
In [62]:
count_subset = aggregate_counts.take(indexer)[-10:]
In [63]:
count_subset
Out[63]:
Not Windows
Windows
tz
America/Sao_Paulo
13
20
Europe/Madrid
16
19
Pacific/Honolulu
0
36
Asia/Tokyo
2
35
Europe/London
43
31
America/Denver
132
59
America/Los_Angeles
130
252
America/Chicago
115
285
245
276
America/New_York
339
912
In [67]:
count_subset.plot(kind='barh', stacked=True, title='Top time zones by Windows and non-Windows users')
Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c0d3a20>
In [65]:
normalized_count_subset = count_subset.div(count_subset.sum(1), axis=0)
In [69]:
normalized_count_subset.plot(kind='barh', stacked=True, title='Percentage Windows and non-Windows users in top-occuring time zones')
Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c3a0fd0>
In [ ]:
Content source: nhonaitran/pyML
Similar notebooks: