In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import json

In [3]:
fid = open('usagov_bitly_data2013-05-16-1368695937', 'rb')

In [4]:
records = [json.loads(line) for line in fid]

In [5]:
records[0]


Out[5]:
{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31',
 u'al': u'en-US,en;q=0.8',
 u'c': u'US',
 u'cy': u'Whittier',
 u'g': u'16hIfSr',
 u'gr': u'CA',
 u'h': u'16hIfSq',
 u'hc': 1368661734,
 u'hh': u'1.usa.gov',
 u'l': u'o_4og9tq48m3',
 u'll': [33.945702, -118.038803],
 u'nk': 0,
 u'r': u'https://www.facebook.com/',
 u't': 1368695936,
 u'tz': u'America/Los_Angeles',
 u'u': u'http://www.jpl.nasa.gov/spaceimages/details.php?id=PIA15817'}

In [6]:
frame = pd.DataFrame(records)

In [7]:
# Count timezones
tz_counts = frame['tz'].value_counts()

In [8]:
tz_counts[:10]


Out[8]:
                       404
America/New_York       269
America/Chicago        118
Europe/London          105
America/Los_Angeles    102
Europe/Prague           71
Australia/NSW           45
Asia/Calcutta           37
Asia/Tokyo              35
Europe/Berlin           30
dtype: int64

In [9]:
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == '']='Unknown'

In [10]:
tz_counts = clean_tz.value_counts()

In [11]:
tz_counts[:10]


Out[11]:
Unknown                404
America/New_York       269
Missing                120
America/Chicago        118
Europe/London          105
America/Los_Angeles    102
Europe/Prague           71
Australia/NSW           45
Asia/Calcutta           37
Asia/Tokyo              35
dtype: int64

In [12]:
#Plot most common timezones
tz_counts[:10].plot(kind='barh', rot=0)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f29a2579b50>

In [13]:
# Fields in frame
frame.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1797 entries, 0 to 1796
Data columns (total 18 columns):
_heartbeat_    120 non-null float64
a              1677 non-null object
al             1458 non-null object
c              1291 non-null object
cy             1291 non-null object
g              1677 non-null object
gr             1291 non-null object
h              1677 non-null object
hc             1677 non-null float64
hh             1677 non-null object
kw             23 non-null object
l              1677 non-null object
ll             1276 non-null object
nk             1677 non-null float64
r              1677 non-null object
t              1677 non-null float64
tz             1677 non-null object
u              1677 non-null object
dtypes: float64(4), object(14)
memory usage: 266.7+ KB

In [14]:
frame['a'][1].split()


Out[14]:
[u'Opera/9.80',
 u'(Windows',
 u'Mobile;',
 u'Opera',
 u'Mini/5.1.21595/29.3530;',
 u'U;',
 u'en)',
 u'Presto/2.8.119',
 u'Version/11.10']

In [15]:
# 'a' column is user agent as space separated fields, ignore spaces between brackets
import re

In [16]:
results = pd.Series([re.findall('\([^)]*\)|\S+',obs)[0] for obs in frame.a.dropna()])

In [17]:
results.value_counts()[:10]


Out[17]:
Mozilla/5.0           1283
Mozilla/4.0            216
Opera/9.80              50
TVersity                42
Socialite/7845          19
Dalvik/1.6.0            17
CakePHP                  9
Yes                      7
Kml-Google;              6
ShortLinkTranslate       5
dtype: int64

In [18]:
# Cleaned frame where each row has an agent string
cframe = frame.loc[frame['a'].notnull(), :]

In [19]:
#cframe.loc[cframe['tz'].isnull(), 'tz'] = 'Missing'

In [20]:
#cframe.loc[cframe['tz'] == '', 'tz'] = 'Unknown'

In [21]:
# Partition into Windows and Non Windows
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')

In [22]:
# Decompose timezones into Windows and Non Windows
## TODO: need to learn how groupby objects work
by_tz_os = cframe.groupby(['tz', operating_system])

In [23]:
agg_counts = by_tz_os.size().unstack().fillna(0)

In [24]:
agg_counts[:10]


Out[24]:
Not Windows Windows
tz
208 196
Africa/Abidjan 0 1
Africa/Accra 1 0
Africa/Blantyre 0 1
Africa/Cairo 0 1
Africa/Gaborone 0 1
Africa/Johannesburg 4 8
Africa/Kampala 0 2
Africa/Lagos 3 0
America/Anchorage 1 1

In [25]:
# Sort in ascending order by summing total for each timezone then using argsort to return indexes in sorted array
indexer = agg_counts.sum(1).argsort()

In [26]:
indexer[:10]


Out[26]:
tz
                       102
Africa/Abidjan          21
Africa/Accra            88
Africa/Blantyre         87
Africa/Cairo            24
Africa/Gaborone         50
Africa/Johannesburg     26
Africa/Kampala          20
Africa/Lagos            30
America/Anchorage       33
dtype: int64

In [27]:
# Since we've sorted in ascending order the last 10 elements are the most common 10
count_subset = agg_counts.take(indexer)[-10:]

In [28]:
count_subset.plot(kind='barh', stacked=True)


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f29c0166ad0>

In [29]:
# Normalize by total count in each timezone
normed_subset = count_subset.div(count_subset.sum(1), axis=0)

In [30]:
normed_subset.plot(kind='barh', stacked=True)


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f29a22d7250>

In [ ]: