In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import json
In [3]:
fid = open('usagov_bitly_data2013-05-16-1368695937', 'rb')
In [4]:
records = [json.loads(line) for line in fid]
In [5]:
records[0]
Out[5]:
In [6]:
frame = pd.DataFrame(records)
In [7]:
# Count timezones
tz_counts = frame['tz'].value_counts()
In [8]:
tz_counts[:10]
Out[8]:
In [9]:
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == '']='Unknown'
In [10]:
tz_counts = clean_tz.value_counts()
In [11]:
tz_counts[:10]
Out[11]:
In [12]:
#Plot most common timezones
tz_counts[:10].plot(kind='barh', rot=0)
Out[12]:
In [13]:
# Fields in frame
frame.info()
In [14]:
frame['a'][1].split()
Out[14]:
In [15]:
# 'a' column is user agent as space separated fields, ignore spaces between brackets
import re
In [16]:
results = pd.Series([re.findall('\([^)]*\)|\S+',obs)[0] for obs in frame.a.dropna()])
In [17]:
results.value_counts()[:10]
Out[17]:
In [18]:
# Cleaned frame where each row has an agent string
cframe = frame.loc[frame['a'].notnull(), :]
In [19]:
#cframe.loc[cframe['tz'].isnull(), 'tz'] = 'Missing'
In [20]:
#cframe.loc[cframe['tz'] == '', 'tz'] = 'Unknown'
In [21]:
# Partition into Windows and Non Windows
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
In [22]:
# Decompose timezones into Windows and Non Windows
## TODO: need to learn how groupby objects work
by_tz_os = cframe.groupby(['tz', operating_system])
In [23]:
agg_counts = by_tz_os.size().unstack().fillna(0)
In [24]:
agg_counts[:10]
Out[24]:
In [25]:
# Sort in ascending order by summing total for each timezone then using argsort to return indexes in sorted array
indexer = agg_counts.sum(1).argsort()
In [26]:
indexer[:10]
Out[26]:
In [27]:
# Since we've sorted in ascending order the last 10 elements are the most common 10
count_subset = agg_counts.take(indexer)[-10:]
In [28]:
count_subset.plot(kind='barh', stacked=True)
Out[28]:
In [29]:
# Normalize by total count in each timezone
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
In [30]:
normed_subset.plot(kind='barh', stacked=True)
Out[30]:
In [ ]: