In [31]:
import json
path = 'data/usagov_bitly_data2012-05-21-1337634399.txt'
records = [json.loads(line) for line in open(path)]

In [32]:
# Display a couple of records
records[0:2]


Out[32]:
[{u'a': u'Mozilla/5.0 (iPod; U; CPU iPhone OS 3_1_3 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7E18 Safari/528.16',
  u'al': u'en-us',
  u'c': u'US',
  u'cy': u'Chesapeake',
  u'g': u'JKZUHq',
  u'gr': u'VA',
  u'h': u'J8ZPYk',
  u'hc': 1337629186,
  u'hh': u'go.nasa.gov',
  u'l': u'nasatwitter',
  u'll': [36.755798, -76.292801],
  u'nk': 1,
  u'r': u'http://t.co/JEY40vW4',
  u't': 1337634399,
  u'tz': u'America/New_York',
  u'u': u'http://www.nasa.gov/mission_pages/hinode/eclipse_120520.html'},
 {u'a': u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5',
  u'al': u'en-US,en;q=0.8',
  u'c': u'US',
  u'cy': u'O Fallon',
  u'g': u'vNJS4H',
  u'gr': u'MO',
  u'h': u'u0uD9q',
  u'hc': 1319563556,
  u'hh': u'1.usa.gov',
  u'l': u'o_4us71ccioa',
  u'll': [38.8251, -90.728897],
  u'nk': 1,
  u'r': u'direct',
  u't': 1337634399,
  u'tz': u'America/Chicago',
  u'u': u'https://www.nysdot.gov/rexdesign/design/community.gif'}]

In [33]:
# Display the user agent from the first record
records[0]["a"]


Out[33]:
u'Mozilla/5.0 (iPod; U; CPU iPhone OS 3_1_3 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7E18 Safari/528.16'

In [34]:
# Utility function: get counts for each element from a collection

from collections import defaultdict

def get_counts(sequence):
    counts = defaultdict(int) # values will initialize to 0
    for x in sequence:
        count[x] += 1
    return counts

In [35]:
# How a time zone looks like
records[0]['tz']


Out[35]:
u'America/New_York'

In [36]:
# Check if a time zone is listed for the record
def time_zone_listed(record):
    return 'tz' in record
    
time_zone_listed(records[0])


Out[36]:
True

In [38]:
# See where the timezone is not listed       
[item for item in records if not time_zone_listed(item)]


Out[38]:
[{u'_heartbeat_': 1337634451},
 {u'_heartbeat_': 1337634482},
 {u'_heartbeat_': 1337634512},
 {u'_heartbeat_': 1337634541},
 {u'_heartbeat_': 1337634571},
 {u'_heartbeat_': 1337634601},
 {u'_heartbeat_': 1337634631},
 {u'_heartbeat_': 1337634661},
 {u'_heartbeat_': 1337634691},
 {u'_heartbeat_': 1337634721},
 {u'_heartbeat_': 1337634751},
 {u'_heartbeat_': 1337634781},
 {u'_heartbeat_': 1337634811},
 {u'_heartbeat_': 1337634841},
 {u'_heartbeat_': 1337634871},
 {u'_heartbeat_': 1337634901},
 {u'_heartbeat_': 1337634931},
 {u'_heartbeat_': 1337634961},
 {u'_heartbeat_': 1337634991},
 {u'_heartbeat_': 1337635021},
 {u'_heartbeat_': 1337635051},
 {u'_heartbeat_': 1337635081},
 {u'_heartbeat_': 1337635112},
 {u'_heartbeat_': 1337635141},
 {u'_heartbeat_': 1337635171},
 {u'_heartbeat_': 1337635201},
 {u'_heartbeat_': 1337635231},
 {u'_heartbeat_': 1337635261},
 {u'_heartbeat_': 1337635291},
 {u'_heartbeat_': 1337635321},
 {u'_heartbeat_': 1337635351},
 {u'_heartbeat_': 1337635381},
 {u'_heartbeat_': 1337635411},
 {u'_heartbeat_': 1337635441},
 {u'_heartbeat_': 1337635471},
 {u'_heartbeat_': 1337635501},
 {u'_heartbeat_': 1337635531},
 {u'_heartbeat_': 1337635561},
 {u'_heartbeat_': 1337635591},
 {u'_heartbeat_': 1337635621},
 {u'_heartbeat_': 1337635651},
 {u'_heartbeat_': 1337635681},
 {u'_heartbeat_': 1337635711},
 {u'_heartbeat_': 1337635741},
 {u'_heartbeat_': 1337635771},
 {u'_heartbeat_': 1337635801},
 {u'_heartbeat_': 1337635831},
 {u'_heartbeat_': 1337635861},
 {u'_heartbeat_': 1337635891},
 {u'_heartbeat_': 1337635921},
 {u'_heartbeat_': 1337635951},
 {u'_heartbeat_': 1337635981},
 {u'_heartbeat_': 1337636011},
 {u'_heartbeat_': 1337636041},
 {u'_heartbeat_': 1337636071},
 {u'_heartbeat_': 1337636101},
 {u'_heartbeat_': 1337636131},
 {u'_heartbeat_': 1337636161},
 {u'_heartbeat_': 1337636191},
 {u'_heartbeat_': 1337636221}]

In [42]:
time_zones = [item['tz'] for item in records if time_zone_listed(item)]
time_zones[0:3]


Out[42]:
[u'America/New_York', u'America/Chicago', u'America/New_York']

In [46]:
from collections import Counter

Counter(time_zones).most_common(10)


Out[46]:
[(u'America/Chicago', 643),
 (u'America/New_York', 571),
 (u'', 521),
 (u'America/Los_Angeles', 315),
 (u'Europe/London', 135),
 (u'America/Denver', 77),
 (u'Europe/Amsterdam', 32),
 (u'America/Phoenix', 32),
 (u'Europe/Madrid', 29),
 (u'America/Rainy_River', 26)]