Import the json package

Assign the path of the json content file to the path variable.


In [2]:
import json
path = r'C:\Users\hrao\Documents\Personal\HK\Books\pydata-book-master\pydata-book-master\ch02\usagov_bitly_data2012-03-16-1331923249.txt'

Open the file located in the path directory, one line at a time, and store it in a list called records.


In [3]:
records = [json.loads(line) for line in open(path,'r')]

In [4]:
type(records)


Out[4]:
list

In [5]:
records[0]


Out[5]:
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 'al': 'en-US,en;q=0.8',
 'c': 'US',
 'cy': 'Danvers',
 'g': 'A6qOVH',
 'gr': 'MA',
 'h': 'wfLQtf',
 'hc': 1331822918,
 'hh': '1.usa.gov',
 'l': 'orofrog',
 'll': [42.576698, -70.954903],
 'nk': 1,
 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 't': 1331923247,
 'tz': 'America/New_York',
 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}

Calling a specific key within the list


In [6]:
records[0]['tz']


Out[6]:
'America/New_York'

Printing all time zone values in the records list.

Here we search for the string 'tz' in each element of the records list.

If the search returns a string, then we print the corresponding value of the key 'tz' for that element.


In [7]:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]

In [8]:
time_zones[:10]


Out[8]:
['America/New_York',
 'America/Denver',
 'America/New_York',
 'America/Sao_Paulo',
 'America/New_York',
 'America/New_York',
 'Europe/Warsaw',
 '',
 '',
 '']

Counting the frequency of each time zone's occurrence in the list using a dict type in Python


In [11]:
counts = {}
for x in time_zones:
    if x in counts:
        counts[x] = counts.get(x,0) + 1
    else:
        counts[x] = 1
print(counts)


{'': 521, 'Africa/Ceuta': 2, 'Asia/Kuala_Lumpur': 3, 'America/Recife': 2, 'Europe/London': 74, 'America/Mexico_City': 15, 'America/Argentina/Buenos_Aires': 1, 'America/Sao_Paulo': 33, 'Asia/Calcutta': 9, 'Europe/Paris': 14, 'Asia/Riyadh': 1, 'America/Phoenix': 20, 'America/Vancouver': 12, 'Europe/Copenhagen': 5, 'America/Chicago': 400, 'Asia/Pontianak': 1, 'America/Indianapolis': 20, 'Europe/Madrid': 35, 'Europe/Rome': 27, 'Asia/Bangkok': 6, 'Europe/Helsinki': 10, 'Australia/Queensland': 1, 'America/Denver': 191, 'America/St_Kitts': 1, 'Europe/Vilnius': 2, 'America/Rainy_River': 25, 'America/Lima': 1, 'America/Costa_Rica': 1, 'Asia/Karachi': 3, 'Chile/Continental': 6, 'Europe/Volgograd': 1, 'Asia/Nicosia': 1, 'America/Chihuahua': 2, 'Europe/Skopje': 1, 'America/Santo_Domingo': 1, 'Africa/Casablanca': 1, 'America/Guayaquil': 2, 'America/La_Paz': 1, 'Asia/Jakarta': 3, 'America/Halifax': 4, 'Asia/Yekaterinburg': 1, 'Asia/Tokyo': 37, 'Africa/Johannesburg': 1, 'Europe/Stockholm': 14, 'Africa/Lusaka': 1, 'America/New_York': 1251, 'America/Montevideo': 1, 'Europe/Malta': 2, 'Australia/NSW': 6, 'Europe/Brussels': 4, 'Europe/Zurich': 4, 'Asia/Amman': 2, 'America/Winnipeg': 4, 'Europe/Bratislava': 3, 'Europe/Oslo': 10, 'America/Argentina/Cordoba': 1, 'Asia/Kuching': 1, 'Europe/Budapest': 5, 'Africa/Cairo': 3, 'Europe/Belgrade': 2, 'Europe/Warsaw': 16, 'America/Managua': 3, 'Asia/Beirut': 4, 'Europe/Lisbon': 8, 'Asia/Dubai': 4, 'Asia/Hong_Kong': 10, 'America/Argentina/Mendoza': 1, 'Europe/Ljubljana': 1, 'America/Edmonton': 6, 'America/Los_Angeles': 382, 'America/Monterrey': 1, 'Europe/Athens': 6, 'Europe/Prague': 10, 'America/Bogota': 3, 'Pacific/Honolulu': 36, 'Asia/Seoul': 5, 'Europe/Berlin': 28, 'Asia/Manila': 1, 'Asia/Novosibirsk': 1, 'Europe/Uzhgorod': 1, 'Asia/Harbin': 3, 'Pacific/Auckland': 11, 'Europe/Riga': 2, 'America/Montreal': 9, 'America/Anchorage': 5, 'America/Caracas': 1, 'Europe/Dublin': 3, 'America/Mazatlan': 1, 'America/Puerto_Rico': 10, 'Asia/Istanbul': 9, 'Europe/Vienna': 6, 'Europe/Moscow': 10, 'Asia/Jerusalem': 3, 'Europe/Bucharest': 4, 'America/Tegucigalpa': 1, 'Europe/Amsterdam': 22, 'Europe/Sofia': 1}

In [14]:
from collections import defaultdict

counts = defaultdict(int)
for x in time_zones:
    counts[x] += 1

print(counts)


defaultdict(<class 'int'>, {'': 521, 'Africa/Ceuta': 2, 'Asia/Kuala_Lumpur': 3, 'America/Recife': 2, 'Europe/London': 74, 'America/Mexico_City': 15, 'America/Argentina/Buenos_Aires': 1, 'America/Sao_Paulo': 33, 'Asia/Calcutta': 9, 'Europe/Paris': 14, 'Asia/Riyadh': 1, 'America/Phoenix': 20, 'America/Vancouver': 12, 'Europe/Copenhagen': 5, 'America/Chicago': 400, 'Asia/Pontianak': 1, 'America/Indianapolis': 20, 'Europe/Madrid': 35, 'Europe/Rome': 27, 'Asia/Bangkok': 6, 'Europe/Helsinki': 10, 'Australia/Queensland': 1, 'America/Denver': 191, 'America/St_Kitts': 1, 'Europe/Vilnius': 2, 'America/Rainy_River': 25, 'America/Lima': 1, 'America/Costa_Rica': 1, 'Asia/Karachi': 3, 'Chile/Continental': 6, 'Europe/Volgograd': 1, 'Asia/Nicosia': 1, 'America/Chihuahua': 2, 'Europe/Skopje': 1, 'America/Santo_Domingo': 1, 'Africa/Casablanca': 1, 'America/Guayaquil': 2, 'America/La_Paz': 1, 'Asia/Jakarta': 3, 'America/Halifax': 4, 'Asia/Yekaterinburg': 1, 'Asia/Tokyo': 37, 'Africa/Johannesburg': 1, 'Europe/Stockholm': 14, 'Africa/Lusaka': 1, 'America/New_York': 1251, 'America/Montevideo': 1, 'Europe/Malta': 2, 'Australia/NSW': 6, 'Europe/Brussels': 4, 'Europe/Zurich': 4, 'Asia/Amman': 2, 'America/Winnipeg': 4, 'Europe/Bratislava': 3, 'Europe/Oslo': 10, 'America/Argentina/Cordoba': 1, 'Asia/Kuching': 1, 'Europe/Budapest': 5, 'Africa/Cairo': 3, 'Europe/Belgrade': 2, 'Europe/Warsaw': 16, 'America/Managua': 3, 'Asia/Beirut': 4, 'Europe/Lisbon': 8, 'Asia/Dubai': 4, 'Asia/Hong_Kong': 10, 'America/Argentina/Mendoza': 1, 'Europe/Ljubljana': 1, 'America/Edmonton': 6, 'America/Los_Angeles': 382, 'America/Monterrey': 1, 'Europe/Athens': 6, 'Europe/Prague': 10, 'America/Bogota': 3, 'Pacific/Honolulu': 36, 'Asia/Seoul': 5, 'Europe/Berlin': 28, 'Asia/Manila': 1, 'Asia/Novosibirsk': 1, 'Europe/Uzhgorod': 1, 'Asia/Harbin': 3, 'Pacific/Auckland': 11, 'Europe/Riga': 2, 'America/Montreal': 9, 'America/Anchorage': 5, 'America/Caracas': 1, 'Europe/Dublin': 3, 'America/Mazatlan': 1, 'America/Puerto_Rico': 10, 'Asia/Istanbul': 9, 'Europe/Vienna': 6, 'Europe/Moscow': 10, 'Asia/Jerusalem': 3, 'Europe/Bucharest': 4, 'America/Tegucigalpa': 1, 'Europe/Amsterdam': 22, 'Europe/Sofia': 1})

In [20]:
counts['America/New_York']


Out[20]:
1251

In [19]:
len(time_zones)


Out[19]:
3440

To list the top n time zone occurrences


In [23]:
def top_counts(count_dict, n):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [24]:
top_counts(counts,10)


Out[24]:
[(33, 'America/Sao_Paulo'),
 (35, 'Europe/Madrid'),
 (36, 'Pacific/Honolulu'),
 (37, 'Asia/Tokyo'),
 (74, 'Europe/London'),
 (191, 'America/Denver'),
 (382, 'America/Los_Angeles'),
 (400, 'America/Chicago'),
 (521, ''),
 (1251, 'America/New_York')]

In [25]:
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)


Out[25]:
[('America/New_York', 1251),
 ('', 521),
 ('America/Chicago', 400),
 ('America/Los_Angeles', 382),
 ('America/Denver', 191),
 ('Europe/London', 74),
 ('Asia/Tokyo', 37),
 ('Pacific/Honolulu', 36),
 ('Europe/Madrid', 35),
 ('America/Sao_Paulo', 33)]