In [1]:
import os
os.listdir('.')


Out[1]:
['.ipynb_checkpoints', 'alexa', 'read_from_json.ipynb', 'scrapy.cfg']

In [2]:
fname = 'alexa/cn.json'
with open(fname) as f:
    import json
    alexa_items = []
    for line in f:
        # print line, type(line)
        alexa_item = json.loads(line)
        # print alexa_item, type(alexa_item)
        alexa_items.append(alexa_item)

In [3]:
len(alexa_items)


Out[3]:
97222

In [4]:
print(alexa_items[0])


{u'url': u'/siteinfo/taobao.com', u'category': u'World/Chinese_Simplified_CN/\u8d2d\u7269', u'name': u'Taobao.com', u'description': u"Launched in May 2003, Taobao Marketplace (www.taobao.com) is the online shopping destination of choice for Chinese consumers looking for wide selection, value and convenience. Shoppers choose from a wide range of products and services on Taobao Marketplace, which features hundreds of millions of product and service listings. Taobao Marketplace was China's largest online shopping destination in terms of gross merchandise volume in 2013, according to iResearch. Taobao Marketplace is a business within Alibaba Group."}

In [5]:
category_items = {}
for item in alexa_items:
    c = item['category']
    if c not in category_items:
        category_items[c] = []
    category_items[c].append(item)
print(len(category_items))


4321

In [6]:
counts = {}
for category, items in category_items.items():
    counts[category] = len(items)
import operator
sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
for category, counts in sorted_counts[:10]:
    print category, counts


World/Chinese_Simplified_CN/游戏 525
亚洲/中国/浙江 525
参考/教育/大专院校与研究所 525
亚洲/中国/四川 525
亚洲/中国/江苏 525
Chinese_Simplified_CN/参考/教育 525
中国/浙江/宁波 525
World/Chinese_Simplified_CN/商业 525
亚洲/中国/上海 525
亚洲/中国/北京 525

In [10]:
for k in sorted(category_items, key=lambda k: len(category_items[k]), reverse=True)[:10]:
    print k, len(category_items[k])


World/Chinese_Simplified_CN/游戏 525
亚洲/中国/浙江 525
Chinese_Simplified_CN/计算机/互联网络 525
亚洲/中国/四川 525
亚洲/中国/江苏 525
Chinese_Simplified_CN/参考/教育 525
中国/浙江/宁波 525
World/Chinese_Simplified_CN/商业 525
亚洲/中国/上海 525
亚洲/中国/北京 525

In [ ]: