In [1]:
%matplotlib inline
In [2]:
# Add the util directory to the path
import sys
import os
sys.path.append(os.path.abspath('../util'))
In [24]:
from operator import itemgetter
from itertools import islice
from nltk.corpus import reuters as re
from matplotlib import pyplot as plt
import clustering as cl
import util
In [4]:
categories = re.categories()
fileids = re.fileids()
cat_dist = []
for article in fileids:
cat_dist.append(re.categories(article))
In [18]:
cat_dist_count = [len(categories) for categories in cat_dist]
distribution = list(zip(*util.distribution(cat_dist_count, pairs=True)))
plt.figure()
plt.xticks(range(len(distribution[1])))
plt.xlabel('No. of categories covered in a single article')
plt.ylabel('No. of aricles')
plt.bar(distribution[0], distribution[1], align='center')
plt.show()
In [42]:
single_categories = [(id, re.categories(id)[0]) for id in fileids if len(re.categories(id)) == 1]
single_cat_list = util.distribution(single_categories, keyfun=itemgetter(1), pairs=True)
baseline = range(len(single_cat_list))
counts = [pair[1] for pair in single_cat_list]
categories = [pair[0] for pair in single_cat_list]
plt.figure(figsize=(30,10), dpi=100)
plt.title('Distribution of articles featuring one category only')
plt.xlabel('Categories')
plt.ylabel('No of articles')
plt.xticks(rotation=45)
plt.bar(baseline, counts)
plt.xticks(baseline, categories)
plt.show()
In [47]:
#single_categories = ((id, re.categories(id)[0]) for id in fileids if len(re.categories(id)) == 1)
single_cat_list = util.distribution(single_categories, keyfun=itemgetter(1), pairs=True)
single_cat_list = [x for x in single_cat_list if x[1] < 600 and x[1] > 200]
baseline = range(len(single_cat_list))
counts = [pair[1] for pair in single_cat_list]
categories = [pair[0] for pair in single_cat_list]
plt.figure(figsize=(8,6), dpi=100)
plt.title('Distribution of articles featuring one category only')
plt.xlabel('Categories')
plt.ylabel('No of articles')
plt.xticks(rotation=45)
plt.bar(baseline, counts)
plt.xticks(baseline, categories)
plt.show()
In [61]:
topic_list = [pair for pair in single_categories if pair[1] in dict(single_cat_list).keys()]
topics = util.group(topic_list, itemgetter(1))
for topic in topics:
topics[topic] = list(map(itemgetter(0), topics[topic]))
topics_len = {}
for topic in topics:
topics_len[topic] = list(map(lambda fileid: len(re.raw(fileid)), topics[topic]))
keys = list(topics_len.keys())
plt.figure(figsize=(8,6), dpi=100)
plt.subplot(221)
plt.title(keys[0] + ' articles length')
plt.hist(topics_len[keys[0]], bins=20)
plt.subplot(222)
plt.title(keys[1] + ' articles length')
plt.hist(topics_len[keys[1]], bins=20)
plt.subplot(223)
plt.title(keys[1] + ' articles length')
plt.hist(topics_len[keys[1]], bins=20)
plt.subplot(224)
plt.title(keys[1] + ' articles length')
plt.hist(topics_len[keys[1]], bins=20)
plt.show()
In [ ]: