In [ ]:
#!/usr/bin/env python
import fileinput
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.misc import factorial
#Pandas depends on matplotlib - set up a temp dir for config
import os
import tempfile
os.environ['MPLCONFIGDIR'] = tempfile.mkdtemp()
from pandas import DataFrame
In [ ]:
def run_kmeans(dataset, max_iterations=100, num_clusters=10, num_seeds=10):
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)
feature_vectors = vectorizer.fit_transform(dataset["content"])
#TODO: Create a KMeans object
#TODO: Compute cluster centers and predict cluster index for each newsgroup topic
#TODO: Return the results in a new DataFrame
In [ ]:
replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
current_topic = ""
dataset = []
for line in fileinput.input():
line = line.strip()
fields = line.split('\t')
content = fields[2].translate(replace_punctuation)
if not current_topic:
current_topic = fields[0]
elif current_topic != fields[0]:
clusters = run_kmeans(DataFrame(dataset))
print(DataFrame.to_string(clusters, header = False, index = False))
current_topic = fields[0]
dataset[:] = []
dataset.append({"topic": fields[0], "article_id": fields[1],
"content": content})
In [ ]:
clusters = run_kmeans(DataFrame(dataset))
print(DataFrame.to_string(clusters, header = False, index = False))