In [102]:
#!/usr/bin/python
"""
Skeleton code for k-means clustering mini-project.
"""
import pickle
import numpy
import matplotlib.pyplot as plt
import sys
from feature_format import featureFormat, targetFeatureSplit
def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature 1", f2_name="feature 2"):
""" some plotting code designed to help you visualize your clusters """
### plot each cluster with a different color--add more colors for
### drawing more than five clusters
colors = ["b", "c", "k", "m", "g"]
for ii, pp in enumerate(pred):
plt.scatter(features[ii][0], features[ii][1], color = colors[pred[ii]])
### if you like, place red stars over points that are POIs (just for funsies)
if mark_poi:
for ii, pp in enumerate(pred):
if poi[ii]:
plt.scatter(features[ii][0], features[ii][1], color="r", marker="*")
plt.xlabel(f1_name)
plt.ylabel(f2_name)
plt.savefig(name)
plt.show()
### load in the dict of dicts containing all the data on each person in the dataset
data_dict = pickle.load( open("final_project_dataset_modified.pkl", "r") )
### there's an outlier--remove it!
data_dict.pop("TOTAL", 0)
### the input features we want to use
### can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
feature_3 = "total_payments"
poi = "poi"
features_list = [poi, feature_1, feature_2, feature_3]
data = featureFormat(data_dict, features_list )
poi, finance_features = targetFeatureSplit( data )
### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, the line below assumes 2 features) _modified
for f1, f2, f3 in finance_features:
plt.scatter( f1, f2 )
plt.show()
#eso = numpy.array(finance_features)[:,1]
#eso = eso[eso > 0]
#print "max is {} , min is {}".format(numpy.max(eso),format(numpy.min(eso)))
salaries = filter(lambda x: x != 'NaN', [data_dict[cat]['salary'] for cat in data_dict])
print('min = {}, max = {}'.format(min(salaries), max(salaries)))
exercised_stock_options= [v["exercised_stock_options"] for k,v in data_dict.iteritems() if v["exercised_stock_options"] != "NaN" ]
print exercised_stock_options
print " max of exercised_stock_options is {}".format(max(exercised_stock_options))
print " min of exercised_stock_options is {}".format(min(exercised_stock_options))
### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=2,random_state=0,max_iter=500).fit(finance_features)
pred = clf.predict(finance_features)
print pred
print clf.cluster_centers_
#pred = clf.labels_
### rename the "name" parameter when you change the number of features
### so that the figure gets saved to a different file
try:
Draw(pred, finance_features, poi, mark_poi=False, name="clusters2.pdf", f1_name=feature_1, f2_name=feature_2)
except NameError:
print "no predictions object named pred found, no clusters to plot"
In [ ]:
In [ ]: