notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [3]:

    
(140-115) / (175-115.)









    Out[3]:





0.4166666666666667



In [5]:

    
""" quiz materials for feature scaling clustering """

### FYI, the most straightforward implementation might 
### throw a divide-by-zero error, if the min and max
### values are the same
### but think about this for a second--that means that every
### data point has the same value for that feature!  
### why would you rescale it?  Or even use it at all?
def featureScaling(arr):
    if set(arr) == 1:
        return "All data points are the same value!"
    scaled = [(float(x)-min(arr)) / (max(arr) - min(arr)) for x in arr]
    return scaled

# tests of your feature scaler--line below is input data
data = [115, 140, 175]
print featureScaling(data)









    



[0.0, 0.4166666666666667, 1.0]



In [11]:

    
from sklearn.preprocessing import MinMaxScaler
import numpy

scaler = MinMaxScaler()



In [12]:

    
weights = numpy.array([[115.], [140.], [175.]])



In [14]:

    
rescaled_weight = scaler.fit_transform(weights)



In [15]:

    
rescaled_weight









    Out[15]:





array([[ 0.        ],
       [ 0.41666667],
       [ 1.        ]])



In [16]:

    
%load ../ud120-projects/k_means/k_means_cluster.py



In [29]:

    
#!/usr/bin/python 

""" 
    skeleton code for k-means clustering mini-project

"""




import pickle
import numpy
import matplotlib.pyplot as plt
import sys
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat, targetFeatureSplit




def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature 1", f2_name="feature 2"):
    """ some plotting code designed to help you visualize your clusters """

    ### plot each cluster with a different color--add more colors for
    ### drawing more than 4 clusters
    colors = ["b", "c", "k", "m", "g"]
    for ii, pp in enumerate(pred):
        plt.scatter(features[ii][0], features[ii][1], color = colors[pred[ii]])

    ### if you like, place red stars over points that are POIs (just for funsies)
    if mark_poi:
        for ii, pp in enumerate(pred):
            if poi[ii]:
                plt.scatter(features[ii][0], features[ii][1], color="r", marker="*")
    plt.xlabel(f1_name)
    plt.ylabel(f2_name)
    plt.savefig(name)
    plt.show()



### load in the dict of dicts containing all the data on each person in the dataset
data_dict = pickle.load( open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )
### there's an outlier--remove it! 
data_dict.pop("TOTAL", 0)


### the input features we want to use 
### can be any key in the person-level dictionary (salary, director_fees, etc.) 
feature_1 = "salary"
feature_2 = "exercised_stock_options"
poi  = "poi"
features_list = [poi, feature_1, feature_2]
data = featureFormat(data_dict, features_list )
poi, finance_features = targetFeatureSplit( data )


### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to 
### for f1, f2, _ in finance_features:
### (as it's currently written, line below assumes 2 features)
for f1, f2 in finance_features:
    plt.scatter( f1, f2 )
plt.show()



from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features_list = ["poi", feature_1, feature_2]
data2 = featureFormat(data_dict, features_list )
poi, finance_features = targetFeatureSplit( data2 )
scaled_finance_features = scaler.fit_transform(finance_features)
clf = KMeans(n_clusters=2)
pred = clf.fit_predict( scaled_finance_features )
Draw(pred, finance_features, poi, name="clusters_before_scaling.pdf", f1_name=feature_1, f2_name=feature_2)


### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred

try:
    Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
except NameError:
    print "no predictions object named pred found, no clusters to plot"



In [49]:

    
import numpy as np
np.min(finance_features, axis=0)
temp_arr = np.max(finance_features, axis=0)



In [50]:

    
temp_arr = np.vstack([temp_arr, [[0.,0.], [200000, 1000000]] ])



In [51]:

    
scaler.fit_transform(temp_arr)









    Out[51]:





array([[ 1.        ,  1.        ],
       [ 0.        ,  0.        ],
       [ 0.17997621,  0.02911345]])



In [ ]: