notebook.community

Edit and run



In [2]:

    
%matplotlib inline



In [7]:

    
%load ../ud120-projects/outliers/outlier_cleaner.py



In [29]:

    
#!/usr/bin/python


def outlierCleaner(predictions, ages, net_worths):
    """
        clean away the 10% of points that have the largest
        residual errors (different between the prediction
        and the actual net worth)

        return a list of tuples named cleaned_data where 
        each tuple is of the form (age, net_worth, error)
    """
    from operator import itemgetter
    
    
    combined = zip(ages, net_worths, (net_worths - predictions)**2)
    combined = sorted(combined, key=itemgetter(2))
    
    keep = int(len(ages) * .9)
    
    cleaned_data = combined[:keep]    
    
    return cleaned_data



In [1]:

    
%load ../ud120-projects/outliers/outlier_removal_regression.py



In [34]:

    
#%%writefile ../ud120-projects/outliers/outlier_removal_regression.py
#!/usr/bin/python

import random
import numpy
import matplotlib.pyplot as plt
import pickle

# from outlier_cleaner import outlierCleaner


### load up some practice data with outliers in it
ages = pickle.load( open("../ud120-projects/outliers/practice_outliers_ages.pkl", "r") )
net_worths = pickle.load( open("../ud120-projects/outliers/practice_outliers_net_worths.pkl", "r") )



### ages and net_worths need to be reshaped into 2D numpy arrays
### second argument of reshape command is a tuple of integers: (n_rows, n_columns)
### by convention, n_rows is the number of data points
### and n_columns is the number of features
ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
from sklearn.cross_validation import train_test_split
ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42)

### fill in a regression here!  Name the regression object reg so that
### the plotting code below works, and you can see what your regression looks like



from sklearn.linear_model import LinearRegression

reg = LinearRegression()

reg.fit(ages_train, net_worths_train)


try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.show()


### identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(ages_train)
    cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train )
except NameError:
    print "your regression object doesn't exist, or isn't name reg"
    print "can't make predictions to use in identifying outliers"







### only run this code if cleaned_data is returning data
if len(cleaned_data) > 0:
    ages, net_worths, errors = zip(*cleaned_data)
    ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
    net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))

    ### refit your cleaned data!
    try:
        reg.fit(ages, net_worths)
        plt.plot(ages, reg.predict(ages), color="blue")
    except NameError:
        print "you don't seem to have regression imported/created,"
        print "   or else your regression object isn't named reg"
        print "   either way, only draw the scatter plot of the cleaned data"
    plt.scatter(ages, net_worths)
    plt.xlabel("ages")
    plt.ylabel("net worths")
    plt.show()


else:
    print "outlierCleaner() is returning an empty list, no refitting to be done"



In [14]:

    
reg.coef_[0][0]









    Out[14]:





5.0779306434402516



In [15]:

    
reg.score(ages_test, net_worths_test)









    Out[15]:





0.87826247883513953



In [32]:

    
## Retrain with working cleanOutliers()
reg.coef_[0][0]









    Out[32]:





6.3685948069436629



In [35]:

    
reg.score(ages_test, net_worths_test)









    Out[35]:





0.98318945568560667



In [36]:

    
%load ../ud120-projects/outliers/enron_outliers.py



In [50]:

    
# %%writefile ../ud120-projects/outliers/enron_outliers.py
#!/usr/bin/python

import pickle
import sys
import matplotlib.pyplot
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat, targetFeatureSplit


### read in data dictionary, convert to numpy array
data_dict = pickle.load( open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )

# Remove the outlier - 'TOTAL' entry
data_dict.pop('TOTAL', 0)

features = ["salary", "bonus"]
data = featureFormat(data_dict, features)



for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter( salary, bonus )

matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()



In [43]:

    
data.argmax(axis=0)[0]









    Out[43]:





67



In [47]:

    
import numpy as np
np.max(data, axis=0)









    Out[47]:





array([ 26704229.,  97343619.])



In [68]:

    
sal = [k for k, v in data_dict.iteritems() if v['salary'] > 1000000 and v['salary'] != 'NaN']
bonus = [k for k, v in data_dict.iteritems() if v['bonus'] > 5500000 and v['bonus'] != 'NaN']
print set(sal) & set(bonus)









    



set(['SKILLING JEFFREY K', 'LAY KENNETH L'])



In [ ]: