In [1]:
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [5]:
import numpy
import random

def ageNetWorthData():

    random.seed(42)
    numpy.random.seed(42)

    ages = []
    for ii in range(100):
        ages.append( random.randint(20,65) )
    net_worths = [ii * 6.25 + numpy.random.normal(scale=40.) for ii in ages]
### need massage list into a 2d numpy array to get it to work in LinearRegression
    ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
    net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))

    from sklearn.cross_validation import train_test_split
    ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths)

    return ages_train, ages_test, net_worths_train, net_worths_test

In [8]:
reg = LinearRegression()

ages_train, ages_test, net_worths_train, net_worths_test = ageNetWorthData()

In [12]:
reg.fit(ages_train, net_worths_train)
reg.predict([27])
reg.coef_
reg.intercept_
print reg.score(ages_test, net_worths_test) # R^2
print reg.score(ages_train, net_worths_train) #R^2


0.812365730575
0.874588235822

In [25]:
import numpy
import matplotlib.pyplot as plt

# from ages_net_worths import ageNetWorthData

ages_train, ages_test, net_worths_train, net_worths_test = ageNetWorthData()



from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(ages_train, net_worths_train)

### get Katie's net worth (she's 27)
### sklearn predictions are returned in an array,
### so you'll want to do something like net_worth = predict([27])[0]
### (not exact syntax, the point is that [0] at the end)
km_net_worth = reg.predict([27])[0] 

### get the slope
### again, you'll get a 2-D array, so stick the [0][0] at the end
slope = reg.coef_[0][0] 

### get the intercept
### here you get a 1-D array, so stick [0] on the end to access
### the info we want
intercept = reg.intercept_[0] 


### get the score on test data
test_score = reg.score(ages_test, net_worths_test)


### get the score on the training data
training_score = reg.score(ages_train, net_worths_train)

def submitFit():
    return {"networth":km_net_worth,
            "slope":slope,
            "intercept":intercept,
            "stats on test":test_score,
            "stats on training": training_score}

In [26]:
print submitFit()


{'slope': 6.4735495495770534, 'stats on training': 0.87458823582171819, 'intercept': -14.353781332184781, 'stats on test': 0.81236573057499983, 'networth': 160.43205650639567}

In [27]:
%load ../ud120-projects/regression/finance_regression.py

In [11]:
# %%writefile ../ud120-projects/regression/finance_regression.py
#!/usr/bin/python

"""
    starter code for the regression mini-project
    
    loads up/formats a modified version of the dataset
    (why modified?  we've removed some trouble points
    that you'll find yourself in the outliers mini-project)

    draws a little scatterplot of the training/testing data

    you fill in the regression code where indicated

"""    


import sys
import pickle
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load( open("../ud120-projects/final_project/final_project_dataset_modified.pkl", "r") )

### list the features you want to look at--first item in the 
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)#, "long_term_incentive"], remove_any_zeroes=True )
target, features = targetFeatureSplit( data )

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"



### your regression goes here!
### please name it reg, so that the plotting code below picks it up and 
### plots it correctly

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(feature_train, target_train)




### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
    plt.scatter( feature, target, color=test_color ) 
for feature, target in zip(feature_train, target_train):
    plt.scatter( feature, target, color=train_color ) 

### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")




### draw the regression line, once it's coded
try:
    plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
    print "Exception"
    pass
plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()


Overwriting ../ud120-projects/regression/finance_regression.py

In [5]:
reg.intercept_


Out[5]:
-102360.54329387937

In [8]:
reg.coef_[0]


Out[8]:
5.4481402888055159

In [9]:
reg.score(feature_train, target_train)


Out[9]:
0.045509192699524359

In [10]:
reg.score(feature_test, target_test)


Out[10]:
-1.4849924173685092

In [12]:
%load ../ud120-projects/regression/finance_regression.py

In [13]:
#!/usr/bin/python

"""
    starter code for the regression mini-project
    
    loads up/formats a modified version of the dataset
    (why modified?  we've removed some trouble points
    that you'll find yourself in the outliers mini-project)

    draws a little scatterplot of the training/testing data

    you fill in the regression code where indicated

"""    


import sys
import pickle
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load( open("../ud120-projects/final_project/final_project_dataset_modified.pkl", "r") )

### list the features you want to look at--first item in the 
### list will be the "target" feature
features_list = ["bonus", "long_term_incentive"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)#, "long_term_incentive"], remove_any_zeroes=True )
target, features = targetFeatureSplit( data )

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"



### your regression goes here!
### please name it reg, so that the plotting code below picks it up and 
### plots it correctly

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(feature_train, target_train)




### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
    plt.scatter( feature, target, color=test_color ) 
for feature, target in zip(feature_train, target_train):
    plt.scatter( feature, target, color=train_color ) 

### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")




### draw the regression line, once it's coded
try:
    plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
    print "Exception"
    pass
plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()



In [14]:
reg.score(feature_test, target_test)


Out[14]:
-0.59271289994986387

In [15]:
%load ../ud120-projects/regression/finance_regression.py

In [16]:
#!/usr/bin/python

"""
    starter code for the regression mini-project
    
    loads up/formats a modified version of the dataset
    (why modified?  we've removed some trouble points
    that you'll find yourself in the outliers mini-project)

    draws a little scatterplot of the training/testing data

    you fill in the regression code where indicated

"""    


import sys
import pickle
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load( open("../ud120-projects/final_project/final_project_dataset_modified.pkl", "r") )

### list the features you want to look at--first item in the 
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)#, "long_term_incentive"], remove_any_zeroes=True )
target, features = targetFeatureSplit( data )

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"



### your regression goes here!
### please name it reg, so that the plotting code below picks it up and 
### plots it correctly

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(feature_train, target_train)




### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
    plt.scatter( feature, target, color=test_color ) 
for feature, target in zip(feature_train, target_train):
    plt.scatter( feature, target, color=train_color ) 

### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")




### draw the regression line, once it's coded
try:
    plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
    print "Exception"
    pass
reg.fit(feature_test, target_test)
plt.plot(feature_train, reg.predict(feature_train), color='r')
plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()



In [18]:
reg.coef_[0]


Out[18]:
2.2741011412666916

In [ ]: