In [1]:
from sklearn.linear_model import LinearRegression
%matplotlib inline
In [5]:
import numpy
import random
def ageNetWorthData():
random.seed(42)
numpy.random.seed(42)
ages = []
for ii in range(100):
ages.append( random.randint(20,65) )
net_worths = [ii * 6.25 + numpy.random.normal(scale=40.) for ii in ages]
### need massage list into a 2d numpy array to get it to work in LinearRegression
ages = numpy.reshape( numpy.array(ages), (len(ages), 1))
net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
from sklearn.cross_validation import train_test_split
ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths)
return ages_train, ages_test, net_worths_train, net_worths_test
In [8]:
reg = LinearRegression()
ages_train, ages_test, net_worths_train, net_worths_test = ageNetWorthData()
In [12]:
reg.fit(ages_train, net_worths_train)
reg.predict([27])
reg.coef_
reg.intercept_
print reg.score(ages_test, net_worths_test) # R^2
print reg.score(ages_train, net_worths_train) #R^2
In [25]:
import numpy
import matplotlib.pyplot as plt
# from ages_net_worths import ageNetWorthData
ages_train, ages_test, net_worths_train, net_worths_test = ageNetWorthData()
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(ages_train, net_worths_train)
### get Katie's net worth (she's 27)
### sklearn predictions are returned in an array,
### so you'll want to do something like net_worth = predict([27])[0]
### (not exact syntax, the point is that [0] at the end)
km_net_worth = reg.predict([27])[0]
### get the slope
### again, you'll get a 2-D array, so stick the [0][0] at the end
slope = reg.coef_[0][0]
### get the intercept
### here you get a 1-D array, so stick [0] on the end to access
### the info we want
intercept = reg.intercept_[0]
### get the score on test data
test_score = reg.score(ages_test, net_worths_test)
### get the score on the training data
training_score = reg.score(ages_train, net_worths_train)
def submitFit():
return {"networth":km_net_worth,
"slope":slope,
"intercept":intercept,
"stats on test":test_score,
"stats on training": training_score}
In [26]:
print submitFit()
In [27]:
%load ../ud120-projects/regression/finance_regression.py
In [11]:
# %%writefile ../ud120-projects/regression/finance_regression.py
#!/usr/bin/python
"""
starter code for the regression mini-project
loads up/formats a modified version of the dataset
(why modified? we've removed some trouble points
that you'll find yourself in the outliers mini-project)
draws a little scatterplot of the training/testing data
you fill in the regression code where indicated
"""
import sys
import pickle
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load( open("../ud120-projects/final_project/final_project_dataset_modified.pkl", "r") )
### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)#, "long_term_incentive"], remove_any_zeroes=True )
target, features = targetFeatureSplit( data )
### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"
### your regression goes here!
### please name it reg, so that the plotting code below picks it up and
### plots it correctly
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(feature_train, target_train)
### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
plt.scatter( feature, target, color=test_color )
for feature, target in zip(feature_train, target_train):
plt.scatter( feature, target, color=train_color )
### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")
### draw the regression line, once it's coded
try:
plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
print "Exception"
pass
plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()
In [5]:
reg.intercept_
Out[5]:
In [8]:
reg.coef_[0]
Out[8]:
In [9]:
reg.score(feature_train, target_train)
Out[9]:
In [10]:
reg.score(feature_test, target_test)
Out[10]:
In [12]:
%load ../ud120-projects/regression/finance_regression.py
In [13]:
#!/usr/bin/python
"""
starter code for the regression mini-project
loads up/formats a modified version of the dataset
(why modified? we've removed some trouble points
that you'll find yourself in the outliers mini-project)
draws a little scatterplot of the training/testing data
you fill in the regression code where indicated
"""
import sys
import pickle
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load( open("../ud120-projects/final_project/final_project_dataset_modified.pkl", "r") )
### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "long_term_incentive"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)#, "long_term_incentive"], remove_any_zeroes=True )
target, features = targetFeatureSplit( data )
### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"
### your regression goes here!
### please name it reg, so that the plotting code below picks it up and
### plots it correctly
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(feature_train, target_train)
### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
plt.scatter( feature, target, color=test_color )
for feature, target in zip(feature_train, target_train):
plt.scatter( feature, target, color=train_color )
### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")
### draw the regression line, once it's coded
try:
plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
print "Exception"
pass
plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()
In [14]:
reg.score(feature_test, target_test)
Out[14]:
In [15]:
%load ../ud120-projects/regression/finance_regression.py
In [16]:
#!/usr/bin/python
"""
starter code for the regression mini-project
loads up/formats a modified version of the dataset
(why modified? we've removed some trouble points
that you'll find yourself in the outliers mini-project)
draws a little scatterplot of the training/testing data
you fill in the regression code where indicated
"""
import sys
import pickle
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load( open("../ud120-projects/final_project/final_project_dataset_modified.pkl", "r") )
### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)#, "long_term_incentive"], remove_any_zeroes=True )
target, features = targetFeatureSplit( data )
### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"
### your regression goes here!
### please name it reg, so that the plotting code below picks it up and
### plots it correctly
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(feature_train, target_train)
### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
plt.scatter( feature, target, color=test_color )
for feature, target in zip(feature_train, target_train):
plt.scatter( feature, target, color=train_color )
### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")
### draw the regression line, once it's coded
try:
plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
print "Exception"
pass
reg.fit(feature_test, target_test)
plt.plot(feature_train, reg.predict(feature_train), color='r')
plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()
In [18]:
reg.coef_[0]
Out[18]:
In [ ]: