In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

import xgboost as xgb

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

from sklearn.linear_model import LassoCV, Lasso

from math import sqrt

import seaborn as sns


np.set_printoptions(suppress=True, precision=4)
plt.rcParams['figure.figsize'] = 10, 6
%matplotlib inline



In [2]:

    
df = pd.read_csv("https://raw.githubusercontent.com/abulbasar/data/master/startups.csv")
df









    Out[2]:







  
    
      
      R&D Spend
      Administration
      Marketing Spend
      State
      Profit
    
  
  
    
      0
      165349.20
      136897.80
      471784.10
      New York
      192261.83
    
    
      1
      162597.70
      151377.59
      443898.53
      California
      191792.06
    
    
      2
      153441.51
      101145.55
      407934.54
      Florida
      191050.39
    
    
      3
      144372.41
      118671.85
      383199.62
      New York
      182901.99
    
    
      4
      142107.34
      91391.77
      366168.42
      Florida
      166187.94
    
    
      5
      131876.90
      99814.71
      362861.36
      New York
      156991.12
    
    
      6
      134615.46
      147198.87
      127716.82
      California
      156122.51
    
    
      7
      130298.13
      145530.06
      323876.68
      Florida
      155752.60
    
    
      8
      120542.52
      148718.95
      311613.29
      New York
      152211.77
    
    
      9
      123334.88
      108679.17
      304981.62
      California
      149759.96
    
    
      10
      101913.08
      110594.11
      229160.95
      Florida
      146121.95
    
    
      11
      100671.96
      91790.61
      249744.55
      California
      144259.40
    
    
      12
      93863.75
      127320.38
      249839.44
      Florida
      141585.52
    
    
      13
      91992.39
      135495.07
      252664.93
      California
      134307.35
    
    
      14
      119943.24
      156547.42
      256512.92
      Florida
      132602.65
    
    
      15
      114523.61
      122616.84
      261776.23
      New York
      129917.04
    
    
      16
      78013.11
      121597.55
      264346.06
      California
      126992.93
    
    
      17
      94657.16
      145077.58
      282574.31
      New York
      125370.37
    
    
      18
      91749.16
      114175.79
      294919.57
      Florida
      124266.90
    
    
      19
      86419.70
      153514.11
      NaN
      New York
      122776.86
    
    
      20
      76253.86
      113867.30
      298664.47
      California
      118474.03
    
    
      21
      78389.47
      153773.43
      299737.29
      New York
      111313.02
    
    
      22
      73994.56
      122782.75
      303319.26
      Florida
      110352.25
    
    
      23
      67532.53
      105751.03
      304768.73
      Florida
      108733.99
    
    
      24
      77044.01
      99281.34
      140574.81
      New York
      108552.04
    
    
      25
      64664.71
      139553.16
      137962.62
      California
      107404.34
    
    
      26
      75328.87
      144135.98
      134050.07
      Florida
      105733.54
    
    
      27
      72107.60
      127864.55
      353183.81
      New York
      105008.31
    
    
      28
      66051.52
      182645.56
      118148.20
      Florida
      103282.38
    
    
      29
      65605.48
      153032.06
      107138.38
      New York
      101004.64
    
    
      30
      61994.48
      115641.28
      91131.24
      Florida
      99937.59
    
    
      31
      61136.38
      152701.92
      88218.23
      New York
      97483.56
    
    
      32
      63408.86
      129219.61
      46085.25
      California
      97427.84
    
    
      33
      55493.95
      103057.49
      214634.81
      Florida
      96778.92
    
    
      34
      46426.07
      157693.92
      210797.67
      California
      96712.80
    
    
      35
      46014.02
      85047.44
      205517.64
      New York
      96479.51
    
    
      36
      28663.76
      127056.21
      201126.82
      Florida
      90708.19
    
    
      37
      44069.95
      51283.14
      197029.42
      California
      89949.14
    
    
      38
      20229.59
      65947.93
      185265.10
      New York
      81229.06
    
    
      39
      38558.51
      82982.09
      174999.30
      California
      81005.76
    
    
      40
      28754.33
      118546.05
      172795.67
      California
      78239.91
    
    
      41
      27892.92
      84710.77
      164470.71
      Florida
      77798.83
    
    
      42
      23640.93
      96189.63
      148001.11
      California
      71498.49
    
    
      43
      15505.73
      127382.30
      35534.17
      New York
      69758.98
    
    
      44
      22177.74
      154806.14
      28334.72
      California
      65200.33
    
    
      45
      1000.23
      124153.04
      1903.93
      New York
      64926.08
    
    
      46
      1315.46
      115816.21
      297114.46
      Florida
      49490.75
    
    
      47
      NaN
      135426.92
      NaN
      California
      42559.73
    
    
      48
      542.05
      51743.15
      NaN
      New York
      35673.41
    
    
      49
      NaN
      116983.80
      45173.06
      California
      14681.40



In [3]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          48 non-null float64
Administration     50 non-null float64
Marketing Spend    47 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB

There are 50 observations and 5 columns. 4 columns - R&D Spend, Administration and Marketing Spend, and Profile are numeric and one is categorical - State. There is 2 null values in columns R&D Spend feature and and 3 in Marketing Spend.

Replace the null values with median for respective state.



In [4]:

    
df_null_idx = df[df.isnull().sum(axis = 1) > 0].index
df.iloc[df_null_idx]









    Out[4]:







  
    
      
      R&D Spend
      Administration
      Marketing Spend
      State
      Profit
    
  
  
    
      19
      86419.70
      153514.11
      NaN
      New York
      122776.86
    
    
      47
      NaN
      135426.92
      NaN
      California
      42559.73
    
    
      48
      542.05
      51743.15
      NaN
      New York
      35673.41
    
    
      49
      NaN
      116983.80
      45173.06
      California
      14681.40



In [5]:

    
median_values = df.groupby("State")[["R&D Spend", "Marketing Spend"]].median()
median_values









    Out[5]:







  
    
      
      R&D Spend
      Marketing Spend
    
    
      State
      
      
    
  
  
    
      California
      64664.710
      186014.36
    
    
      Florida
      74661.715
      253176.18
    
    
      New York
      77044.010
      261776.23



In [6]:

    
df["R&D Spend"] = df.apply(lambda row: median_values.loc[row["State"], "R&D Spend"]  if np.isnan(row["R&D Spend"]) else row["R&D Spend"], axis = 1 )
df["Marketing Spend"] = df.apply(lambda row: median_values.loc[row["State"], "Marketing Spend"]  if np.isnan(row["Marketing Spend"]) else row["Marketing Spend"], axis = 1 )
df.iloc[df_null_idx]









    Out[6]:







  
    
      
      R&D Spend
      Administration
      Marketing Spend
      State
      Profit
    
  
  
    
      19
      86419.70
      153514.11
      261776.23
      New York
      122776.86
    
    
      47
      64664.71
      135426.92
      186014.36
      California
      42559.73
    
    
      48
      542.05
      51743.15
      261776.23
      New York
      35673.41
    
    
      49
      64664.71
      116983.80
      45173.06
      California
      14681.40



In [7]:

    
# Check if there are any more null values.
df.isnull().sum()









    Out[7]:





R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

Let's see the distribution of the Profit using a histogram plot and see if there is any outliers in the data using bosplot.



In [8]:

    
plt.figure(figsize = (8, 6))
plt.subplot(2, 1, 1)
df.Profit.plot.hist(bins = 10, normed = True)
df.Profit.plot.kde(title = "Historgram of Profit")

plt.subplot(2, 1, 2)
df.Profit.plot.box(vert = False, title = "Boxplot of Profit")

plt.tight_layout()

Profit has one outlier. We can try to take log scale to remove the outlier value before doing any prediction. But for now, let ignore the outlier.

Let's plot association between each pair of columns.



In [9]:

    
sns.pairplot(df)









    Out[9]:





<seaborn.axisgrid.PairGrid at 0x11a8eb630>

Displays only the numeric column. Let's how the avg Profit plays for each State.



In [10]:

    
df.groupby("State").Profit.mean().sort_values().plot.bar(title = "Avg Profit by State")
plt.xlabel("State")
plt.ylabel("Profit")









    Out[10]:





Text(0,0.5,'Profit')

Avg Profit is highest in state of Florida and least in California.

Let's create the y vector containing the outcome column.



In [11]:

    
y = df.Profit.values
y









    Out[11]:





array([ 192261.83,  191792.06,  191050.39,  182901.99,  166187.94,
        156991.12,  156122.51,  155752.6 ,  152211.77,  149759.96,
        146121.95,  144259.4 ,  141585.52,  134307.35,  132602.65,
        129917.04,  126992.93,  125370.37,  124266.9 ,  122776.86,
        118474.03,  111313.02,  110352.25,  108733.99,  108552.04,
        107404.34,  105733.54,  105008.31,  103282.38,  101004.64,
         99937.59,   97483.56,   97427.84,   96778.92,   96712.8 ,
         96479.51,   90708.19,   89949.14,   81229.06,   81005.76,
         78239.91,   77798.83,   71498.49,   69758.98,   65200.33,
         64926.08,   49490.75,   42559.73,   35673.41,   14681.4 ])

Create dummy variables for categorical feature.



In [12]:

    
df_features = df.iloc[:, 0:4]
df_dummied = pd.get_dummies(df_features, columns=["State"], drop_first=True)
df_dummied.sample(10)









    Out[12]:







  
    
      
      R&D Spend
      Administration
      Marketing Spend
      State_Florida
      State_New York
    
  
  
    
      17
      94657.16
      145077.58
      282574.31
      0
      1
    
    
      13
      91992.39
      135495.07
      252664.93
      0
      0
    
    
      38
      20229.59
      65947.93
      185265.10
      0
      1
    
    
      35
      46014.02
      85047.44
      205517.64
      0
      1
    
    
      42
      23640.93
      96189.63
      148001.11
      0
      0
    
    
      11
      100671.96
      91790.61
      249744.55
      0
      0
    
    
      31
      61136.38
      152701.92
      88218.23
      0
      1
    
    
      22
      73994.56
      122782.75
      303319.26
      1
      0
    
    
      44
      22177.74
      154806.14
      28334.72
      0
      0
    
    
      41
      27892.92
      84710.77
      164470.71
      1
      0

State column has been replaced by two additional column - one for Florida and one NY. First value in the categorical values CA has been dropped to avoid collinearity issue.

Now, let's create X feature matrix and y outcome vector.



In [13]:

    
X = df_dummied.values
X[0, :]









    Out[13]:





array([ 165349.2,  136897.8,  471784.1,       0. ,       1. ])

Let's normalize the feature values to bring them to a similar scale.



In [14]:

    
scaler = StandardScaler() 
X_std = scaler.fit_transform(X)
pd.DataFrame(X_std).head()

Split the X and y into training and test sets.



In [15]:

    
X_train, X_test, y_train, y_test = train_test_split(X_std, y, 
                                                    test_size = 0.3, random_state = 100)



In [16]:

    
print("Training set: ", X_train.shape, y_train.shape)









    



Training set:  (35, 5) (35,)



In [17]:

    
print("Test set: ", X_test.shape, y_test.shape)









    



Test set:  (15, 5) (15,)

Ratio of the size of the training data



In [18]:

    
X_train.shape[0] / df.shape[0]









    Out[18]:





0.7

Fit linear regression model



In [19]:

    
lr = LinearRegression()
lr.fit(X_train, y_train)









    Out[19]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [20]:

    
lr.intercept_, lr.coef_









    Out[20]:





(111827.94141541519,
 array([ 36952.9979,  -1074.4562,   1885.608 ,    894.426 ,   -442.6833]))

By looking at the cofficients, we can conclude that R&D Spend has the higest influence on the outcome variable.

Predict the outcome based on the model



In [21]:

    
y_test_pred = lr.predict(X_test)



In [22]:

    
output = pd.DataFrame({"actual": y_test, "prediction": y_test_pred})
output["error"] = output.actual - output.prediction
output









    Out[22]:







  
    
      
      actual
      prediction
      error
    
  
  
    
      0
      156122.51
      159019.628029
      -2897.118029
    
    
      1
      90708.19
      71812.853004
      18895.336996
    
    
      2
      89949.14
      86017.265670
      3931.874330
    
    
      3
      103282.38
      100397.875646
      2884.504354
    
    
      4
      69758.98
      54763.007516
      14995.972484
    
    
      5
      14681.40
      98569.217498
      -83887.817498
    
    
      6
      156991.12
      161629.007963
      -4637.887963
    
    
      7
      96778.92
      96063.934484
      714.985516
    
    
      8
      118474.03
      113044.675267
      5429.354733
    
    
      9
      71498.49
      65850.551449
      5647.938551
    
    
      10
      192261.83
      190878.953160
      1382.876840
    
    
      11
      144259.40
      134066.474864
      10192.925136
    
    
      12
      124266.90
      128219.404807
      -3952.504807
    
    
      13
      64926.08
      41824.481655
      23101.598345
    
    
      14
      81229.06
      63796.209758
      17432.850242

A simpliest prediction model could have been the average. Let's how the model did overall against one feature.



In [23]:

    
X_test_inv = scaler.inverse_transform(X_test)
plt.scatter(X_test_inv[:, 0], y_test, alpha = 0.3, c = "blue", label = "Actual")
plt.scatter(X_test_inv[:, 0], y_test_pred, c = "red", label = "Predicted")

plt.xlabel("R&D Spend")
plt.ylabel("Profit")
plt.title("Profit Actual vs Estimate")
plt.legend()









    Out[23]:





<matplotlib.legend.Legend at 0x11f0cb518>



In [24]:

    
np.mean((y_test_pred - y_test) ** 2)









    Out[24]:





579577930.69277871



In [25]:

    
y_train_pred = lr.predict(X_train)

Compare the root mean squared error (RMSE) of test dataset against the training.



In [26]:

    
print("Test rmse: ", sqrt(mean_squared_error(y_test, y_test_pred)), 
      "\nTraining rmse:", sqrt(mean_squared_error(y_train, y_train_pred)))









    



Test rmse:  24074.424825793423 
Training rmse: 12413.672826747377

r2 score can have a max value 1, negative values of R2 means suboptimal model



In [27]:

    
r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)









    Out[27]:





(0.69192474547582505, 0.89284947669793391)

On the training the both RMSE and R2 scores perform natually better than those on the test dataset.

Let's calculate R2 score manually.



In [28]:

    
SSR = np.sum((y_train - y_train_pred) ** 2)  # Sum of squared residuals
SST = np.sum((y_train - np.mean(y_train_pred)) ** 2) # Sum of squared totals
R2 = 1 - SSR/SST
R2









    Out[28]:





0.89284947669793391

R2 can be viewed as (1 - mse/variance(y))

Significance Scores for feature selection



In [29]:

    
from sklearn.feature_selection import f_regression



In [30]:

    
_, p_vals = f_regression(X_train, y_train)
p_vals









    Out[30]:





array([ 0.    ,  0.2718,  0.0001,  0.3377,  0.9174])



In [31]:

    
pd.DataFrame({"feature": df_dummied.columns, "p_value": p_vals})









    Out[31]:







  
    
      
      feature
      p_value
    
  
  
    
      0
      R&D Spend
      2.608341e-17
    
    
      1
      Administration
      2.718478e-01
    
    
      2
      Marketing Spend
      6.395756e-05
    
    
      3
      State_Florida
      3.377138e-01
    
    
      4
      State_New York
      9.174009e-01

p-value indicates the significant scores for each feature. p-value < 0.05 indicates, the corresponding feature is statistically significant. We can rebuild the model excluding the non-significant features one by one until all remaining features are significant.

Power Plant Dataset

Let's look at another dataset



In [32]:

    
df = pd.read_csv("/data/Combined_Cycle_Power_Plant.csv")
df.head()



In [33]:

    
X = df.iloc[:, 0:4].values
y = df.PE.values



In [34]:

    
sns.pairplot(df)









    Out[34]:





<seaborn.axisgrid.PairGrid at 0x11ee225c0>



In [35]:

    
scaler = StandardScaler()
X_std = scaler.fit_transform(X)



In [36]:

    
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.3, random_state = 1)



In [37]:

    
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))



In [38]:

    
lr = LinearRegression(normalize=False)
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
rmse(y_test, y_test_pred)









    Out[38]:





4.56908403075697



In [39]:

    
from scipy import stats



In [40]:

    
residuals = y_test - y_test_pred

plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.scatter(y_test, residuals)
plt.xlabel("y_test")
plt.ylabel("Residuals")
plt.hlines([0], xmin = 420, xmax = 500, linestyles = "dashed")

plt.subplot(1, 2, 2)
stats.probplot(residuals, plot=plt)









    Out[40]:





((array([-3.899 , -3.6787, -3.5579, ...,  3.5579,  3.6787,  3.899 ]),
  array([-43.4127, -43.029 , -43.029 , ...,  16.7918,  17.4058,  17.4058])),
 (4.5192590374114339, -0.023764787618388999, 0.98885330775966918))

Residual plots show there are outliers in the lower end of the y_test values. qqPlot shows that residuals do not exhibit normaality, indicating non linearity in the model.



In [41]:

    
poly = PolynomialFeatures(degree=2)

X = df.iloc[:, 0:4].values
X_poly = poly.fit_transform(X)
X_poly_train, X_poly_test, y_train, y_test = train_test_split(X_poly, y, test_size = 0.3, random_state = 100)
X_poly_train_std = scaler.fit_transform(X_poly_train)
X_poly_test_std = scaler.transform(X_poly_test)

pd.DataFrame(X_poly_train_std).head()



In [42]:

    
lr.fit(X_poly_train_std, y_train)
print("Train rmse: ", rmse(y_train, lr.predict(X_poly_train_std)))
print("Test rmse: ", rmse(y_test, lr.predict(X_poly_test_std)))









    



Train rmse:  4.246283890430433
Test rmse:  4.276988298022075



In [43]:

    
print(lr.intercept_, lr.coef_)









    



454.309651218 [  0.     -39.9701 -42.9304  92.6406  61.3886   4.9224   7.0201  20.0357
  -2.6156  -1.2008  36.9954   0.0829 -91.48   -57.9414  -3.7064]

Polynomial regression generally sufferes from overfitting. Let's regularize the model using Lasso.



In [44]:

    
lasso = Lasso(alpha=0.03, max_iter=10000, normalize=False, random_state=100)
lasso.fit(X_poly_train_std, y_train)
print("Train rmse: ", rmse(y_train, lasso.predict(X_poly_train_std)))
print("Test rmse: ", rmse(y_test, lasso.predict(X_poly_test_std)))
print(lasso.intercept_, lasso.coef_)









    



Train rmse:  4.330404075518093
Test rmse:  4.35914090833783
454.309651218 [  0.     -17.3624  -1.4063   0.4679   0.       4.471    0.      -0.
  -1.0568  -0.      -0.435   -1.6751   0.       0.      -0.    ]

Let's find cross validation score that accuracy score is more reliable in a sense that it incorporates every piece of is incorporated in both training and testing.



In [45]:

    
X_poly_std = scaler.fit_transform(X_poly)
lasso = Lasso(alpha=0.03, max_iter=10000, random_state=100)
scores = cross_val_score(lasso, X_poly_std, y, cv = 10, scoring="neg_mean_squared_error")
scores = np.sqrt(-scores)
print("RMSE scores", scores)
print("Mean rmse: ", np.mean(scores))









    



RMSE scores [ 4.307   4.371   4.3509  4.3279  4.3506  4.3315  4.4165  4.2604  4.3743
  4.3033]
Mean rmse:  4.33935182329

Encapsulate the steps in a pipeline



In [46]:

    
from sklearn.pipeline import Pipeline



In [47]:

    
pipeline = Pipeline(steps = [
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", StandardScaler()),
    ("lasso", Lasso(alpha=0.03, max_iter=10000, normalize=False, random_state=1))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
pipeline.fit(X_train, y_train)
rmse(y_test, pipeline.predict(X_test))









    Out[47]:





4.353479757142914

LassoCV helps find the best alpha. We could also use model tuning techqniues to find best alpha as well.



In [48]:

    
# Find best alpha
lassocv = LassoCV(cv = 10, max_iter=10000, tol=1e-5)
lassocv.fit(X_poly_std, y)
print("Lassocv alpha: ", lassocv.alpha_)

# Apply the best alpha to find cross validation score
lasso = Lasso(alpha = lassocv.alpha_, max_iter=10000, random_state=100)
scores = cross_val_score(lasso, X_poly_std, y, cv = 10, scoring="neg_mean_squared_error")
print("Mean rmse: ", np.mean(np.sqrt(-scores)))









    



Lassocv alpha:  0.0161806999848
Mean rmse:  4.32254217026

Look at the cofficients values. Many of the features are not zero making the model parsimonious hence more robust - that is less prone to overfitting.

Let's plot how coefficient reached 0 values by varying the alpha valuess.



In [49]:

    
coefs = []
alphas = 10 ** np.linspace(-5, 5, 20)
for alpha in alphas:
    lasso = Lasso(alpha=alpha, max_iter=10000, tol=1e-5,random_state=100)
    lasso.fit(X_poly_std, y)
    coefs.append(lasso.coef_)

plt.plot(alphas, coefs)
plt.xscale("log")
plt.xlabel("Alpha (penalty term on the coefficients)")
plt.ylabel("Coefficients of the features")









    



/Users/abulbasar/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    Out[49]:





Text(0,0.5,'Coefficients of the features')

From this graph, which alpha values should we select. That question can be answered by looking which alpha values gives the best performance (rmse for example). lassocv function does that for us, or we can use model tuning techniques using grid search - that will be explained later.

Xgboost



In [52]:

    
poly = PolynomialFeatures(degree=2)

X = df.iloc[:, 0:4].values
X_poly = poly.fit_transform(X)
X_poly_train, X_poly_test, y_train, y_test = train_test_split(X_poly, y, test_size = 0.3, random_state = 100)
X_poly_train_std = scaler.fit_transform(X_poly_train)
X_poly_test_std = scaler.transform(X_poly_test)



In [53]:

    
gbm = xgb.XGBRegressor(max_depth=10, learning_rate=0.1, n_estimators=100, 
                     objective='reg:linear', booster='gbtree', 
                     reg_alpha=0.01, reg_lambda=1, random_state=0)
gbm.fit(X_poly_train_std, y_train)
print("rmse:", rmse(y_test, gbm.predict(X_poly_test_std)))









    



rmse: 1.8581129118988502



In [55]:

    
param = {'silent':1, 
         'objective':'reg:linear', 
         'booster':'gbtree',
         'alpha': 0.01, 
         'lambda': 1
        }

dtrain = xgb.DMatrix(X_poly_train_std, label=y_train)
dtest = xgb.DMatrix(X_poly_test_std, label=y_test)
watchlist  = [(dtrain,'eval'), (dtest, 'train')]
num_round = 100
bst = xgb.train(param, dtrain, num_round, watchlist, verbose_eval=False)
print("rmse:", rmse(y_test, bst.predict(dtest)))

plt.figure(figsize=(8, 10))
xgb.plot_importance(bst)









    



rmse: 2.019509243562749






    Out[55]:





<matplotlib.axes._subplots.AxesSubplot at 0x1208e6278>






    





<matplotlib.figure.Figure at 0x120c1d048>



In [ ]:

	0	1	2	3	4
0	2.073486	0.560753	2.260465	-0.685994	1.393261
1	2.009412	1.082807	2.004818	-0.685994	-0.717741
2	1.796193	-0.728257	1.675110	1.457738	-0.717741
3	1.585002	-0.096365	1.448347	-0.685994	1.393261
4	1.532256	-1.079919	1.292210	1.457738	-0.717741

	AT	V	AP	RH	PE
0	14.96	41.76	1024.07	73.17	463.26
1	25.18	62.96	1020.04	59.08	444.37
2	5.11	39.40	1012.16	92.14	488.56
3	20.86	57.32	1010.24	76.64	446.48
4	10.82	37.50	1009.23	96.62	473.90

	1	2	3	4	5	6	7	8	9	10	11	12	13	14
0	-0.922813	-1.038771	1.548754	0.887904	-0.961410	-0.981797	-0.911073	-0.588861	-1.003315	-1.017924	-0.360645	1.551684	0.938456	0.898371
1	0.806093	-0.641344	-0.375735	-1.079505	0.747957	0.058652	0.807495	0.199676	-0.692063	-0.653966	-1.211656	-0.377966	-1.085846	-1.101707
2	-0.838247	-0.839664	0.500565	-0.016857	-0.904417	-0.887329	-0.836248	-0.847481	-0.851878	-0.836163	-0.703084	0.497996	-0.002738	-0.120483
3	-1.512091	-1.077333	1.134197	-0.278400	-1.282547	-1.273221	-1.512930	-1.687473	-1.031601	-1.065114	-1.062435	1.134184	-0.246762	-0.383576
4	1.232950	1.498469	-0.315068	-0.995069	1.346129	1.527732	1.238003	0.660068	1.602163	1.506417	0.371953	-0.317467	-1.000394	-1.032252

	R&D Spend	Administration	Marketing Spend	State	Profit
0	165349.20	136897.80	471784.10	New York	192261.83
1	162597.70	151377.59	443898.53	California	191792.06
2	153441.51	101145.55	407934.54	Florida	191050.39
3	144372.41	118671.85	383199.62	New York	182901.99
4	142107.34	91391.77	366168.42	Florida	166187.94
5	131876.90	99814.71	362861.36	New York	156991.12
6	134615.46	147198.87	127716.82	California	156122.51
7	130298.13	145530.06	323876.68	Florida	155752.60
8	120542.52	148718.95	311613.29	New York	152211.77
9	123334.88	108679.17	304981.62	California	149759.96
10	101913.08	110594.11	229160.95	Florida	146121.95
11	100671.96	91790.61	249744.55	California	144259.40
12	93863.75	127320.38	249839.44	Florida	141585.52
13	91992.39	135495.07	252664.93	California	134307.35
14	119943.24	156547.42	256512.92	Florida	132602.65
15	114523.61	122616.84	261776.23	New York	129917.04
16	78013.11	121597.55	264346.06	California	126992.93
17	94657.16	145077.58	282574.31	New York	125370.37
18	91749.16	114175.79	294919.57	Florida	124266.90
19	86419.70	153514.11	NaN	New York	122776.86
20	76253.86	113867.30	298664.47	California	118474.03
21	78389.47	153773.43	299737.29	New York	111313.02
22	73994.56	122782.75	303319.26	Florida	110352.25
23	67532.53	105751.03	304768.73	Florida	108733.99
24	77044.01	99281.34	140574.81	New York	108552.04
25	64664.71	139553.16	137962.62	California	107404.34
26	75328.87	144135.98	134050.07	Florida	105733.54
27	72107.60	127864.55	353183.81	New York	105008.31
28	66051.52	182645.56	118148.20	Florida	103282.38
29	65605.48	153032.06	107138.38	New York	101004.64
30	61994.48	115641.28	91131.24	Florida	99937.59
31	61136.38	152701.92	88218.23	New York	97483.56
32	63408.86	129219.61	46085.25	California	97427.84
33	55493.95	103057.49	214634.81	Florida	96778.92
34	46426.07	157693.92	210797.67	California	96712.80
35	46014.02	85047.44	205517.64	New York	96479.51
36	28663.76	127056.21	201126.82	Florida	90708.19
37	44069.95	51283.14	197029.42	California	89949.14
38	20229.59	65947.93	185265.10	New York	81229.06
39	38558.51	82982.09	174999.30	California	81005.76
40	28754.33	118546.05	172795.67	California	78239.91
41	27892.92	84710.77	164470.71	Florida	77798.83
42	23640.93	96189.63	148001.11	California	71498.49
43	15505.73	127382.30	35534.17	New York	69758.98
44	22177.74	154806.14	28334.72	California	65200.33
45	1000.23	124153.04	1903.93	New York	64926.08
46	1315.46	115816.21	297114.46	Florida	49490.75
47	NaN	135426.92	NaN	California	42559.73
48	542.05	51743.15	NaN	New York	35673.41
49	NaN	116983.80	45173.06	California	14681.40

	R&D Spend	Marketing Spend
State
California	64664.710	186014.36
Florida	74661.715	253176.18
New York	77044.010	261776.23

	actual	prediction	error
0	156122.51	159019.628029	-2897.118029
1	90708.19	71812.853004	18895.336996
2	89949.14	86017.265670	3931.874330
3	103282.38	100397.875646	2884.504354
4	69758.98	54763.007516	14995.972484
5	14681.40	98569.217498	-83887.817498
6	156991.12	161629.007963	-4637.887963
7	96778.92	96063.934484	714.985516
8	118474.03	113044.675267	5429.354733
9	71498.49	65850.551449	5647.938551
10	192261.83	190878.953160	1382.876840
11	144259.40	134066.474864	10192.925136
12	124266.90	128219.404807	-3952.504807
13	64926.08	41824.481655	23101.598345
14	81229.06	63796.209758	17432.850242

	feature	p_value
0	R&D Spend	2.608341e-17
1	Administration	2.718478e-01
2	Marketing Spend	6.395756e-05
3	State_Florida	3.377138e-01
4	State_New York	9.174009e-01