# Data is for Fifty startups - we are trying to find the best startup to invest in. # We have there Profit , marketing spends, Administrative spends and R&D spends.

Initializing and Referencing needed Libraries


In [107]:
# import Libraries
from IPython.display import clear_output

from io import StringIO
import pandas as pd
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Libraries

from sklearn.preprocessing  import LabelEncoder()
from sklearn.preprocessing  import OneHotEncoder()
from sklearn.preprocessing  import StandardScaler()
from sklearn.linear_model import LinearRegression()

from sklearn.model_selection import StratifiedKFold

In [108]:
# Loading Data from URL

url='https://raw.githubusercontent.com/saqibmujtaba/Machine-Learning/DataFiles/50_Startups.csv'
s=requests.get(url).text

dataset=pd.read_csv(StringIO(s))

In [109]:
# Size of data

print(dataset.shape)


(50, 5)

In [110]:
# Variables variation - Qualitative and Quantitative 
print(dataset.dtypes)


R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [111]:
dataset.sample(2)


Out[111]:
R&D Spend Administration Marketing Spend State Profit
26 75328.87 144135.98 134050.07 Florida 105733.54
20 76253.86 113867.30 298664.47 California 118474.03

Clearly , we have to do Label Encoding on the State column as its a categorical variable.


In [119]:
# Co-relation between different varaibles.

pd.set_option('display.width',100)
pd.set_option('precision',3)
corr = dataset.corr(method='pearson', min_periods=1)


dataset.corr(method='pearson', min_periods=1)


Out[119]:
R&D Spend Administration Marketing Spend Profit
R&D Spend 1.000 0.242 0.724 0.973
Administration 0.242 1.000 -0.032 0.201
Marketing Spend 0.724 -0.032 1.000 0.748
Profit 0.973 0.201 0.748 1.000

In [147]:
# Visualize co-relation by creating Co-relational Matrix
sns.heatmap(corr, cmap='jet',vmin=-1,vmax=1,center=0,square=True, linewidths=.5, )
plt.show()

#cmap = Paired, Rainbow, jet, Greens, Blues, copper, seismic
# Jet is used mstly as it is having a good variation of color sampling


# We find that R&D Spend and Profit have a strong co-relation of 97% followed by Marketing Spend. # Also, administrative expenses have least importance w.r.t profit of these startups

Preprocessing


In [155]:
X = dataset.ix[:, dataset.columns != 'Profit']
print X


    R&D Spend  Administration  Marketing Spend       State
0   165349.20       136897.80        471784.10    New York
1   162597.70       151377.59        443898.53  California
2   153441.51       101145.55        407934.54     Florida
3   144372.41       118671.85        383199.62    New York
4   142107.34        91391.77        366168.42     Florida
5   131876.90        99814.71        362861.36    New York
6   134615.46       147198.87        127716.82  California
7   130298.13       145530.06        323876.68     Florida
8   120542.52       148718.95        311613.29    New York
9   123334.88       108679.17        304981.62  California
10  101913.08       110594.11        229160.95     Florida
11  100671.96        91790.61        249744.55  California
12   93863.75       127320.38        249839.44     Florida
13   91992.39       135495.07        252664.93  California
14  119943.24       156547.42        256512.92     Florida
15  114523.61       122616.84        261776.23    New York
16   78013.11       121597.55        264346.06  California
17   94657.16       145077.58        282574.31    New York
18   91749.16       114175.79        294919.57     Florida
19   86419.70       153514.11             0.00    New York
20   76253.86       113867.30        298664.47  California
21   78389.47       153773.43        299737.29    New York
22   73994.56       122782.75        303319.26     Florida
23   67532.53       105751.03        304768.73     Florida
24   77044.01        99281.34        140574.81    New York
25   64664.71       139553.16        137962.62  California
26   75328.87       144135.98        134050.07     Florida
27   72107.60       127864.55        353183.81    New York
28   66051.52       182645.56        118148.20     Florida
29   65605.48       153032.06        107138.38    New York
30   61994.48       115641.28         91131.24     Florida
31   61136.38       152701.92         88218.23    New York
32   63408.86       129219.61         46085.25  California
33   55493.95       103057.49        214634.81     Florida
34   46426.07       157693.92        210797.67  California
35   46014.02        85047.44        205517.64    New York
36   28663.76       127056.21        201126.82     Florida
37   44069.95        51283.14        197029.42  California
38   20229.59        65947.93        185265.10    New York
39   38558.51        82982.09        174999.30  California
40   28754.33       118546.05        172795.67  California
41   27892.92        84710.77        164470.71     Florida
42   23640.93        96189.63        148001.11  California
43   15505.73       127382.30         35534.17    New York
44   22177.74       154806.14         28334.72  California
45    1000.23       124153.04          1903.93    New York
46    1315.46       115816.21        297114.46     Florida
47       0.00       135426.92             0.00  California
48     542.05        51743.15             0.00    New York
49       0.00       116983.80         45173.06  California

In [ ]: