In [35]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
In [2]:
os.getcwd()
Out[2]:
In [3]:
a = pd.read_csv('per_scholas_data.csv', skiprows=range(1,2))
b = pd.read_csv('per_scholas_data2.csv')
c = pd.read_csv('per_scholas_data3.csv', skiprows=range(1,2))
In [4]:
a.head()
Out[4]:
In [5]:
b.head()
Out[5]:
In [6]:
c.head()
Out[6]:
In [7]:
a.describe()
Out[7]:
In [8]:
b.describe()
Out[8]:
In [9]:
c.describe()
Out[9]:
In [10]:
a.shape
Out[10]:
In [11]:
b.shape
Out[11]:
In [12]:
c.shape
Out[12]:
In [13]:
merged_df = a.merge(b,on='Record ID').merge(c,on='Record ID')
In [14]:
merged_df.head()
Out[14]:
In [15]:
merged_df.describe()
Out[15]:
In [16]:
merged_df
Out[16]:
Merged dataframe only containes 9 records...may not be very useful. Go back to the first data frame.
In [17]:
labels = a.columns.values
In [18]:
labels
Out[18]:
In [19]:
sum(a['Placed'])
Out[19]:
In [20]:
a_num = a[['First Post Training Wage', 'Retained (Months)', 'Current Wage']]
In [21]:
a_num_na = a_num.fillna(0)
In [22]:
first_wage = a_num_na['First Post Training Wage']
In [23]:
retained = a_num_na['Retained (Months)']
In [24]:
current_wage = a_num_na['Current Wage']
In [25]:
sns.jointplot(x=first_wage, y=current_wage)
plt.show()
There seems to be a good correlation between first post training wage and current wage
In [27]:
sns.jointplot(x=first_wage, y=retained, kind="kde");
plt.show()
Not much of correlation between first wage and retained duration
Let's go back to initial and current wage and see if we can find a relationship
In [29]:
x = first_wage
y = current_wage
In [31]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
In [36]:
lr = linear_model.LinearRegression()
In [41]:
X_train, X_test, y_train, y_test = X_train.reshape(-1,1), X_test.reshape(-1, 1), y_train.reshape(-1, 1), y_test.reshape(-1, 1)
In [42]:
y_train.shape
Out[42]:
In [43]:
lr.fit(X_train, y_train)
Out[43]:
In [44]:
lr.score(X_test, y_test)
Out[44]:
In [46]:
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, lr.predict(X_test), color='blue',
linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
Not a very good model -- need more samples and features!