This is a simple example of multivariate linear regression. Some repository data is scraped off of github in the following format:
{
"forks": Number,
"contributors": Number,
"releases": Number,
"commits": Number,
"issues": Number,
"stars": Number
}
We will attempt to predict the number of stars a repository should have given the number of forks and issues that are open.
In [ ]:
import pandas as pd
df = pd.read_json('./data/clean.json')
# decided to drop some features
df = df.drop(['issues'], 1)
# standardize features
means = df.mean()
stds = df.std()
standardized = (df - means) / stds
In [ ]:
x_data = standardized.drop('stars', 1).values
y_data = standardized['stars'].values
count = len(x_data) / 2
# separate data set
x_train, x_test = x_data[::2], x_data[1::2]
y_train, y_test = y_data[::2], y_data[1::2]
In [ ]:
from sklearn import linear_model
# fit the data
regr = linear_model.LinearRegression()
regr.fit(x_train, y_train)
# print the coefficients
regr.coef_
In [ ]:
import numpy as np
err = "%.2f" % np.mean((regr.predict(x_test) - y_test) ** 2)
score = regr.score(x_test, y_test)
print('Residual Square Error: ' + err)
print('Variance Score: %.2f' % score)
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
predicted = (regr.predict(x_test) * stds['stars']) + means['stars']
actual = (y_test * stds['stars']) + means['stars']
plt.plot(predicted, actual)
In [ ]: