In [2]:
import pandas as pd
In [7]:
# Read in the data with pandas http://pandas.pydata.org
data = pd.read_csv('train.csv')
# Remove nulls and replace them with blanks
data['comments'] = data['comments'].fillna('')
In [8]:
# Lets take a look at the data
data.head()
Out[8]:
In training the id doesn't matter, but this is the column to keep for predictions on test.csv. Aside from that quality is the most important column since its what we are trying to predict
For this tutorial/baseline lets use a simple unigram model of the comments with linear regresssion
Its also important we separate training data from test data to prevent overfitting.
In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
In [9]:
train, test = train_test_split(data)
In [10]:
# Make a pipeline to do unigrams then run linear regression
# http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
pipeline = Pipeline([
('cv', CountVectorizer()),
('regression', LinearRegression())
])
# Use the cross validation feature of sklearn to get a good estimate of the error.
# Before feeding in the comments we fill any nulls as empty strings
# http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html
cv = GridSearchCV(
pipeline, {}
).fit(train['comments'], train['quality'])
In [12]:
# Output the best score, this is based on held out data in cross validation
print("R Squared: {}".format(cv.best_score_))
# Output the Mean Squared Error using our held out training data
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(test['quality'], cv.predict(test['comments']))
print("MSE: {}".format(mse))
In [13]:
# Load the kaggle test data
kaggle_test = pd.read_csv('test.csv')
kaggle_test['comments'] = kaggle_test['comments'].fillna('')
In [14]:
# Make training predictions
predictions = cv.predict(kaggle_test['comments'])
# Lets take a quick look at the predictions to make sure they are sensible, seems like it
predictions
Out[14]:
In [15]:
# Finally lets write out the predictions with their id's
with open('predictions.csv', 'w') as f:
f.write("id,quality\n")
for row_id, prediction in zip(kaggle_test['id'], predictions):
f.write('{},{}\n'.format(row_id, prediction))
In [16]:
# Save the model for later
import pickle
with open('model.pkl', 'bw') as f:
pickle.dump(cv, f)
In [ ]: