In [2]:
import pandas as pd

In [7]:
# Read in the data with pandas
data = pd.read_csv('train.csv')

# Remove nulls and replace them with blanks
data['comments'] = data['comments'].fillna('')

In [8]:
# Lets take a look at the data

id tid dept date forcredit attendance textbookuse interest grade tags comments helpcount nothelpcount online profgender profhotness helpfulness clarity easiness quality
0 24228248 916674 Business 01/05/2015 Yes NaN It's a must have Really into it NaN ["Would take again", "Hilarious", "Tests are t... Great Professor My wife took this class twice ... 0 10 NaN 0 0 4 5 3 9
1 24218909 916674 Business 01/02/2015 Yes Mandatory It's a must have Sorta interested A ["Skip class? You won't pass.", "Tests are tou... Great Professor Study the notes from class and... 0 1 NaN 0 0 4 4 2 8
2 24215795 916674 Business 01/02/2015 Yes NaN Essential to passing Really into it NaN ["Hilarious", "Would take again", "Skip class?... Brother Brau is a great guy He gives great spi... 1 2 NaN 0 0 4 4 3 8
3 24204179 916674 Business 12/30/2014 Yes Not Mandatory Essential to passing Sorta interested NaN ["Tests are tough", "Get ready to read"] People rave about Brau but I personally dont g... 18 6 NaN 0 0 3 1 2 4
4 24198463 916674 Business 12/28/2014 Yes Not Mandatory You need it sometimes Sorta interested A ["Inspirational", "Hilarious", "Skip class? Yo... This class doesnt have much homework which was... 1 0 NaN 0 0 4 4 4 8


In training the id doesn't matter, but this is the column to keep for predictions on test.csv. Aside from that quality is the most important column since its what we are trying to predict

For this tutorial/baseline lets use a simple unigram model of the comments with linear regresssion

Its also important we separate training data from test data to prevent overfitting.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split

In [9]:
train, test = train_test_split(data)

In [10]:
# Make a pipeline to do unigrams then run linear regression
pipeline = Pipeline([
        ('cv', CountVectorizer()),
        ('regression', LinearRegression())

# Use the cross validation feature of sklearn to get a good estimate of the error.
# Before feeding in the comments we fill any nulls as empty strings
cv = GridSearchCV(
    pipeline, {}
).fit(train['comments'], train['quality'])

In [12]:
# Output the best score, this is based on held out data in cross validation
print("R Squared: {}".format(cv.best_score_))

# Output the Mean Squared Error using our held out training data
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(test['quality'], cv.predict(test['comments']))
print("MSE: {}".format(mse))

R Squared: 0.1397619335236992
MSE: 5.238645822808021

In [13]:
# Load the kaggle test data
kaggle_test = pd.read_csv('test.csv')
kaggle_test['comments'] = kaggle_test['comments'].fillna('')

In [14]:
# Make training predictions
predictions = cv.predict(kaggle_test['comments'])

# Lets take a quick look at the predictions to make sure they are sensible, seems like it

array([ 11.11033054,   8.27763007,   7.23343964, ...,   8.33615437,
         8.68797534,   6.32335439])

In [15]:
# Finally lets write out the predictions with their id's

with open('predictions.csv', 'w') as f:
    for row_id, prediction in zip(kaggle_test['id'], predictions):
        f.write('{},{}\n'.format(row_id, prediction))

In [16]:
# Save the model for later
import pickle
with open('model.pkl', 'bw') as f:
    pickle.dump(cv, f)

In [ ]: