In [2]:

    
import pandas as pd



In [7]:

    
# Read in the data with pandas http://pandas.pydata.org
data = pd.read_csv('train.csv')

# Remove nulls and replace them with blanks
data['comments'] = data['comments'].fillna('')



In [8]:

    
# Lets take a look at the data
data.head()









    Out[8]:






  
    
      
      id
      tid
      dept
      date
      forcredit
      attendance
      textbookuse
      interest
      grade
      tags
      comments
      helpcount
      nothelpcount
      online
      profgender
      profhotness
      helpfulness
      clarity
      easiness
      quality
    
  
  
    
      0
      24228248
      916674
      Business
      01/05/2015
      Yes
      NaN
      It's a must have
      Really into it
      NaN
      ["Would take again", "Hilarious", "Tests are t...
      Great Professor My wife took this class twice ...
      0
      10
      NaN
      0
      0
      4
      5
      3
      9
    
    
      1
      24218909
      916674
      Business
      01/02/2015
      Yes
      Mandatory
      It's a must have
      Sorta interested
      A
      ["Skip class? You won't pass.", "Tests are tou...
      Great Professor Study the notes from class and...
      0
      1
      NaN
      0
      0
      4
      4
      2
      8
    
    
      2
      24215795
      916674
      Business
      01/02/2015
      Yes
      NaN
      Essential to passing
      Really into it
      NaN
      ["Hilarious", "Would take again", "Skip class?...
      Brother Brau is a great guy He gives great spi...
      1
      2
      NaN
      0
      0
      4
      4
      3
      8
    
    
      3
      24204179
      916674
      Business
      12/30/2014
      Yes
      Not Mandatory
      Essential to passing
      Sorta interested
      NaN
      ["Tests are tough", "Get ready to read"]
      People rave about Brau but I personally dont g...
      18
      6
      NaN
      0
      0
      3
      1
      2
      4
    
    
      4
      24198463
      916674
      Business
      12/28/2014
      Yes
      Not Mandatory
      You need it sometimes
      Sorta interested
      A
      ["Inspirational", "Hilarious", "Skip class? Yo...
      This class doesnt have much homework which was...
      1
      0
      NaN
      0
      0
      4
      4
      4
      8

Notes

In training the id doesn't matter, but this is the column to keep for predictions on test.csv. Aside from that quality is the most important column since its what we are trying to predict

For this tutorial/baseline lets use a simple unigram model of the comments with linear regresssion

Its also important we separate training data from test data to prevent overfitting.



In [5]:

    
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split



In [9]:

    
train, test = train_test_split(data)



In [10]:

    
# Make a pipeline to do unigrams then run linear regression
# http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
pipeline = Pipeline([
        ('cv', CountVectorizer()),
        ('regression', LinearRegression())
])

# Use the cross validation feature of sklearn to get a good estimate of the error.
# Before feeding in the comments we fill any nulls as empty strings
# http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html
cv = GridSearchCV(
    pipeline, {}
).fit(train['comments'], train['quality'])



In [12]:

    
# Output the best score, this is based on held out data in cross validation
print("R Squared: {}".format(cv.best_score_))

# Output the Mean Squared Error using our held out training data
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(test['quality'], cv.predict(test['comments']))
print("MSE: {}".format(mse))









    



R Squared: 0.1397619335236992
MSE: 5.238645822808021



In [13]:

    
# Load the kaggle test data
kaggle_test = pd.read_csv('test.csv')
kaggle_test['comments'] = kaggle_test['comments'].fillna('')



In [14]:

    
# Make training predictions
predictions = cv.predict(kaggle_test['comments'])

# Lets take a quick look at the predictions to make sure they are sensible, seems like it
predictions









    Out[14]:





array([ 11.11033054,   8.27763007,   7.23343964, ...,   8.33615437,
         8.68797534,   6.32335439])



In [15]:

    
# Finally lets write out the predictions with their id's

with open('predictions.csv', 'w') as f:
    f.write("id,quality\n")
    for row_id, prediction in zip(kaggle_test['id'], predictions):
        f.write('{},{}\n'.format(row_id, prediction))



In [16]:

    
# Save the model for later
import pickle
with open('model.pkl', 'bw') as f:
    pickle.dump(cv, f)



In [ ]:

	id	tid	dept	date	forcredit	attendance	textbookuse	interest	grade	tags	comments	helpcount	nothelpcount	online	helpfulness	clarity	easiness	quality
0	24228248	916674	Business	01/05/2015	Yes	NaN	It's a must have	Really into it	NaN	["Would take again", "Hilarious", "Tests are t...	Great Professor My wife took this class twice ...	0	10	NaN	4	5	3	9
1	24218909	916674	Business	01/02/2015	Yes	Mandatory	It's a must have	Sorta interested	A	["Skip class? You won't pass.", "Tests are tou...	Great Professor Study the notes from class and...	0	1	NaN	4	4	2	8
2	24215795	916674	Business	01/02/2015	Yes	NaN	Essential to passing	Really into it	NaN	["Hilarious", "Would take again", "Skip class?...	Brother Brau is a great guy He gives great spi...	1	2	NaN	4	4	3	8
3	24204179	916674	Business	12/30/2014	Yes	Not Mandatory	Essential to passing	Sorta interested	NaN	["Tests are tough", "Get ready to read"]	People rave about Brau but I personally dont g...	18	6	NaN	3	1	2	4
4	24198463	916674	Business	12/28/2014	Yes	Not Mandatory	You need it sometimes	Sorta interested	A	["Inspirational", "Hilarious", "Skip class? Yo...	This class doesnt have much homework which was...	1	0	NaN	4	4	4	8