In [14]:
# Import the libraries you need
import scipy as sp
import pandas as pd
import numpy as np
from pattern.en import parsetree, Sentence, modality, sentiment

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Define a dictionary for interest values. Since interest values appear hierarchical, assign increasing numeric values
# from least to most interested.
INTEREST_DICT = {"Meh" : 0, "Low" : 1, "It's my life" : 4,
                 "Sorta interested" : 2, "Really into it" : 3}

In [12]:
# A function to get train/test feature matrices based on what features we want to use
def get_features(train, test, features):
    train_feature_list = []
    test_feature_list = []
    for feature in features:
        if feature == 'comments':
            train[feature].fillna("", inplace=True)
            test[feature].fillna('', inplace=True)
            train_comments_revised = train[feature]
            test_comments_revised = test[feature]
            tfv = TfidfVectorizer(ngram_range=(1,3))
            tfv =
            comments_transformed = tfv.transform(train_comments_revised)
            test_comments_transformed = tfv.transform(test_comments_revised)
        elif feature == 'dept':
            cv = CountVectorizer()
            cv =[feature])
            comments_transformed = cv.transform(train[feature])
            test_comments_transformed = cv.transform(test[feature])
        elif feature == 'interest':
            interest_list = np.array([[INTEREST_DICT[x] if x in INTEREST_DICT else -1]
                             for x in train[feature]])
            test_interest_list = np.array([[INTEREST_DICT[x] if x in INTEREST_DICT else -1]
                             for x in test[feature]])
        # Assuming all other features will not need to be pre-processed
            if feature in train:
                train_feature_list.append(np.array([[x] for x in train[feature].values]))
                test_feature_list.append(np.array([[x] for x in test[feature].values]))
    if len(features) == 1:
        return train_feature_list[0], test_feature_list[0]
    # Need to use a sparse matrix if Count or TfIdf Vectorizers are used
    if 'comments' in features or 'dept' in features:
        return sp.sparse.hstack((train_feature_list), format='csr'), sp.sparse.hstack((test_feature_list), format='csr')
        return np.concatenate((train_feature_list), axis=1), np.concatenate((test_feature_list), axis=1)

In [19]:
data = pd.read_csv('train.csv')
        train, test = train_test_split(data, test_size=0.20)
        features = ['comments', 'interest']
        model = Ridge(solver='auto')
        # This could take a while due to the n_gram range of (1,3) in the TfIdfVectorizer
        train_data, test_data = get_features(train, test, features)
        # Make lists for subjectivity, polarity, and modality features
        train_subjectivities = []
        test_subjectivities = []
        train_polarities = []
        test_polarities = []
        train_modalities = []
        test_modalities = []
        for comment in train['comments']:
            comment_sentiment = sentiment(comment)
            # sentiment returns a tuple of (polarity, subjectivity)
        train_polarities = np.array(train_polarities)
        train_subjectivities = np.array(train_subjectivities)
        train_modalities = np.array(train_modalities)
        for comment in test['comments']:
            comment_sentiment = sentiment(comment)
            # sentiment returns a tuple of (polarity, subjectivity)
        test_polarities = np.array(test_polarities)
        test_subjectivities = np.array(test_subjectivities)
        test_modalities = np.array(test_modalities)
        train_data = sp.sparse.hstack((train_data, train_polarities, train_subjectivities,
                                       train_modalities), format='csr')
        test_data = sp.sparse.hstack((test_data, test_polarities, test_subjectivities,
                                       test_modalities), format='csr')
        regressor = Ridge(solver='auto')
        model =, train['quality'])

In [20]:
predictions = model.predict(test_data)
        # Apply a floor/ceiling function to predictions to keep them in the range (2, 10)
        new_predictions = [2 if x < 2 else 10 if x > 10 else x for x in predictions]
        new_predictions = np.array(new_predictions)
        mse = mean_squared_error(test['quality'], new_predictions)
        print("MSE: {}".format(mse))

MSE: 2.52939253864