In [14]:
# Import the libraries you need
import scipy as sp
import pandas as pd
import numpy as np
from pattern.en import parsetree, Sentence, modality, sentiment
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
# Define a dictionary for interest values. Since interest values appear hierarchical, assign increasing numeric values
# from least to most interested.
INTEREST_DICT = {"Meh" : 0, "Low" : 1, "It's my life" : 4,
"Sorta interested" : 2, "Really into it" : 3}
In [12]:
# A function to get train/test feature matrices based on what features we want to use
def get_features(train, test, features):
train_feature_list = []
test_feature_list = []
for feature in features:
if feature == 'comments':
train[feature].fillna("", inplace=True)
test[feature].fillna('', inplace=True)
train_comments_revised = train[feature]
test_comments_revised = test[feature]
tfv = TfidfVectorizer(ngram_range=(1,3))
tfv = tfv.fit(train_comments_revised)
comments_transformed = tfv.transform(train_comments_revised)
train_feature_list.append(comments_transformed)
test_comments_transformed = tfv.transform(test_comments_revised)
test_feature_list.append(test_comments_transformed)
elif feature == 'dept':
cv = CountVectorizer()
cv = cv.fit(train[feature])
comments_transformed = cv.transform(train[feature])
train_feature_list.append(comments_transformed)
test_comments_transformed = cv.transform(test[feature])
test_feature_list.append(test_comments_transformed)
elif feature == 'interest':
interest_list = np.array([[INTEREST_DICT[x] if x in INTEREST_DICT else -1]
for x in train[feature]])
train_feature_list.append(interest_list)
test_interest_list = np.array([[INTEREST_DICT[x] if x in INTEREST_DICT else -1]
for x in test[feature]])
test_feature_list.append(test_interest_list)
# Assuming all other features will not need to be pre-processed
else:
if feature in train:
train_feature_list.append(np.array([[x] for x in train[feature].values]))
test_feature_list.append(np.array([[x] for x in test[feature].values]))
if len(features) == 1:
return train_feature_list[0], test_feature_list[0]
# Need to use a sparse matrix if Count or TfIdf Vectorizers are used
if 'comments' in features or 'dept' in features:
return sp.sparse.hstack((train_feature_list), format='csr'), sp.sparse.hstack((test_feature_list), format='csr')
else:
return np.concatenate((train_feature_list), axis=1), np.concatenate((test_feature_list), axis=1)
In [19]:
data = pd.read_csv('train.csv')
train, test = train_test_split(data, test_size=0.20)
features = ['comments', 'interest']
model = Ridge(solver='auto')
# This could take a while due to the n_gram range of (1,3) in the TfIdfVectorizer
train_data, test_data = get_features(train, test, features)
# Make lists for subjectivity, polarity, and modality features
train_subjectivities = []
test_subjectivities = []
train_polarities = []
test_polarities = []
train_modalities = []
test_modalities = []
for comment in train['comments']:
comment_sentiment = sentiment(comment)
# sentiment returns a tuple of (polarity, subjectivity)
train_polarities.append([comment_sentiment[0]])
train_subjectivities.append([comment_sentiment[1]])
train_modalities.append([modality(comment)])
train_polarities = np.array(train_polarities)
train_subjectivities = np.array(train_subjectivities)
train_modalities = np.array(train_modalities)
for comment in test['comments']:
comment_sentiment = sentiment(comment)
# sentiment returns a tuple of (polarity, subjectivity)
test_polarities.append([comment_sentiment[0]])
test_subjectivities.append([comment_sentiment[1]])
test_modalities.append([modality(comment)])
test_polarities = np.array(test_polarities)
test_subjectivities = np.array(test_subjectivities)
test_modalities = np.array(test_modalities)
train_data = sp.sparse.hstack((train_data, train_polarities, train_subjectivities,
train_modalities), format='csr')
test_data = sp.sparse.hstack((test_data, test_polarities, test_subjectivities,
test_modalities), format='csr')
regressor = Ridge(solver='auto')
model = regressor.fit(train_data, train['quality'])
In [20]:
predictions = model.predict(test_data)
# Apply a floor/ceiling function to predictions to keep them in the range (2, 10)
new_predictions = [2 if x < 2 else 10 if x > 10 else x for x in predictions]
new_predictions = np.array(new_predictions)
mse = mean_squared_error(test['quality'], new_predictions)
print("MSE: {}".format(mse))