In [66]:
#Importing required libraries: Regex operations, pandas, numpy, SVC model
import re
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.svm import SVC
In [67]:
def add_essay_training(data, essay_set, essay, score):
if essay_set not in data:
data[essay_set] = {"essay":[],"score":[]}
data[essay_set]["essay"].append(essay)
data[essay_set]["score"].append(score)
def add_essay_test(data, essay_set, essay, prediction_id):
if essay_set not in data:
data[essay_set] = {"essay":[], "prediction_id":[]}
data[essay_set]["essay"].append(essay)
data[essay_set]["prediction_id"].append(prediction_id)
def read_training_data(training_file):
f = open(training_file)
f.readline()
training_data = {}
for row in f:
row = row.strip().split("\t")
essay_set = row[1]
essay = row[2]
domain1_score = int(row[6])
if essay_set == "2":
essay_set = "2_1"
add_essay_training(training_data, essay_set, essay, domain1_score)
if essay_set == "2_1":
essay_set = "2_2"
domain2_score = int(row[9])
add_essay_training(training_data, essay_set, essay, domain2_score)
return training_data
def read_test_data(test_file):
f = open(test_file)
f.readline()
test_data = {}
for row in f:
row = row.strip().split("\t")
essay_set = row[1]
essay = row[2]
domain1_predictionid = int(row[3])
if essay_set == "2":
domain2_predictionid = int(row[4])
add_essay_test(test_data, "2_1", essay, domain1_predictionid)
add_essay_test(test_data, "2_2", essay, domain2_predictionid)
else:
add_essay_test(test_data, essay_set, essay, domain1_predictionid)
return test_data
def get_character_count(essay):
return len(essay)
def get_word_count(essay):
return len(re.findall(r"\s", essay))+1
def extract_features(essays, feature_functions):
return [[f(es) for f in feature_functions] for es in essays]
In [68]:
print("Reading Training Data")
training = read_training_data("Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv")
print("Reading Validation Data")
test = read_test_data("Desktop/hackathon/ASAP-AES/Data/valid_set.tsv")
In [69]:
strain = DataFrame.from_csv("Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv", sep="\t")
strain.head()
Out[69]:
In [70]:
stest = DataFrame.from_csv("Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv", sep="\t")
stest.head()
Out[70]:
In [71]:
feature_functions = [get_character_count, get_word_count]
essay_sets = sorted(training.keys())
predictions = {}
In [72]:
for es_set in essay_sets:
print("Making Predictions for Essay Set %s" % es_set)
features = extract_features(training[es_set]["essay"],feature_functions)
rf = RandomForestRegressor(n_estimators = 100)
rf.fit(features,training[es_set]["score"])
features = extract_features(test[es_set]["essay"], feature_functions)
predicted_scores = rf.predict(features)
for pred_id, pred_score in zip(test[es_set]["prediction_id"],
predicted_scores):
predictions[pred_id] = round(pred_score)
print features
In [73]:
print predicted_scores
In [75]:
output_file = "length_benchmark_rf.csv"
print("Writing submission to %s" % output_file)
f = open(output_file, "w")
f.write("prediction_id,predicted_score\n")
for key in sorted(predictions.keys()):
f.write("%d,%d\n" % (key,predictions[key]))
f.close()
In [76]:
for es_set in essay_sets:
print("Making Predictions for Essay Set %s" % es_set)
features = extract_features(training[es_set]["essay"],feature_functions)
clf = SVC()
clf.fit(features,training[es_set]["score"])
'''SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
print(clf.predict([[-0.8, -1]]))'''
features = extract_features(test[es_set]["essay"], feature_functions)
predicted_scores = clf.predict(features)
for pred_id, pred_score in zip(test[es_set]["prediction_id"],
predicted_scores):
predictions[pred_id] = round(pred_score)
print features
In [77]:
print predicted_scores
In [78]:
output_file = "length_benchmark_svc.csv"
print("Writing submission to %s" % output_file)
f = open(output_file, "w")
f.write("prediction_id,predicted_score\n")
for key in sorted(predictions.keys()):
f.write("%d,%d\n" % (key,predictions[key]))
f.close()
In [87]:
#Word_to_vec implementation
#loding all required libraries
from KaggleWord2VecUtility import KaggleWord2VecUtility
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#loading test and train data
print "loading data..."
if __name__=='__main__':
train=pd.read_csv('Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv',header=0,delimiter='\t',quoting=3)
test=pd.read_csv('Desktop/hackathon/ASAP-AES/Data/valid_set.tsv',header=0,delimiter='\t',quoting=3)
#word2vec
print "creating word vectors..."
clean_train_reviews=[]
for i in xrange(len(train["review"])):
clean_train_reviews.append(" ".join(KaggleWord2VecUtility.reviewto_wordlist(train["review"][i],True)))
#create Bag of Words
print "creating a vector..."
vector=TfidfVectorizer(analyzer="word",max_features=50000,sublinear_tf=True,stop_words = 'english',ngram_range=(1, 2), use_idf=1,smooth_idf=1,strip_accents='unicode',min_df=3)
#tokenizing the vectors
print "tokenizing the vector..."
vector=vector.fit(clean_train_reviews)
train_data=vector.transform(clean_train_reviews)
y=train["sentiment"]
#splitting train data for testing purposes
print "splitting training data for testing purposes..."
X_train,X_test,y_train,y_test=train_test_split(train_data,y,test_size=0.2,random_state=42)
showdown=False
op=True
#showdown(removed Gaussian as performed poorly)
if showdown:
print "Classifier Tasks"
classifiers=[
RandomForestClassifier(n_estimators=150),
MultinomialNB(alpha=0.0001),
SGDClassifier(loss='modified_huber',warm_start="True"),
LogisticRegression(penalty="l2",C=1)
]
count=0
for clf in classifiers:
count+=1
print "training ",count
clf.fit(X_train,y_train)
print "testing ",count
y_pred=clf.predict(X_test)
print "result ",count,":",accuracy_score(y_test,y_pred)
if op:
print "training classifier"
clf=SVC() #performing better than others
clf.fit(train_data,y)
print "training complete"
clean_test_reviews=[]
print "creating test data"
for i in xrange(len(test["review"])):
clean_test_reviews.append(" ".join(KaggleWord2VecUtility.reviewto_wordlist(test["review"][i],True)))
test_data=vector.transform(clean_test_reviews)
print "testing..."
y_pred=clf.predict_proba(test_data)[:,1]
print "testing complete"
print "preparing submission file"
submission=pd.DataFrame(data={"id":test['id'],"sentiment":y_pred})
submission.to_csv('asap_word_to_vec.csv',quoting=3,index=False)
In [ ]: