In [ ]:
#Importing required libraries: Regex operations, pandas, numpy, SVC model
import re                  
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from sklearn.svm import SVC

def add_essay_training(data, essay_set, essay, score):
    if essay_set not in data:
        data[essay_set] = {"essay":[],"score":[]}
    data[essay_set]["essay"].append(essay)
    data[essay_set]["score"].append(score)

def add_essay_test(data, essay_set, essay, prediction_id):
    if essay_set not in data:
        data[essay_set] = {"essay":[], "prediction_id":[]}
    data[essay_set]["essay"].append(essay)
    data[essay_set]["prediction_id"].append(prediction_id)

def read_training_data(training_file):
    f = open(training_file)
    f.readline()

    training_data = {}
    for row in f:
        row = row.strip().split("\t")
        essay_set = row[1]
        essay = row[2]
        domain1_score = int(row[6])
        if essay_set == "2":
            essay_set = "2_1"
        add_essay_training(training_data, essay_set, essay, domain1_score)
        
        if essay_set == "2_1":
            essay_set = "2_2"
            domain2_score = int(row[9])
            add_essay_training(training_data, essay_set, essay, domain2_score)
    
    return training_data

def read_test_data(test_file):
    f = open(test_file)
    f.readline()

    test_data = {}
    for row in f:
        row = row.strip().split("\t")
        essay_set = row[1]
        essay = row[2]
        domain1_predictionid = int(row[3])
        if essay_set == "2": 
            domain2_predictionid = int(row[4])
            add_essay_test(test_data, "2_1", essay, domain1_predictionid)
            add_essay_test(test_data, "2_2", essay, domain2_predictionid)
        else:
            add_essay_test(test_data, essay_set, essay, domain1_predictionid)
    return test_data

def get_character_count(essay):
    return len(essay)

def get_word_count(essay):
    return len(re.findall(r"\s", essay))+1

def extract_features(essays, feature_functions):
    return [[f(es) for f in feature_functions] for es in essays]

def main():
    print("Reading Training Data")
    training = read_training_data("../Data/training_set_rel3.tsv")
    print("Reading Validation Data")
    test = read_test_data("../Data/valid_set.tsv")
    
    feature_functions = [get_character_count, get_word_count]

    essay_sets = sorted(training.keys())
    predictions = {}

'''    for es_set in essay_sets:
        print("Making Predictions for Essay Set %s" % es_set)
        features = extract_features(training[es_set]["essay"],feature_functions)
        
    X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
    y = np.array([1, 1, 2, 2])
    clf = SVC()
    clf.fit(X, y) 
    SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
    print(clf.predict([[-0.8, -1]]))
'''    
    
    for es_set in essay_sets:
        print("Making Predictions for Essay Set %s" % es_set)
        features = extract_features(training[es_set]["essay"],
                                    feature_functions)
        rf = RandomForestRegressor(n_estimators = 100)
        rf.fit(features,training[es_set]["score"])
        features = extract_features(test[es_set]["essay"], feature_functions)
        predicted_scores = rf.predict(features)
        for pred_id, pred_score in zip(test[es_set]["prediction_id"], 
                                       predicted_scores):
            predictions[pred_id] = round(pred_score)
    
    output_file = "../Submissions/length_benchmark.csv"
    print("Writing submission to %s" % output_file)
    f = open(output_file, "w")
    f.write("prediction_id,predicted_score\n")
    for key in sorted(predictions.keys()):
        f.write("%d,%d\n" % (key,predictions[key]))
    f.close()
    
if __name__=="__main__":
    main()