notebook.community

Edit and run



In [66]:

    
#Importing required libraries: Regex operations, pandas, numpy, SVC model
import re                  
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.svm import SVC



In [67]:

    
def add_essay_training(data, essay_set, essay, score):
    if essay_set not in data:
        data[essay_set] = {"essay":[],"score":[]}
    data[essay_set]["essay"].append(essay)
    data[essay_set]["score"].append(score)

def add_essay_test(data, essay_set, essay, prediction_id):
    if essay_set not in data:
        data[essay_set] = {"essay":[], "prediction_id":[]}
    data[essay_set]["essay"].append(essay)
    data[essay_set]["prediction_id"].append(prediction_id)

def read_training_data(training_file):
    f = open(training_file)
    f.readline()

    training_data = {}
    for row in f:
        row = row.strip().split("\t")
        essay_set = row[1]
        essay = row[2]
        domain1_score = int(row[6])
        if essay_set == "2":
            essay_set = "2_1"
        add_essay_training(training_data, essay_set, essay, domain1_score)
        
        if essay_set == "2_1":
            essay_set = "2_2"
            domain2_score = int(row[9])
            add_essay_training(training_data, essay_set, essay, domain2_score)
    
    return training_data

def read_test_data(test_file):
    f = open(test_file)
    f.readline()

    test_data = {}
    for row in f:
        row = row.strip().split("\t")
        essay_set = row[1]
        essay = row[2]
        domain1_predictionid = int(row[3])
        if essay_set == "2": 
            domain2_predictionid = int(row[4])
            add_essay_test(test_data, "2_1", essay, domain1_predictionid)
            add_essay_test(test_data, "2_2", essay, domain2_predictionid)
        else:
            add_essay_test(test_data, essay_set, essay, domain1_predictionid)
    return test_data

def get_character_count(essay):
    return len(essay)

def get_word_count(essay):
    return len(re.findall(r"\s", essay))+1

def extract_features(essays, feature_functions):
    return [[f(es) for f in feature_functions] for es in essays]



In [68]:

    
print("Reading Training Data")
training = read_training_data("Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv")
print("Reading Validation Data")
test = read_test_data("Desktop/hackathon/ASAP-AES/Data/valid_set.tsv")









    



Reading Training Data
Reading Validation Data



In [69]:

    
strain = DataFrame.from_csv("Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv", sep="\t")
strain.head()









    Out[69]:






  
    
      
      essay_set
      essay
      rater1_domain1
      rater2_domain1
      rater3_domain1
      domain1_score
      rater1_domain2
      rater2_domain2
      domain2_score
      rater1_trait1
      ...
      rater2_trait3
      rater2_trait4
      rater2_trait5
      rater2_trait6
      rater3_trait1
      rater3_trait2
      rater3_trait3
      rater3_trait4
      rater3_trait5
      rater3_trait6
    
    
      essay_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      1
      Dear local newspaper, I think effects computer...
      4
      4
      NaN
      8
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      1
      Dear @CAPS1 @CAPS2, I believe that using compu...
      5
      4
      NaN
      9
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      1
      Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...
      4
      3
      NaN
      7
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      4
      1
      Dear Local Newspaper, @CAPS1 I have found that...
      5
      5
      NaN
      10
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      5
      1
      Dear @LOCATION1, I know having computers has a...
      4
      4
      NaN
      8
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

5 rows × 27 columns



In [70]:

    
stest = DataFrame.from_csv("Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv", sep="\t")
stest.head()









    Out[70]:






  
    
      
      essay_set
      essay
      rater1_domain1
      rater2_domain1
      rater3_domain1
      domain1_score
      rater1_domain2
      rater2_domain2
      domain2_score
      rater1_trait1
      ...
      rater2_trait3
      rater2_trait4
      rater2_trait5
      rater2_trait6
      rater3_trait1
      rater3_trait2
      rater3_trait3
      rater3_trait4
      rater3_trait5
      rater3_trait6
    
    
      essay_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      1
      Dear local newspaper, I think effects computer...
      4
      4
      NaN
      8
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      1
      Dear @CAPS1 @CAPS2, I believe that using compu...
      5
      4
      NaN
      9
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      1
      Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...
      4
      3
      NaN
      7
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      4
      1
      Dear Local Newspaper, @CAPS1 I have found that...
      5
      5
      NaN
      10
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      5
      1
      Dear @LOCATION1, I know having computers has a...
      4
      4
      NaN
      8
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

5 rows × 27 columns



In [71]:

    
feature_functions = [get_character_count, get_word_count]

essay_sets = sorted(training.keys())
predictions = {}



In [72]:

    
for es_set in essay_sets:
        print("Making Predictions for Essay Set %s" % es_set)
        features = extract_features(training[es_set]["essay"],feature_functions)
        rf = RandomForestRegressor(n_estimators = 100)
        rf.fit(features,training[es_set]["score"])
        features = extract_features(test[es_set]["essay"], feature_functions)
        predicted_scores = rf.predict(features)
        for pred_id, pred_score in zip(test[es_set]["prediction_id"], 
                                       predicted_scores):
            predictions[pred_id] = round(pred_score)
print features









    



Making Predictions for Essay Set 1
Making Predictions for Essay Set 2_1
Making Predictions for Essay Set 2_2
Making Predictions for Essay Set 3
Making Predictions for Essay Set 4
Making Predictions for Essay Set 5
Making Predictions for Essay Set 6
Making Predictions for Essay Set 7
Making Predictions for Essay Set 8
[[4018, 738], [4671, 867], [3403, 658], [4707, 844], [4355, 860], [3469, 675], [1433, 262], [1960, 389], [2035, 379], [3382, 629], [2167, 404], [3831, 732], [2333, 465], [4562, 849], [2643, 530], [2256, 430], [3465, 627], [4067, 761], [2753, 529], [1664, 318], [1677, 333], [4587, 852], [3813, 719], [4341, 823], [4236, 853], [2702, 499], [3003, 565], [1515, 287], [1828, 344], [1785, 345], [3249, 609], [3217, 649], [4433, 845], [4535, 848], [3463, 727], [2115, 388], [3427, 627], [2705, 585], [4423, 833], [2855, 543], [2209, 420], [4714, 847], [2308, 501], [3409, 704], [3101, 614], [4161, 838], [3061, 627], [4151, 767], [4163, 766], [3419, 677], [4147, 850], [2085, 389], [3575, 689], [4074, 805], [2536, 521], [3024, 551], [4719, 874], [4294, 869], [2828, 518], [4384, 856], [3324, 600], [3806, 710], [4705, 851], [4798, 857], [1455, 282], [4447, 853], [4217, 779], [3731, 709], [4520, 850], [4198, 853], [4430, 815], [2335, 469], [2888, 545], [2629, 502], [2859, 544], [2609, 454], [3725, 731], [3437, 640], [3858, 683], [3379, 668], [2130, 444], [4729, 902], [4379, 814], [4081, 806], [2395, 451], [3365, 696], [289, 57], [3604, 759], [4566, 854], [4002, 840], [2507, 491], [4686, 845], [3204, 628], [2083, 414], [4499, 876], [2818, 561], [3351, 655], [3385, 625], [3491, 622], [4954, 852], [4298, 853], [2384, 444], [3677, 717], [2306, 317], [4304, 850], [3639, 676], [4344, 852], [3820, 720], [4428, 854], [4236, 847], [4220, 797], [3440, 648], [3504, 671], [4386, 843], [4314, 815], [3720, 740], [3842, 708], [3687, 768], [4586, 835], [4447, 882], [2889, 574], [4579, 848], [1959, 416], [3938, 804], [3745, 738], [2119, 401], [2064, 416], [1098, 207], [3155, 619], [2883, 585], [4464, 870], [3012, 603], [3112, 605], [3186, 609], [2469, 471], [2355, 519], [2803, 501], [2944, 584], [2521, 506], [2986, 622], [2004, 387], [2432, 416], [3837, 787], [4570, 842], [1129, 264], [2255, 445], [2529, 490], [4486, 849], [4435, 855], [3004, 598], [3161, 584], [2524, 459], [2788, 507], [4462, 824], [2380, 433], [2104, 407], [3127, 605], [4100, 774], [2522, 529], [4203, 846], [2031, 384], [3377, 715], [3063, 590], [1713, 373], [3259, 631], [2351, 438], [2538, 515], [4154, 869], [3272, 593], [948, 177], [4749, 852], [1565, 299], [2161, 397], [1807, 367], [4698, 849], [2744, 512], [3007, 597], [2758, 544], [2124, 375], [2932, 566], [4864, 853], [4930, 839], [4205, 851], [4556, 857], [2221, 412], [4173, 856], [2831, 574], [4421, 850], [1381, 272], [3463, 635], [4708, 876], [4432, 854], [3940, 762], [4530, 860], [4065, 796], [2750, 505], [4913, 859], [3050, 603], [1034, 214], [1002, 199], [3600, 687], [2887, 621], [1389, 305], [4037, 751], [4590, 855], [1654, 300], [4398, 809], [4420, 849], [4072, 806], [4254, 759], [788, 159], [1962, 377], [4415, 841], [1343, 256], [4226, 855], [497, 112], [4324, 850], [4155, 800], [3186, 612], [4358, 843], [4664, 854], [4397, 841], [2835, 593], [2427, 481], [2763, 550], [3062, 606], [3655, 732], [3647, 772], [2892, 572], [3650, 743], [2952, 552], [2852, 569], [3175, 600]]



In [73]:

    
print predicted_scores









    



[ 40.93  39.47  38.99  37.98  41.98  37.75  34.39  36.39  36.21  40.3   37.2
  42.54  34.42  38.67  37.94  32.6   41.74  38.11  38.09  32.41  33.3
  40.31  40.76  39.71  36.15  37.02  38.92  33.56  37.83  34.67  42.1
  37.51  40.82  36.89  35.58  38.16  38.18  37.04  38.44  36.59  39.22
  39.26  32.19  30.68  35.46  33.79  36.78  36.58  37.16  37.51  33.58
  37.79  38.21  36.39  37.79  39.31  41.68  42.1   38.75  38.11  40.9
  39.54  38.85  42.78  34.72  39.65  43.8   38.04  40.09  35.39  41.49
  33.21  33.06  35.94  37.3   41.06  40.2   40.38  43.98  37.99  36.2
  41.77  40.63  36.49  37.08  35.08  20.82  32.44  39.8   32.35  33.93
  37.11  39.02  30.85  43.8   33.57  35.02  38.99  42.58  42.89  37.89
  37.39  37.67  38.26  33.28  38.79  42.62  40.71  39.9   37.79  41.71
  41.09  41.99  39.61  40.53  35.46  41.99  34.76  37.08  41.79  33.57
  40.01  30.2   34.91  36.03  36.43  31.09  30.1   36.49  32.97  44.73
  38.55  36.57  38.24  37.9   34.21  37.24  35.9   34.57  35.13  35.68
  38.81  35.14  39.95  26.51  32.67  35.26  39.01  39.78  38.98  42.5
  39.47  37.62  43.88  38.01  39.31  35.41  38.68  37.36  33.85  37.56
  34.25  36.82  31.71  40.68  37.57  35.37  36.    40.25  30.39  45.33
  31.4   38.09  32.29  38.76  37.13  39.64  36.08  36.87  36.64  42.34
  48.85  32.59  39.92  38.37  37.39  35.5   38.95  28.43  42.06  39.97
  40.06  39.51  41.5   39.05  35.55  39.89  36.08  25.59  26.91  38.07
  32.72  27.69  40.2   40.69  32.16  40.43  39.14  36.45  43.9   24.88
  37.4   39.31  29.17  36.77  28.11  35.32  37.03  38.69  40.75  41.62
  38.98  38.27  36.22  35.62  35.81  38.21  32.77  33.55  35.63  37.93
  32.42  38.93]



In [75]:

    
output_file = "length_benchmark_rf.csv"
print("Writing submission to %s" % output_file)
f = open(output_file, "w")
f.write("prediction_id,predicted_score\n")
for key in sorted(predictions.keys()):
   f.write("%d,%d\n" % (key,predictions[key]))
f.close()









    



Writing submission to length_benchmark_rf.csv



In [76]:

    
for es_set in essay_sets:
        print("Making Predictions for Essay Set %s" % es_set)
        features = extract_features(training[es_set]["essay"],feature_functions)
        clf = SVC()
        clf.fit(features,training[es_set]["score"]) 
        '''SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False)
        print(clf.predict([[-0.8, -1]]))'''
        features = extract_features(test[es_set]["essay"], feature_functions)
        predicted_scores = clf.predict(features)
        for pred_id, pred_score in zip(test[es_set]["prediction_id"], 
                                       predicted_scores):
            predictions[pred_id] = round(pred_score)
print features









    



Making Predictions for Essay Set 1
Making Predictions for Essay Set 2_1
Making Predictions for Essay Set 2_2
Making Predictions for Essay Set 3
Making Predictions for Essay Set 4
Making Predictions for Essay Set 5
Making Predictions for Essay Set 6
Making Predictions for Essay Set 7
Making Predictions for Essay Set 8
[[4018, 738], [4671, 867], [3403, 658], [4707, 844], [4355, 860], [3469, 675], [1433, 262], [1960, 389], [2035, 379], [3382, 629], [2167, 404], [3831, 732], [2333, 465], [4562, 849], [2643, 530], [2256, 430], [3465, 627], [4067, 761], [2753, 529], [1664, 318], [1677, 333], [4587, 852], [3813, 719], [4341, 823], [4236, 853], [2702, 499], [3003, 565], [1515, 287], [1828, 344], [1785, 345], [3249, 609], [3217, 649], [4433, 845], [4535, 848], [3463, 727], [2115, 388], [3427, 627], [2705, 585], [4423, 833], [2855, 543], [2209, 420], [4714, 847], [2308, 501], [3409, 704], [3101, 614], [4161, 838], [3061, 627], [4151, 767], [4163, 766], [3419, 677], [4147, 850], [2085, 389], [3575, 689], [4074, 805], [2536, 521], [3024, 551], [4719, 874], [4294, 869], [2828, 518], [4384, 856], [3324, 600], [3806, 710], [4705, 851], [4798, 857], [1455, 282], [4447, 853], [4217, 779], [3731, 709], [4520, 850], [4198, 853], [4430, 815], [2335, 469], [2888, 545], [2629, 502], [2859, 544], [2609, 454], [3725, 731], [3437, 640], [3858, 683], [3379, 668], [2130, 444], [4729, 902], [4379, 814], [4081, 806], [2395, 451], [3365, 696], [289, 57], [3604, 759], [4566, 854], [4002, 840], [2507, 491], [4686, 845], [3204, 628], [2083, 414], [4499, 876], [2818, 561], [3351, 655], [3385, 625], [3491, 622], [4954, 852], [4298, 853], [2384, 444], [3677, 717], [2306, 317], [4304, 850], [3639, 676], [4344, 852], [3820, 720], [4428, 854], [4236, 847], [4220, 797], [3440, 648], [3504, 671], [4386, 843], [4314, 815], [3720, 740], [3842, 708], [3687, 768], [4586, 835], [4447, 882], [2889, 574], [4579, 848], [1959, 416], [3938, 804], [3745, 738], [2119, 401], [2064, 416], [1098, 207], [3155, 619], [2883, 585], [4464, 870], [3012, 603], [3112, 605], [3186, 609], [2469, 471], [2355, 519], [2803, 501], [2944, 584], [2521, 506], [2986, 622], [2004, 387], [2432, 416], [3837, 787], [4570, 842], [1129, 264], [2255, 445], [2529, 490], [4486, 849], [4435, 855], [3004, 598], [3161, 584], [2524, 459], [2788, 507], [4462, 824], [2380, 433], [2104, 407], [3127, 605], [4100, 774], [2522, 529], [4203, 846], [2031, 384], [3377, 715], [3063, 590], [1713, 373], [3259, 631], [2351, 438], [2538, 515], [4154, 869], [3272, 593], [948, 177], [4749, 852], [1565, 299], [2161, 397], [1807, 367], [4698, 849], [2744, 512], [3007, 597], [2758, 544], [2124, 375], [2932, 566], [4864, 853], [4930, 839], [4205, 851], [4556, 857], [2221, 412], [4173, 856], [2831, 574], [4421, 850], [1381, 272], [3463, 635], [4708, 876], [4432, 854], [3940, 762], [4530, 860], [4065, 796], [2750, 505], [4913, 859], [3050, 603], [1034, 214], [1002, 199], [3600, 687], [2887, 621], [1389, 305], [4037, 751], [4590, 855], [1654, 300], [4398, 809], [4420, 849], [4072, 806], [4254, 759], [788, 159], [1962, 377], [4415, 841], [1343, 256], [4226, 855], [497, 112], [4324, 850], [4155, 800], [3186, 612], [4358, 843], [4664, 854], [4397, 841], [2835, 593], [2427, 481], [2763, 550], [3062, 606], [3655, 732], [3647, 772], [2892, 572], [3650, 743], [2952, 552], [2852, 569], [3175, 600]]



In [77]:

    
print predicted_scores









    



[40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40]



In [78]:

    
output_file = "length_benchmark_svc.csv"
print("Writing submission to %s" % output_file)
f = open(output_file, "w")
f.write("prediction_id,predicted_score\n")
for key in sorted(predictions.keys()):
   f.write("%d,%d\n" % (key,predictions[key]))
f.close()









    



Writing submission to length_benchmark_svc.csv



In [87]:

    
#Word_to_vec implementation 

#loding all required libraries
from KaggleWord2VecUtility import KaggleWord2VecUtility
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#loading test and train data
print "loading data..."
if __name__=='__main__':
    train=pd.read_csv('Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv',header=0,delimiter='\t',quoting=3)
    test=pd.read_csv('Desktop/hackathon/ASAP-AES/Data/valid_set.tsv',header=0,delimiter='\t',quoting=3)

#word2vec
print "creating word vectors..."

clean_train_reviews=[]
for i in xrange(len(train["review"])):
    clean_train_reviews.append(" ".join(KaggleWord2VecUtility.reviewto_wordlist(train["review"][i],True)))

#create Bag of Words
print "creating a vector..."
vector=TfidfVectorizer(analyzer="word",max_features=50000,sublinear_tf=True,stop_words = 'english',ngram_range=(1, 2), use_idf=1,smooth_idf=1,strip_accents='unicode',min_df=3)

#tokenizing the vectors
print "tokenizing the vector..." 
vector=vector.fit(clean_train_reviews)
train_data=vector.transform(clean_train_reviews)


y=train["sentiment"]

#splitting train data for testing purposes
print "splitting training data for testing purposes..."
X_train,X_test,y_train,y_test=train_test_split(train_data,y,test_size=0.2,random_state=42)


showdown=False
op=True

#showdown(removed Gaussian as performed poorly)
if showdown:
    print "Classifier Tasks"
    classifiers=[
                RandomForestClassifier(n_estimators=150),
                MultinomialNB(alpha=0.0001),
                SGDClassifier(loss='modified_huber',warm_start="True"),
                LogisticRegression(penalty="l2",C=1)
                ]
    count=0
    for clf in classifiers:
        count+=1
        print "training ",count
        clf.fit(X_train,y_train)
        print "testing ",count
        y_pred=clf.predict(X_test)
        print "result ",count,":",accuracy_score(y_test,y_pred)
if op:
    print "training classifier"
    clf=SVC() #performing better than others
    clf.fit(train_data,y)

    print "training complete"

    clean_test_reviews=[]
    print "creating test data"
    for i in xrange(len(test["review"])):
        clean_test_reviews.append(" ".join(KaggleWord2VecUtility.reviewto_wordlist(test["review"][i],True)))
    test_data=vector.transform(clean_test_reviews)

    print "testing..."
    y_pred=clf.predict_proba(test_data)[:,1]
    print "testing complete"
    print "preparing submission file"
    submission=pd.DataFrame(data={"id":test['id'],"sentiment":y_pred})
    submission.to_csv('asap_word_to_vec.csv',quoting=3,index=False)









    



loading data...
creating word vectors...






    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-87-1956db73ae55> in <module>()
     20 
     21 clean_train_reviews=[]
---> 22 for i in xrange(len(train["review"])):
     23     clean_train_reviews.append(" ".join(KaggleWord2VecUtility.reviewto_wordlist(train["review"][i],True)))
     24 

/home/tarzilams/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1990             return self._getitem_multilevel(key)
   1991         else:
-> 1992             return self._getitem_column(key)
   1993 
   1994     def _getitem_column(self, key):

/home/tarzilams/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1997         # get column
   1998         if self.columns.is_unique:
-> 1999             return self._get_item_cache(key)
   2000 
   2001         # duplicate columns & possible reduce dimensionality

/home/tarzilams/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1343         res = cache.get(item)
   1344         if res is None:
-> 1345             values = self._data.get(item)
   1346             res = self._box_item_values(item, values)
   1347             cache[item] = res

/home/tarzilams/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3223 
   3224             if not isnull(item):
-> 3225                 loc = self.items.get_loc(item)
   3226             else:
   3227                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/tarzilams/anaconda2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   1876                 return self._engine.get_loc(key)
   1877             except KeyError:
-> 1878                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1879 
   1880         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4027)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3891)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12408)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12359)()

KeyError: 'review'



In [ ]:

	essay_set	essay	rater1_domain1	rater2_domain1	rater3_domain1	domain1_score	rater1_domain2	rater2_domain2	domain2_score	rater1_trait1	...	rater2_trait3	rater2_trait4	rater2_trait5	rater2_trait6	rater3_trait1	rater3_trait2	rater3_trait3	rater3_trait4	rater3_trait5	rater3_trait6
essay_id
1	1	Dear local newspaper, I think effects computer...	4	4	NaN	8	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	1	Dear @CAPS1 @CAPS2, I believe that using compu...	5	4	NaN	9	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	1	Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...	4	3	NaN	7	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	1	Dear Local Newspaper, @CAPS1 I have found that...	5	5	NaN	10	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	1	Dear @LOCATION1, I know having computers has a...	4	4	NaN	8	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN