notebook.community

Edit and run



In [1]:

    
import pandas as pd



In [2]:

    
train = pd.read_csv("labeledTrainData.tsv", sep='\t')
test = pd.read_csv("testData.tsv", sep='\t')
sample = pd.read_csv("sample.csv")



In [3]:

    
print train.head(2)
print test.head(2)
print sample.head(2)
print train.count()
print test.count()









    



       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
         id                                             review
0  12311_10  Naturally in a film who's main themes are of m...
1    8348_2  This movie is a disaster within a disaster fil...
         id  sentiment
0  12311_10          0
1    8348_2          0
id           25000
sentiment    25000
review       25000
dtype: int64
id        25000
review    25000
dtype: int64



In [46]:

    
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk

def review_to_wordlist(review, remove_stopwords=False):    
    # Function to convert a document to a sequence of words,    
    # optionally removing stop words.  Returns a list of words.    
    #    
    # 1. Remove HTML    
    review_text = BeautifulSoup(review).get_text()    
    #      
    # 2. Remove non-letters    
    review_text = re.sub("[^a-zA-Z]"," ", review_text)    
    #    
    # 3. Convert words to lower case and split them    
    words = review_text.lower().split()    
    #    
    # 4. Optionally remove stop words (false by default)    
    if remove_stopwords:        
        stops = set(stopwords.words("english"))        
        words = [w for w in words if not w in stops]    
        #    
        # 5. Return a list of words    
    return(words)



In [8]:

    
# nltk.download('punkt')









    



[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.






    Out[8]:





True



In [33]:

    
# from nltk import word_tokenize
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')



In [50]:

    
# Define a function to split a review into parsed sentences
def review_to_sentences(review, tokenizer, remove_stopwords=False):    
    # Function to split a review into parsed sentences. Returns a     
    # list of sentences, where each sentence is a list of words    
    #    
    # 1. Use the NLTK tokenizer to split the paragraph into sentences 
    raw_sentences = tokenizer.tokenize(review.strip().decode('utf-8'))
    
    #    
    # 2. Loop over each sentence    
    sentences = []    
    for raw_sentence in raw_sentences:        
        # If a sentence is empty, skip it        
        if len(raw_sentence) > 0:            
            # Otherwise, call review_to_wordlist to get a list of words            
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))    
            #    
            # Return the list of sentences (each sentence is a list of words,    
            # so this returns a list of lists    
    return sentences



In [18]:

    
# unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", sep='\t')



In [41]:

    
nltk.download()









    



NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> stopwords
    Downloading package stopwords to /home/ubuntu/nltk_data...
      Unzipping corpora/stopwords.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q






    Out[41]:





True



In [51]:

    
sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)









    



/home/ubuntu/anaconda/lib/python2.7/site-packages/bs4/__init__.py:182: UserWarning: "." looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
/home/ubuntu/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://www.happierabroad.com" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)






    



Parsing sentences from training set



In [52]:

    
print len(sentences)
print sentences[0]









    



266885
[u'with', u'all', u'this', u'stuff', u'going', u'down', u'at', u'the', u'moment', u'with', u'mj', u'i', u've', u'started', u'listening', u'to', u'his', u'music', u'watching', u'the', u'odd', u'documentary', u'here', u'and', u'there', u'watched', u'the', u'wiz', u'and', u'watched', u'moonwalker', u'again']



In [53]:

    
from gensim.models import word2vec

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)from gensim.models import word2vec
print "Training model..."

model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

# If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context.pkl"
model.save(model_name)









    



Training model...






    



Using gpu device 0: GRID K520



In [54]:

    
print model.doesnt_match("man woman child kitchen".split())
print model.doesnt_match("france england germany berlin".split())
print model.doesnt_match("paris berlin london austria".split())









    



kitchen
berlin
paris



In [55]:

    
model.most_similar("man")









    Out[55]:





[(u'woman', 0.6002646684646606),
 (u'guy', 0.4918012022972107),
 (u'boy', 0.4908530116081238),
 (u'lady', 0.48227813839912415),
 (u'named', 0.4763108491897583),
 (u'girl', 0.45790719985961914),
 (u'doctor', 0.4178612232208252),
 (u'soldier', 0.4119168519973755),
 (u'son', 0.40765318274497986),
 (u'businessman', 0.4072432518005371)]



In [56]:

    
model.most_similar("movie")









    Out[56]:





[(u'film', 0.7240103483200073),
 (u'it', 0.49680161476135254),
 (u'movies', 0.49463677406311035),
 (u'show', 0.428367555141449),
 (u'flick', 0.4108177423477173),
 (u'really', 0.3872818946838379),
 (u'thing', 0.38200855255126953),
 (u'just', 0.37946921586990356),
 (u'sequel', 0.37930771708488464),
 (u'because', 0.3668331503868103)]



In [57]:

    
model.most_similar("kill")









    Out[57]:





[(u'killing', 0.5700429081916809),
 (u'killed', 0.5425001382827759),
 (u'kills', 0.5362086296081543),
 (u'protect', 0.5012085437774658),
 (u'decide', 0.49004626274108887),
 (u'shoot', 0.48850518465042114),
 (u'jail', 0.4791141152381897),
 (u'die', 0.47435376048088074),
 (u'drive', 0.4721546173095703),
 (u'steal', 0.4617525339126587)]



In [58]:

    
type(model.syn0)









    Out[58]:





numpy.ndarray



In [59]:

    
model.syn0.shape









    Out[59]:





(8306, 300)



In [60]:

    
model.syn0









    Out[60]:





array([[ 0.11301138,  0.09196355,  0.02227202, ...,  0.0932686 ,
         0.07330729,  0.05409562],
       [-0.10085322,  0.00798355,  0.06586684, ...,  0.07878271,
         0.07288806,  0.05811012],
       [-0.1237859 ,  0.02547193,  0.08878306, ...,  0.11291546,
         0.07088824,  0.13749835],
       ..., 
       [ 0.08134036,  0.19218518, -0.04442903, ...,  0.09148403,
        -0.01917326, -0.02962955],
       [ 0.0599947 ,  0.16439657, -0.03100142, ...,  0.12783648,
        -0.04256465,  0.12394594],
       [ 0.03108146,  0.02801377,  0.05658953, ...,  0.07385855,
        -0.03732267,  0.00617132]], dtype=float32)



In [61]:

    
print model["flower"].shape
model["flower"]









    



(300,)






    Out[61]:





array([  9.19017661e-03,   6.54114336e-02,   1.33776793e-03,
        -1.81574468e-02,   4.02462948e-03,  -5.59470057e-03,
        -1.72537216e-03,  -7.38028288e-02,   6.71816012e-03,
        -3.69991884e-02,   4.19161059e-02,  -2.58125439e-02,
         8.52258317e-03,   9.11382437e-02,  -1.23168685e-01,
         1.05112046e-01,  -7.69874081e-02,  -7.29553998e-02,
        -4.31609750e-02,  -9.53138396e-02,  -2.66510025e-02,
         8.86965543e-02,   6.93313628e-02,  -1.32601196e-02,
         4.21601236e-02,   5.42824268e-02,   4.79070656e-02,
        -8.12596604e-02,  -8.10332149e-02,  -2.31435173e-03,
        -3.37034203e-02,  -2.09816005e-02,   3.05269510e-02,
         4.79896180e-03,  -1.06143104e-02,   4.57150824e-02,
         8.17497000e-02,   2.96192449e-02,  -5.13435528e-03,
        -3.79809141e-02,  -7.90826380e-02,  -6.15077466e-02,
        -1.99032500e-02,  -1.64032474e-01,  -3.05936057e-02,
        -7.30207562e-02,   9.87990350e-02,   2.07166597e-02,
        -3.08413655e-02,   1.42323568e-01,  -3.17578241e-02,
        -2.00305302e-02,   7.44638443e-02,   1.63359381e-02,
        -1.79643948e-02,   2.62650568e-02,   7.95819387e-02,
         3.30082513e-02,  -8.87906700e-02,  -4.00113761e-02,
        -1.85798365e-03,   3.61026935e-02,   1.78475101e-02,
        -1.54231712e-02,  -3.75902131e-02,   3.27626690e-02,
        -2.43896581e-02,  -9.22908483e-04,   6.77397847e-02,
        -1.61202319e-04,   3.44642028e-02,  -5.72613105e-02,
         5.25838397e-02,   3.25910859e-02,  -8.41945931e-02,
         4.32095304e-03,   6.38286173e-02,  -5.83400838e-02,
        -4.41564247e-02,   2.28474531e-02,  -2.60690395e-02,
         4.02334891e-02,   4.89793122e-02,  -9.84984040e-02,
        -1.53396586e-02,   8.76839682e-02,  -3.21209920e-03,
        -2.03806497e-02,  -4.85607870e-02,  -4.75146435e-02,
        -3.95433307e-02,  -1.12077566e-02,  -2.20387504e-02,
         5.25889248e-02,  -8.73170979e-03,  -3.35193947e-02,
        -7.34655978e-03,   4.61649671e-02,  -2.35073566e-02,
         4.25066501e-02,  -1.18835606e-01,   3.09316143e-02,
        -1.16189392e-02,   3.57255107e-03,   4.01408505e-03,
         6.55354327e-03,   2.09548958e-02,   3.32682580e-02,
         4.16323841e-02,  -7.75999948e-02,  -6.90751001e-02,
        -4.33408432e-02,   1.48383409e-01,   9.46911424e-02,
         5.98912016e-02,  -4.94176485e-02,   3.97704868e-03,
         1.76540669e-02,   3.30831297e-02,  -1.71052422e-02,
        -5.07994629e-02,  -1.24006369e-03,  -7.27377012e-02,
        -5.76486485e-03,  -7.00146658e-03,  -8.59979261e-03,
         4.58529592e-02,  -4.76779044e-03,   5.45258448e-02,
        -4.69582044e-02,   3.17930616e-02,   6.20730035e-02,
        -8.98051634e-02,   2.90602483e-02,   2.10950635e-02,
         5.03623635e-02,   3.36718000e-02,  -7.40320906e-02,
         2.97545176e-02,  -1.37950303e-02,  -4.98948880e-02,
        -2.69349683e-02,   6.43363893e-02,  -5.82120828e-02,
         3.59887742e-02,   4.44662161e-02,   2.48444844e-02,
         2.57890634e-02,  -4.90085483e-02,   3.07832174e-02,
         2.22949013e-02,   5.30341920e-03,   8.31956863e-02,
        -6.21707505e-03,  -7.20037669e-02,   6.95003346e-02,
        -7.41811246e-02,   4.19695303e-03,   8.83872211e-02,
        -7.04506645e-03,  -3.42773534e-02,  -2.92306785e-02,
        -3.37861553e-02,  -4.59951684e-02,  -1.40411202e-02,
        -6.13155356e-03,   3.76843810e-02,  -4.59368937e-02,
         1.00071765e-02,   2.80338037e-03,  -3.98323871e-02,
         4.48476747e-02,   8.01434144e-02,  -1.39008481e-02,
         6.27132058e-02,   2.93352511e-02,  -1.22893248e-02,
        -2.56101359e-02,   3.40031981e-02,   3.05552292e-03,
         2.39802804e-02,  -5.55145321e-03,   6.80891937e-03,
         1.84586793e-02,   4.39051688e-02,  -1.27430782e-01,
         1.06349215e-02,  -3.02398186e-02,   1.43760189e-01,
         9.86641794e-02,   1.10159919e-01,   3.21253017e-02,
        -7.16447458e-03,   1.27157748e-01,   2.25632004e-02,
         9.38136354e-02,   1.10070622e-02,   1.41717084e-02,
         9.51233879e-03,  -9.05933138e-03,  -7.41828233e-03,
         1.66966731e-03,  -1.15900442e-01,   2.78077871e-02,
         4.37698588e-02,  -1.30047062e-02,  -7.37289945e-03,
         3.69571485e-02,  -1.20272972e-01,  -4.39815372e-02,
        -5.01154326e-02,  -8.50610733e-02,  -8.66879597e-02,
         7.02069402e-02,   1.33885726e-01,   7.03556761e-02,
         4.03005108e-02,   5.07070161e-02,   7.16587454e-02,
        -1.37249738e-01,   7.61254430e-02,   8.49524885e-02,
        -4.35745157e-02,  -5.80381304e-02,  -4.59821485e-02,
        -9.84015316e-02,   1.07166268e-01,  -4.21592183e-02,
         3.12481839e-02,  -1.54463798e-02,  -5.50422026e-03,
         1.97067782e-02,   3.64014730e-02,  -7.47649930e-03,
        -1.58287883e-02,  -8.86944085e-02,   2.60410029e-02,
        -9.63460431e-02,  -7.54706562e-02,   5.03473803e-02,
        -3.87354777e-03,   8.63794796e-03,   2.12353002e-03,
         2.39198320e-02,  -1.50680974e-01,   1.26310840e-01,
        -8.85575116e-02,   1.01039365e-01,   1.09208956e-01,
         2.15202868e-02,  -1.72488783e-02,  -1.14340864e-01,
        -6.65742829e-02,   2.41953041e-03,  -1.73636172e-02,
        -4.76870127e-02,  -6.11959584e-02,   6.52838200e-02,
        -3.90024930e-02,  -9.98545885e-02,  -2.84016877e-03,
        -1.11836223e-02,  -9.80024263e-02,  -4.31969985e-02,
        -1.54170953e-02,  -2.67540589e-02,   9.50384587e-02,
         8.71185139e-02,   9.38164815e-03,  -7.24252537e-02,
         1.00458406e-01,  -5.28579466e-02,   2.26406064e-02,
        -5.18779494e-02,  -1.01900645e-01,  -6.39918968e-02,
         2.01029312e-02,   5.83033152e-02,   2.77105905e-02,
         1.95493605e-02,  -7.51767308e-02,   8.77631009e-02,
         8.31894726e-02,   6.73510972e-03,   3.16922627e-02,
        -1.93729959e-02,  -7.00921267e-02,   2.92901210e-02,
        -6.13672957e-02,  -6.14573732e-02,   3.41361687e-02,
         1.13238141e-01,   6.56837672e-02,   2.25104876e-02,
        -4.85451743e-02,   2.95406058e-02,  -9.96120274e-02,
         7.33703077e-02,  -1.43731041e-02,   3.27502303e-02], dtype=float32)



In [64]:

    
import numpy as np



In [65]:

    
def makeFeatureVec(words, model, num_features):    
    # Function to average all of the word vectors in a given    
    # paragraph    
    #    
    # Pre-initialize an empty numpy array (for speed)    
    featureVec = np.zeros((num_features,),dtype="float32")    
    #    
    nwords = 0.    
    #     
    # Index2word is a list that contains the names of the words in     
    # the model's vocabulary. Convert it to a set, for speed     
    index2word_set = set(model.index2word)    
    #    
    # Loop over each word in the review and, if it is in the model's    
    # vocaublary, add its feature vector to the total    
    for word in words:        
        if word in index2word_set:             
            nwords = nwords + 1.            
            featureVec = np.add(featureVec,model[word])    
    #     
    # Divide the result by the number of words to get the average    
    featureVec = np.divide(featureVec,nwords)    
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features):    
    # Given a set of reviews (each one a list of words), calculate     
    # the average feature vector for each one and return a 2D numpy array     
    #     
    # Initialize a counter    
    counter = 0.    
    #     
    # Preallocate a 2D numpy array, for speed    
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")    
    #     
    # Loop through the reviews    
    for review in reviews:       
        #       
        # Print a status message every 1000th review       
        if counter%1000. == 0.:           
            print "Review %d of %d" % (counter, len(reviews))       
            #        
            # Call the function (defined above) that makes average feature vectors       
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)       
        #       
        # Increment the counter       
        counter = counter + 1.    
    return reviewFeatureVecs



In [66]:

    
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

clean_train_reviews = []
for review in train["review"]:    
    clean_train_reviews.append( review_to_wordlist( review, remove_stopwords=True ))
trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print "Creating average feature vecs for test reviews"
clean_test_reviews = []
for review in test["review"]:    
    clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ))
testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )









    



Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Creating average feature vecs for test reviews
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000



In [67]:

    
print trainDataVecs.shape
print testDataVecs.shape









    



(25000, 300)
(25000, 300)



In [68]:

    
# LB : 0.81744 / 482nd/510

# Fit a random forest to the training data, using 100 trees
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 100)

print "Fitting a random forest to labeled training data..."
clf.fit(trainDataVecs, train["sentiment"])

# Test & extract results 
result = clf.predict(testDataVecs)

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv("submission.csv", index=False, quoting=3)









    



Fitting a random forest to labeled training data...



In [74]:

    
# LB : 0.73900
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

clf.fit(trainDataVecs, train["sentiment"])
result = clf.predict(testDataVecs)

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv("submission.csv", index=False, quoting=3)



In [75]:

    
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV

parameters = {'C' : [1, 10, 10, 1000]}
model = GridSearchCV(SVC(cache_size=2000, kernel="rbf"), parameters, n_jobs=4, verbose=1)

# Fit Grid Search Model
model.fit(trainDataVecs, train["sentiment"])









    



Fitting 3 folds for each of 4 candidates, totalling 12 fits






    



[Parallel(n_jobs=4)]: Done   1 jobs       | elapsed:  5.1min
[Parallel(n_jobs=4)]: Done   6 out of  12 | elapsed:  9.0min remaining:  9.0min
[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed: 11.8min finished






    



Best score: 0.858
Best parameters set:






    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-75-a68515dacf07> in <module>()
     11 print("Best parameters set:")
     12 best_parameters = model.best_estimator_.get_params()
---> 13 for param_name in sorted(param_grid.keys()):
     14     print("\t%s: %r" % (param_name, best_parameters[param_name]))

NameError: name 'param_grid' is not defined



In [80]:

    
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

clf = model.best_estimator_
    
clf.fit(trainDataVecs, train["sentiment"])
result = clf.predict(testDataVecs)

# LB : 0.85348 / 259nd/510

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv("submission.csv", index=False, quoting=3)









    



Best score: 0.858
Best parameters set:
	C: 1000
	cache_size: 2000
	class_weight: None
	coef0: 0.0
	degree: 3
	gamma: 0.0
	kernel: 'rbf'
	max_iter: -1
	probability: False
	random_state: None
	shrinking: True
	tol: 0.001
	verbose: False



In [81]:

    
parameters = {'C' : [1000, 1500, 2500, 3000, 3500, 4000, 4500, 5000]}
model = GridSearchCV(SVC(cache_size=2000, kernel="rbf"), parameters, n_jobs=4, verbose=1)

# Fit Grid Search Model
model.fit(trainDataVecs, train["sentiment"])

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
clf = model.best_estimator_
    
clf.fit(trainDataVecs, train["sentiment"])
result = clf.predict(testDataVecs)

# LB : 0.85600 / 250nd/510

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv("submission.csv", index=False, quoting=3)









    



Fitting 3 folds for each of 8 candidates, totalling 24 fits






    



[Parallel(n_jobs=4)]: Done   1 jobs       | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done  18 out of  24 | elapsed:  9.7min remaining:  3.2min
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed: 11.7min finished






    



Best score: 0.860
Best parameters set:
	C: 5000
	cache_size: 2000
	class_weight: None
	coef0: 0.0
	degree: 3
	gamma: 0.0
	kernel: 'rbf'
	max_iter: -1
	probability: False
	random_state: None
	shrinking: True
	tol: 0.001
	verbose: False



In [82]:

    
parameters = {'C' : [5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000]}
model = GridSearchCV(SVC(cache_size=2000, kernel="rbf"), parameters, n_jobs=4, verbose=1)

# Fit Grid Search Model
model.fit(trainDataVecs, train["sentiment"])

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
clf = model.best_estimator_
    
clf.fit(trainDataVecs, train["sentiment"])
result = clf.predict(testDataVecs)

# LB : 0.85684 248nd/510

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv("submission.csv", index=False, quoting=3)









    



Fitting 3 folds for each of 8 candidates, totalling 24 fits






    



[Parallel(n_jobs=4)]: Done   1 jobs       | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done  18 out of  24 | elapsed:  9.3min remaining:  3.1min
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed: 11.1min finished






    



Best score: 0.861
Best parameters set:
	C: 12000
	cache_size: 2000
	class_weight: None
	coef0: 0.0
	degree: 3
	gamma: 0.0
	kernel: 'rbf'
	max_iter: -1
	probability: False
	random_state: None
	shrinking: True
	tol: 0.001
	verbose: False



In [83]:

    
parameters = {'C' : [12000, 20000, 30000, 40000, 50000, 60000, 70000, 80000,
                    90000, 100000, 110000, 120000, 130000, 140000, 150000]}
model = GridSearchCV(SVC(cache_size=2000, kernel="rbf"), parameters, n_jobs=5, verbose=1)

# Fit Grid Search Model
model.fit(trainDataVecs, train["sentiment"])

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))









    



Fitting 3 folds for each of 15 candidates, totalling 45 fits






    



[Parallel(n_jobs=5)]: Done   1 jobs       | elapsed:  2.2min
[Parallel(n_jobs=5)]: Done  37 out of  45 | elapsed: 20.4min remaining:  4.4min
[Parallel(n_jobs=5)]: Done  45 out of  45 | elapsed: 23.7min finished






    



Best score: 0.861
Best parameters set:
	C: 120000
	cache_size: 2000
	class_weight: None
	coef0: 0.0
	degree: 3
	gamma: 0.0
	kernel: 'rbf'
	max_iter: -1
	probability: False
	random_state: None
	shrinking: True
	tol: 0.001
	verbose: False



In [84]:

    
parameters = {'gamma' : [0.1, 0.3, 0.5, 0.7, 0.9, 0.01, 0.03, 0.05, 0.07, 0.09,
                         0.001, 0.003, 0.005, 0.007, 0.009, 0.0001, 0.0003, 0.0005, 0.0007, 0.0009]}
model = GridSearchCV(SVC(cache_size=2000, kernel="rbf", C=12000), parameters, n_jobs=5, verbose=1)

# Fit Grid Search Model
model.fit(trainDataVecs, train["sentiment"])

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))









    



Fitting 3 folds for each of 20 candidates, totalling 60 fits






    



[Parallel(n_jobs=5)]: Done   1 jobs       | elapsed:  4.0min
[Parallel(n_jobs=5)]: Done  50 jobs       | elapsed: 39.1min
[Parallel(n_jobs=5)]: Done  52 out of  60 | elapsed: 40.8min remaining:  6.3min
[Parallel(n_jobs=5)]: Done  60 out of  60 | elapsed: 43.3min finished






    



Best score: 0.863
Best parameters set:
	C: 12000
	cache_size: 2000
	class_weight: None
	coef0: 0.0
	degree: 3
	gamma: 0.09
	kernel: 'rbf'
	max_iter: -1
	probability: False
	random_state: None
	shrinking: True
	tol: 0.001
	verbose: False



In [85]:

    
parameters = {'gamma' : [0.085, 0.086, 0.087, 0.088, 0.089, 0.09, 0.091, 0.092, 0.093, 0.094, 0.095]}
model = GridSearchCV(SVC(cache_size=2000, kernel="rbf", C=12000), parameters, n_jobs=6, verbose=1)

# Fit Grid Search Model
model.fit(trainDataVecs, train["sentiment"])

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))









    



Fitting 3 folds for each of 11 candidates, totalling 33 fits






    



[Parallel(n_jobs=6)]: Done   1 jobs       | elapsed:  4.6min
[Parallel(n_jobs=6)]: Done  33 out of  33 | elapsed: 25.4min finished






    



Best score: 0.864
Best parameters set:
	C: 12000
	cache_size: 2000
	class_weight: None
	coef0: 0.0
	degree: 3
	gamma: 0.085
	kernel: 'rbf'
	max_iter: -1
	probability: False
	random_state: None
	shrinking: True
	tol: 0.001
	verbose: False



In [86]:

    
clf = model.best_estimator_
    
clf.fit(trainDataVecs, train["sentiment"])
result = clf.predict(testDataVecs)

# LB : 

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv("submission.csv", index=False, quoting=3)