notebook.community

Edit and run

Copyright (C) 2017 Constantine Savenkov

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.



In [1]:

    
import pandas as pd
import numpy  as np



In [2]:

    
train   = pd.read_csv('products_sentiment_train.tsv', sep = '\t')
test    = pd.read_csv('products_sentiment_test.tsv' , sep = '\t')
answers = pd.read_csv('products_sentiment_sample_submission.csv')

test.insert(2, 'Sentiment', answers['Sentiment'].values)
dataframe = train.append(test.drop(['Id'], axis = 1), ignore_index=True)

del train, test, answers



In [3]:

    
dataframe.head()









    Out[3]:







  
    
      
      Review
      Sentiment
    
  
  
    
      0
      2 . take around 10,000 640x480 pictures .
      1
    
    
      1
      i downloaded a trial version of computer assoc...
      1
    
    
      2
      the wrt54g plus the hga7t is a perfect solutio...
      1
    
    
      3
      i dont especially like how music files are uns...
      0
    
    
      4
      i was using the cheapie pail ... and it worked...
      1



In [ ]:



In [4]:

    
from re import split, findall

words_dict = dict()
for i in range(dataframe.shape[0]):
    review = split(r'[^a-z]', dataframe.loc[i, 'Review'].lower())
    for word in filter(lambda w: w != '', review):
        if word not in words_dict:
            words_dict[word] = 1
        else:
            words_dict[word] += 1

print(len(words_dict))



In [ ]:



In [5]:

    
mean = np.mean(list(words_dict.values()))
std  = np.sqrt(mean)
left_border, right_border = mean - 2.0 * std, mean + 2.0 * std

print('borders: [', left_border, '<-', mean, '->', right_border, ']')

for word in words_dict.copy():
    if words_dict[word] < left_border or right_border < words_dict[word]:
        del words_dict[word]

print('number of new words :', len(words_dict))









    



borders: [ 3.73263851709 <- 10.0835641736 -> 16.4344898301 ]
number of new words : 890



In [ ]:



In [6]:

    
for i in range(dataframe.shape[0]):
    tmp_review = split(r'[^a-z]', dataframe.loc[i, 'Review'].lower())
    new_review = list()
    
    for word in filter(lambda w: w != '', tmp_review):
        if word in words_dict:
            new_review.append(word)
    
    dataframe.loc[i, 'Review'] = ' '.join(new_review)



In [7]:

    
dataframe.head()









    Out[7]:







  
    
      
      Review
      Sentiment
    
  
  
    
      0
      
      1
    
    
      1
      downloaded version firewall antivirus fell
      1
    
    
      2
      wrt area house
      1
    
    
      3
      dont basically folder might explorer folders
      0
    
    
      4
      ok until opening fell
      1



In [ ]:



In [8]:

    
list_of_empty_strings = [i for i in range(dataframe.shape[0]) if dataframe.loc[i, 'Review'] == '']

percentage_of_zeros, percentage_of_ones = [], []
for i in list_of_empty_strings:
    percentage_of_zeros.append(dataframe.loc[i, 'Sentiment'] == 0)
    percentage_of_ones .append(dataframe.loc[i, 'Sentiment'] == 1)

print('number of null strings is', len(list_of_empty_strings))
print('percentage of null strings with zeros in sentiment:', sum(percentage_of_zeros) / len(percentage_of_zeros))
print('percentage of null strings with ones  in sentiment:', sum(percentage_of_ones ) / len(percentage_of_ones ))









    



number of null strings is 333
percentage of null strings with zeros in sentiment: 0.276276276276
percentage of null strings with ones  in sentiment: 0.723723723724



In [9]:

    
print(len(list(filter(lambda i: i > 2000, list_of_empty_strings))))



In [10]:

    
train, test = dataframe.loc[0 : 2000], dataframe.loc[2000 : 2500]



In [ ]:



In [11]:

    
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(sublinear_tf = True, use_idf = True)

X_train = vect.fit_transform(train['Review'])
X_test  = vect.transform(test['Review'])



In [ ]:



In [12]:

    
#X_coded = np.zeros(shape=(dataframe.shape[0], len_words_dict), dtype=int)
#y_coded = np.zeros(shape=(dataframe.shape[0],              1), dtype=int)

#for i in range(dataframe.shape[0]):
#    review = dataframe.loc[i, 'Review'].lower()

#    for j, word in enumerate(words_dict):
#        X_coded[i, j] = len(findall('\s?' + word + '\s?', review))

#    y_coded[i] = dataframe.loc[i, 'Sentiment']



In [ ]:



In [13]:

    
#X_train, X_test = X_coded[0 : 2000, :], X_coded[2000 : 2500, :]
#y_train, y_test = y_coded[0 : 2000, 0], y_coded[2000 : 2500, 0]

y_train = train['Sentiment']
y_test  = test ['Sentiment']



In [ ]:



In [14]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, average_precision_score, accuracy_score, \
                            recall_score, roc_auc_score, f1_score, log_loss, mean_absolute_error, \
                            mean_squared_error

classifier = RandomForestClassifier(n_estimators      =  100,
                                    max_depth         =   10,
                                    min_samples_split =    5,
                                    min_samples_leaf  =    2,
                                    max_leaf_nodes    =   10,
                                    random_state      = 2017)
classifier.fit(X_train, y_train)
answers = classifier.predict(X_test)

print('average_precision_score:', average_precision_score(answers, y_test))
print('accuracy_score         :', accuracy_score         (answers, y_test))
print('precision_score        :', precision_score        (answers, y_test))
print('recall_score           :', recall_score           (answers, y_test))
print('roc_auc_score          :', roc_auc_score          (answers, y_test))
print('log_loss               :', log_loss               (answers, y_test))
print('f1_score               :', f1_score               (answers, y_test))
print('mean_absolute_error    :', mean_absolute_error    (answers, y_test))
print('mean_squared_error     :', mean_squared_error     (answers, y_test))









    



average_precision_score: 0.999501002004
accuracy_score         : 0.502
precision_score        : 1.0
recall_score           : 0.501002004008
roc_auc_score          : 0.750501002004
log_loss               : 17.2003106447
f1_score               : 0.667556742323
mean_absolute_error    : 0.498
mean_squared_error     : 0.498



In [15]:

    
print('PREDICTIONS:')
print('percentage of zeros:', sum(answers == 0) / len(answers))
print('percentage of  ones:', sum(answers == 1) / len(answers))
print()
print('TRAIN VALUES:')
print('percentage of zeros:', sum(y_train == 0) / len(y_train))
print('percentage of  ones:', sum(y_train == 1) / len(y_train))
print()
print('TEST VALUES:')
print('percentage of zeros:', sum(y_test  == 0) / len(y_test ))
print('percentage of  ones:', sum(y_test  == 1) / len(y_test ))









    



PREDICTIONS:
percentage of zeros: 0.002
percentage of  ones: 0.998

TRAIN VALUES:
percentage of zeros: 0.36331834083
percentage of  ones: 0.63668165917

TEST VALUES:
percentage of zeros: 0.5
percentage of  ones: 0.5



In [ ]:

	Review	Sentiment
0	2 . take around 10,000 640x480 pictures .	1
1	i downloaded a trial version of computer assoc...	1
2	the wrt54g plus the hga7t is a perfect solutio...	1
3	i dont especially like how music files are uns...	0
4	i was using the cheapie pail ... and it worked...	1

	Review	Sentiment
0		1
1	downloaded version firewall antivirus fell	1
2	wrt area house	1
3	dont basically folder might explorer folders	0
4	ok until opening fell	1