Copyright (C) 2017 Constantine Savenkov

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
import pandas as pd
import numpy  as np

In [2]:
train   = pd.read_csv('products_sentiment_train.tsv', sep = '\t')
test    = pd.read_csv('products_sentiment_test.tsv' , sep = '\t')
answers = pd.read_csv('products_sentiment_sample_submission.csv')

test.insert(2, 'Sentiment', answers['Sentiment'].values)
dataframe = train.append(test.drop(['Id'], axis = 1), ignore_index=True)

del train, test, answers

In [3]:
dataframe.head()


Out[3]:
Review Sentiment
0 2 . take around 10,000 640x480 pictures . 1
1 i downloaded a trial version of computer assoc... 1
2 the wrt54g plus the hga7t is a perfect solutio... 1
3 i dont especially like how music files are uns... 0
4 i was using the cheapie pail ... and it worked... 1

In [ ]:


In [4]:
from re import split, findall

words_dict = dict()
for i in range(dataframe.shape[0]):
    review = split(r'[^a-z]', dataframe.loc[i, 'Review'].lower())
    for word in filter(lambda w: w != '', review):
        if word not in words_dict:
            words_dict[word] = 1
        else:
            words_dict[word] += 1

print(len(words_dict))


4332

In [ ]:


In [5]:
mean = np.mean(list(words_dict.values()))
std  = np.sqrt(mean)
left_border, right_border = mean - 2.0 * std, mean + 2.0 * std

print('borders: [', left_border, '<-', mean, '->', right_border, ']')

for word in words_dict.copy():
    if words_dict[word] < left_border or right_border < words_dict[word]:
        del words_dict[word]

print('number of new words :', len(words_dict))


borders: [ 3.73263851709 <- 10.0835641736 -> 16.4344898301 ]
number of new words : 890

In [ ]:


In [6]:
for i in range(dataframe.shape[0]):
    tmp_review = split(r'[^a-z]', dataframe.loc[i, 'Review'].lower())
    new_review = list()
    
    for word in filter(lambda w: w != '', tmp_review):
        if word in words_dict:
            new_review.append(word)
    
    dataframe.loc[i, 'Review'] = ' '.join(new_review)

In [7]:
dataframe.head()


Out[7]:
Review Sentiment
0 1
1 downloaded version firewall antivirus fell 1
2 wrt area house 1
3 dont basically folder might explorer folders 0
4 ok until opening fell 1

In [ ]:


In [8]:
list_of_empty_strings = [i for i in range(dataframe.shape[0]) if dataframe.loc[i, 'Review'] == '']

percentage_of_zeros, percentage_of_ones = [], []
for i in list_of_empty_strings:
    percentage_of_zeros.append(dataframe.loc[i, 'Sentiment'] == 0)
    percentage_of_ones .append(dataframe.loc[i, 'Sentiment'] == 1)

print('number of null strings is', len(list_of_empty_strings))
print('percentage of null strings with zeros in sentiment:', sum(percentage_of_zeros) / len(percentage_of_zeros))
print('percentage of null strings with ones  in sentiment:', sum(percentage_of_ones ) / len(percentage_of_ones ))


number of null strings is 333
percentage of null strings with zeros in sentiment: 0.276276276276
percentage of null strings with ones  in sentiment: 0.723723723724

In [9]:
print(len(list(filter(lambda i: i > 2000, list_of_empty_strings))))


71

In [10]:
train, test = dataframe.loc[0 : 2000], dataframe.loc[2000 : 2500]

In [ ]:


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(sublinear_tf = True, use_idf = True)

X_train = vect.fit_transform(train['Review'])
X_test  = vect.transform(test['Review'])

In [ ]:


In [12]:
#X_coded = np.zeros(shape=(dataframe.shape[0], len_words_dict), dtype=int)
#y_coded = np.zeros(shape=(dataframe.shape[0],              1), dtype=int)

#for i in range(dataframe.shape[0]):
#    review = dataframe.loc[i, 'Review'].lower()

#    for j, word in enumerate(words_dict):
#        X_coded[i, j] = len(findall('\s?' + word + '\s?', review))

#    y_coded[i] = dataframe.loc[i, 'Sentiment']

In [ ]:


In [13]:
#X_train, X_test = X_coded[0 : 2000, :], X_coded[2000 : 2500, :]
#y_train, y_test = y_coded[0 : 2000, 0], y_coded[2000 : 2500, 0]

y_train = train['Sentiment']
y_test  = test ['Sentiment']

In [ ]:


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, average_precision_score, accuracy_score, \
                            recall_score, roc_auc_score, f1_score, log_loss, mean_absolute_error, \
                            mean_squared_error

classifier = RandomForestClassifier(n_estimators      =  100,
                                    max_depth         =   10,
                                    min_samples_split =    5,
                                    min_samples_leaf  =    2,
                                    max_leaf_nodes    =   10,
                                    random_state      = 2017)
classifier.fit(X_train, y_train)
answers = classifier.predict(X_test)

print('average_precision_score:', average_precision_score(answers, y_test))
print('accuracy_score         :', accuracy_score         (answers, y_test))
print('precision_score        :', precision_score        (answers, y_test))
print('recall_score           :', recall_score           (answers, y_test))
print('roc_auc_score          :', roc_auc_score          (answers, y_test))
print('log_loss               :', log_loss               (answers, y_test))
print('f1_score               :', f1_score               (answers, y_test))
print('mean_absolute_error    :', mean_absolute_error    (answers, y_test))
print('mean_squared_error     :', mean_squared_error     (answers, y_test))


average_precision_score: 0.999501002004
accuracy_score         : 0.502
precision_score        : 1.0
recall_score           : 0.501002004008
roc_auc_score          : 0.750501002004
log_loss               : 17.2003106447
f1_score               : 0.667556742323
mean_absolute_error    : 0.498
mean_squared_error     : 0.498

In [15]:
print('PREDICTIONS:')
print('percentage of zeros:', sum(answers == 0) / len(answers))
print('percentage of  ones:', sum(answers == 1) / len(answers))
print()
print('TRAIN VALUES:')
print('percentage of zeros:', sum(y_train == 0) / len(y_train))
print('percentage of  ones:', sum(y_train == 1) / len(y_train))
print()
print('TEST VALUES:')
print('percentage of zeros:', sum(y_test  == 0) / len(y_test ))
print('percentage of  ones:', sum(y_test  == 1) / len(y_test ))


PREDICTIONS:
percentage of zeros: 0.002
percentage of  ones: 0.998

TRAIN VALUES:
percentage of zeros: 0.36331834083
percentage of  ones: 0.63668165917

TEST VALUES:
percentage of zeros: 0.5
percentage of  ones: 0.5

In [ ]: