Copyright (C) 2017 Constantine Savenkov
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
In [1]:
import pandas as pd
import numpy as np
In [2]:
train = pd.read_csv('products_sentiment_train.tsv', sep = '\t')
test = pd.read_csv('products_sentiment_test.tsv' , sep = '\t')
answers = pd.read_csv('products_sentiment_sample_submission.csv')
test.insert(2, 'Sentiment', answers['Sentiment'].values)
dataframe = train.append(test.drop(['Id'], axis = 1), ignore_index=True)
del train, test, answers
In [3]:
dataframe.head()
Out[3]:
In [ ]:
In [4]:
from re import split, findall
words_dict = dict()
for i in range(dataframe.shape[0]):
review = split(r'[^a-z]', dataframe.loc[i, 'Review'].lower())
for word in filter(lambda w: w != '', review):
if word not in words_dict:
words_dict[word] = 1
else:
words_dict[word] += 1
print(len(words_dict))
In [ ]:
In [5]:
mean = np.mean(list(words_dict.values()))
std = np.sqrt(mean)
left_border, right_border = mean - 2.0 * std, mean + 2.0 * std
print('borders: [', left_border, '<-', mean, '->', right_border, ']')
for word in words_dict.copy():
if words_dict[word] < left_border or right_border < words_dict[word]:
del words_dict[word]
print('number of new words :', len(words_dict))
In [ ]:
In [6]:
for i in range(dataframe.shape[0]):
tmp_review = split(r'[^a-z]', dataframe.loc[i, 'Review'].lower())
new_review = list()
for word in filter(lambda w: w != '', tmp_review):
if word in words_dict:
new_review.append(word)
dataframe.loc[i, 'Review'] = ' '.join(new_review)
In [7]:
dataframe.head()
Out[7]:
In [ ]:
In [8]:
list_of_empty_strings = [i for i in range(dataframe.shape[0]) if dataframe.loc[i, 'Review'] == '']
percentage_of_zeros, percentage_of_ones = [], []
for i in list_of_empty_strings:
percentage_of_zeros.append(dataframe.loc[i, 'Sentiment'] == 0)
percentage_of_ones .append(dataframe.loc[i, 'Sentiment'] == 1)
print('number of null strings is', len(list_of_empty_strings))
print('percentage of null strings with zeros in sentiment:', sum(percentage_of_zeros) / len(percentage_of_zeros))
print('percentage of null strings with ones in sentiment:', sum(percentage_of_ones ) / len(percentage_of_ones ))
In [9]:
print(len(list(filter(lambda i: i > 2000, list_of_empty_strings))))
In [10]:
train, test = dataframe.loc[0 : 2000], dataframe.loc[2000 : 2500]
In [ ]:
In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(sublinear_tf = True, use_idf = True)
X_train = vect.fit_transform(train['Review'])
X_test = vect.transform(test['Review'])
In [ ]:
In [12]:
#X_coded = np.zeros(shape=(dataframe.shape[0], len_words_dict), dtype=int)
#y_coded = np.zeros(shape=(dataframe.shape[0], 1), dtype=int)
#for i in range(dataframe.shape[0]):
# review = dataframe.loc[i, 'Review'].lower()
# for j, word in enumerate(words_dict):
# X_coded[i, j] = len(findall('\s?' + word + '\s?', review))
# y_coded[i] = dataframe.loc[i, 'Sentiment']
In [ ]:
In [13]:
#X_train, X_test = X_coded[0 : 2000, :], X_coded[2000 : 2500, :]
#y_train, y_test = y_coded[0 : 2000, 0], y_coded[2000 : 2500, 0]
y_train = train['Sentiment']
y_test = test ['Sentiment']
In [ ]:
In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, average_precision_score, accuracy_score, \
recall_score, roc_auc_score, f1_score, log_loss, mean_absolute_error, \
mean_squared_error
classifier = RandomForestClassifier(n_estimators = 100,
max_depth = 10,
min_samples_split = 5,
min_samples_leaf = 2,
max_leaf_nodes = 10,
random_state = 2017)
classifier.fit(X_train, y_train)
answers = classifier.predict(X_test)
print('average_precision_score:', average_precision_score(answers, y_test))
print('accuracy_score :', accuracy_score (answers, y_test))
print('precision_score :', precision_score (answers, y_test))
print('recall_score :', recall_score (answers, y_test))
print('roc_auc_score :', roc_auc_score (answers, y_test))
print('log_loss :', log_loss (answers, y_test))
print('f1_score :', f1_score (answers, y_test))
print('mean_absolute_error :', mean_absolute_error (answers, y_test))
print('mean_squared_error :', mean_squared_error (answers, y_test))
In [15]:
print('PREDICTIONS:')
print('percentage of zeros:', sum(answers == 0) / len(answers))
print('percentage of ones:', sum(answers == 1) / len(answers))
print()
print('TRAIN VALUES:')
print('percentage of zeros:', sum(y_train == 0) / len(y_train))
print('percentage of ones:', sum(y_train == 1) / len(y_train))
print()
print('TEST VALUES:')
print('percentage of zeros:', sum(y_test == 0) / len(y_test ))
print('percentage of ones:', sum(y_test == 1) / len(y_test ))
In [ ]: