In [58]:
# Load packages
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [59]:
# Load MS Excel file
dataset=pd.read_excel('matrix.xlsx')
dataset=dataset[['tag','Filename','MDA']]

# Split the dataset in train/test ratio: 0.20
train_set, test_set = train_test_split(dataset, test_size = 0.20)

In [60]:
# Baseline all BUY
buy=[]
for i in range(0, len(test_set)):
    buy.append(1)
allBuy = pd.Series(buy)

# Print accuracy, precision, recall, F measure
tab = pd.crosstab(test_set_pred, test_set.tag, rownames=['Predicted'], colnames=['Actual'], margins=True) # Print confusion matrix
print(tab)
classification_report(allBuy, test_set.tag)
a=accuracy_score(allBuy, test_set.tag)
p=precision_score(allBuy, test_set.tag)
r=recall_score(allBuy, test_set.tag)
f=f1_score(allBuy, test_set.tag)
print "Accuracy = ",a,"\nPrecision =",p,"\nRecall = ",r,"\nF-Score = ",f


Actual     -1.0  0.0  1.0   All
Predicted                      
0.0           1    5    2    28
1.0           7   83  113  1034
All          32  437  593  1913
Accuracy =  0.558380414313 
Precision = 1.0 
Recall =  0.558380414313 
F-Score =  0.716616314199
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1203: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1304: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)

In [54]:
# Create the Count classifier
vectorizer = CountVectorizer(stop_words="english")
counts = vectorizer.fit_transform(train_set.MDA.values)
classifier = MultinomialNB(fit_prior="False")

# Train the classifier
classifier.fit(counts, train_set.tag)

# Test the classifier
predictions = classifier.predict(vectorizer.transform(test_set.MDA.values)) 
test_set_pred = pd.Series(predictions, index=test_set.index)
tab = pd.crosstab(test_set_pred, test_set.tag, rownames=['Predicted'], colnames=['Actual'], margins=True) # Print confusion matrix
print(tab)

# Print accuracy, precision, recall, F measure
classification_report(test_set_pred, test_set.tag)
a=accuracy_score(test_set_pred, test_set.tag)
p=precision_score(test_set_pred, test_set.tag)
r=recall_score(test_set_pred, test_set.tag)
f=f1_score(test_set_pred, test_set.tag)
print "Accuracy = ",a,"\nPrecision =",p,"\nRecall = ",r,"\nF-Score = ",f


Actual     -1    0    1   All
Predicted                    
-1          4   10    2    16
0           5  228  166   399
1           9  216  422   647
All        18  454  590  1062
Accuracy =  0.61581920904 
Precision = 0.627781451701 
Recall =  0.61581920904 
F-Score =  0.62006505017
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1203: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1304: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)

In [55]:
# Create the TFIDF classifier
vectorizer = TfidfVectorizer(stop_words="english")
counts = vectorizer.fit_transform(train_set.MDA.values)
classifier = MultinomialNB(fit_prior="False")

# Train the classifier
classifier.fit(counts, train_set.tag)

# Test the classifier
predictions = classifier.predict(vectorizer.transform(test_set.MDA.values)) 
test_set_pred = pd.Series(predictions, index=test_set.index)
tab = pd.crosstab(test_set_pred, test_set.tag, rownames=['Predicted'], colnames=['Actual'], margins=True) # Print confusion matrix
print(tab)

# Print accuracy, precision, recall, F measure
classification_report(test_set_pred, test_set.tag)
a=accuracy_score(test_set_pred, test_set.tag)
p=precision_score(test_set_pred, test_set.tag)
r=recall_score(test_set_pred, test_set.tag)
f=f1_score(test_set_pred, test_set.tag)
print "Accuracy = ",a,"\nPrecision =",p,"\nRecall = ",r,"\nF-Score = ",f


Actual     -1    0    1   All
Predicted                    
0           1   21    6    28
1          17  433  584  1034
All        18  454  590  1062
Accuracy =  0.569679849341 
Precision = 0.96495282481 
Recall =  0.569679849341 
F-Score =  0.702546947886
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1203: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1304: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)