In [157]:
# Load packages
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans

In [158]:
# Function to create Classification Report (Group 6)
def fit_model(train, test, target, variables, classifier):
    tarTrain = train.as_matrix(target)
    varTrain = train.as_matrix(variables)
    classifier.fit(varTrain,tarTrain)
    varTest = test.as_matrix(variables)
    predictions = classifier.predict(varTest)
    # Print confusion matrix
    tab = pd.crosstab(test['tag'], predictions, rownames=['tag'], colnames=['Predicted'], margins=True)
    print(tab)
    # Print accuracy, precision, recall, F measure
    print(classification_report(test['tag'], predictions))
    a=accuracy_score(test['tag'],predictions)
    p=precision_score(test['tag'],predictions, pos_label = "pos")
    r=recall_score(test['tag'].values,predictions, pos_label = "pos")
    f=f1_score(test['tag'].values,predictions, pos_label = "pos")
    print "Accuracy = ",a,"\nPrecision =",p,"\nRecall = ",r,"\nF-Score = ",f

In [175]:
# Load MS Excel file
dataset=pd.read_excel('matrix.xlsx')
dataset = dataset.fillna(0)

# Split the dataset in train/test ratio: 0.20
train_set, test_set = train_test_split(dataset, test_size = 0.20)

# Define the target and the variables
variables = ['all', '100', 'bigrams', 'wordCount', 'textBlob', 'fyear', 'at', 'ceq', 'ni', 'oiadp', 
             'sale', 'ggroup', 'label_roe', 'op_margin']
target = ['tag']

In [176]:
# MODEL 1 - Support Vector Machine
sv = SVC()
fit_model(train_set, test_set, target, variables, sv)


C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\svm\base.py:514: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y_ = column_or_1d(y, warn=True)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1203: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
Predicted  -1  0     1   All
tag                         
-1          0  0    29    29
0           1  6   450   457
1           0  1   575   576
All         1  7  1054  1062
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        29
          0       0.86      0.01      0.03       457
          1       0.55      1.00      0.71       576

avg / total       0.66      0.55      0.39      1062

Accuracy =  0.547080979284 
Precision = 0.664732377369 
Recall =  0.547080979284 
F-Score =  0.393784683254
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1304: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)

In [177]:
# MODEL 2 - Logistic Regression
lr = LogisticRegression()
fit_model(train_set, test_set, target, variables, lr)


C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\utils\validation.py:515: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1203: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
Predicted  -1    0    1   All
tag                          
-1          0    1   28    29
0           2   64  391   457
1           0   65  511   576
All         2  130  930  1062
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        29
          0       0.49      0.14      0.22       457
          1       0.55      0.89      0.68       576

avg / total       0.51      0.54      0.46      1062

Accuracy =  0.54143126177 
Precision = 0.509863406747 
Recall =  0.54143126177 
F-Score =  0.461899182993
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1304: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)

In [178]:
# MODEL 3 - Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100)
fit_model(train_set, test_set, target, variables, rf)


C:\Users\Cedric\Anaconda2\lib\site-packages\ipykernel\__main__.py:5: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1203: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
Predicted  -1    0    1   All
tag                          
-1          0   17   12    29
0           1  230  226   457
1           1  143  432   576
All         2  390  670  1062
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        29
          0       0.59      0.50      0.54       457
          1       0.64      0.75      0.69       576

avg / total       0.60      0.62      0.61      1062

Accuracy =  0.623352165725 
Precision = 0.603487632099 
Recall =  0.623352165725 
F-Score =  0.609795606731
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1304: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)

In [179]:
# MODEL 4 - Gradient Boosting Classifier
gb = GradientBoostingClassifier(n_estimators=100)
fit_model(train_set, test_set, target, variables, gb)


C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\utils\validation.py:515: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1203: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
Predicted  -1    0    1   All
tag                          
-1          0   10   19    29
0           1  184  272   457
1           3  120  453   576
All         4  314  744  1062
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        29
          0       0.59      0.40      0.48       457
          1       0.61      0.79      0.69       576

avg / total       0.58      0.60      0.58      1062

Accuracy =  0.599811676083 
Precision = 0.582397227649 
Recall =  0.599811676083 
F-Score =  0.577657779847
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1304: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)

In [180]:
# MODEL 5 - KMeans
km = KMeans(n_clusters=3)
fit_model(train_set, test_set, target, variables, km)


Predicted   0     1  2   All
tag                         
-1          0    29  0    29
0          16   441  0   457
1          30   545  1   576
All        46  1015  1  1062
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        29
          0       0.35      0.04      0.06       457
          1       0.54      0.95      0.69       576
          2       0.00      0.00      0.00         0

avg / total       0.44      0.53      0.40      1062

Accuracy =  0.528248587571 
Precision = 0.440901421767 
Recall =  0.528248587571 
F-Score =  0.398957904579
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1203: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1304: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
C:\Users\Cedric\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)