Now we run this a second time, on the second (b) feature table that has removed all epithets with fewer than 27 representative documents. The results are better (overall F1 score for decision tree is 0.44, random forest is 0.47; in a these were 0.33 and 0.40, respectively).


In [1]:
import os
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier


/root/venv/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
from sklearn import clone
from sklearn import preprocessing
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

import datetime as dt

In [3]:
fp_df = os.path.expanduser('~/cltk_data/user_data/tlg_bow_df.pickle')
dataframe_bow = joblib.load(fp_df)

In [4]:
Y = dataframe_bow['epithet']

In [5]:
X = dataframe_bow.drop(['epithet', 'id', 'author'], 1)

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

In [7]:
def scale_data(X_train, X_test, Y_train, Y_test):
    """Take Vectors, 
    """

    '''
    -PREPOCESSING 
    -Here, scaled data has zero mean and unit varience
    -We save the scaler to later use with testing/prediction data
    '''
    print('Scaling data ...')
    t0 = dt.datetime.utcnow()
    scaler = preprocessing.StandardScaler().fit(X_train)
    fp_scaler = os.path.expanduser('~/cltk_data/user_data/tlg_bow_scaler.pickle')
    joblib.dump(scaler, fp_scaler)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()

    return X_train_scaled, X_test_scaled, Y_train, Y_test

In [8]:
X_train_scaled, X_test_scaled, Y_train, Y_test = scale_data(X_train, X_test, Y_train, Y_test)


Scaling data ...
... finished in 0:00:02.469028 secs.

Decision tree


In [12]:
def run_tree(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Run decision tree with scikit.
    
    Experiment with: 'max_depth'
    """
    '''
    -This is where we define the models with pre-defined parameters
    -We can learn these parameters given our data
    '''
    print('Defining and fitting models ...')
    t0 = dt.datetime.utcnow()
    dec_tree = DecisionTreeClassifier()

    dec_tree.fit(X_train_scaled, Y_train)

    fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_dt.pickle')
    joblib.dump(dec_tree, fp_model_pickle)

    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()
    

    Y_prediction_tree = dec_tree.predict(X_test_scaled)
    print('tree_predictions ', Y_prediction_tree)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print('----Tree_report--------------------------------')
    print(classification_report(expected, Y_prediction_tree))
    
    return dec_tree

In [13]:
clf_dec_tree = run_tree(X_train_scaled, X_test_scaled, Y_train, Y_test)


Defining and fitting models ...
... finished in 0:00:10.446019 secs.

tree_predictions  ['Grammatici' 'Scriptores Ecclesiastici' 'Comici' 'Philosophici/-ae'
 'Comici' 'Theologici' 'Philosophici/-ae' 'Tragici' 'Lyrici/-ae'
 'Scriptores Ecclesiastici' 'Historici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Philosophici/-ae' 'Comici' 'Historici/-ae' 'Medici'
 'Tragici' 'Tragici' 'Scriptores Ecclesiastici' 'Philosophici/-ae'
 'Tragici' 'Scriptores Ecclesiastici' 'Tragici' 'Philosophici/-ae'
 'Tragici' 'Historici/-ae' 'Philosophici/-ae' 'Tragici' 'Tragici' 'Comici'
 'Historici/-ae' 'Scriptores Ecclesiastici' 'Comici'
 'Scriptores Ecclesiastici' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Grammatici' 'Historici/-ae' 'Comici' 'Philosophici/-ae' 'Tragici'
 'Poetae' 'Philosophici/-ae' 'Tragici' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Grammatici' 'Lyrici/-ae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Philosophici/-ae' 'Philosophici/-ae' 'Epici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Rhetorici'
 'Philosophici/-ae' 'Comici' 'Comici' 'Philosophici/-ae' 'Tragici' 'Comici'
 'Medici' 'Philosophici/-ae' 'Tragici' 'Historici/-ae' 'Grammatici'
 'Comici' 'Historici/-ae' 'Comici' 'Tragici' 'Historici/-ae' 'Tragici'
 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Epici/-ae'
 'Grammatici' 'Philosophici/-ae' 'Tragici' 'Scriptores Ecclesiastici'
 'Rhetorici' 'Rhetorici' 'Philosophici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Tragici' 'Lyrici/-ae' 'Tragici' 'Historici/-ae'
 'Historici/-ae' 'Scriptores Ecclesiastici' 'Alchemistae' 'Historici/-ae'
 'Epici/-ae' 'Theologici' 'Historici/-ae' 'Historici/-ae' 'Tragici'
 'Historici/-ae' 'Theologici' 'Comici' 'Comici' 'Tragici' 'Tragici'
 'Elegiaci' 'Medici' 'Comici' 'Tragici' 'Historici/-ae' 'Tragici' 'Tragici'
 'Medici' 'Tragici' 'Philosophici/-ae' 'Philosophici/-ae' 'Tragici'
 'Historici/-ae' 'Rhetorici' 'Tragici' 'Historici/-ae' 'Philosophici/-ae'
 'Historici/-ae' 'Medici' 'Theologici' 'Comici' 'Comici' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Philosophici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Sophistae' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Theologici' 'Tragici' 'Comici'
 'Historici/-ae' 'Historici/-ae' 'Philosophici/-ae' 'Medici' 'Lyrici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Sophistae'
 'Philosophici/-ae' 'Historici/-ae' 'Grammatici' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Sophistae' 'Comici' 'Historici/-ae'
 'Historici/-ae' 'Tragici' 'Tragici' 'Philosophici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Medici' 'Historici/-ae' 'Elegiaci' 'Medici' 'Comici'
 'Tragici' 'Historici/-ae' 'Philosophici/-ae' 'Comici' 'Sophistae' 'Comici'
 'Comici' 'Historici/-ae' 'Tragici' 'Tragici' 'Tragici' 'Historici/-ae'
 'Grammatici' 'Rhetorici' 'Medici' 'Comici' 'Tragici' 'Historici/-ae'
 'Epici/-ae' 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Comici'
 'Philosophici/-ae' 'Medici' 'Historici/-ae' 'Philosophici/-ae' 'Comici'
 'Tragici' 'Comici' 'Historici/-ae' 'Sophistae' 'Philosophici/-ae'
 'Tragici' 'Philosophici/-ae' 'Tragici' 'Tragici' 'Historici/-ae' 'Tragici'
 'Medici' 'Grammatici' 'Tragici' 'Theologici' 'Epici/-ae'
 'Philosophici/-ae' 'Historici/-ae' 'Tragici' 'Historici/-ae'
 'Historici/-ae' 'Tragici' 'Tragici' 'Philosophici/-ae' 'Tragici'
 'Historici/-ae' 'Comici' 'Historici/-ae' 'Historici/-ae' 'Theologici'
 'Philosophici/-ae' 'Scriptores Ecclesiastici' 'Comici' 'Sophistae'
 'Lyrici/-ae' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Rhetorici' 'Tragici' 'Historici/-ae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Tragici' 'Theologici' 'Historici/-ae' 'Tragici'
 'Epici/-ae' 'Rhetorici' 'Historici/-ae' 'Comici' 'Poetae' 'Historici/-ae'
 'Grammatici' 'Tragici' 'Philosophici/-ae' 'Philosophici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Rhetorici' 'Lyrici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Comici' 'Comici' 'Comici'
 'Historici/-ae' 'Tragici' 'Comici' 'Historici/-ae' 'Tragici' 'Comici'
 'Historici/-ae' 'Tragici' 'Philosophici/-ae' 'Medici' 'Medici'
 'Historici/-ae' 'Historici/-ae' 'Grammatici' 'Philosophici/-ae'
 'Philosophici/-ae' 'Comici' 'Tragici' 'Lyrici/-ae' 'Tragici'
 'Historici/-ae' 'Tragici' 'Historici/-ae' 'Tragici' 'Tragici' 'Tragici'
 'Comici' 'Historici/-ae' 'Comici' 'Comici' 'Historici/-ae' 'Epici/-ae'
 'Tragici' 'Historici/-ae' 'Tragici' 'Philosophici/-ae']
actual_values    7               Philosophici/-ae
1108    Scriptores Ecclesiastici
67                        Comici
1233            Philosophici/-ae
238                       Comici
23      Scriptores Ecclesiastici
1228                   Rhetorici
514             Philosophici/-ae
467                   Lyrici/-ae
470                   Theologici
1291               Historici/-ae
1399                    Elegiaci
1163                     Tragici
1505                      Comici
1370                 Alchemistae
657                    Epici/-ae
314             Philosophici/-ae
1461                      Comici
104                      Tragici
695                Historici/-ae
599             Philosophici/-ae
161                       Comici
949                   Theologici
753                       Comici
1460                      Medici
267                       Comici
1586                      Comici
1570                 Alchemistae
929                      Tragici
811                       Comici
                  ...           
1199               Historici/-ae
503                   Lyrici/-ae
871                    Rhetorici
673                       Medici
100                       Comici
593                Historici/-ae
1005               Historici/-ae
952             Philosophici/-ae
1077    Scriptores Ecclesiastici
622             Philosophici/-ae
1113                      Comici
944                    Epici/-ae
139                      Tragici
1568                    Elegiaci
316                Historici/-ae
512                       Comici
342                Historici/-ae
1419               Historici/-ae
351                       Comici
1400                      Comici
688                       Comici
159                    Rhetorici
1339                      Comici
940                       Comici
1253               Historici/-ae
1338                   Epici/-ae
1492                     Tragici
981                Historici/-ae
1136                     Tragici
1094                 Alchemistae
Name: epithet, dtype: object

----Tree_report--------------------------------
                          precision    recall  f1-score   support

             Alchemistae       1.00      0.14      0.25         7
                  Comici       0.68      0.51      0.58        51
                Elegiaci       0.00      0.00      0.00         8
               Epici/-ae       0.29      0.11      0.15        19
              Grammatici       0.40      0.20      0.27        20
           Historici/-ae       0.66      0.75      0.70        84
              Lyrici/-ae       0.29      0.17      0.21        12
                  Medici       0.15      0.29      0.20         7
        Philosophici/-ae       0.45      0.48      0.46        46
                  Poetae       0.00      0.00      0.00         7
               Rhetorici       0.38      0.25      0.30        12
Scriptores Ecclesiastici       0.44      0.31      0.36        13
               Sophistae       0.33      0.25      0.29         8
              Theologici       0.25      0.40      0.31         5
                 Tragici       0.15      0.60      0.24        15

             avg / total       0.49      0.45      0.45       314


In [39]:
from sklearn.tree import export_graphviz  # also `apt-get install graphviz` | also? `pip install graphviz`
from sklearn.externals.six import StringIO  
import pydotplus # pydot is not py3 compatible

In [80]:
# write dot file
dot_tree = os.path.expanduser('~/cltk_data/user_data/decision_tree_depth5.dot')
export_graphviz(clf_dec_tree, out_file=dot_tree, max_depth=5, feature_names=list(dataframe_bow.keys()))

In [81]:
graph = pydotplus.graph_from_dot_file(dot_tree)

In [82]:
dot_tree_svg = os.path.expanduser('~/cltk_data/user_data/decision_tree_depth5.svg')
graph.write_svg(dot_tree_svg)

# also png  -- this was breaking for full tree
# dot_tree_png = os.path.expanduser('~/cltk_data/user_data/decision_tree.png')
# graph.write_svg(dot_tree_png)


Out[82]:
True

In [83]:
from IPython.display import SVG, display, Image
display(SVG(filename=dot_tree_svg))


Tree 0 ιστορεω <= -0.1299 gini = 0.8616 samples = 939 value = [19, 97, 16, 44, 46, 253, 45, 31, 178, 20, 38, 43 23, 17, 69] 1 φημι <= -0.1776 gini = 0.8855 samples = 638 value = [19, 96, 16, 41, 24, 112, 45, 20, 117, 18, 28, 16 15, 6, 65] 0->1 True 442 μηδεν <= -0.1735 gini = 0.7213 samples = 301 value = [0, 1, 0, 3, 22, 141, 0, 11, 61, 2, 10, 27, 8 11, 4] 0->442 False 2 πυθαγορειου <= 0.9731 gini = 0.8734 samples = 362 value = [8, 73, 13, 33, 9, 39, 36, 6, 49, 18, 5, 4, 4 0, 65] 1->2 321 ειμι <= -0.1928 gini = 0.8424 samples = 276 value = [11, 23, 3, 8, 15, 73, 9, 14, 68, 0, 23, 12, 11 6, 0] 1->321 3 φησι <= -0.1046 gini = 0.8731 samples = 351 value = [8, 73, 13, 33, 9, 39, 36, 6, 38, 18, 5, 4, 4 0, 65] 2->3 320 gini = 0.0 samples = 11 value = [0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0 0] 2->320 4 ευρησεις <= -0.0838 gini = 0.871 samples = 334 value = [8, 71, 13, 33, 8, 27, 36, 6, 36, 18, 5, 4, 4 0, 65] 3->4 311 πανυ <= -0.1498 gini = 0.4706 samples = 17 value = [0, 2, 0, 0, 1, 12, 0, 0, 2, 0, 0, 0, 0, 0 0] 3->311 5 παιαν <= 0.5108 gini = 0.8659 samples = 326 value = [1, 71, 13, 33, 8, 27, 36, 6, 35, 18, 5, 4, 4 0, 65] 4->5 308 μοχθηρος <= -0.0655 gini = 0.2188 samples = 8 value = [7, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0] 4->308 6 (...) 5->6 307 (...) 5->307 309 (...) 308->309 310 (...) 308->310 312 λογω <= -0.1912 gini = 0.3378 samples = 15 value = [0, 0, 0, 0, 1, 12, 0, 0, 2, 0, 0, 0, 0, 0 0] 311->312 319 gini = 0.0 samples = 2 value = [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 311->319 313 (...) 312->313 316 (...) 312->316 322 λογου <= -0.1964 gini = 0.4362 samples = 66 value = [1, 1, 0, 0, 1, 48, 2, 0, 12, 0, 0, 1, 0, 0 0] 321->322 345 περιττωματα <= 0.0618 gini = 0.8715 samples = 210 value = [10, 22, 3, 8, 14, 25, 7, 14, 56, 0, 23, 11, 11 6, 0] 321->345 323 φιλος <= -0.1627 gini = 0.3444 samples = 60 value = [0, 1, 0, 0, 1, 48, 2, 0, 7, 0, 0, 1, 0, 0 0] 322->323 342 ιον <= -0.0962 gini = 0.2778 samples = 6 value = [1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0] 322->342 324 εγω <= -0.1349 gini = 0.2838 samples = 57 value = [0, 1, 0, 0, 1, 48, 2, 0, 4, 0, 0, 1, 0, 0 0] 323->324 341 gini = 0.0 samples = 3 value = [0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0] 323->341 325 (...) 324->325 336 (...) 324->336 343 gini = 0.0 samples = 5 value = [0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0] 342->343 344 gini = 0.0 samples = 1 value = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 342->344 346 στραταω <= 0.0516 gini = 0.8616 samples = 199 value = [10, 22, 3, 8, 14, 25, 7, 3, 56, 0, 23, 11, 11 6, 0] 345->346 441 gini = 0.0 samples = 11 value = [0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0 0] 345->441 347 μειδιου <= 0.2915 gini = 0.8562 samples = 187 value = [10, 22, 3, 8, 14, 14, 7, 3, 56, 0, 23, 10, 11 6, 0] 346->347 438 εισηκουσθη <= 0.7248 gini = 0.1528 samples = 12 value = [0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 1, 0, 0 0] 346->438 348 (...) 347->348 435 (...) 347->435 439 (...) 438->439 440 (...) 438->440 443 γραφω <= -0.1646 gini = 0.2583 samples = 112 value = [0, 0, 0, 1, 6, 96, 0, 0, 7, 1, 0, 1, 0, 0 0] 442->443 468 χριστον <= 0.0049 gini = 0.8236 samples = 189 value = [0, 1, 0, 2, 16, 45, 0, 11, 54, 1, 10, 26, 8 11, 4] 442->468 444 περιστασιν <= 0.136 gini = 0.1363 samples = 97 value = [0, 0, 0, 0, 1, 90, 0, 0, 5, 0, 0, 1, 0, 0 0] 443->444 455 ελλην <= -0.1641 gini = 0.7022 samples = 15 value = [0, 0, 0, 1, 5, 6, 0, 0, 2, 1, 0, 0, 0, 0, 0] 443->455 445 ελεγξας <= 0.2365 gini = 0.1013 samples = 95 value = [0, 0, 0, 0, 1, 90, 0, 0, 3, 0, 0, 1, 0, 0 0] 444->445 454 gini = 0.0 samples = 2 value = [0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0] 444->454 446 παιζει <= 1.0614 gini = 0.0631 samples = 93 value = [0, 0, 0, 0, 1, 90, 0, 0, 1, 0, 0, 1, 0, 0 0] 445->446 453 gini = 0.0 samples = 2 value = [0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0] 445->453 447 (...) 446->447 452 (...) 446->452 456 βιβλιω <= -0.0719 gini = 0.5 samples = 6 value = [0, 0, 0, 1, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0] 455->456 461 αθηνας <= -0.0381 gini = 0.5185 samples = 9 value = [0, 0, 0, 0, 1, 6, 0, 0, 1, 1, 0, 0, 0, 0, 0] 455->461 457 gini = 0.0 samples = 4 value = [0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 456->457 458 ακουω <= -0.0917 gini = 0.5 samples = 2 value = [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0] 456->458 459 (...) 458->459 460 (...) 458->460 462 gini = 0.0 samples = 6 value = [0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0] 461->462 463 λινος <= 1.4049 gini = 0.6667 samples = 3 value = [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0] 461->463 464 (...) 463->464 467 (...) 463->467 469 αισθητα <= -0.0313 gini = 0.78 samples = 151 value = [0, 1, 0, 2, 16, 39, 0, 11, 54, 1, 9, 6, 8, 0 4] 468->469 530 σαρδεις <= 0.298 gini = 0.6136 samples = 38 value = [0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 1, 20, 0, 11 0] 468->530 470 βασιλειας <= -0.0016 gini = 0.815 samples = 121 value = [0, 1, 0, 2, 16, 39, 0, 11, 25, 1, 8, 6, 8, 0 4] 469->470 527 ζημιουμεθα <= 2.1885 gini = 0.0644 samples = 30 value = [0, 0, 0, 0, 0, 0, 0, 0, 29, 0, 1, 0, 0, 0 0] 469->527 471 στομαχω <= 0.2519 gini = 0.8455 samples = 96 value = [0, 1, 0, 2, 16, 18, 0, 11, 24, 1, 8, 5, 6, 0 4] 470->471 520 πιθηκων <= 4.5966 gini = 0.2848 samples = 25 value = [0, 0, 0, 0, 0, 21, 0, 0, 1, 0, 0, 1, 2, 0 0] 470->520 472 (...) 471->472 517 (...) 471->517 521 (...) 520->521 522 (...) 520->522 528 gini = 0.0 samples = 29 value = [0, 0, 0, 0, 0, 0, 0, 0, 29, 0, 0, 0, 0, 0 0] 527->528 529 gini = 0.0 samples = 1 value = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] 527->529 531 πλασματι <= 3.2031 gini = 0.4444 samples = 30 value = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 0, 10 0] 530->531 538 επιβηση <= 1.5373 gini = 0.4062 samples = 8 value = [0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 1, 0, 0, 1, 0] 530->538 532 πηγαζω <= -0.1082 gini = 0.32 samples = 25 value = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 0, 5 0] 531->532 537 gini = 0.0 samples = 5 value = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0] 531->537 533 (...) 532->533 534 (...) 532->534 539 gini = 0.0 samples = 6 value = [0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0] 538->539 540 τρισσην <= 2.7234 gini = 0.5 samples = 2 value = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0] 538->540 541 (...) 540->541 542 (...) 540->542

Random forest


In [86]:
def run_random_forest(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Scikit random forest
    
    Experiment with 'n_estimators'
    """
    t0 = dt.datetime.utcnow()

    n_estimators = 30
    
    rf_model = RandomForestClassifier(n_estimators=n_estimators)

    # Train
    clf = clone(rf_model)
    clf = rf_model.fit(X_train_scaled, Y_train)
    
    #joblib.dump(clf, 'models/random_forest.pickle')

    fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_fandom_forest.pickle')
    joblib.dump(clf, fp_model_pickle)
    
    scores = clf.score(X_train_scaled, Y_train)
    
    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()
    
    Y_prediction = clf.predict(X_test_scaled)
    print('tree_predictions ', Y_prediction)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print('----Random forest report--------------------------------')
    print(classification_report(expected, Y_prediction))
    
    return clf

In [87]:
clf_random_forest = run_random_forest(X_train_scaled, X_test_scaled, Y_train, Y_test)


... finished in 0:00:04.907945 secs.

tree_predictions  ['Philosophici/-ae' 'Historici/-ae' 'Comici' 'Philosophici/-ae' 'Comici'
 'Historici/-ae' 'Philosophici/-ae' 'Lyrici/-ae' 'Comici'
 'Scriptores Ecclesiastici' 'Historici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Comici' 'Historici/-ae' 'Historici/-ae' 'Comici'
 'Tragici' 'Tragici' 'Scriptores Ecclesiastici' 'Philosophici/-ae'
 'Tragici' 'Scriptores Ecclesiastici' 'Tragici' 'Philosophici/-ae'
 'Tragici' 'Comici' 'Philosophici/-ae' 'Tragici' 'Tragici' 'Historici/-ae'
 'Historici/-ae' 'Scriptores Ecclesiastici' 'Philosophici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Grammatici' 'Historici/-ae' 'Comici' 'Historici/-ae' 'Tragici'
 'Historici/-ae' 'Historici/-ae' 'Tragici' 'Historici/-ae'
 'Philosophici/-ae' 'Historici/-ae' 'Scriptores Ecclesiastici' 'Comici'
 'Philosophici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Historici/-ae' 'Historici/-ae' 'Philosophici/-ae'
 'Epici/-ae' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Philosophici/-ae' 'Comici' 'Historici/-ae'
 'Philosophici/-ae' 'Tragici' 'Comici' 'Medici' 'Historici/-ae' 'Comici'
 'Historici/-ae' 'Grammatici' 'Historici/-ae' 'Historici/-ae' 'Comici'
 'Tragici' 'Historici/-ae' 'Tragici' 'Historici/-ae' 'Philosophici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Grammatici' 'Philosophici/-ae' 'Tragici'
 'Philosophici/-ae' 'Philosophici/-ae' 'Philosophici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Tragici' 'Philosophici/-ae' 'Tragici'
 'Grammatici' 'Historici/-ae' 'Historici/-ae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Epici/-ae' 'Scriptores Ecclesiastici' 'Historici/-ae'
 'Historici/-ae' 'Tragici' 'Philosophici/-ae' 'Philosophici/-ae' 'Comici'
 'Comici' 'Tragici' 'Tragici' 'Grammatici' 'Philosophici/-ae' 'Comici'
 'Tragici' 'Historici/-ae' 'Tragici' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Philosophici/-ae' 'Tragici'
 'Historici/-ae' 'Historici/-ae' 'Tragici' 'Historici/-ae' 'Comici'
 'Philosophici/-ae' 'Comici' 'Scriptores Ecclesiastici' 'Tragici' 'Comici'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Historici/-ae' 'Philosophici/-ae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Scriptores Ecclesiastici' 'Epici/-ae' 'Comici' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Grammatici'
 'Philosophici/-ae' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Comici'
 'Historici/-ae' 'Historici/-ae' 'Tragici' 'Comici' 'Philosophici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae'
 'Comici' 'Comici' 'Comici' 'Tragici' 'Historici/-ae' 'Philosophici/-ae'
 'Comici' 'Philosophici/-ae' 'Epici/-ae' 'Comici' 'Historici/-ae' 'Comici'
 'Tragici' 'Tragici' 'Epici/-ae' 'Historici/-ae' 'Philosophici/-ae'
 'Medici' 'Comici' 'Comici' 'Historici/-ae' 'Epici/-ae' 'Historici/-ae'
 'Lyrici/-ae' 'Historici/-ae' 'Comici' 'Philosophici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Tragici' 'Comici' 'Epici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Philosophici/-ae' 'Comici'
 'Philosophici/-ae' 'Tragici' 'Tragici' 'Historici/-ae' 'Tragici'
 'Historici/-ae' 'Grammatici' 'Tragici' 'Scriptores Ecclesiastici'
 'Epici/-ae' 'Historici/-ae' 'Historici/-ae' 'Tragici' 'Historici/-ae'
 'Historici/-ae' 'Tragici' 'Tragici' 'Tragici' 'Historici/-ae'
 'Historici/-ae' 'Comici' 'Historici/-ae' 'Historici/-ae'
 'Scriptores Ecclesiastici' 'Philosophici/-ae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Philosophici/-ae' 'Comici' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Comici' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Epici/-ae' 'Tragici' 'Historici/-ae' 'Rhetorici'
 'Philosophici/-ae' 'Comici' 'Tragici' 'Historici/-ae' 'Historici/-ae'
 'Comici' 'Philosophici/-ae' 'Philosophici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Philosophici/-ae' 'Comici'
 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Comici' 'Philosophici/-ae' 'Comici'
 'Historici/-ae' 'Tragici' 'Philosophici/-ae' 'Historici/-ae' 'Tragici'
 'Comici' 'Historici/-ae' 'Comici' 'Philosophici/-ae' 'Medici'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Philosophici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Comici' 'Lyrici/-ae' 'Tragici'
 'Lyrici/-ae' 'Historici/-ae' 'Tragici' 'Historici/-ae' 'Tragici' 'Tragici'
 'Tragici' 'Comici' 'Rhetorici' 'Comici' 'Comici' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Tragici'
 'Philosophici/-ae']
actual_values    7               Philosophici/-ae
1108    Scriptores Ecclesiastici
67                        Comici
1233            Philosophici/-ae
238                       Comici
23      Scriptores Ecclesiastici
1228                   Rhetorici
514             Philosophici/-ae
467                   Lyrici/-ae
470                   Theologici
1291               Historici/-ae
1399                    Elegiaci
1163                     Tragici
1505                      Comici
1370                 Alchemistae
657                    Epici/-ae
314             Philosophici/-ae
1461                      Comici
104                      Tragici
695                Historici/-ae
599             Philosophici/-ae
161                       Comici
949                   Theologici
753                       Comici
1460                      Medici
267                       Comici
1586                      Comici
1570                 Alchemistae
929                      Tragici
811                       Comici
                  ...           
1199               Historici/-ae
503                   Lyrici/-ae
871                    Rhetorici
673                       Medici
100                       Comici
593                Historici/-ae
1005               Historici/-ae
952             Philosophici/-ae
1077    Scriptores Ecclesiastici
622             Philosophici/-ae
1113                      Comici
944                    Epici/-ae
139                      Tragici
1568                    Elegiaci
316                Historici/-ae
512                       Comici
342                Historici/-ae
1419               Historici/-ae
351                       Comici
1400                      Comici
688                       Comici
159                    Rhetorici
1339                      Comici
940                       Comici
1253               Historici/-ae
1338                   Epici/-ae
1492                     Tragici
981                Historici/-ae
1136                     Tragici
1094                 Alchemistae
Name: epithet, dtype: object

----Random forest report--------------------------------
                          precision    recall  f1-score   support

             Alchemistae       0.00      0.00      0.00         7
                  Comici       0.70      0.63      0.66        51
                Elegiaci       0.00      0.00      0.00         8
               Epici/-ae       0.44      0.21      0.29        19
              Grammatici       0.71      0.25      0.37        20
           Historici/-ae       0.65      0.95      0.77        84
              Lyrici/-ae       0.00      0.00      0.00        12
                  Medici       0.67      0.29      0.40         7
        Philosophici/-ae       0.51      0.70      0.59        46
                  Poetae       0.00      0.00      0.00         7
               Rhetorici       1.00      0.17      0.29        12
Scriptores Ecclesiastici       0.50      0.38      0.43        13
               Sophistae       0.00      0.00      0.00         8
              Theologici       0.00      0.00      0.00         5
                 Tragici       0.22      0.67      0.33        15

             avg / total       0.52      0.55      0.49       314

/root/venv/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

ADA boost


In [90]:
def run_ada_boost(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Scikit random forest.
    
    For plotting see:
    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_iris.html
    
    Experiment with 'n_estimators'
    """
    
    t0 = dt.datetime.utcnow()
    
    n_estimators = 30
    ada_classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                                        n_estimators=n_estimators)

    # Train
    clf = clone(ada_classifier)
    clf = ada_classifier.fit(X_train_scaled, Y_train)
    
    fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_ada_boost.pickle')
    joblib.dump(clf, fp_model_pickle)
    
    scores = clf.score(X_train_scaled, Y_train)
    
    
    
    Y_prediction = clf.predict(X_test_scaled)
    print('tree_predictions ', Y_prediction)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print(classification_report(expected, Y_prediction))

    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()

In [91]:
run_ada_boost(X_train_scaled, X_test_scaled, Y_train, Y_test)


tree_predictions  ['Comici' 'Historici/-ae' 'Comici' 'Philosophici/-ae' 'Comici'
 'Historici/-ae' 'Philosophici/-ae' 'Comici' 'Philosophici/-ae'
 'Scriptores Ecclesiastici' 'Historici/-ae' 'Historici/-ae' 'Comici'
 'Philosophici/-ae' 'Comici' 'Historici/-ae' 'Comici' 'Comici' 'Comici'
 'Scriptores Ecclesiastici' 'Philosophici/-ae' 'Comici'
 'Scriptores Ecclesiastici' 'Comici' 'Philosophici/-ae' 'Comici'
 'Historici/-ae' 'Alchemistae' 'Comici' 'Comici' 'Comici' 'Historici/-ae'
 'Scriptores Ecclesiastici' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Philosophici/-ae' 'Sophistae' 'Comici'
 'Philosophici/-ae' 'Comici' 'Philosophici/-ae' 'Philosophici/-ae' 'Comici'
 'Philosophici/-ae' 'Philosophici/-ae' 'Historici/-ae'
 'Scriptores Ecclesiastici' 'Comici' 'Sophistae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Philosophici/-ae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Philosophici/-ae' 'Theologici' 'Epici/-ae'
 'Philosophici/-ae' 'Philosophici/-ae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Medici' 'Philosophici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Comici' 'Comici' 'Sophistae' 'Philosophici/-ae'
 'Comici' 'Comici' 'Philosophici/-ae' 'Comici' 'Historici/-ae' 'Comici'
 'Comici' 'Historici/-ae' 'Comici' 'Historici/-ae' 'Grammatici'
 'Historici/-ae' 'Historici/-ae' 'Grammatici' 'Philosophici/-ae' 'Comici'
 'Grammatici' 'Philosophici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Philosophici/-ae' 'Comici' 'Comici' 'Comici'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Medici' 'Historici/-ae'
 'Comici' 'Scriptores Ecclesiastici' 'Comici' 'Historici/-ae' 'Comici'
 'Medici' 'Philosophici/-ae' 'Comici' 'Comici' 'Comici' 'Comici'
 'Philosophici/-ae' 'Philosophici/-ae' 'Comici' 'Comici' 'Philosophici/-ae'
 'Comici' 'Rhetorici' 'Historici/-ae' 'Comici' 'Historici/-ae'
 'Philosophici/-ae' 'Comici' 'Historici/-ae' 'Philosophici/-ae' 'Comici'
 'Historici/-ae' 'Comici' 'Medici' 'Comici' 'Scriptores Ecclesiastici'
 'Comici' 'Comici' 'Historici/-ae' 'Historici/-ae' 'Grammatici'
 'Historici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Comici' 'Philosophici/-ae' 'Historici/-ae'
 'Theologici' 'Comici' 'Comici' 'Philosophici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Comici' 'Comici' 'Comici' 'Philosophici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Philosophici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Comici' 'Historici/-ae' 'Comici' 'Comici' 'Comici'
 'Philosophici/-ae' 'Historici/-ae' 'Epici/-ae' 'Alchemistae'
 'Historici/-ae' 'Comici' 'Comici' 'Comici' 'Comici' 'Historici/-ae'
 'Historici/-ae' 'Comici' 'Philosophici/-ae' 'Comici' 'Philosophici/-ae'
 'Historici/-ae' 'Comici' 'Comici' 'Comici' 'Historici/-ae' 'Historici/-ae'
 'Medici' 'Medici' 'Comici' 'Comici' 'Historici/-ae' 'Philosophici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Philosophici/-ae' 'Comici'
 'Philosophici/-ae' 'Philosophici/-ae' 'Historici/-ae' 'Philosophici/-ae'
 'Comici' 'Comici' 'Comici' 'Comici' 'Sophistae' 'Philosophici/-ae'
 'Comici' 'Philosophici/-ae' 'Comici' 'Comici' 'Historici/-ae' 'Comici'
 'Comici' 'Grammatici' 'Comici' 'Theologici' 'Comici' 'Comici'
 'Historici/-ae' 'Comici' 'Philosophici/-ae' 'Historici/-ae' 'Comici'
 'Comici' 'Comici' 'Comici' 'Historici/-ae' 'Philosophici/-ae'
 'Philosophici/-ae' 'Historici/-ae' 'Scriptores Ecclesiastici'
 'Philosophici/-ae' 'Historici/-ae' 'Philosophici/-ae' 'Philosophici/-ae'
 'Comici' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Philosophici/-ae' 'Comici' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Comici' 'Historici/-ae' 'Philosophici/-ae' 'Comici'
 'Comici' 'Philosophici/-ae' 'Comici' 'Comici' 'Comici' 'Historici/-ae'
 'Theologici' 'Comici' 'Philosophici/-ae' 'Philosophici/-ae'
 'Historici/-ae' 'Historici/-ae' 'Philosophici/-ae' 'Philosophici/-ae'
 'Comici' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae' 'Historici/-ae'
 'Historici/-ae' 'Philosophici/-ae' 'Philosophici/-ae' 'Philosophici/-ae'
 'Comici' 'Philosophici/-ae' 'Comici' 'Comici' 'Historici/-ae' 'Comici'
 'Comici' 'Historici/-ae' 'Comici' 'Philosophici/-ae' 'Philosophici/-ae'
 'Comici' 'Historici/-ae' 'Historici/-ae' 'Sophistae' 'Comici'
 'Philosophici/-ae' 'Comici' 'Philosophici/-ae' 'Comici' 'Comici'
 'Historici/-ae' 'Comici' 'Philosophici/-ae' 'Comici' 'Comici' 'Comici'
 'Philosophici/-ae' 'Philosophici/-ae' 'Comici' 'Historici/-ae'
 'Historici/-ae' 'Comici' 'Comici' 'Sophistae' 'Comici' 'Historici/-ae']
actual_values    7               Philosophici/-ae
1108    Scriptores Ecclesiastici
67                        Comici
1233            Philosophici/-ae
238                       Comici
23      Scriptores Ecclesiastici
1228                   Rhetorici
514             Philosophici/-ae
467                   Lyrici/-ae
470                   Theologici
1291               Historici/-ae
1399                    Elegiaci
1163                     Tragici
1505                      Comici
1370                 Alchemistae
657                    Epici/-ae
314             Philosophici/-ae
1461                      Comici
104                      Tragici
695                Historici/-ae
599             Philosophici/-ae
161                       Comici
949                   Theologici
753                       Comici
1460                      Medici
267                       Comici
1586                      Comici
1570                 Alchemistae
929                      Tragici
811                       Comici
                  ...           
1199               Historici/-ae
503                   Lyrici/-ae
871                    Rhetorici
673                       Medici
100                       Comici
593                Historici/-ae
1005               Historici/-ae
952             Philosophici/-ae
1077    Scriptores Ecclesiastici
622             Philosophici/-ae
1113                      Comici
944                    Epici/-ae
139                      Tragici
1568                    Elegiaci
316                Historici/-ae
512                       Comici
342                Historici/-ae
1419               Historici/-ae
351                       Comici
1400                      Comici
688                       Comici
159                    Rhetorici
1339                      Comici
940                       Comici
1253               Historici/-ae
1338                   Epici/-ae
1492                     Tragici
981                Historici/-ae
1136                     Tragici
1094                 Alchemistae
Name: epithet, dtype: object

                          precision    recall  f1-score   support

             Alchemistae       1.00      0.29      0.44         7
                  Comici       0.34      0.78      0.48        51
                Elegiaci       0.00      0.00      0.00         8
               Epici/-ae       0.00      0.00      0.00        19
              Grammatici       0.40      0.10      0.16        20
           Historici/-ae       0.60      0.58      0.59        84
              Lyrici/-ae       0.00      0.00      0.00        12
                  Medici       0.17      0.14      0.15         7
        Philosophici/-ae       0.30      0.54      0.39        46
                  Poetae       0.00      0.00      0.00         7
               Rhetorici       0.00      0.00      0.00        12
Scriptores Ecclesiastici       0.50      0.31      0.38        13
               Sophistae       0.00      0.00      0.00         8
              Theologici       0.25      0.20      0.22         5
                 Tragici       0.00      0.00      0.00        15

             avg / total       0.34      0.39      0.34       314

... finished in 0:02:21.417754 secs.

/root/venv/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

SVC


In [93]:
def run_svc(X_train_scaled, X_test_scaled, Y_train, Y_test):
    """Run SVC with scikit."""
    # This is where we define the models with pre-defined parameters
    # We can learn these parameters given our data
    print('Defining and fitting SVC model ...')
    t0 = dt.datetime.utcnow()   
    scv = svm.LinearSVC(C=100.)

    scv.fit(X_train_scaled, Y_train)

    fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_svc.pickle')
    joblib.dump(scv, fp_model_pickle)
    

    Y_prediction_svc = scv.predict(X_test_scaled)
    print('svc_predictions ', Y_prediction_svc)

    expected = Y_test
    print('actual_values   ', expected)


    print()
    print('----SVC_report--------------------------------')
    print(classification_report(expected, Y_prediction_svc))

    print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
    print()

In [ ]:
run_svc(X_train_scaled, X_test_scaled, Y_train, Y_test)


Defining and fitting SVC model ...

In [ ]: