In [28]:
# Load packages
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [29]:
# Load MS Excel file
dataset=pd.read_excel('matrix.xlsx')
dataset=dataset[['ggroup','Filename','MDA']]
print(dataset)


      ggroup                                           Filename  \
0       2030  1994_Q1_100885_UNION PACIFIC CORP_10-K_1994-03-29   
1       2010  1994_Q1_101829_UNITED TECHNOLOGIES CORP -DE-_1...   
2       2010            1994_Q1_12927_BOEING CO_10-K_1994-03-17   
3       2010  1994_Q1_15840_BUTLER MANUFACTURING CO_10-K_199...   
4       2010      1994_Q1_18230_CATERPILLAR INC_10-K_1994-03-02   
5       2010         1994_Q1_217346_TEXTRON INC_10-K_1994-03-30   
6       2010  1994_Q1_26172_CUMMINS ENGINE CO INC_10-K_1994-...   
7       2010    1994_Q1_277135_GRAINGER W W INC_10-K_1994-03-28   
8       2010  1994_Q1_277509_FEDERAL SIGNAL CORP -DE-_10-K_1...   
9       2030  1994_Q1_278041_INTERNATIONAL SHIPHOLDING CORP_...   
10      2020          1994_Q1_27996_DELUXE CORP_10-K_1994-03-31   
11      2020  1994_Q1_310431_CBI INDUSTRIES INC -DE-_10-K_19...   
12      2020  1994_Q1_315213_HALF ROBERT INTERNATIONAL INC -...   
13      2010  1994_Q1_33619_ESTERLINE TECHNOLOGIES CORP_10-K...   
14      2010  1994_Q1_40533_GENERAL DYNAMICS CORP_10-K_1994-...   
15      2010  1994_Q1_40545_GENERAL ELECTRIC CO_10-K_1994-03-11   
16      2020    1994_Q1_45599_HARLAND JOHN H CO_10-K_1994-03-30   
17      2010          1994_Q1_45876_HARSCO CORP_10-K_1994-03-29   
18      2010        1994_Q1_48305_HONEYWELL INC_10-K_1994-03-08   
19      2010          1994_Q1_48898_HUBBELL INC_10-K_1994-03-25   
20      2010  1994_Q1_49826_ILLINOIS TOOL WORKS INC_10-K_199...   
21      2020           1994_Q1_52466_IONICS INC_10-K_1994-03-29   
22      2010           1994_Q1_54381_KAMAN CORP_10-K_1994-03-11   
23      2020   1994_Q1_55135_KELLY SERVICES INC_10-K_1994-03-14   
24      2030           1994_Q1_56047_KIRBY CORP_10-K_1994-03-15   
25      2010      1994_Q1_62996_MASCO CORP -DE-_10-K_1994-03-25   
26      2030  1994_Q1_700674_AIR EXPRESS INTERNATIONAL CORP ...   
27      2030  1994_Q1_702165_NORFOLK SOUTHERN CORP_10-K_1994...   
28      2020  1994_Q1_714278_INFORMATION RESOURCES INC_10-K_...   
29      2010         1994_Q1_72331_NORDSON CORP_10-K_1994-01-28   
...      ...                                                ...   
5277    2010  2014_Q4_1423221_Quanex Building Products CORP_...   
5278    2020           2014_Q4_1546640_ADT Corp_10-K_2014-11-12   
5279    2020         2014_Q4_25212_COURIER Corp_10-K_2014-12-01   
5280    2010      2014_Q4_26076_CUBIC CORP -DE-_10-K_2014-11-26   
5281    2010  2014_Q4_32604_EMERSON ELECTRIC CO_10-K_2014-11-19   
5282    2010  2014_Q4_33619_ESTERLINE TECHNOLOGIES CORP_10-K...   
5283    2010           2014_Q4_46619_HEICO CORP_10-K_2014-12-18   
5284    2010         2014_Q4_50725_GRIFFON CORP_10-K_2014-11-13   
5285    2010  2014_Q4_52988_JACOBS ENGINEERING GROUP INC -DE...   
5286    2020  2014_Q4_63296_MATTHEWS INTERNATIONAL CORP_10-K...   
5287    2010  2014_Q4_64472_GENCOR INDUSTRIES INC_10-K_2014-...   
5288    2010            2014_Q4_67887_MOOG INC._10-K_2014-11-10   
5289    2010          2014_Q4_6955_ACTUANT CORP_10-K_2014-10-27   
5290    2020       2014_Q4_717954_UNIFIRST CORP_10-K_2014-10-29   
5291    2010         2014_Q4_72331_NORDSON CORP_10-K_2014-12-15   
5292    2010             2014_Q4_737758_TORO CO_10-K_2014-12-22   
5293    2010  2014_Q4_764401_INSTEEL INDUSTRIES INC_10-K_201...   
5294    2020  2014_Q4_771497_ABM INDUSTRIES INC -DE-_10-K_20...   
5295    2010        2014_Q4_775158_OSHKOSH CORP_10-K_2014-11-13   
5296    2010      2014_Q4_801898_JOY GLOBAL INC_10-K_2014-12-19   
5297    2010  2014_Q4_80420_POWELL INDUSTRIES INC_10-K_2014-...   
5298    2010  2014_Q4_808450_NAVISTAR INTERNATIONAL CORP_10-...   
5299    2020      2014_Q4_831641_TETRA TECH INC_10-K_2014-11-19   
5300    2020  2014_Q4_833444_TYCO INTERNATIONAL LTD_10-K_201...   
5301    2010  2014_Q4_866706_ESCO TECHNOLOGIES INC_10-K_2014...   
5302    2010  2014_Q4_868857_AECOM TECHNOLOGY CORP_10-K_2014...   
5303    2010  2014_Q4_883902_NCI BUILDING SYSTEMS INC_10-K_2...   
5304    2020   2014_Q4_886206_FRANKLIN COVEY CO_10-K_2014-11-14   
5305    2010  2014_Q4_906193_KEY TECHNOLOGY INC_10-K_2014-12-12   
5306    2010  2014_Q4_923120_GREENBRIER COMPANIES INC_10-K_2...   

                                                    MDA  
0     Item Management s Discussion and Analysis of F...  
1     Item Management s Discussion and Analysis of R...  
2     Item Management s Discussion and Analysis of F...  
3     Item Management s Discussion and Analysis of F...  
4     ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
5     ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
6     ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF R...  
7     Item Management s Discussion and Analysis of F...  
8     Item Management s Discussion and Analysis of F...  
9     ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
10    Item Management s Discussion and Analysis of F...  
11    Item Management s Discussion and Analysis of F...  
12    ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
13    ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
14    ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
15    Item Management s Discussion and Analysis of F...  
16    ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
17    Item Management s Discussion of Financial Cond...  
18    ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
19    Item Management s Discussion and Analysis of F...  
20    ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
21    Item MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
22    Item Management s Discussion and Analysis of F...  
23    ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
24    ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
25    ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
26    Item Management s Discussion and Analysis of F...  
27    Item Management s Discussion and Analysis of F...  
28    ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
29    Item Management s Discussion and Analysis of F...  
...                                                 ...  
5277  Item Management s Discussion and Analysis of F...  
5278   Table of Contents Item Management s Discussio...  
5279  Item Management s Discussion and Analysis of F...  
5280  Item MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
5281  ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
5282  Item Management s Discussion and Analysis of F...  
5283   Index Item MANAGEMENT S DISCUSSION AND ANALYS...  
5284  Item Management s Discussion and Analysis of F...  
5285   Table of Contents Item MANAGEMENT S DISCUSSIO...  
5286  ITEM MANAGEMENT S DISCUSSION AND ANALYSIS cont...  
5287  ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
5288  Item Management s Discussion and Analysis of F...  
5289   Table of Contents Item Management s Discussio...  
5290   ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF ...  
5291   Management s Discussion and Analysis of Finan...  
5292  ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
5293  Item Management s Discussion and Analysis of F...  
5294  ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
5295  ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
5296   Item Management s Discussion and Analysis of ...  
5297   Payments Due by Period Payments Due by Period...  
5298   Item Management s Discussion and Analysis of ...  
5299  Item Management s Discussion and Analysis of F...  
5300   Table of Contents Item Management s Discussio...  
5301   Filtration PTI Technologies Inc PTI VACCO Ind...  
5302  ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
5303  Item Management s Discussion and Analysis of F...  
5304  ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
5305  ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  
5306  Item MANAGEMENT S DISCUSSION AND ANALYSIS OF F...  

[5307 rows x 3 columns]

In [30]:
# Split the dataset in train/test ratio: 0.20
train_set, test_set = train_test_split(dataset, test_size = 0.20)

In [31]:
# Create the classifier
vectorizer = CountVectorizer(stop_words="english")
counts = vectorizer.fit_transform(train_set.MDA.values)
classifier = MultinomialNB(fit_prior="False")

In [32]:
# Train the classifier
classifier.fit(counts, train_set.ggroup)


Out[32]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior='False')

In [33]:
# Test the classifier
predictions = classifier.predict(vectorizer.transform(test_set.MDA.values)) 
test_set_pred = pd.Series(predictions, index=test_set.index)

tab = pd.crosstab(test_set.ggroup, test_set_pred, rownames=['Actual'], colnames=['Predicted'], margins=True) # Print confusion matrix
print(tab)


Predicted  2010  2020  2030   All
Actual                           
2010        607    55     5   667
2020         19   213     1   233
2030         15     9   138   162
All         641   277   144  1062

In [ ]: