In [28]:
# Load packages
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
In [29]:
# Load MS Excel file
dataset=pd.read_excel('matrix.xlsx')
dataset=dataset[['ggroup','Filename','MDA']]
print(dataset)
ggroup Filename \
0 2030 1994_Q1_100885_UNION PACIFIC CORP_10-K_1994-03-29
1 2010 1994_Q1_101829_UNITED TECHNOLOGIES CORP -DE-_1...
2 2010 1994_Q1_12927_BOEING CO_10-K_1994-03-17
3 2010 1994_Q1_15840_BUTLER MANUFACTURING CO_10-K_199...
4 2010 1994_Q1_18230_CATERPILLAR INC_10-K_1994-03-02
5 2010 1994_Q1_217346_TEXTRON INC_10-K_1994-03-30
6 2010 1994_Q1_26172_CUMMINS ENGINE CO INC_10-K_1994-...
7 2010 1994_Q1_277135_GRAINGER W W INC_10-K_1994-03-28
8 2010 1994_Q1_277509_FEDERAL SIGNAL CORP -DE-_10-K_1...
9 2030 1994_Q1_278041_INTERNATIONAL SHIPHOLDING CORP_...
10 2020 1994_Q1_27996_DELUXE CORP_10-K_1994-03-31
11 2020 1994_Q1_310431_CBI INDUSTRIES INC -DE-_10-K_19...
12 2020 1994_Q1_315213_HALF ROBERT INTERNATIONAL INC -...
13 2010 1994_Q1_33619_ESTERLINE TECHNOLOGIES CORP_10-K...
14 2010 1994_Q1_40533_GENERAL DYNAMICS CORP_10-K_1994-...
15 2010 1994_Q1_40545_GENERAL ELECTRIC CO_10-K_1994-03-11
16 2020 1994_Q1_45599_HARLAND JOHN H CO_10-K_1994-03-30
17 2010 1994_Q1_45876_HARSCO CORP_10-K_1994-03-29
18 2010 1994_Q1_48305_HONEYWELL INC_10-K_1994-03-08
19 2010 1994_Q1_48898_HUBBELL INC_10-K_1994-03-25
20 2010 1994_Q1_49826_ILLINOIS TOOL WORKS INC_10-K_199...
21 2020 1994_Q1_52466_IONICS INC_10-K_1994-03-29
22 2010 1994_Q1_54381_KAMAN CORP_10-K_1994-03-11
23 2020 1994_Q1_55135_KELLY SERVICES INC_10-K_1994-03-14
24 2030 1994_Q1_56047_KIRBY CORP_10-K_1994-03-15
25 2010 1994_Q1_62996_MASCO CORP -DE-_10-K_1994-03-25
26 2030 1994_Q1_700674_AIR EXPRESS INTERNATIONAL CORP ...
27 2030 1994_Q1_702165_NORFOLK SOUTHERN CORP_10-K_1994...
28 2020 1994_Q1_714278_INFORMATION RESOURCES INC_10-K_...
29 2010 1994_Q1_72331_NORDSON CORP_10-K_1994-01-28
... ... ...
5277 2010 2014_Q4_1423221_Quanex Building Products CORP_...
5278 2020 2014_Q4_1546640_ADT Corp_10-K_2014-11-12
5279 2020 2014_Q4_25212_COURIER Corp_10-K_2014-12-01
5280 2010 2014_Q4_26076_CUBIC CORP -DE-_10-K_2014-11-26
5281 2010 2014_Q4_32604_EMERSON ELECTRIC CO_10-K_2014-11-19
5282 2010 2014_Q4_33619_ESTERLINE TECHNOLOGIES CORP_10-K...
5283 2010 2014_Q4_46619_HEICO CORP_10-K_2014-12-18
5284 2010 2014_Q4_50725_GRIFFON CORP_10-K_2014-11-13
5285 2010 2014_Q4_52988_JACOBS ENGINEERING GROUP INC -DE...
5286 2020 2014_Q4_63296_MATTHEWS INTERNATIONAL CORP_10-K...
5287 2010 2014_Q4_64472_GENCOR INDUSTRIES INC_10-K_2014-...
5288 2010 2014_Q4_67887_MOOG INC._10-K_2014-11-10
5289 2010 2014_Q4_6955_ACTUANT CORP_10-K_2014-10-27
5290 2020 2014_Q4_717954_UNIFIRST CORP_10-K_2014-10-29
5291 2010 2014_Q4_72331_NORDSON CORP_10-K_2014-12-15
5292 2010 2014_Q4_737758_TORO CO_10-K_2014-12-22
5293 2010 2014_Q4_764401_INSTEEL INDUSTRIES INC_10-K_201...
5294 2020 2014_Q4_771497_ABM INDUSTRIES INC -DE-_10-K_20...
5295 2010 2014_Q4_775158_OSHKOSH CORP_10-K_2014-11-13
5296 2010 2014_Q4_801898_JOY GLOBAL INC_10-K_2014-12-19
5297 2010 2014_Q4_80420_POWELL INDUSTRIES INC_10-K_2014-...
5298 2010 2014_Q4_808450_NAVISTAR INTERNATIONAL CORP_10-...
5299 2020 2014_Q4_831641_TETRA TECH INC_10-K_2014-11-19
5300 2020 2014_Q4_833444_TYCO INTERNATIONAL LTD_10-K_201...
5301 2010 2014_Q4_866706_ESCO TECHNOLOGIES INC_10-K_2014...
5302 2010 2014_Q4_868857_AECOM TECHNOLOGY CORP_10-K_2014...
5303 2010 2014_Q4_883902_NCI BUILDING SYSTEMS INC_10-K_2...
5304 2020 2014_Q4_886206_FRANKLIN COVEY CO_10-K_2014-11-14
5305 2010 2014_Q4_906193_KEY TECHNOLOGY INC_10-K_2014-12-12
5306 2010 2014_Q4_923120_GREENBRIER COMPANIES INC_10-K_2...
MDA
0 Item Management s Discussion and Analysis of F...
1 Item Management s Discussion and Analysis of R...
2 Item Management s Discussion and Analysis of F...
3 Item Management s Discussion and Analysis of F...
4 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
5 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
6 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF R...
7 Item Management s Discussion and Analysis of F...
8 Item Management s Discussion and Analysis of F...
9 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
10 Item Management s Discussion and Analysis of F...
11 Item Management s Discussion and Analysis of F...
12 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
13 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
14 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
15 Item Management s Discussion and Analysis of F...
16 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
17 Item Management s Discussion of Financial Cond...
18 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
19 Item Management s Discussion and Analysis of F...
20 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
21 Item MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
22 Item Management s Discussion and Analysis of F...
23 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
24 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
25 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
26 Item Management s Discussion and Analysis of F...
27 Item Management s Discussion and Analysis of F...
28 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
29 Item Management s Discussion and Analysis of F...
... ...
5277 Item Management s Discussion and Analysis of F...
5278 Table of Contents Item Management s Discussio...
5279 Item Management s Discussion and Analysis of F...
5280 Item MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
5281 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
5282 Item Management s Discussion and Analysis of F...
5283 Index Item MANAGEMENT S DISCUSSION AND ANALYS...
5284 Item Management s Discussion and Analysis of F...
5285 Table of Contents Item MANAGEMENT S DISCUSSIO...
5286 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS cont...
5287 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
5288 Item Management s Discussion and Analysis of F...
5289 Table of Contents Item Management s Discussio...
5290 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF ...
5291 Management s Discussion and Analysis of Finan...
5292 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
5293 Item Management s Discussion and Analysis of F...
5294 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
5295 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
5296 Item Management s Discussion and Analysis of ...
5297 Payments Due by Period Payments Due by Period...
5298 Item Management s Discussion and Analysis of ...
5299 Item Management s Discussion and Analysis of F...
5300 Table of Contents Item Management s Discussio...
5301 Filtration PTI Technologies Inc PTI VACCO Ind...
5302 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
5303 Item Management s Discussion and Analysis of F...
5304 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
5305 ITEM MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
5306 Item MANAGEMENT S DISCUSSION AND ANALYSIS OF F...
[5307 rows x 3 columns]
In [30]:
# Split the dataset in train/test ratio: 0.20
train_set, test_set = train_test_split(dataset, test_size = 0.20)
In [31]:
# Create the classifier
vectorizer = CountVectorizer(stop_words="english")
counts = vectorizer.fit_transform(train_set.MDA.values)
classifier = MultinomialNB(fit_prior="False")
In [32]:
# Train the classifier
classifier.fit(counts, train_set.ggroup)
Out[32]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior='False')
In [33]:
# Test the classifier
predictions = classifier.predict(vectorizer.transform(test_set.MDA.values))
test_set_pred = pd.Series(predictions, index=test_set.index)
tab = pd.crosstab(test_set.ggroup, test_set_pred, rownames=['Actual'], colnames=['Predicted'], margins=True) # Print confusion matrix
print(tab)
Predicted 2010 2020 2030 All
Actual
2010 607 55 5 667
2020 19 213 1 233
2030 15 9 138 162
All 641 277 144 1062
In [ ]:
Content source: CedricVallee/pythonFinancialAnalyst
Similar notebooks: