In [1]:
import ml
import nlp
import json_io
import pickle
from itertools import chain
from dvs import DictVectorizerPartial
import numpy as np

Process comment


In [ ]:
path = ml.JSON_DIR+"reddit/"
sarcastic_path = path+"sarcastic/"
serious_path = path+"serious/"
source = '-reddit-'
features_path = 'features/'
n=10

In [ ]:
json_io.processRandomizeJson(sarcastic=True,
                     json_path=sarcastic_path,
                     features_path=features_path,
                     source=source,
                     n=n,
                     cleanTokens=nlp.cleanTokensReddit)
json_io.processRandomizeJson(sarcastic=False,
                     json_path=serious_path,
                     features_path=features_path,
                     source=source,
                     n=n,
                     cleanTokens=nlp.cleanTokensReddit)

Load set of features


In [ ]:
sarcasticFeats = json_io.loadProcessedFeatures(features_path,
                                       source,
                                       sarcastic=True,
                                       n=5,
                                       random=False)
seriousFeats = json_io.loadProcessedFeatures(features_path,
                                     source,
                                     sarcastic=False,
                                     n=3,
                                     random=False)
features = chain(sarcasticFeats, seriousFeats)

Flatten feature dictionaries, if leaveout is a feature that feature is ommitted


In [ ]:
dvp = DictVectorizerPartial()

In [ ]:
(X,y) = ml.split_feat(features, 2)

In [ ]:
(X,y) = ml.flatten(X,y)

In [ ]:
(X,y) = (dvp.partial_fit_transform(X), np.array(list(y)))

In [ ]:
pickle.dump(dvp, open('pickled/-reddit-dvp.pickle', 'wb'))
pickle.dump(y, open('pickled/-reddit-y.pickle', 'wb'))
pickle.dump(X, open('pickled/-reddit-X.pickle', 'wb'))

In [2]:
X = pickle.load(open('pickled/-reddit-X.pickle', 'rb'))
y = pickle.load(open('pickled/-reddit-y.pickle', 'rb'))

Train and test, reports results


In [4]:
from sklearn.naive_bayes import MultinomialNB

results = []
for reduceamount in [0, 2500000, 1000000, 500000, 100000]: #, 50000, 25000, 10000, 7500, 5000, 2500, 1500, 1000, 750, 500, 250, 100, 50, 10, 5]:
    print("\n\t\tReduction: "+str(reduceamount))
    for trainsize in [0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8]:
        print("\n\t\tTraining size: "+str(trainsize))
        results.append((reduceamount,
                       trainsize,
                       ml.trainTest(X,
                                    y,
                                    classifiers=[MultinomialNB()],
                                    reduce=reduceamount,
                                    splits=2,
                                    trainsize=trainsize,
                                    testsize=0.2)))
pickle.dump(results, open('pickled/-reddit-trained-mnbayes.pickle', 'wb'))
print(results)


		Reduction: 0

		Training size: 0.01
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.568462
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.619744

		Training size: 0.05
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.673811
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.691526

		Training size: 0.1
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.700810
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.702373

		Training size: 0.2
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.715615
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.714991

		Training size: 0.4
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 2	Score:	0.725965
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 2	Score:	0.727018

		Training size: 0.6
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 2	Score:	0.730205
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 3	Score:	0.732338

		Training size: 0.8
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 3	Score:	0.734310
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 3	Score:	0.733505

		Reduction: 2500000

		Training size: 0.01
Features before reduction: (7453, 12490143)
C:\Program Files\Anaconda3\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [0 0 0 ..., 0 0 0] are constant.
  UserWarning)
C:\Program Files\Anaconda3\lib\site-packages\sklearn\feature_selection\univariate_selection.py:114: RuntimeWarning: invalid value encountered in true_divide
  f = msb / msw
Features after reduction: (7453, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.639258
Features before reduction: (7453, 12490143)
Features after reduction: (7453, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.640526

		Training size: 0.05
Features before reduction: (37268, 12490143)
Features after reduction: (37268, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.695404
Features before reduction: (37268, 12490143)
Features after reduction: (37268, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.696115

		Training size: 0.1
Features before reduction: (74537, 12490143)
Features after reduction: (74537, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.708484
Features before reduction: (74537, 12490143)
Features after reduction: (74537, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.706619

		Training size: 0.2
Features before reduction: (149075, 12490143)
Features after reduction: (149075, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.685355
Features before reduction: (149075, 12490143)
Features after reduction: (149075, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.683658

		Training size: 0.4
Features before reduction: (298150, 12490143)
Features after reduction: (298150, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.655384
Features before reduction: (298150, 12490143)
Features after reduction: (298150, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.655250

		Training size: 0.6
Features before reduction: (447226, 12490143)
Features after reduction: (447226, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.701991
Features before reduction: (447226, 12490143)
Features after reduction: (447226, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.700321

		Training size: 0.8
Features before reduction: (596301, 12490143)
Features after reduction: (596301, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.720042
Features before reduction: (596301, 12490143)
Features after reduction: (596301, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.720961

		Reduction: 1000000

		Training size: 0.01
Features before reduction: (7453, 12490143)
Features after reduction: (7453, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.662112
Features before reduction: (7453, 12490143)
Features after reduction: (7453, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.650440

		Training size: 0.05
Features before reduction: (37268, 12490143)
Features after reduction: (37268, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.693438
Features before reduction: (37268, 12490143)
Features after reduction: (37268, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.696611

		Training size: 0.1
Features before reduction: (74537, 12490143)
Features after reduction: (74537, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.569112
Features before reduction: (74537, 12490143)
Features after reduction: (74537, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.568321

		Training size: 0.2
Features before reduction: (149075, 12490143)
Features after reduction: (149075, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.671322
Features before reduction: (149075, 12490143)
Features after reduction: (149075, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.671020

		Training size: 0.4
Features before reduction: (298150, 12490143)
Features after reduction: (298150, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.719666
Features before reduction: (298150, 12490143)
Features after reduction: (298150, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.720840

		Training size: 0.6
Features before reduction: (447226, 12490143)
Features after reduction: (447226, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.728608
Features before reduction: (447226, 12490143)
Features after reduction: (447226, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.729869

		Training size: 0.8
Features before reduction: (596301, 12490143)
Features after reduction: (596301, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.733693
Features before reduction: (596301, 12490143)
Features after reduction: (596301, 1000000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.732143

		Reduction: 500000

		Training size: 0.01
Features before reduction: (7453, 12490143)
Features after reduction: (7453, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.658597
Features before reduction: (7453, 12490143)
Features after reduction: (7453, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.658885

		Training size: 0.05
Features before reduction: (37268, 12490143)
Features after reduction: (37268, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.551175
Features before reduction: (37268, 12490143)
Features after reduction: (37268, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.556629

		Training size: 0.1
Features before reduction: (74537, 12490143)
Features after reduction: (74537, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.662910
Features before reduction: (74537, 12490143)
Features after reduction: (74537, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.661837

		Training size: 0.2
Features before reduction: (149075, 12490143)
Features after reduction: (149075, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.715749
Features before reduction: (149075, 12490143)
Features after reduction: (149075, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.715306

		Training size: 0.4
Features before reduction: (298150, 12490143)
Features after reduction: (298150, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.722906
Features before reduction: (298150, 12490143)
Features after reduction: (298150, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.725992

		Training size: 0.6
Features before reduction: (447226, 12490143)
Features after reduction: (447226, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.726401
Features before reduction: (447226, 12490143)
Features after reduction: (447226, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.729252

		Training size: 0.8
Features before reduction: (596301, 12490143)
Features after reduction: (596301, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.732063
Features before reduction: (596301, 12490143)
Features after reduction: (596301, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.729534

		Reduction: 100000

		Training size: 0.01
Features before reduction: (7453, 12490143)
Features after reduction: (7453, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.536954
Features before reduction: (7453, 12490143)
Features after reduction: (7453, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.531903

		Training size: 0.05
Features before reduction: (37268, 12490143)
Features after reduction: (37268, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.698476
Features before reduction: (37268, 12490143)
Features after reduction: (37268, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.697812

		Training size: 0.1
Features before reduction: (74537, 12490143)
Features after reduction: (74537, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.704721
Features before reduction: (74537, 12490143)
Features after reduction: (74537, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.706968

		Training size: 0.2
Features before reduction: (149075, 12490143)
Features after reduction: (149075, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.715903
Features before reduction: (149075, 12490143)
Features after reduction: (149075, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.715655

		Training size: 0.4
Features before reduction: (298150, 12490143)
Features after reduction: (298150, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.722041
Features before reduction: (298150, 12490143)
Features after reduction: (298150, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.720941

		Training size: 0.6
Features before reduction: (447226, 12490143)
Features after reduction: (447226, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.724308
Features before reduction: (447226, 12490143)
Features after reduction: (447226, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.724476

		Training size: 0.8
Features before reduction: (596301, 12490143)
Features after reduction: (596301, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.725066
Features before reduction: (596301, 12490143)
Features after reduction: (596301, 100000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.724436
[(0, 0.01, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.629998, 0.56846172422120256), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.696998, 0.61974429150232091)]), (0, 0.05, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.880995, 0.6738106737503019), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.031438, 0.69152646972014276)]), (0, 0.1, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.186007, 0.70081032493493256), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.079014, 0.70237328610909866)]), (0, 0.2, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.6125, 0.71561485416834369), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.532959, 0.71499101129625153)]), (0, 0.4, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 2.226014, 0.72596527945477474), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 2.223394, 0.72701843355067219)]), (0, 0.6, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 2.945503, 0.73020472778985213), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 3.008574, 0.73233786793313482)]), (0, 0.8, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 3.837503, 0.73431001636749038), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 3.767503, 0.73350505782285547)]), (2500000, 0.01, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.210503, 0.63925782822184662, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.169001, 0.64052563792964667, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool))]), (2500000, 0.05, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.345001, 0.69540368671013442, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.289004, 0.69611473342456198, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool))]), (2500000, 0.1, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.438005, 0.70848426306045242, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.439999, 0.70661944243204811, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool))]), (2500000, 0.2, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.569001, 0.68535512087794148, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.593003, 0.68365799994633614, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (2500000, 0.4, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.028001, 0.65538383106603348, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.024999, 0.65524967130859424, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (2500000, 0.6, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.506999, 0.70199093080039709, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.512, 0.70032064182027964, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool))]), (2500000, 0.8, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.965998, 0.72004212616383589, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.924, 0.72096112050229411, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool))]), (1000000, 0.01, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.083008, 0.66211194290160724, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.078998, 0.65044004400440047, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool))]), (1000000, 0.05, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.163, 0.6934382462636508, array([ True,  True, False, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.151999, 0.69661112452708684, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (1000000, 0.1, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.233001, 0.5691123990447825, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.239001, 0.56832085647589148, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (1000000, 0.2, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.411001, 0.67132201024980542, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.418003, 0.67102015079556732, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (1000000, 0.4, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.827001, 0.71966647884300627, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.823998, 0.72084037672059886, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool))]), (1000000, 0.6, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.244996, 0.72860822667632619, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.239, 0.72986932839625429, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool))]), (1000000, 0.8, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.685997, 0.73369288148327028, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.598995, 0.73214333628484796, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (500000, 0.01, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.055975, 0.65859695725670131, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.055999, 0.65888540073519553, array([ True,  True,  True, ...,  True,  True,  True], dtype=bool))]), (500000, 0.05, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.106999, 0.551175239475167, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.108001, 0.55662883361506887, array([ True, False, False, ..., False, False, False], dtype=bool))]), (500000, 0.1, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.188, 0.66291019345837021, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.182, 0.66183691539885692, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (500000, 0.2, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.363, 0.71574901392578283, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.374, 0.71530628672623364, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (500000, 0.4, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.663999, 0.72290643698516188, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.668, 0.72599211140626263, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (500000, 0.6, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.023995, 0.72640129866645198, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.032004, 0.72925219351203407, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (500000, 0.8, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.400002, 0.7320628404303845, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 1.367005, 0.72953392900265635, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (100000, 0.01, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.019003, 0.53695430518661624, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.018002, 0.53190319031903188, array([ True, False, False, ..., False, False, False], dtype=bool))]), (100000, 0.05, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.064993, 0.69847594515549116, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.063012, 0.69781185435616733, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (100000, 0.1, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.118004, 0.70472108186428395, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.118006, 0.70696825780138994, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (100000, 0.2, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.229002, 0.7159032976468378, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.235001, 0.71565510209557537, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (100000, 0.4, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.471995, 0.72204110654967935, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.480001, 0.72094099653867827, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (100000, 0.6, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.731993, 0.72430840645040118, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.726, 0.72447610614720004, array([ True,  True,  True, ..., False, False, False], dtype=bool))]), (100000, 0.8, [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.991001, 0.72506640907993236, array([ True,  True,  True, ..., False, False, False], dtype=bool)), (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 0.974999, 0.72443585821996836, array([ True,  True,  True, ..., False, False, False], dtype=bool))])]

In [5]:
xyz = []
for red, train, res in results:
    acc = [r[2] for r in res]
    xyz.append((red, train, sum(acc)/len(acc)))
json_io.list_to_json(xyz, "-reddit-reduction-trainsize-accuracy-mnbayes.json", old_format=True)

In [6]:
from sklearn.linear_model import LogisticRegression

results = []
for reduceamount in [0, 2500000, 1000000]: #, 50000, 25000, 10000, 7500, 5000, 2500, 1500, 1000, 750, 500, 250, 100, 50, 10, 5]:
    print("\n\t\tReduction: "+str(reduceamount))
    for trainsize in [0.8]:
        print("\n\t\tTraining size: "+str(trainsize))
        results.append((reduceamount,
                       trainsize,
                       ml.trainTest(X,
                                    y,
                                    classifiers=[LogisticRegression(n_jobs=-1)],
                                    reduce=reduceamount,
                                    splits=2,
                                    trainsize=trainsize,
                                    testsize=0.2)))
pickle.dump(results, open('pickled/-reddit-trained-log.pickle', 'wb'))
print(results)


		Reduction: 0

		Training size: 0.01
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 64	Score:	0.660871
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 65	Score:	0.662199

		Training size: 0.05
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 187	Score:	0.706700
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 186	Score:	0.706814

		Training size: 0.1
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 314	Score:	0.718493
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 314	Score:	0.721115

		Training size: 0.2
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 1209	Score:	0.733203
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 1256	Score:	0.732888

		Training size: 0.4
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 2596	Score:	0.749369
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 3403	Score:	0.745895

		Training size: 0.6
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 2612	Score:	0.755266
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 4809	Score:	0.750805

		Training size: 0.8
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 2364	Score:	0.760840
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-af0fb0031a1c> in <module>()
     20                                     splits=2,
     21                                     trainsize=trainsize,
---> 22                                     testsize=0.2)))
     23 pickle.dump(results, open('pickled/-reddit-trained-log.pickle', 'wb'))
     24 print(results)

C:\dev\CSC393 Sr Design\src\ml.py in trainTest(X, y, classifiers, reduce, splits, trainsize, testsize)
     53             print("Starting to train %s"%str(type(classifier)))
     54             s = datetime.now()
---> 55             classifier.fit(X_train, y_train)
     56             traintime = (datetime.now() - s).total_seconds()
     57             score = classifier.score(X_test, y_test)

C:\Program Files\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
   1184                 self.class_weight, self.penalty, self.dual, self.verbose,
   1185                 self.max_iter, self.tol, self.random_state,
-> 1186                 sample_weight=sample_weight)
   1187             self.n_iter_ = np.array([n_iter_])
   1188             return self

C:\Program Files\Anaconda3\lib\site-packages\sklearn\svm\base.py in _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)
    910         X, y_ind, sp.isspmatrix(X), solver_type, tol, bias, C,
    911         class_weight_, max_iter, rnd.randint(np.iinfo('i').max),
--> 912         epsilon, sample_weight)
    913     # Regarding rnd.randint(..) in the above signature:
    914     # seed for srand in range [0..INT_MAX); due to limitations in Numpy

KeyboardInterrupt: 

In [ ]:
xyz = []
for red, train, res in results:
    acc = [r[2] for r in res]
    xyz.append((red, train, sum(acc)/len(acc)))
json_io.list_to_json(xyz, "-reddit-reduction-trainsize-accuracy-log.json", old_format=True)

In [ ]: