In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib

In [3]:
posts = load_files("./posts")

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(posts.data, posts.target)

In [5]:
vectorizer = CountVectorizer()

In [6]:
vectorizer.fit(X_train)


Out[6]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
X_train_vectorized = vectorizer.transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [8]:
clf = MLPClassifier()

In [9]:
clf.fit(X_train_vectorized, Y_train)


Out[9]:
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [10]:
clf.score(X_test_vectorized, Y_test)


Out[10]:
0.96980552712384849

In [11]:
joblib.dump(vectorizer, 'count_vectorizer.joblib')


Out[11]:
['count_vectorizer.joblib']

In [12]:
joblib.dump(clf, 'mlp_post_classifier.joblib')


Out[12]:
['mlp_post_classifier.joblib']

In [ ]: