In [1]:
from warnings import simplefilter
from pandas import read_sql_table
from numpy import unique
from sqlalchemy import create_engine
from urllib import quote_plus
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer
In [2]:
simplefilter("ignore")
In [3]:
from preprocess import PreProcessor
from postprocess_comp import PostProcessor
In [4]:
engine = create_engine("mysql+pymysql://<username>:%s@<hostname>/<database>?charset=utf8" % quote_plus(<password>), encoding="utf-8")
In [5]:
with engine.connect() as con, con.begin():
questions = read_sql_table(<table_name>, con)
In [6]:
questions = questions[questions.status == "answered"]
questions = questions[questions.lang == "en"]
questions = questions[["question_id", "body", "tags_2"]]
questions = questions.dropna()
In [7]:
questions.head()
Out[7]:
In [8]:
unique_tags = unique(questions["tags_2"])
In [9]:
print unique_tags
In [10]:
pre = PreProcessor("en")
In [11]:
questions["body"] = pre.clean(questions["body"])
In [12]:
questions = pre.process_tag(questions)
In [13]:
questions.head()
Out[13]:
In [14]:
seed = 101
In [15]:
data, labels = questions["body"], questions["tags"]
In [16]:
train_data, test_data, train_target, test_target = train_test_split(data, labels, test_size = 0.2, random_state = seed)
In [17]:
mlb = MultiLabelBinarizer(classes = unique_tags)
train_mlb = mlb.fit_transform(train_target)
test_mlb = mlb.transform(test_target)
In [18]:
print mlb.classes_
In [19]:
print "Training :", train_mlb.shape, " & Test :", test_mlb.shape
In [20]:
tfidf_vect = TfidfVectorizer(analyzer = "word", stop_words = pre.stopwords(),
tokenizer = pre.tokenize, lowercase = False)
train_dtm = tfidf_vect.fit_transform(train_data)
test_dtm = tfidf_vect.transform(test_data)
In [21]:
print "Training :", train_dtm.shape, " & Test :", test_dtm.shape
In [22]:
classifiers = {"LogisticRegression": OneVsRestClassifier(LogisticRegression()),
"RandomForestClassifier": OneVsRestClassifier(RandomForestClassifier()),
"LinearSVC": OneVsRestClassifier(LinearSVC()),
"XGBClassifier": OneVsRestClassifier(XGBClassifier()),
"DecisionTreeClassifier": OneVsRestClassifier(DecisionTreeClassifier()),
"SGDClassifier": OneVsRestClassifier(SGDClassifier())}
In [23]:
post = PostProcessor()
In [24]:
post.compare_classifiers(classifiers, train_dtm, train_mlb, 5)
In [25]:
param = {"estimator__loss": ["log", "modified_huber"],
"estimator__penalty": [None, "l1", "elasticnet"],
"estimator__class_weight": [None, "balanced"]}
In [33]:
search = GridSearchCV(OneVsRestClassifier(SGDClassifier()), param)
In [34]:
search.fit(train_dtm, train_mlb)
Out[34]:
In [35]:
clf = search.best_estimator_
In [36]:
clf.fit(train_dtm, train_mlb)
Out[36]:
In [30]:
print clf.classes_
In [31]:
post.compare_results(clf, test_dtm, mlb, test_mlb)