Import libraries


In [1]:
from warnings import simplefilter
from pandas import read_sql_table
from numpy import unique
from sqlalchemy import create_engine
from urllib import quote_plus
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer

Ignore warnings


In [2]:
simplefilter("ignore")

Load PreProcessor and PostProcessor Classes


In [3]:
from preprocess import PreProcessor
from postprocess_comp import PostProcessor

Loading questions from database


In [4]:
engine = create_engine("mysql+pymysql://<username>:%s@<hostname>/<database>?charset=utf8" % quote_plus(<password>), encoding="utf-8")

In [5]:
with engine.connect() as con, con.begin():
    questions = read_sql_table(<table_name>, con)

In [6]:
questions = questions[questions.status == "answered"]
questions = questions[questions.lang == "en"]
questions = questions[["question_id", "body", "tags_2"]]
questions = questions.dropna()

In [7]:
questions.head()


Out[7]:
question_id body tags_2
0 240 hi i am 27 my height is 5 2 bt my weight is on... Basic Health
10 429 what is the normal weight lenght of 27weeks ag... Basic Health
11 427 hello apu i have a question how much solid foo... Basic Health
17 212 dear maya apa how much vegetable protein vitam... Basic Health
18 261 apa i am a 18years old boy but my height is on... Basic Health

In [8]:
unique_tags = unique(questions["tags_2"])

In [9]:
print unique_tags


[u'ASK' u'BLAST' u'BMC' u'BRAC IED' u'Basic Health' u'Basic sex education '
 u'Beauty and Care' u'Cardiology' u'Career' u'Child/Forced Marriage'
 u'Communicable Diseases ' u'Contraception and Family Planning'
 u'Cybercrime' u'Dermatology' u'ENT' u'Elopement' u'Endocrinology'
 u'Family Law' u'Fitness' u'Gastroenterology' u'Gender Violence'
 u'Geriatric' u'Marie Stopes' u"Men's Health" u'Mental Health' u'Neurology'
 u'Oncology' u'Ophthalmology' u'Orthopedics ' u'Others' u'Parenting'
 u'Pediatrics/Child Care' u'Property Law' u'Relationships' u'Respiratory'
 u'STIs/STDs' u'Sajida Foundation' u'Sexuality' u'Technical Query'
 u'Teen Health' u'Urology/Nephrology' u'User Query'
 u'Womens Health - Labour and Post Pregnancy' u'Womens Health - Pregnancy'
 u'Womens Health and Physiology']

Cleaning


In [10]:
pre = PreProcessor("en")

In [11]:
questions["body"] = pre.clean(questions["body"])

In [12]:
questions = pre.process_tag(questions)

In [13]:
questions.head()


Out[13]:
question_id body tags
0 240 hi i am my height is bt my weight is only kg i... [Basic Health]
1 429 what is the normal weight lenght of weeks aged... [Basic Health, Womens Health - Pregnancy]
3 427 hello apu i have a question how much solid foo... [Basic Health, Pediatrics/Child Care]
5 212 dear maya apa how much vegetable protein vitam... [Basic Health, Parenting, Pediatrics/Child Care]
8 261 apa i am a years old boy but my height is only... [Basic Health]

Setting seed


In [14]:
seed = 101

Splitting data between training and test sets


In [15]:
data, labels = questions["body"], questions["tags"]

In [16]:
train_data, test_data, train_target, test_target = train_test_split(data, labels, test_size = 0.2, random_state = seed)

Converting labels into binary form


In [17]:
mlb = MultiLabelBinarizer(classes = unique_tags)
train_mlb = mlb.fit_transform(train_target)
test_mlb = mlb.transform(test_target)

In [18]:
print mlb.classes_


[u'ASK' u'BLAST' u'BMC' u'BRAC IED' u'Basic Health' u'Basic sex education '
 u'Beauty and Care' u'Cardiology' u'Career' u'Child/Forced Marriage'
 u'Communicable Diseases ' u'Contraception and Family Planning'
 u'Cybercrime' u'Dermatology' u'ENT' u'Elopement' u'Endocrinology'
 u'Family Law' u'Fitness' u'Gastroenterology' u'Gender Violence'
 u'Geriatric' u'Marie Stopes' u"Men's Health" u'Mental Health' u'Neurology'
 u'Oncology' u'Ophthalmology' u'Orthopedics ' u'Others' u'Parenting'
 u'Pediatrics/Child Care' u'Property Law' u'Relationships' u'Respiratory'
 u'STIs/STDs' u'Sajida Foundation' u'Sexuality' u'Technical Query'
 u'Teen Health' u'Urology/Nephrology' u'User Query'
 u'Womens Health - Labour and Post Pregnancy' u'Womens Health - Pregnancy'
 u'Womens Health and Physiology']

In [19]:
print "Training :", train_mlb.shape, " & Test :", test_mlb.shape


Training : (3353, 45)  & Test : (839, 45)

Feature Extraction


In [20]:
tfidf_vect = TfidfVectorizer(analyzer = "word", stop_words = pre.stopwords(), 
                             tokenizer = pre.tokenize, lowercase = False) 
train_dtm = tfidf_vect.fit_transform(train_data)
test_dtm = tfidf_vect.transform(test_data)

In [21]:
print "Training :", train_dtm.shape, " & Test :", test_dtm.shape


Training : (3353, 6035)  & Test : (839, 6035)

Comparing classifiers


In [22]:
classifiers = {"LogisticRegression": OneVsRestClassifier(LogisticRegression()),
              "RandomForestClassifier": OneVsRestClassifier(RandomForestClassifier()),
              "LinearSVC": OneVsRestClassifier(LinearSVC()),
              "XGBClassifier": OneVsRestClassifier(XGBClassifier()),
              "DecisionTreeClassifier": OneVsRestClassifier(DecisionTreeClassifier()),
              "SGDClassifier": OneVsRestClassifier(SGDClassifier())}

In [23]:
post = PostProcessor()

In [24]:
post.compare_classifiers(classifiers, train_dtm, train_mlb, 5)


Testing  LinearSVC
Score:  0.365

Testing  LogisticRegression
Score:  0.2138

Testing  RandomForestClassifier
Score:  0.2443

Testing  DecisionTreeClassifier
Score:  0.2824

Testing  SGDClassifier
Score:  0.3788

Testing  XGBClassifier
Score:  0.3242

Best classifier : SGDClassifier

Parameter tuning of SGDClassifier


In [25]:
param = {"estimator__loss": ["log", "modified_huber"],
         "estimator__penalty": [None, "l1", "elasticnet"],
         "estimator__class_weight": [None, "balanced"]}

In [33]:
search = GridSearchCV(OneVsRestClassifier(SGDClassifier()), param)

In [34]:
search.fit(train_dtm, train_mlb)


Out[34]:
GridSearchCV(cv=None, error_score='raise',
       estimator=OneVsRestClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
          n_jobs=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'estimator__class_weight': [None, 'balanced'], 'estimator__penalty': [None, 'l1', 'elasticnet'], 'estimator__loss': ['log', 'modified_huber']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

Fitting tuned classifier


In [35]:
clf = search.best_estimator_

In [36]:
clf.fit(train_dtm, train_mlb)


Out[36]:
OneVsRestClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty=None, power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
          n_jobs=1)

In [30]:
print clf.classes_


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]

Evaluating performance


In [31]:
post.compare_results(clf, test_dtm, mlb, test_mlb)


Using default prediction function: 
Accuracy:  49.1696464044 
Failure:  29.4398092968

Classification Report (default function):
                                           precision    recall  f1-score   support

                                      ASK       0.00      0.00      0.00         1
                                    BLAST       0.00      0.00      0.00         0
                                      BMC       0.00      0.00      0.00         1
                                 BRAC IED       0.00      0.00      0.00         0
                             Basic Health       0.59      0.42      0.49       109
                     Basic sex education        0.69      0.58      0.63        96
                          Beauty and Care       0.73      0.62      0.67        39
                               Cardiology       0.00      0.00      0.00         1
                                   Career       0.00      0.00      0.00        10
                    Child/Forced Marriage       0.00      0.00      0.00         0
                   Communicable Diseases        0.00      0.00      0.00         2
        Contraception and Family Planning       0.80      0.60      0.69        91
                               Cybercrime       0.00      0.00      0.00         1
                              Dermatology       0.40      0.12      0.18        17
                                      ENT       0.00      0.00      0.00         1
                                Elopement       0.00      0.00      0.00         0
                            Endocrinology       0.00      0.00      0.00         9
                               Family Law       1.00      0.20      0.33         5
                                  Fitness       0.55      0.35      0.43        17
                         Gastroenterology       1.00      0.14      0.25         7
                          Gender Violence       0.00      0.00      0.00         2
                                Geriatric       0.00      0.00      0.00         0
                             Marie Stopes       0.00      0.00      0.00         2
                             Men's Health       0.60      0.28      0.38        32
                            Mental Health       0.50      0.16      0.24        25
                                Neurology       0.00      0.00      0.00         0
                                 Oncology       0.00      0.00      0.00         1
                            Ophthalmology       0.00      0.00      0.00         1
                             Orthopedics        0.00      0.00      0.00         2
                                   Others       0.79      0.36      0.49        64
                                Parenting       0.00      0.00      0.00        10
                    Pediatrics/Child Care       0.56      0.36      0.44        25
                             Property Law       0.00      0.00      0.00         1
                            Relationships       0.75      0.30      0.43        20
                              Respiratory       0.00      0.00      0.00         2
                                STIs/STDs       0.75      0.50      0.60         6
                        Sajida Foundation       0.00      0.00      0.00         0
                                Sexuality       0.00      0.00      0.00         3
                          Technical Query       0.00      0.00      0.00         3
                              Teen Health       0.00      0.00      0.00         3
                       Urology/Nephrology       1.00      0.17      0.29         6
                               User Query       0.91      0.75      0.82        28
Womens Health - Labour and Post Pregnancy       0.00      0.00      0.00         4
                Womens Health - Pregnancy       0.77      0.69      0.73       188
             Womens Health and Physiology       0.69      0.53      0.60       160

                              avg / total       0.67      0.48      0.55       995


Using custom prediction function 
Accuracy:  56.3102332709 
Failure:  0.0

Classification Report (custom function):
                                           precision    recall  f1-score   support

                                      ASK       0.00      0.00      0.00         1
                                    BLAST       0.00      0.00      0.00         0
                                      BMC       0.00      0.00      0.00         1
                                 BRAC IED       0.00      0.00      0.00         0
                             Basic Health       0.41      0.70      0.52       109
                     Basic sex education        0.49      0.74      0.59        96
                          Beauty and Care       0.51      0.90      0.65        39
                               Cardiology       0.00      0.00      0.00         1
                                   Career       0.67      0.40      0.50        10
                    Child/Forced Marriage       0.00      0.00      0.00         0
                   Communicable Diseases        0.00      0.00      0.00         2
        Contraception and Family Planning       0.57      0.69      0.63        91
                               Cybercrime       0.00      0.00      0.00         1
                              Dermatology       0.40      0.24      0.30        17
                                      ENT       0.00      0.00      0.00         1
                                Elopement       0.00      0.00      0.00         0
                            Endocrinology       0.00      0.00      0.00         9
                               Family Law       0.50      0.40      0.44         5
                                  Fitness       0.31      0.47      0.37        17
                         Gastroenterology       0.33      0.14      0.20         7
                          Gender Violence       0.00      0.00      0.00         2
                                Geriatric       0.00      0.00      0.00         0
                             Marie Stopes       0.00      0.00      0.00         2
                             Men's Health       0.27      0.41      0.32        32
                            Mental Health       0.29      0.52      0.37        25
                                Neurology       0.00      0.00      0.00         0
                                 Oncology       0.00      0.00      0.00         1
                            Ophthalmology       0.00      0.00      0.00         1
                             Orthopedics        1.00      0.50      0.67         2
                                   Others       0.42      0.73      0.54        64
                                Parenting       0.00      0.00      0.00        10
                    Pediatrics/Child Care       0.37      0.64      0.47        25
                             Property Law       0.00      0.00      0.00         1
                            Relationships       0.38      0.70      0.49        20
                              Respiratory       0.00      0.00      0.00         2
                                STIs/STDs       0.60      0.50      0.55         6
                        Sajida Foundation       0.00      0.00      0.00         0
                                Sexuality       0.00      0.00      0.00         3
                          Technical Query       0.00      0.00      0.00         3
                              Teen Health       0.20      0.33      0.25         3
                       Urology/Nephrology       1.00      0.17      0.29         6
                               User Query       0.53      0.82      0.65        28
Womens Health - Labour and Post Pregnancy       0.00      0.00      0.00         4
                Womens Health - Pregnancy       0.59      0.84      0.69       188
             Womens Health and Physiology       0.50      0.72      0.59       160

                              avg / total       0.47      0.67      0.54       995