Import libraries



In [1]:

    
from warnings import simplefilter
from pandas import read_sql_table
from numpy import unique
from sqlalchemy import create_engine
from urllib import quote_plus
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer

Ignore warnings



In [2]:

    
simplefilter("ignore")

Load PreProcessor and PostProcessor Classes



In [3]:

    
from preprocess import PreProcessor
from postprocess_comp import PostProcessor

Loading questions from database



In [4]:

    
engine = create_engine("mysql+pymysql://<username>:%s@<hostname>/<database>?charset=utf8" % quote_plus(<password>), encoding="utf-8")



In [5]:

    
with engine.connect() as con, con.begin():
    questions = read_sql_table(<table_name>, con)



In [6]:

    
questions = questions[questions.status == "answered"]
questions = questions[questions.lang == "en"]
questions = questions[["question_id", "body", "tags_2"]]
questions = questions.dropna()



In [7]:

    
questions.head()









    Out[7]:






  
    
      
      question_id
      body
      tags_2
    
  
  
    
      0
      240
      hi i am 27 my height is 5 2 bt my weight is on...
      Basic Health
    
    
      10
      429
      what is the normal weight lenght of 27weeks ag...
      Basic Health
    
    
      11
      427
      hello apu i have a question how much solid foo...
      Basic Health
    
    
      17
      212
      dear maya apa how much vegetable protein vitam...
      Basic Health
    
    
      18
      261
      apa i am a 18years old boy but my height is on...
      Basic Health



In [8]:

    
unique_tags = unique(questions["tags_2"])



In [9]:

    
print unique_tags









    



[u'ASK' u'BLAST' u'BMC' u'BRAC IED' u'Basic Health' u'Basic sex education '
 u'Beauty and Care' u'Cardiology' u'Career' u'Child/Forced Marriage'
 u'Communicable Diseases ' u'Contraception and Family Planning'
 u'Cybercrime' u'Dermatology' u'ENT' u'Elopement' u'Endocrinology'
 u'Family Law' u'Fitness' u'Gastroenterology' u'Gender Violence'
 u'Geriatric' u'Marie Stopes' u"Men's Health" u'Mental Health' u'Neurology'
 u'Oncology' u'Ophthalmology' u'Orthopedics ' u'Others' u'Parenting'
 u'Pediatrics/Child Care' u'Property Law' u'Relationships' u'Respiratory'
 u'STIs/STDs' u'Sajida Foundation' u'Sexuality' u'Technical Query'
 u'Teen Health' u'Urology/Nephrology' u'User Query'
 u'Womens Health - Labour and Post Pregnancy' u'Womens Health - Pregnancy'
 u'Womens Health and Physiology']

Cleaning



In [10]:

    
pre = PreProcessor("en")



In [11]:

    
questions["body"] = pre.clean(questions["body"])



In [12]:

    
questions = pre.process_tag(questions)



In [13]:

    
questions.head()









    Out[13]:






  
    
      
      question_id
      body
      tags
    
  
  
    
      0
      240
      hi i am my height is bt my weight is only kg i...
      [Basic Health]
    
    
      1
      429
      what is the normal weight lenght of weeks aged...
      [Basic Health, Womens Health - Pregnancy]
    
    
      3
      427
      hello apu i have a question how much solid foo...
      [Basic Health, Pediatrics/Child Care]
    
    
      5
      212
      dear maya apa how much vegetable protein vitam...
      [Basic Health, Parenting, Pediatrics/Child Care]
    
    
      8
      261
      apa i am a years old boy but my height is only...
      [Basic Health]

Setting seed



In [14]:

    
seed = 101

Splitting data between training and test sets



In [15]:

    
data, labels = questions["body"], questions["tags"]



In [16]:

    
train_data, test_data, train_target, test_target = train_test_split(data, labels, test_size = 0.2, random_state = seed)

Converting labels into binary form



In [17]:

    
mlb = MultiLabelBinarizer(classes = unique_tags)
train_mlb = mlb.fit_transform(train_target)
test_mlb = mlb.transform(test_target)



In [18]:

    
print mlb.classes_









    



[u'ASK' u'BLAST' u'BMC' u'BRAC IED' u'Basic Health' u'Basic sex education '
 u'Beauty and Care' u'Cardiology' u'Career' u'Child/Forced Marriage'
 u'Communicable Diseases ' u'Contraception and Family Planning'
 u'Cybercrime' u'Dermatology' u'ENT' u'Elopement' u'Endocrinology'
 u'Family Law' u'Fitness' u'Gastroenterology' u'Gender Violence'
 u'Geriatric' u'Marie Stopes' u"Men's Health" u'Mental Health' u'Neurology'
 u'Oncology' u'Ophthalmology' u'Orthopedics ' u'Others' u'Parenting'
 u'Pediatrics/Child Care' u'Property Law' u'Relationships' u'Respiratory'
 u'STIs/STDs' u'Sajida Foundation' u'Sexuality' u'Technical Query'
 u'Teen Health' u'Urology/Nephrology' u'User Query'
 u'Womens Health - Labour and Post Pregnancy' u'Womens Health - Pregnancy'
 u'Womens Health and Physiology']



In [19]:

    
print "Training :", train_mlb.shape, " & Test :", test_mlb.shape









    



Training : (3353, 45)  & Test : (839, 45)

Feature Extraction



In [20]:

    
tfidf_vect = TfidfVectorizer(analyzer = "word", stop_words = pre.stopwords(), 
                             tokenizer = pre.tokenize, lowercase = False) 
train_dtm = tfidf_vect.fit_transform(train_data)
test_dtm = tfidf_vect.transform(test_data)



In [21]:

    
print "Training :", train_dtm.shape, " & Test :", test_dtm.shape









    



Training : (3353, 6035)  & Test : (839, 6035)

Comparing classifiers



In [22]:

    
classifiers = {"LogisticRegression": OneVsRestClassifier(LogisticRegression()),
              "RandomForestClassifier": OneVsRestClassifier(RandomForestClassifier()),
              "LinearSVC": OneVsRestClassifier(LinearSVC()),
              "XGBClassifier": OneVsRestClassifier(XGBClassifier()),
              "DecisionTreeClassifier": OneVsRestClassifier(DecisionTreeClassifier()),
              "SGDClassifier": OneVsRestClassifier(SGDClassifier())}



In [23]:

    
post = PostProcessor()



In [24]:

    
post.compare_classifiers(classifiers, train_dtm, train_mlb, 5)









    



Testing  LinearSVC
Score:  0.365

Testing  LogisticRegression
Score:  0.2138

Testing  RandomForestClassifier
Score:  0.2443

Testing  DecisionTreeClassifier
Score:  0.2824

Testing  SGDClassifier
Score:  0.3788

Testing  XGBClassifier
Score:  0.3242

Best classifier : SGDClassifier

Parameter tuning of SGDClassifier



In [25]:

    
param = {"estimator__loss": ["log", "modified_huber"],
         "estimator__penalty": [None, "l1", "elasticnet"],
         "estimator__class_weight": [None, "balanced"]}



In [33]:

    
search = GridSearchCV(OneVsRestClassifier(SGDClassifier()), param)



In [34]:

    
search.fit(train_dtm, train_mlb)









    Out[34]:





GridSearchCV(cv=None, error_score='raise',
       estimator=OneVsRestClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
          n_jobs=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'estimator__class_weight': [None, 'balanced'], 'estimator__penalty': [None, 'l1', 'elasticnet'], 'estimator__loss': ['log', 'modified_huber']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

Fitting tuned classifier



In [35]:

    
clf = search.best_estimator_



In [36]:

    
clf.fit(train_dtm, train_mlb)









    Out[36]:





OneVsRestClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty=None, power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
          n_jobs=1)



In [30]:

    
print clf.classes_









    



[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]

Evaluating performance



In [31]:

    
post.compare_results(clf, test_dtm, mlb, test_mlb)









    



Using default prediction function: 
Accuracy:  49.1696464044 
Failure:  29.4398092968

Classification Report (default function):
                                           precision    recall  f1-score   support

                                      ASK       0.00      0.00      0.00         1
                                    BLAST       0.00      0.00      0.00         0
                                      BMC       0.00      0.00      0.00         1
                                 BRAC IED       0.00      0.00      0.00         0
                             Basic Health       0.59      0.42      0.49       109
                     Basic sex education        0.69      0.58      0.63        96
                          Beauty and Care       0.73      0.62      0.67        39
                               Cardiology       0.00      0.00      0.00         1
                                   Career       0.00      0.00      0.00        10
                    Child/Forced Marriage       0.00      0.00      0.00         0
                   Communicable Diseases        0.00      0.00      0.00         2
        Contraception and Family Planning       0.80      0.60      0.69        91
                               Cybercrime       0.00      0.00      0.00         1
                              Dermatology       0.40      0.12      0.18        17
                                      ENT       0.00      0.00      0.00         1
                                Elopement       0.00      0.00      0.00         0
                            Endocrinology       0.00      0.00      0.00         9
                               Family Law       1.00      0.20      0.33         5
                                  Fitness       0.55      0.35      0.43        17
                         Gastroenterology       1.00      0.14      0.25         7
                          Gender Violence       0.00      0.00      0.00         2
                                Geriatric       0.00      0.00      0.00         0
                             Marie Stopes       0.00      0.00      0.00         2
                             Men's Health       0.60      0.28      0.38        32
                            Mental Health       0.50      0.16      0.24        25
                                Neurology       0.00      0.00      0.00         0
                                 Oncology       0.00      0.00      0.00         1
                            Ophthalmology       0.00      0.00      0.00         1
                             Orthopedics        0.00      0.00      0.00         2
                                   Others       0.79      0.36      0.49        64
                                Parenting       0.00      0.00      0.00        10
                    Pediatrics/Child Care       0.56      0.36      0.44        25
                             Property Law       0.00      0.00      0.00         1
                            Relationships       0.75      0.30      0.43        20
                              Respiratory       0.00      0.00      0.00         2
                                STIs/STDs       0.75      0.50      0.60         6
                        Sajida Foundation       0.00      0.00      0.00         0
                                Sexuality       0.00      0.00      0.00         3
                          Technical Query       0.00      0.00      0.00         3
                              Teen Health       0.00      0.00      0.00         3
                       Urology/Nephrology       1.00      0.17      0.29         6
                               User Query       0.91      0.75      0.82        28
Womens Health - Labour and Post Pregnancy       0.00      0.00      0.00         4
                Womens Health - Pregnancy       0.77      0.69      0.73       188
             Womens Health and Physiology       0.69      0.53      0.60       160

                              avg / total       0.67      0.48      0.55       995


Using custom prediction function 
Accuracy:  56.3102332709 
Failure:  0.0

Classification Report (custom function):
                                           precision    recall  f1-score   support

                                      ASK       0.00      0.00      0.00         1
                                    BLAST       0.00      0.00      0.00         0
                                      BMC       0.00      0.00      0.00         1
                                 BRAC IED       0.00      0.00      0.00         0
                             Basic Health       0.41      0.70      0.52       109
                     Basic sex education        0.49      0.74      0.59        96
                          Beauty and Care       0.51      0.90      0.65        39
                               Cardiology       0.00      0.00      0.00         1
                                   Career       0.67      0.40      0.50        10
                    Child/Forced Marriage       0.00      0.00      0.00         0
                   Communicable Diseases        0.00      0.00      0.00         2
        Contraception and Family Planning       0.57      0.69      0.63        91
                               Cybercrime       0.00      0.00      0.00         1
                              Dermatology       0.40      0.24      0.30        17
                                      ENT       0.00      0.00      0.00         1
                                Elopement       0.00      0.00      0.00         0
                            Endocrinology       0.00      0.00      0.00         9
                               Family Law       0.50      0.40      0.44         5
                                  Fitness       0.31      0.47      0.37        17
                         Gastroenterology       0.33      0.14      0.20         7
                          Gender Violence       0.00      0.00      0.00         2
                                Geriatric       0.00      0.00      0.00         0
                             Marie Stopes       0.00      0.00      0.00         2
                             Men's Health       0.27      0.41      0.32        32
                            Mental Health       0.29      0.52      0.37        25
                                Neurology       0.00      0.00      0.00         0
                                 Oncology       0.00      0.00      0.00         1
                            Ophthalmology       0.00      0.00      0.00         1
                             Orthopedics        1.00      0.50      0.67         2
                                   Others       0.42      0.73      0.54        64
                                Parenting       0.00      0.00      0.00        10
                    Pediatrics/Child Care       0.37      0.64      0.47        25
                             Property Law       0.00      0.00      0.00         1
                            Relationships       0.38      0.70      0.49        20
                              Respiratory       0.00      0.00      0.00         2
                                STIs/STDs       0.60      0.50      0.55         6
                        Sajida Foundation       0.00      0.00      0.00         0
                                Sexuality       0.00      0.00      0.00         3
                          Technical Query       0.00      0.00      0.00         3
                              Teen Health       0.20      0.33      0.25         3
                       Urology/Nephrology       1.00      0.17      0.29         6
                               User Query       0.53      0.82      0.65        28
Womens Health - Labour and Post Pregnancy       0.00      0.00      0.00         4
                Womens Health - Pregnancy       0.59      0.84      0.69       188
             Womens Health and Physiology       0.50      0.72      0.59       160

                              avg / total       0.47      0.67      0.54       995

	question_id	body	tags_2
0	240	hi i am 27 my height is 5 2 bt my weight is on...	Basic Health
10	429	what is the normal weight lenght of 27weeks ag...	Basic Health
11	427	hello apu i have a question how much solid foo...	Basic Health
17	212	dear maya apa how much vegetable protein vitam...	Basic Health
18	261	apa i am a 18years old boy but my height is on...	Basic Health

	question_id	body	tags
0	240	hi i am my height is bt my weight is only kg i...	[Basic Health]
1	429	what is the normal weight lenght of weeks aged...	[Basic Health, Womens Health - Pregnancy]
3	427	hello apu i have a question how much solid foo...	[Basic Health, Pediatrics/Child Care]
5	212	dear maya apa how much vegetable protein vitam...	[Basic Health, Parenting, Pediatrics/Child Care]
8	261	apa i am a years old boy but my height is only...	[Basic Health]