Objectives

  • Learn how to parse html.
  • Create models that capture different aspects of the problem.
  • How to learn processes in parallel ?

In [1]:
import pandas as pd
import numpy as np
import os, sys
import re, json

from urllib.parse import urlparse

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/Stumbleupon_classification_challenge/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(4)

from data import load_datasets
from models import train_test_split, cross_val_scheme

In [2]:
# Initialize Stemmer
sns = SnowballStemmer(language='english')

In [3]:
train, test, sample_sub = load_datasets.load_dataset()

In [4]:
train['is_news'] = train.is_news.fillna(-999)
test['is_news'] = test.is_news.fillna(-999)
  • Text Features based on the boiler plate
  • Text Features based on the parsed raw html
  • Numerical features
  • Train different models on different datasets and then use their predictions in the next stage of classifier and predict.

In [11]:
def extract_top_level_domain(url):
        parsed_url = urlparse(url)
        top_level = parsed_url[1].split('.')[-1]
    
        return top_level
    
def get_tlds(urls):
    return np.array([extract_top_level_domain(url) for url in urls])

train['tlds'] = get_tlds(train.url)
test['tlds'] = get_tlds(test.url)

In [12]:
ohe = pd.get_dummies(list(train.tlds) + list(test.tlds))
train = pd.concat((train, ohe.iloc[:len(train)]), axis=1)
test = pd.concat((test, ohe.iloc[len(train):]), axis=1)

In [19]:
class NumericalFeatures(BaseEstimator, TransformerMixin):
    
    @staticmethod
    def url_depth(url):
        parsed_url = urlparse(url)
        path = parsed_url.path

        return len(list(filter(lambda x: len(x)> 0, path.split('/'))))
    
    @staticmethod
    def get_url_depths(urls):
        return np.array([NumericalFeatures.url_depth(url) for url in urls])
    
    def __init__(self, numerical_features):
        self.features = numerical_features
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        df['url_depth'] = self.get_url_depths(df.url)
        
        numeric_features = self.features + ['url_depth']
        df_numeric = df[numeric_features]
        
        return df_numeric

Split into training and test sets.


In [7]:
params = {
    'test_size': 0.2,
    'random_state': 2,
    'stratify': train.is_news
}

itrain, itest = train_test_split.tr_ts_split(len(train), **params)

X_train = train.iloc[itrain]
X_test = train.iloc[itest]

y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label

In [8]:
numeric_features = list(train.select_dtypes(exclude=['object']).columns[1:])
numeric_features.remove('label')

In [9]:
pipeline = Pipeline([
            ('feature_extractor', NumericalFeatures(numeric_features)),
            ('imputer', Imputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('model', xgb.XGBClassifier(learning_rate=.08, max_depth=6))
            ])

In [10]:
pipeline.fit(X_train, y_train)


Out[10]:
Pipeline(steps=[('feature_extractor', NumericalFeatures(numerical_features=None)), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', XGBClassifier(base_score=0.5, colsample_bylevel=1, cols...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [12]:
# cross validation
params = {
    'n_folds': 5,
    'shuffle': True,
    'random_state': 3
}

scores, mean_score, std_score = cross_val_scheme.cv_scheme(pipeline, X_train, y_train, train.iloc[itrain].is_news, **params)

print('CV Scores: %s'%(scores))
print('Mean CV Score: %f'%(mean_score))
print('Std Cv Scoes: %f'%(std_score))


CV Scores: [ 0.76471212  0.73756162  0.76209055  0.73137451  0.72021488]
Mean CV Score: 0.743191
Std Cv Scoes: 0.017433

In [13]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('ROC AUC score on the test set ', roc_auc_score(y_test, y_preds))


ROC AUC score on the test set  0.753914951989

In [17]:
joblib.dump(pipeline, os.path.join(basepath, 'data/processed/pipeline_numeric/pipeline_numeric.pkl'))


Out[17]:
['/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl_01.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl_02.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl_03.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl_04.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl_05.npy']

Load Textual Features Prepared from raw content


In [7]:
train = joblib.load(os.path.join(basepath, 'data/processed/train_raw_content.pkl'))
test = joblib.load(os.path.join(basepath, 'data/processed/test_raw_content.pkl'))

Text features from Boilerplate


In [5]:
train_json = list(map(json.loads, train.boilerplate))
test_json = list(map(json.loads, test.boilerplate))

In [6]:
train['boilerplate'] = train_json
test['boilerplate'] = test_json

In [9]:
def get_component(boilerplate, key):
    """
    Get value for a particular key in boilerplate json,
    if present return the value else return an empty string
    
    boilerplate: list of boilerplate text in json format
    key: key for which we want to fetch value e.g. body, title and url
    """
    
    return np.array([bp[key] if key in bp and bp[key] else u'' for bp in boilerplate])

In [10]:
train['body_bp'] = get_component(train.boilerplate, 'body')
test['body_bp'] = get_component(test.boilerplate, 'body')

train['title_bp'] = get_component(train.boilerplate, 'title')
test['title_bp'] = get_component(test.boilerplate, 'title')

train['url_component'] = get_component(train.boilerplate, 'url')
test['url_component'] = get_component(test.boilerplate, 'url')

In [11]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

class VarSelect(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        return df[self.keys]

class StemTokenizer(object):
    def __init__(self):
        self.sns = sns
    
    def __call__(self, doc):
        return [self.sns.stem(t) for t in word_tokenize(doc)]

In [12]:
def remove_non_alphanumeric(df):
    return df.replace(r'[^A-Za-z0-9]+', ' ', regex=True)

strip_non_words = FunctionTransformer(remove_non_alphanumeric, validate=False)

In [15]:
# Lemma Tokenizer

pipeline_lemma = Pipeline([
    ('strip', strip_non_words),
    ('union', FeatureUnion([
        ('body', Pipeline([
            ('var', VarSelect(keys='body_bp')),
            ('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=LemmaTokenizer(),
                                     ngram_range=(1, 2), min_df=3, sublinear_tf=True)),
            ('svd', TruncatedSVD(n_components=100))
        ])),
        ('title', Pipeline([
            ('var', VarSelect(keys='title_bp')),
            ('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=LemmaTokenizer(),
                                     ngram_range=(1, 2), min_df=3, sublinear_tf=True)),
            ('svd', TruncatedSVD(n_components=100))
        ])),
        ('url', Pipeline([
            ('var', VarSelect(keys='url_component')),
            ('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=LemmaTokenizer(),
                                     ngram_range=(1,2), min_df=3, sublinear_tf=True)),
            ('svd', TruncatedSVD(n_components=50))
        ]))
    ])),
    ('scaler', MinMaxScaler()),
    ('selection', SelectKBest(chi2, k=100)),
    ('model', LogisticRegression())
])

In [14]:
params = {
    'test_size': 0.2,
    'random_state': 2,
    'stratify': train.is_news
}

itrain, itest = train_test_split.tr_ts_split(len(train), **params)

features = ['url_component', 'body_bp', 'title_bp']

X_train = train.iloc[itrain][features]
X_test = train.iloc[itest][features]

y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label

In [67]:
pipeline.fit(X_train, y_train)


Out[67]:
Pipeline(steps=[('strip', FunctionTransformer(accept_sparse=False,
          func=<function remove_non_alphanumeric at 0x7f85f4503730>,
          pass_y=False, validate=False)), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('body', Pipeline(steps=[('var', VarSelect(keys='body')), ('tfidf', Tfidf...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [68]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('AUC score on unseen examples are: ', roc_auc_score(y_test, y_preds))


AUC score on unseen examples are:  0.868649291267

In [16]:
# train on full dataset
X = train[features]
y = train.label

pipeline_lemma.fit(X, y)


Out[16]:
Pipeline(steps=[('strip', FunctionTransformer(accept_sparse=False,
          func=<function remove_non_alphanumeric at 0x7f96d5457ae8>,
          pass_y=False, validate=False)), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('body', Pipeline(steps=[('var', VarSelect(keys='body_bp')), ('tfidf', Tf...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [17]:
# save this model to disk
joblib.dump(pipeline_lemma, os.path.join(basepath, 'data/processed/pipeline_boilerplate_lemma/model_lemma.pkl'))


Out[17]:
['/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_01.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_02.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_03.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_04.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_05.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_06.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_07.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_08.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_09.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_10.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_11.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_12.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_13.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_14.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_15.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_16.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_17.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_18.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_19.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_20.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_21.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_22.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_23.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_24.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_25.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_26.npy']

Pipeline involving Stemming


In [18]:
# Stemming Tokenizer

pipeline_stemming = Pipeline([
    ('strip', strip_non_words),
    ('union', FeatureUnion([
        ('body', Pipeline([
            ('var', VarSelect(keys='body_bp')),
            ('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=StemTokenizer(),
                                     ngram_range=(1, 2), min_df=3, sublinear_tf=True)),
            ('svd', TruncatedSVD(n_components=100))
        ])),
        ('title', Pipeline([
            ('var', VarSelect(keys='title_bp')),
            ('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=StemTokenizer(),
                                     ngram_range=(1, 2), min_df=3, sublinear_tf=True)),
            ('svd', TruncatedSVD(n_components=100))
        ])),
        ('url', Pipeline([
            ('var', VarSelect(keys='url_component')),
            ('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=StemTokenizer(),
                                     ngram_range=(1,2), min_df=3, sublinear_tf=True)),
            ('svd', TruncatedSVD(n_components=50))
        ]))
    ])),
    ('scaler', MinMaxScaler()),
    ('selection', SelectKBest(chi2, k=100)),
    ('model', LogisticRegression())
])

In [14]:
params = {
    'test_size': 0.2,
    'random_state': 2,
    'stratify': train.is_news
}

itrain, itest = train_test_split.tr_ts_split(len(train), **params)

features = ['url_component', 'body', 'title']

X_train = train.iloc[itrain][features]
X_test = train.iloc[itest][features]

y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label

In [15]:
pipeline_stemming.fit(X_train, y_train)


Out[15]:
Pipeline(steps=[('strip', FunctionTransformer(accept_sparse=False,
          func=<function remove_non_alphanumeric at 0x7fbfd7a7ed08>,
          pass_y=False, validate=False)), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('body', Pipeline(steps=[('var', VarSelect(keys='body')), ('tfidf', Tfidf...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [16]:
y_preds = pipeline_stemming.predict_proba(X_test)[:, 1]
print('AUC score on unseen examples are: ', roc_auc_score(y_test, y_preds))


AUC score on unseen examples are:  0.869247370828

In [19]:
# train on full dataset
X = train[features]
y = train.label

pipeline_stemming.fit(X, y)


Out[19]:
Pipeline(steps=[('strip', FunctionTransformer(accept_sparse=False,
          func=<function remove_non_alphanumeric at 0x7f96d5457ae8>,
          pass_y=False, validate=False)), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('body', Pipeline(steps=[('var', VarSelect(keys='body_bp')), ('tfidf', Tf...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [20]:
# save this model to disk
joblib.dump(pipeline_stemming, os.path.join(basepath, 'data/processed/pipeline_boilerplate_stem/model_stem.pkl'))


Out[20]:
['/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_01.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_02.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_03.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_04.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_05.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_06.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_07.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_08.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_09.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_10.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_11.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_12.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_13.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_14.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_15.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_16.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_17.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_18.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_19.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_20.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_21.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_22.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_23.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_24.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_25.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_stem/model_stem.pkl_26.npy']

Blending


In [21]:
class Blending(object):
    def __init__(self, models):
        self.models = models # dict
        
    def predict(self, X, X_test, y=None):
        cv = KFold(len(X), n_folds=3, shuffle=True, random_state=10)
        
        dataset_blend_train = np.zeros((X.shape[0], len(self.models.keys())))
        dataset_blend_test = np.zeros((X_test.shape[0], len(self.models.keys())))
        
        for index, key in enumerate(self.models.keys()):
            dataset_blend_test_index = np.zeros((X_test.shape[0], len(cv)))
            
            model = self.models[key][1]
            feature_list = self.models[key][0]
            
            print('Training model of type: ', key)
            
            for i , (itrain, itest) in enumerate(cv):
                Xtr = X.iloc[itrain][feature_list]
                ytr = y.iloc[itrain]

                Xte = X.iloc[itest][feature_list]
                yte = y.iloc[itest]

                y_preds = model.predict_proba(Xte)[:, 1]
                
                dataset_blend_train[itest, index] = y_preds
                dataset_blend_test_index[:, i] = model.predict_proba(X_test)[:, 1]
                
            dataset_blend_test[:, index] = dataset_blend_test_index.mean(1)
             
        print('\nBlending')
        clf = LogisticRegression()
        clf.fit(dataset_blend_train, y)
        
        y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
        y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
        
        return y_submission

In [22]:
def stem_tokens(x):
    return ' '.join([sns.stem(word) for word in word_tokenize(x)])

def preprocess_string(s):
    return stem_tokens(s)

class Weights(BaseEstimator, TransformerMixin):
    def __init__(self, weight):
        self.weight = weight
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.weight * X

In [23]:
# load all the models from the disk
# pipeline_numeric = joblib.load(os.path.join(basepath, 'data/processed/pipeline_numeric/pipeline_numeric.pkl'))
# pipeline_lemma = joblib.load(os.path.join(basepath, 'data/processed/pipeline_boilerplate_lemma/model_lemma.pkl'))
# pipeline_stemming = joblib.load(os.path.join(basepath, 'data/processed/pipeline_boilerplate_stem/model_stem.pkl'))
pipeline_raw = joblib.load(os.path.join(basepath, 'data/processed/pipeline_raw/model_raw.pkl'))

In [24]:
numeric_features = list(train.select_dtypes(exclude=['object']).columns[1:]) + ['url']
numeric_features.remove('label')

boilerplate_features = ['body_bp', 'title_bp', 'url_component']
raw_features = ['body', 'title', 'h1', 'h2', 'h3', 'h4', 'span', 'a', 'label_',\
       'meta-title', 'meta-description', 'li']

models = {
#     'numeric': [numeric_features, pipeline_numeric],
    'boilerplate_lemma': [boilerplate_features, pipeline_lemma],
    'boilerplate_stem': [boilerplate_features, pipeline_stemming],
    'boilerplate_raw': [raw_features, pipeline_raw]
}

In [26]:
params = {
    'test_size': 0.2,
    'random_state': 2,
    'stratify': train.is_news
}

itrain, itest = train_test_split.tr_ts_split(len(train), **params)

features = list(boilerplate_features) + list(raw_features)

X_train = train.iloc[itrain][features]
X_test = train.iloc[itest][features]

y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label

In [27]:
blend = Blending(models)
y_blend = blend.predict(X_train, X_test, y_train)


Training model of type:  boilerplate_raw
Training model of type:  boilerplate_lemma
Training model of type:  boilerplate_stem

Blending

In [28]:
print('AUC score after blending ', roc_auc_score(y_test, y_blend))


AUC score after blending  0.874679899909

Train on full dataset.


In [33]:
X = train[features]
X_test = test[features]

y = train.label

In [34]:
assert X.shape[1] == X_test.shape[1]

In [35]:
blend = Blending(models)
predictions = blend.predict(X, X_test, y)


Training model of type:  boilerplate_raw
Training model of type:  boilerplate_lemma
Training model of type:  boilerplate_stem

Blending

Submissions


In [36]:
sample_sub['label'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/blend_3.csv'), index=False)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: