Objective of the Notebook.

  • Learn how to create new pipelines.
  • Feature Engineering.
  • Extract text for different tags and weight them differently to introduce some domain knowledge.
  • Text Mining.
  • Parse raw html to extract text content and extract features from it.

In [1]:
import numpy as np
import pandas as pd
import os, sys
import re, json

import warnings
warnings.filterwarnings('ignore')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import f_classif, chi2, SelectKBest
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from bs4 import BeautifulSoup
from collections import defaultdict

basepath = os.path.expanduser('~/Desktop/src/Stumbleupon_classification_challenge/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

from data import load_datasets
from models import train_test_split, cross_val_scheme

In [2]:
# intialize Porter Stemmer

sns = SnowballStemmer(language='english')
por = PorterStemmer()

In [3]:
# add some custom stopwords to the list of stopwords 
custom_stopwords = ['i', 'http', 'www']
ENGLISH_STOP_WORDS = set(ENGLISH_STOP_WORDS) | set(custom_stopwords)

In [4]:
# load datasets
train, test, sample_sub = load_datasets.load_dataset()

In [5]:
train['boilerplate'] = list(map(json.loads, train.boilerplate))
test['boilerplate'] = list(map(json.loads, test.boilerplate))

In [6]:
def decompose_boilerplate(boilerplate_json, key='body'):
    return [bp[key] if key in bp and bp[key] else u'' for bp in boilerplate_json]
    
train_body = decompose_boilerplate(train.boilerplate)
train_title = decompose_boilerplate(train.boilerplate, key='title')

test_body = decompose_boilerplate(test.boilerplate)
test_title = decompose_boilerplate(test.boilerplate, 'title')

In [7]:
train['body'] = train_body
train['title'] = train_title

test['body'] = test_body
test['title'] = test_title

In [4]:
class Parse():
    TAGS = ['h1', 'h2', 'h3', 'h4', 'span',\
            'a', 'label_', 'meta-title', 'meta-description','li']
    
    @staticmethod
    def read_html(urlid):
        with open(os.path.join(basepath, 'data/raw/raw_content/'+str(urlid)), 'r', encoding='utf-8', errors='ignore') as infile:
            html = infile.read()
            infile.close()
        return html
    
    @staticmethod
    def parse_html(html):
        return BeautifulSoup(html, 'lxml')
    
    @staticmethod
    def remove_tags(html, tags):
        for tag in tags:
            for el in html.find_all(tag):
                el.extract()

        return html
 
    @staticmethod
    def tag_content(html, tag):
        def process(s):
            s = s.lower()
            s = s.strip()
            s = re.sub(r'[^a-z0-9]+', ' ', s)
            return s

        tags_component = tag.split('-')
        attrs = {}
        
        if len(tags_component) > 1:
            tag_name = tags_component[0]
            attrs['name'] = tags_component[1]
        else:
            tag_name = tags_component[0]
        
        for el in html.find_all(tag_name, attrs):
            if len(attrs.keys()) > 0:    
                return process(el.get('content', ''))
            else:
                return process(el.text) if el.text else ''        
        return '' # could not find the tag
        
    def __init__(self, key='urlid'):
        self.key = key
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        urlids = df[self.key]
        tags_content_dict = defaultdict(list)
        
        for urlid in urlids.values:
            html = self.read_html(urlid)
            html = self.parse_html(html)
            html = self.remove_tags(html, ['style', 'script'])
            
            for tag in self.TAGS:
                tags_content_dict[tag].append(self.tag_content(html, tag))
        
        for tag in self.TAGS:
            df[tag] = tags_content_dict[tag]
        
        return df

In [9]:
# parse all the raw content
parse_train = Parse()
train = parse_train.transform(train)

In [10]:
parse_test = Parse()
test = parse_test.transform(test)

In [11]:
# dump parsed content to the disk
joblib.dump(train, os.path.join(basepath, 'data/processed/train_raw_content.pkl'))
joblib.dump(test, os.path.join(basepath, 'data/processed/test_raw_content.pkl'))


Out[11]:
['/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/test_raw_content.pkl',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/test_raw_content.pkl_01.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/test_raw_content.pkl_02.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/test_raw_content.pkl_03.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/test_raw_content.pkl_04.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/test_raw_content.pkl_05.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/test_raw_content.pkl_06.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/test_raw_content.pkl_07.npy']

In [5]:
# load pickle from the disk
train = joblib.load(os.path.join(basepath, 'data/processed/train_raw_content.pkl'))
test = joblib.load(os.path.join(basepath, 'data/processed/test_raw_content.pkl'))

In [6]:
feature_df = train[train.columns[26:]]
feature_df['label'] = train.label

In [7]:
features = list(train.columns[27:])

In [15]:
params = {
    'test_size': 0.2,
    'random_state': 2,
    'stratify': train.is_news
}

itrain, itest = train_test_split.tr_ts_split(len(train), **params)

X_train = feature_df.iloc[itrain][features]
X_test = feature_df.iloc[itest][features]

y_train = feature_df.iloc[itrain].label
y_test = feature_df.iloc[itest].label

Task

  • Text Preprocessing
    • Lowercase all the string, remove stopwords, stem the words.
  • Decompose the boilerplate into body, title and url
  • Create text features for these parts and weigh them differently.

In [8]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [9]:
class VarSelect(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        return df[self.keys]

In [10]:
class Weights(BaseEstimator, TransformerMixin):
    def __init__(self, weight):
        self.weight = weight
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.weight * X

In [11]:
def stem_tokens(x):
    return ' '.join([sns.stem(word) for word in word_tokenize(x)])

def preprocess_string(s):
    return stem_tokens(s)

In [16]:
def remove_non_alphanumeric(df):
    return df.replace(r'[^A-Za-z0-9]+', ' ', regex=True)

strip_non_words = FunctionTransformer(remove_non_alphanumeric, validate=False)

pipeline = Pipeline([
            ('strip', strip_non_words),
            ('union', FeatureUnion([
                    ('h1_', Pipeline([
                        ('var', VarSelect(keys='h1')),
                        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
                        ('svd', TruncatedSVD(n_components=50))
                    ])),
                    ('h2_', Pipeline([
                        ('var', VarSelect(keys='h2')),
                        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
                        ('svd', TruncatedSVD(n_components=50))
                    ])),
                    ('h3_', Pipeline([
                        ('var', VarSelect(keys='h3')),
                        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
                        ('svd', TruncatedSVD(n_components=50))
                    ])),
                    ('h4_', Pipeline([
                        ('var', VarSelect(keys='h4')),
                        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
                        ('svd', TruncatedSVD(n_components=50))
                    ])),
                    ('meta_title', Pipeline([
                        ('var', VarSelect(keys='meta-title')),
                        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
                        ('svd', TruncatedSVD(n_components=50)),
                        ('weight', Weights(weight=5))
                    ])),
                    ('meta_description', Pipeline([
                        ('var', VarSelect(keys='meta-description')),
                        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
                        ('svd', TruncatedSVD(n_components=50)),
                        ('weight', Weights(weight=3))
                    ])),
                    ('span_', Pipeline([
                        ('var', VarSelect(keys='span')),
                        ('tfidf', TfidfVectorizer(min_df=2, ngram_range=(1, 2), tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
                        ('svd', TruncatedSVD(n_components=50))
                    ])),
                    ('lsa_body', Pipeline([
                        ('var', VarSelect(keys='body')),
                        ('tfidf', TfidfVectorizer(min_df=2, ngram_range=(1, 2), tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
                        ('svd', TruncatedSVD(n_components=100)),
                        ('weight', Weights(weight=20))
                    ])),
                    ('lsa_title', Pipeline([
                        ('var', VarSelect(keys='title')),
                        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS,preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
                        ('svd', TruncatedSVD(n_components=50)),
                        ('weight', Weights(weight=5))
                    ])),
                ])),
            ('scale', MinMaxScaler()),
            ('feat', SelectKBest(chi2, k=100)),
            ('model', LogisticRegression())
        ])

In [17]:
pipeline.fit(X_train, y_train)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-317e1c402966> in <module>()
----> 1 pipeline.fit(X_train, y_train)

NameError: name 'X_train' is not defined

Private Leaderboard Score: 0.87341


In [36]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('ROC AUC score on unseen examples: ', roc_auc_score(y_test, y_preds))


ROC AUC score on unseen examples:  0.861660971934

In [13]:
X = feature_df[features]
y = feature_df.label

In [18]:
# train on full dataset
pipeline.fit(X, y)


Out[18]:
Pipeline(steps=[('strip', FunctionTransformer(accept_sparse=False,
          func=<function remove_non_alphanumeric at 0x7ff03e056ea0>,
          pass_y=False, validate=False)), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('h1_', Pipeline(steps=[('var', VarSelect(keys='h1')), ('tfidf', TfidfVec...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [19]:
# store this model on the disk
joblib.dump(pipeline, os.path.join(basepath, 'data/processed/pipeline_raw/model_raw.pkl'))


Out[19]:
['/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_01.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_02.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_03.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_04.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_05.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_06.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_07.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_08.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_09.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_10.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_11.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_12.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_13.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_14.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_15.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_16.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_17.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_18.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_19.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_20.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_21.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_22.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_23.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_24.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_25.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_26.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_27.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_28.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_29.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_30.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_31.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_32.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_33.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_34.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_35.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_36.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_37.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_38.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_39.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_40.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_41.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_42.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_43.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_44.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_45.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_46.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_47.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_48.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_49.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_50.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_51.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_52.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_53.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_54.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_55.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_raw/model_raw.pkl_56.npy']

In [39]:
predictions = pipeline.predict_proba(test[features])[:, 1]

Submission


In [40]:
sample_sub['label'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/ml_pipeline_chi2.csv'), index=False)

In [ ]: