Objective of the Notebook.
In [1]:
import numpy as np
import pandas as pd
import os, sys
import re, json
import warnings
warnings.filterwarnings('ignore')
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import f_classif, chi2, SelectKBest
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from collections import defaultdict
basepath = os.path.expanduser('~/Desktop/src/Stumbleupon_classification_challenge/')
sys.path.append(os.path.join(basepath, 'src'))
np.random.seed(2)
from data import load_datasets
from models import train_test_split, cross_val_scheme
In [2]:
# intialize Porter Stemmer
sns = SnowballStemmer(language='english')
por = PorterStemmer()
In [3]:
# add some custom stopwords to the list of stopwords
custom_stopwords = ['i', 'http', 'www']
ENGLISH_STOP_WORDS = set(ENGLISH_STOP_WORDS) | set(custom_stopwords)
In [4]:
# load datasets
train, test, sample_sub = load_datasets.load_dataset()
In [5]:
train['boilerplate'] = list(map(json.loads, train.boilerplate))
test['boilerplate'] = list(map(json.loads, test.boilerplate))
In [6]:
def decompose_boilerplate(boilerplate_json, key='body'):
return [bp[key] if key in bp and bp[key] else u'' for bp in boilerplate_json]
train_body = decompose_boilerplate(train.boilerplate)
train_title = decompose_boilerplate(train.boilerplate, key='title')
test_body = decompose_boilerplate(test.boilerplate)
test_title = decompose_boilerplate(test.boilerplate, 'title')
In [7]:
train['body'] = train_body
train['title'] = train_title
test['body'] = test_body
test['title'] = test_title
In [4]:
class Parse():
TAGS = ['h1', 'h2', 'h3', 'h4', 'span',\
'a', 'label_', 'meta-title', 'meta-description','li']
@staticmethod
def read_html(urlid):
with open(os.path.join(basepath, 'data/raw/raw_content/'+str(urlid)), 'r', encoding='utf-8', errors='ignore') as infile:
html = infile.read()
infile.close()
return html
@staticmethod
def parse_html(html):
return BeautifulSoup(html, 'lxml')
@staticmethod
def remove_tags(html, tags):
for tag in tags:
for el in html.find_all(tag):
el.extract()
return html
@staticmethod
def tag_content(html, tag):
def process(s):
s = s.lower()
s = s.strip()
s = re.sub(r'[^a-z0-9]+', ' ', s)
return s
tags_component = tag.split('-')
attrs = {}
if len(tags_component) > 1:
tag_name = tags_component[0]
attrs['name'] = tags_component[1]
else:
tag_name = tags_component[0]
for el in html.find_all(tag_name, attrs):
if len(attrs.keys()) > 0:
return process(el.get('content', ''))
else:
return process(el.text) if el.text else ''
return '' # could not find the tag
def __init__(self, key='urlid'):
self.key = key
def fit(self, X, y=None):
return self
def transform(self, df):
urlids = df[self.key]
tags_content_dict = defaultdict(list)
for urlid in urlids.values:
html = self.read_html(urlid)
html = self.parse_html(html)
html = self.remove_tags(html, ['style', 'script'])
for tag in self.TAGS:
tags_content_dict[tag].append(self.tag_content(html, tag))
for tag in self.TAGS:
df[tag] = tags_content_dict[tag]
return df
In [9]:
# parse all the raw content
parse_train = Parse()
train = parse_train.transform(train)
In [10]:
parse_test = Parse()
test = parse_test.transform(test)
In [11]:
# dump parsed content to the disk
joblib.dump(train, os.path.join(basepath, 'data/processed/train_raw_content.pkl'))
joblib.dump(test, os.path.join(basepath, 'data/processed/test_raw_content.pkl'))
Out[11]:
In [5]:
# load pickle from the disk
train = joblib.load(os.path.join(basepath, 'data/processed/train_raw_content.pkl'))
test = joblib.load(os.path.join(basepath, 'data/processed/test_raw_content.pkl'))
In [6]:
feature_df = train[train.columns[26:]]
feature_df['label'] = train.label
In [7]:
features = list(train.columns[27:])
In [15]:
params = {
'test_size': 0.2,
'random_state': 2,
'stratify': train.is_news
}
itrain, itest = train_test_split.tr_ts_split(len(train), **params)
X_train = feature_df.iloc[itrain][features]
X_test = feature_df.iloc[itest][features]
y_train = feature_df.iloc[itrain].label
y_test = feature_df.iloc[itest].label
Task
In [8]:
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
In [9]:
class VarSelect(BaseEstimator, TransformerMixin):
def __init__(self, keys):
self.keys = keys
def fit(self, X, y=None):
return self
def transform(self, df):
return df[self.keys]
In [10]:
class Weights(BaseEstimator, TransformerMixin):
def __init__(self, weight):
self.weight = weight
def fit(self, X, y=None):
return self
def transform(self, X):
return self.weight * X
In [11]:
def stem_tokens(x):
return ' '.join([sns.stem(word) for word in word_tokenize(x)])
def preprocess_string(s):
return stem_tokens(s)
In [16]:
def remove_non_alphanumeric(df):
return df.replace(r'[^A-Za-z0-9]+', ' ', regex=True)
strip_non_words = FunctionTransformer(remove_non_alphanumeric, validate=False)
pipeline = Pipeline([
('strip', strip_non_words),
('union', FeatureUnion([
('h1_', Pipeline([
('var', VarSelect(keys='h1')),
('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
('svd', TruncatedSVD(n_components=50))
])),
('h2_', Pipeline([
('var', VarSelect(keys='h2')),
('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
('svd', TruncatedSVD(n_components=50))
])),
('h3_', Pipeline([
('var', VarSelect(keys='h3')),
('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
('svd', TruncatedSVD(n_components=50))
])),
('h4_', Pipeline([
('var', VarSelect(keys='h4')),
('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
('svd', TruncatedSVD(n_components=50))
])),
('meta_title', Pipeline([
('var', VarSelect(keys='meta-title')),
('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
('svd', TruncatedSVD(n_components=50)),
('weight', Weights(weight=5))
])),
('meta_description', Pipeline([
('var', VarSelect(keys='meta-description')),
('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
('svd', TruncatedSVD(n_components=50)),
('weight', Weights(weight=3))
])),
('span_', Pipeline([
('var', VarSelect(keys='span')),
('tfidf', TfidfVectorizer(min_df=2, ngram_range=(1, 2), tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
('svd', TruncatedSVD(n_components=50))
])),
('lsa_body', Pipeline([
('var', VarSelect(keys='body')),
('tfidf', TfidfVectorizer(min_df=2, ngram_range=(1, 2), tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS, preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
('svd', TruncatedSVD(n_components=100)),
('weight', Weights(weight=20))
])),
('lsa_title', Pipeline([
('var', VarSelect(keys='title')),
('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=2, tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS,preprocessor=preprocess_string, strip_accents='unicode', norm='l2', sublinear_tf=True)),
('svd', TruncatedSVD(n_components=50)),
('weight', Weights(weight=5))
])),
])),
('scale', MinMaxScaler()),
('feat', SelectKBest(chi2, k=100)),
('model', LogisticRegression())
])
In [17]:
pipeline.fit(X_train, y_train)
Private Leaderboard Score: 0.87341
In [36]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('ROC AUC score on unseen examples: ', roc_auc_score(y_test, y_preds))
In [13]:
X = feature_df[features]
y = feature_df.label
In [18]:
# train on full dataset
pipeline.fit(X, y)
Out[18]:
In [19]:
# store this model on the disk
joblib.dump(pipeline, os.path.join(basepath, 'data/processed/pipeline_raw/model_raw.pkl'))
Out[19]:
In [39]:
predictions = pipeline.predict_proba(test[features])[:, 1]
In [40]:
sample_sub['label'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/ml_pipeline_chi2.csv'), index=False)
In [ ]: