Objectives
In [1]:
import pandas as pd
import numpy as np
import os, sys
import re, json
from urllib.parse import urlparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
basepath = os.path.expanduser('~/Desktop/src/Stumbleupon_classification_challenge/')
sys.path.append(os.path.join(basepath, 'src'))
np.random.seed(4)
from data import load_datasets
from models import train_test_split, cross_val_scheme
In [2]:
# Initialize Stemmer
sns = SnowballStemmer(language='english')
In [3]:
train, test, sample_sub = load_datasets.load_dataset()
In [4]:
train['is_news'] = train.is_news.fillna(-999)
test['is_news'] = test.is_news.fillna(-999)
In [11]:
def extract_top_level_domain(url):
parsed_url = urlparse(url)
top_level = parsed_url[1].split('.')[-1]
return top_level
def get_tlds(urls):
return np.array([extract_top_level_domain(url) for url in urls])
train['tlds'] = get_tlds(train.url)
test['tlds'] = get_tlds(test.url)
In [12]:
ohe = pd.get_dummies(list(train.tlds) + list(test.tlds))
train = pd.concat((train, ohe.iloc[:len(train)]), axis=1)
test = pd.concat((test, ohe.iloc[len(train):]), axis=1)
In [19]:
class NumericalFeatures(BaseEstimator, TransformerMixin):
@staticmethod
def url_depth(url):
parsed_url = urlparse(url)
path = parsed_url.path
return len(list(filter(lambda x: len(x)> 0, path.split('/'))))
@staticmethod
def get_url_depths(urls):
return np.array([NumericalFeatures.url_depth(url) for url in urls])
def __init__(self, numerical_features):
self.features = numerical_features
def fit(self, X, y=None):
return self
def transform(self, df):
df['url_depth'] = self.get_url_depths(df.url)
numeric_features = self.features + ['url_depth']
df_numeric = df[numeric_features]
return df_numeric
Split into training and test sets.
In [7]:
params = {
'test_size': 0.2,
'random_state': 2,
'stratify': train.is_news
}
itrain, itest = train_test_split.tr_ts_split(len(train), **params)
X_train = train.iloc[itrain]
X_test = train.iloc[itest]
y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label
In [8]:
numeric_features = list(train.select_dtypes(exclude=['object']).columns[1:])
numeric_features.remove('label')
In [9]:
pipeline = Pipeline([
('feature_extractor', NumericalFeatures(numeric_features)),
('imputer', Imputer(strategy='mean')),
('scaler', StandardScaler()),
('model', xgb.XGBClassifier(learning_rate=.08, max_depth=6))
])
In [10]:
pipeline.fit(X_train, y_train)
Out[10]:
In [12]:
# cross validation
params = {
'n_folds': 5,
'shuffle': True,
'random_state': 3
}
scores, mean_score, std_score = cross_val_scheme.cv_scheme(pipeline, X_train, y_train, train.iloc[itrain].is_news, **params)
print('CV Scores: %s'%(scores))
print('Mean CV Score: %f'%(mean_score))
print('Std Cv Scoes: %f'%(std_score))
In [13]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('ROC AUC score on the test set ', roc_auc_score(y_test, y_preds))
In [17]:
joblib.dump(pipeline, os.path.join(basepath, 'data/processed/pipeline_numeric/pipeline_numeric.pkl'))
Out[17]:
In [7]:
train = joblib.load(os.path.join(basepath, 'data/processed/train_raw_content.pkl'))
test = joblib.load(os.path.join(basepath, 'data/processed/test_raw_content.pkl'))
In [5]:
train_json = list(map(json.loads, train.boilerplate))
test_json = list(map(json.loads, test.boilerplate))
In [6]:
train['boilerplate'] = train_json
test['boilerplate'] = test_json
In [9]:
def get_component(boilerplate, key):
"""
Get value for a particular key in boilerplate json,
if present return the value else return an empty string
boilerplate: list of boilerplate text in json format
key: key for which we want to fetch value e.g. body, title and url
"""
return np.array([bp[key] if key in bp and bp[key] else u'' for bp in boilerplate])
In [10]:
train['body_bp'] = get_component(train.boilerplate, 'body')
test['body_bp'] = get_component(test.boilerplate, 'body')
train['title_bp'] = get_component(train.boilerplate, 'title')
test['title_bp'] = get_component(test.boilerplate, 'title')
train['url_component'] = get_component(train.boilerplate, 'url')
test['url_component'] = get_component(test.boilerplate, 'url')
In [11]:
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
class VarSelect(BaseEstimator, TransformerMixin):
def __init__(self, keys):
self.keys = keys
def fit(self, X, y=None):
return self
def transform(self, df):
return df[self.keys]
class StemTokenizer(object):
def __init__(self):
self.sns = sns
def __call__(self, doc):
return [self.sns.stem(t) for t in word_tokenize(doc)]
In [12]:
def remove_non_alphanumeric(df):
return df.replace(r'[^A-Za-z0-9]+', ' ', regex=True)
strip_non_words = FunctionTransformer(remove_non_alphanumeric, validate=False)
In [15]:
# Lemma Tokenizer
pipeline_lemma = Pipeline([
('strip', strip_non_words),
('union', FeatureUnion([
('body', Pipeline([
('var', VarSelect(keys='body_bp')),
('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=LemmaTokenizer(),
ngram_range=(1, 2), min_df=3, sublinear_tf=True)),
('svd', TruncatedSVD(n_components=100))
])),
('title', Pipeline([
('var', VarSelect(keys='title_bp')),
('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=LemmaTokenizer(),
ngram_range=(1, 2), min_df=3, sublinear_tf=True)),
('svd', TruncatedSVD(n_components=100))
])),
('url', Pipeline([
('var', VarSelect(keys='url_component')),
('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=LemmaTokenizer(),
ngram_range=(1,2), min_df=3, sublinear_tf=True)),
('svd', TruncatedSVD(n_components=50))
]))
])),
('scaler', MinMaxScaler()),
('selection', SelectKBest(chi2, k=100)),
('model', LogisticRegression())
])
In [14]:
params = {
'test_size': 0.2,
'random_state': 2,
'stratify': train.is_news
}
itrain, itest = train_test_split.tr_ts_split(len(train), **params)
features = ['url_component', 'body_bp', 'title_bp']
X_train = train.iloc[itrain][features]
X_test = train.iloc[itest][features]
y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label
In [67]:
pipeline.fit(X_train, y_train)
Out[67]:
In [68]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('AUC score on unseen examples are: ', roc_auc_score(y_test, y_preds))
In [16]:
# train on full dataset
X = train[features]
y = train.label
pipeline_lemma.fit(X, y)
Out[16]:
In [17]:
# save this model to disk
joblib.dump(pipeline_lemma, os.path.join(basepath, 'data/processed/pipeline_boilerplate_lemma/model_lemma.pkl'))
Out[17]:
In [18]:
# Stemming Tokenizer
pipeline_stemming = Pipeline([
('strip', strip_non_words),
('union', FeatureUnion([
('body', Pipeline([
('var', VarSelect(keys='body_bp')),
('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=StemTokenizer(),
ngram_range=(1, 2), min_df=3, sublinear_tf=True)),
('svd', TruncatedSVD(n_components=100))
])),
('title', Pipeline([
('var', VarSelect(keys='title_bp')),
('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=StemTokenizer(),
ngram_range=(1, 2), min_df=3, sublinear_tf=True)),
('svd', TruncatedSVD(n_components=100))
])),
('url', Pipeline([
('var', VarSelect(keys='url_component')),
('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=StemTokenizer(),
ngram_range=(1,2), min_df=3, sublinear_tf=True)),
('svd', TruncatedSVD(n_components=50))
]))
])),
('scaler', MinMaxScaler()),
('selection', SelectKBest(chi2, k=100)),
('model', LogisticRegression())
])
In [14]:
params = {
'test_size': 0.2,
'random_state': 2,
'stratify': train.is_news
}
itrain, itest = train_test_split.tr_ts_split(len(train), **params)
features = ['url_component', 'body', 'title']
X_train = train.iloc[itrain][features]
X_test = train.iloc[itest][features]
y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label
In [15]:
pipeline_stemming.fit(X_train, y_train)
Out[15]:
In [16]:
y_preds = pipeline_stemming.predict_proba(X_test)[:, 1]
print('AUC score on unseen examples are: ', roc_auc_score(y_test, y_preds))
In [19]:
# train on full dataset
X = train[features]
y = train.label
pipeline_stemming.fit(X, y)
Out[19]:
In [20]:
# save this model to disk
joblib.dump(pipeline_stemming, os.path.join(basepath, 'data/processed/pipeline_boilerplate_stem/model_stem.pkl'))
Out[20]:
In [21]:
class Blending(object):
def __init__(self, models):
self.models = models # dict
def predict(self, X, X_test, y=None):
cv = KFold(len(X), n_folds=3, shuffle=True, random_state=10)
dataset_blend_train = np.zeros((X.shape[0], len(self.models.keys())))
dataset_blend_test = np.zeros((X_test.shape[0], len(self.models.keys())))
for index, key in enumerate(self.models.keys()):
dataset_blend_test_index = np.zeros((X_test.shape[0], len(cv)))
model = self.models[key][1]
feature_list = self.models[key][0]
print('Training model of type: ', key)
for i , (itrain, itest) in enumerate(cv):
Xtr = X.iloc[itrain][feature_list]
ytr = y.iloc[itrain]
Xte = X.iloc[itest][feature_list]
yte = y.iloc[itest]
y_preds = model.predict_proba(Xte)[:, 1]
dataset_blend_train[itest, index] = y_preds
dataset_blend_test_index[:, i] = model.predict_proba(X_test)[:, 1]
dataset_blend_test[:, index] = dataset_blend_test_index.mean(1)
print('\nBlending')
clf = LogisticRegression()
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
return y_submission
In [22]:
def stem_tokens(x):
return ' '.join([sns.stem(word) for word in word_tokenize(x)])
def preprocess_string(s):
return stem_tokens(s)
class Weights(BaseEstimator, TransformerMixin):
def __init__(self, weight):
self.weight = weight
def fit(self, X, y=None):
return self
def transform(self, X):
return self.weight * X
In [23]:
# load all the models from the disk
# pipeline_numeric = joblib.load(os.path.join(basepath, 'data/processed/pipeline_numeric/pipeline_numeric.pkl'))
# pipeline_lemma = joblib.load(os.path.join(basepath, 'data/processed/pipeline_boilerplate_lemma/model_lemma.pkl'))
# pipeline_stemming = joblib.load(os.path.join(basepath, 'data/processed/pipeline_boilerplate_stem/model_stem.pkl'))
pipeline_raw = joblib.load(os.path.join(basepath, 'data/processed/pipeline_raw/model_raw.pkl'))
In [24]:
numeric_features = list(train.select_dtypes(exclude=['object']).columns[1:]) + ['url']
numeric_features.remove('label')
boilerplate_features = ['body_bp', 'title_bp', 'url_component']
raw_features = ['body', 'title', 'h1', 'h2', 'h3', 'h4', 'span', 'a', 'label_',\
'meta-title', 'meta-description', 'li']
models = {
# 'numeric': [numeric_features, pipeline_numeric],
'boilerplate_lemma': [boilerplate_features, pipeline_lemma],
'boilerplate_stem': [boilerplate_features, pipeline_stemming],
'boilerplate_raw': [raw_features, pipeline_raw]
}
In [26]:
params = {
'test_size': 0.2,
'random_state': 2,
'stratify': train.is_news
}
itrain, itest = train_test_split.tr_ts_split(len(train), **params)
features = list(boilerplate_features) + list(raw_features)
X_train = train.iloc[itrain][features]
X_test = train.iloc[itest][features]
y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label
In [27]:
blend = Blending(models)
y_blend = blend.predict(X_train, X_test, y_train)
In [28]:
print('AUC score after blending ', roc_auc_score(y_test, y_blend))
In [33]:
X = train[features]
X_test = test[features]
y = train.label
In [34]:
assert X.shape[1] == X_test.shape[1]
In [35]:
blend = Blending(models)
predictions = blend.predict(X, X_test, y)
In [36]:
sample_sub['label'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/blend_3.csv'), index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: