Data presented to us has both numerical as well as text based features.
Objectives of this notebook.
Whether numerical features are any significant or not.
Is boilerplate code sufficient enough to capture detailed intricacies in the data ?
Evaluation Metric - AUC ( Area Under Curve )
In [187]:
%matplotlib inline
# load libraries
import pandas as pd
import numpy as np
import os
import sys
import re, json
from urllib.parse import urlparse
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD as SVD
from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('poster')
import warnings
warnings.filterwarnings('ignore')
# set seed
np.random.seed(1)
basepath = os.path.expanduser('~/Desktop/src/Stumbleupon_classification_challenge/')
sys.path.append(os.path.join(basepath, 'src'))
from models import train_test_split, cross_val_scheme
In [189]:
# initialize stemmers
sn_stem = SnowballStemmer(language='english')
p_stem = PorterStemmer()
In [ ]:
train.head(2)
In [2]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/train.tsv'), delimiter='\t')
test = pd.read_csv(os.path.join(basepath, 'data/raw/test.tsv'), delimiter='\t')
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sampleSubmission.csv'))
In [ ]:
test.head(2)
In [ ]:
sample_sub.head()
In [3]:
# remove urlid from the train and test and store them in separate variable
def fetch_urlid(data):
return data['urlid']
def delete_urlid(data):
del data['urlid']
train_urlid = fetch_urlid(train)
test_urlid = fetch_urlid(test)
delete_urlid(train)
delete_urlid(test)
Helper Functions
In [4]:
def encode_variable(train, test):
"""
Convert categorical variable to numerical form
train: Values of the variable in the training set
test: Values of the variable in the test set
"""
data = pd.concat((train, test), axis=0)
lbl = LabelEncoder()
lbl.fit(data)
train_ = lbl.transform(train)
test_ = lbl.transform(test)
return train_, test_
def store(filename, data):
"""
Pickle data onto disk
filename: filename that you want to give to this dump
data: actual data that you want to dump.
"""
import pickle
with open(os.path.join(basepath, 'data/processed/') + filename, 'wb') as outfile:
pickle.dump(data, outfile, protocol=pickle.HIGHEST_PROTOCOL)
outfile.close()
def load(filename):
"""
Load data from disk
filename: filename of the pickled data that you want to load
"""
import pickle
with open(os.path.join(basepath, 'data/processed/') + filename, 'rb') as infile:
data = pickle.load(infile)
infile.close()
return data
In [ ]:
train.columns
Let's see the url variable.
Lets create a variable which counts the depth in the url.
e.g. www.guardian.co.uk/a has depth 1, whereas www.guardian.co.uk/a/b has depth 2
In [5]:
def url_depth(url):
"""
Takes in a url and calculates depth
e.g. www.guardian.co.uk/a has depth 1, whereas www.guardian.co.uk/a/b has depth 2
url - url of the webpage
"""
parsed_url = urlparse(url)
path = parsed_url.path
return len(list(filter(lambda x: len(x)> 0, path.split('/'))))
url_depths = train.url.map(url_depth)
url_depths_test = test.url.map(url_depth)
assert len(url_depths) == len(train)
assert len(url_depths_test) == len(test)
In [6]:
feature_df = pd.DataFrame({'url_depths': url_depths, 'label': train.label})
feature_df_test = pd.DataFrame({'url_depths': url_depths_test})
Validate the hypothesis that this feature is actually indicative or not.
In [7]:
train.is_news.value_counts() / train.is_news.value_counts().sum()
Out[7]:
In [8]:
test.is_news.value_counts() / test.is_news.value_counts().sum()
Out[8]:
Since the ratio of the news article to other articles is somewhat constant in training and test sets is constant, we have to make sure that this ratio is preserved in the differnt folds we create during the cross-validation so that our dataset is representative of the original set.
In [9]:
params = {
'test_size': 0.2,
'random_state': 2,
'stratify': train.is_news
}
features = ['url_depths']
itrain, itest = train_test_split.tr_ts_split(len(train), **params)
X_train = feature_df.iloc[itrain][features].values
X_test = feature_df.iloc[itest][features].values
y_train = feature_df.iloc[itrain].label.values
y_test = feature_df.iloc[itest].label.values
In [10]:
# cross validation scheme
est = LogisticRegression()
params = {
'n_folds': 3,
'shuffle': True,
'random_state': 3
}
is_news = train.iloc[itrain].is_news
scores, mean_score, std_score = cross_val_scheme.cv_scheme(est, X_train, y_train, is_news, **params)
print('CV Scores: %s'%(scores))
print('Mean CV Score: %f'%(mean_score))
print('Std Cv Scoes: %f'%(std_score))
In [11]:
# performance on the held out test set
est.fit(X_train, y_train)
y_pred = est.predict_proba(X_test)[:, 1]
print('ROC AUC score on the held out set: %f '%(roc_auc_score(y_test, y_pred)))
Private Leaderboard score - 0.54425
In [ ]:
# train on full dataset
est.fit(feature_df[['url_depths']], feature_df.label)
predictions = est.predict_proba(feature_df_test[['url_depths']])[:, 1]
In [12]:
def extract_top_level_domain(url):
"""
Extracts top level domain from a given url
url: Url of the webpage in the dataset
"""
parsed_url = urlparse(url)
top_level = parsed_url[1].split('.')[-1]
return top_level
top_level_domains_train = train.url.map(extract_top_level_domain)
top_level_domains_test = test.url.map(extract_top_level_domain)
assert len(top_level_domains_train) == len(train)
assert len(top_level_domains_test) == len(test)
In [13]:
tld_encode_train, tld_encoded_test = encode_variable(top_level_domains_train, top_level_domains_test)
In [14]:
feature_df['tld'] = tld_encode_train
feature_df_test['tld'] = tld_encoded_test
In [ ]:
params = {
'test_size': 0.2,
'random_state': 2,
'stratify': train.is_news
}
features = ['url_depths', 'tld']
X_train, X_test, y_train, y_test = train_test_split.tr_ts_split(feature_df[features], feature_df['label'], **params)
In [ ]:
# cross validation scheme
est = RandomForestClassifier(n_jobs=-1)
params = {
'n_folds': 3,
'shuffle': True,
'random_state': 3
}
scores, mean_score, std_score = cross_val_scheme.cv_scheme(est, X_train, y_train, train.is_news, **params)
print('CV Scores: %s'%(scores))
print('Mean CV Score: %f'%(mean_score))
print('Std Cv Scoes: %f'%(std_score))
Private Leaderboard Score: 0.61713
In [ ]:
# performance on the held out test set
est.fit(X_train, y_train)
y_pred = est.predict_proba(X_test)[:, 1]
print('ROC AUC score on the held out set: %f '%(roc_auc_score(y_test, y_pred)))
In [ ]:
# train on full dataset
est.fit(feature_df[features], feature_df.label)
predictions = est.predict_proba(feature_df_test[features])[:, 1]
In [15]:
train_is_news, test_is_news = encode_variable(train.is_news, test.is_news)
In [16]:
feature_df['is_news'] = train_is_news
feature_df_test['is_news'] = test_is_news
In [ ]:
params = {
'test_size': 0.2,
'random_state': 2,
'stratify': train.is_news
}
features = ['url_depths', 'tld', 'is_news']
X_train, X_test, y_train, y_test = train_test_split.tr_ts_split(feature_df[features], feature_df['label'], **params)
In [ ]:
# cross validation scheme
est = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1)
params = {
'n_folds': 3,
'shuffle': True,
'random_state': 3
}
scores, mean_score, std_score = cross_val_scheme.cv_scheme(est, X_train, y_train, train.is_news, **params)
print('CV Scores: %s'%(scores))
print('Mean CV Score: %f'%(mean_score))
print('Std Cv Scoes: %f'%(std_score))
Private Leaderboard Score: 0.60854
In [ ]:
# performance on the held out test set
est.fit(X_train, y_train)
y_pred = est.predict_proba(X_test)[:, 1]
print('ROC AUC score on the held out set: %f '%(roc_auc_score(y_test, y_pred)))
In [ ]:
# train on full dataset
est.fit(feature_df[features], feature_df.label)
predictions = est.predict_proba(feature_df_test[features])[:, 1]
In [17]:
alchemy_category_train, alchemy_category_test = encode_variable(train.alchemy_category, test.alchemy_category)
In [18]:
feature_df['alchemy_category'] = alchemy_category_train
feature_df_test['alchemy_category'] = alchemy_category_test
In [ ]:
params = {
'test_size': 0.2,
'random_state': 2,
'stratify': train.is_news
}
features = ['url_depths', 'tld', 'alchemy_category']
X_train, X_test, y_train, y_test = train_test_split.tr_ts_split(feature_df[features], feature_df['label'], **params)
In [ ]:
# cross validation scheme
est = RandomForestClassifier(n_jobs=-1)
params = {
'n_folds': 5,
'shuffle': True,
'random_state': 3
}
scores, mean_score, std_score = cross_val_scheme.cv_scheme(est, X_train, y_train, train.is_news, **params)
print('CV Scores: %s'%(scores))
print('Mean CV Score: %f'%(mean_score))
print('Std Cv Scoes: %f'%(std_score))
Private Leaderboard Score: 0.67329
In [ ]:
# performance on the held out test set
est.fit(X_train, y_train)
y_pred = est.predict_proba(X_test)[:, 1]
print('ROC AUC score on the held out set: %f '%(roc_auc_score(y_test, y_pred)))
In [ ]:
# train on full dataset
est.fit(feature_df[features], feature_df.label)
predictions = est.predict_proba(feature_df_test[features])[:, 1]
In [184]:
def convert_to_json(text):
return json.loads(text)
def extract_body(json_):
return json_['body'].lower() if json_['body'] else u''
def remove_stopwords(text):
return ' '.join([word for word in text.split(' ') if word not in ENGLISH_STOP_WORDS])
converted_json_train = train.boilerplate.map(convert_to_json)
converted_json_test = test.boilerplate.map(convert_to_json)
body_train = list(map(extract_body, converted_json_train))
body_test = list(map(extract_body, converted_json_test))
body_train = list(map(remove_stopwords, body_train))
body_test = list(map(remove_stopwords, body_test))
In [198]:
def stem_words(sentence, stemmer):
return ' '.join([stemmer.stem(word) for word in sentence.split(' ')])
def stemming(sentences, stemmer):
return [stem_words(sentence, stemmer) for sentence in sentences]
body_train = stemming(body_train, sn_stem)
body_test = stemming(body_test, sn_stem)
In [199]:
tfidf = TfidfVectorizer(strip_accents='unicode')
tfidf.fit(body_train)
body_train_tfidf = tfidf.transform(body_train)
body_test_tfidf = tfidf.transform(body_test)
This takes considerable amount of memory resource to generate these tfidf features so it is advisable to dump them once you have computed them.
In [200]:
store('train_text_features_stemmed', body_train_tfidf)
store('test_text_features_stemmed', body_test_tfidf)
In [19]:
body_train_tfidf = load('train_text_features_removed')
body_test_tfidf = load('test_text_features_removed')
In [201]:
print('Number of features captured in the body of the boilerplate: %d'%body_train_tfidf.shape[1])
In [202]:
# make sure number of features captured in both training and test dataset are same
assert(body_train_tfidf.shape[1] == body_test_tfidf.shape[1])
85131 feature are way too many features, so we have to bring down the dimensionality of the problem.
In [203]:
svd = SVD(n_components=100)
svd.fit(body_train_tfidf)
features_train = svd.transform(body_train_tfidf)
features_test = svd.transform(body_test_tfidf)
In [204]:
print('Reduced number of features are: %d'%features_train.shape[1])
In [205]:
params = {
'test_size': 0.2,
'random_state': 2,
'stratify': train.is_news
}
itrain, itest = train_test_split.tr_ts_split(len(train), **params)
X_train = features_train[itrain]
X_test = features_train[itest]
y_train = feature_df.iloc[itrain].label
y_test = feature_df.iloc[itest].label
In [206]:
# cross validation scheme
est = LogisticRegression(C=1.)
params = {
'n_folds': 5,
'shuffle': True,
'random_state': 3
}
is_news = train.iloc[itrain].is_news
scores, mean_score, std_score = cross_val_scheme.cv_scheme(est, X_train, y_train.values, is_news, **params)
print('CV Scores: %s'%(scores))
print('Mean CV Score: %f'%(mean_score))
print('Std Cv Scoes: %f'%(std_score))
Private Leaderboard Score: 0.86825 ( after stemming )
In [207]:
# performance on the held out test set
est.fit(X_train, y_train.values)
y_pred = est.predict_proba(X_test)[:, 1]
print('ROC AUC score on the held out set: %f '%(roc_auc_score(y_test, y_pred)))
In [208]:
# train on full dataset
est.fit(features_train, feature_df.label.values)
predictions = est.predict_proba(features_test)[:, 1]
How to combine textual features with numerical features?
Use numpy.hstack to concatenate textual features with numerical features
In [86]:
features_concat_train = np.hstack((features_train, feature_df[features].values))
features_concat_test = np.hstack((features_test, feature_df_test[features].values))
In [116]:
X_train = features_concat_train[itrain]
X_test = features_concat_train[itest]
y_train = feature_df.iloc[itrain].label.values
y_test = feature_df.iloc[itest].label.values
In [117]:
min_max = MinMaxScaler()
min_max.fit(X_train)
# scaler = StandardScaler()
# scaler.fit(X_train)
X_train = min_max.transform(X_train)
X_test = min_max.transform(X_test)
In [124]:
# cross validation scheme
est = LogisticRegression(C=.8)
params = {
'n_folds': 5,
'shuffle': True,
'random_state': 3
}
is_news = train.iloc[itrain].is_news
scores, mean_score, std_score = cross_val_scheme.cv_scheme(est, X_train, y_train, is_news, **params)
print('CV Scores: %s'%(scores))
print('Mean CV Score: %f'%(mean_score))
print('Std Cv Scoes: %f'%(std_score))
Private Leaderboard score: 0.78387
In [125]:
# performance on the held out test set
est.fit(X_train, y_train)
y_pred = est.predict_proba(X_test)[:, 1]
print('ROC AUC score on the held out set: %f '%(roc_auc_score(y_test, y_pred)))
In [126]:
# train on full dataset
min_max = MinMaxScaler()
min_max.fit(features_concat_train)
features_concat_train = min_max.transform(features_concat_train)
feature_concat_test = min_max.transform(features_concat_test)
est.fit(features_concat_train, feature_df.label.values)
predictions = est.predict_proba(features_concat_test)[:, 1]
In [209]:
sample_sub['label'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/text_features_100_features_stemmed.csv'), index=False)
In [ ]: