In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15,5)
In [16]:
NYT_train_raw = pd.read_csv("NYTimesBlogTrain.csv")
NYT_test_raw = pd.read_csv("NYTimesBlogTest.csv")
Join the data for preprocessing
In [17]:
print('Max train ID: %d. Max test ID: %d' % (np.max(NYT_train_raw['UniqueID']), np.max(NYT_test_raw['UniqueID'])))
joined = NYT_train_raw.merge(NYT_test_raw, how = 'outer')
Create additional features:
In [18]:
joined['QorE'] = joined['Headline'].str.contains(r'\!|\?').astype(int)
joined['Q&A'] = joined['Headline'].str.contains(r'Q\. and A\.').astype(int)
Convert "PubDate" into two columns: Weekday and Hour:
In [19]:
joined['PubDate'] = pd.to_datetime(joined['PubDate'])
joined['Weekday'] = joined['PubDate'].dt.weekday
joined['Hour'] = joined['PubDate'].dt.hour
In [20]:
print("At the moment, we have %d entries with NewsDesk=Nan." % len(joined.loc[joined['NewsDesk'].isnull()]))
Below are the results of one day of searching for meaningful patterns in the data. There are a few easily identifiable features, most of which lead to zero popularity. They are:
Now, as ask788 pointed out in this thread, the problem with data is often their structure, not the models we use on them. I agree that ideally this feature engineering should have been done automatically, but I am a novice, and had to tediously plod through the rows of data manually.
You can browse individual features that I selected by printing the head() of a subset, like so:
In [21]:
joined.loc[(joined['NewsDesk'] == 'Foreign') & (joined['SectionName'].isnull())].head()
Out[21]:
In [22]:
joined.loc[(joined['NewsDesk'] == 'Styles') & (joined['SectionName'].isnull()), 'NewsDesk'] = 'TStyle'
joined.loc[(joined['NewsDesk'] == 'Foreign') & (joined['SectionName'].isnull()), 'NewsDesk'] = 'History'
joined.loc[(joined['NewsDesk'].isnull()) & (joined['Headline'].str.contains(r'^1[0-9]{3}')), 'NewsDesk'] = 'History'
joined.loc[(joined['NewsDesk'].isnull()) & (joined['Headline'] == 'Daily Clip Report'), 'NewsDesk'] = 'Daily Rubric'
joined.loc[joined['NewsDesk'] == 'Daily Rubric', 'SectionName'] = 'Clip Report'
joined.loc[(joined['NewsDesk'].isnull()) & (joined['Headline'] == 'Today in Politics'), 'SectionName'] = 'Today in Politics'
joined.loc[joined['SectionName'] == 'Today in Politics', 'NewsDesk'] = 'Daily Rubric'
joined.loc[(joined['NewsDesk'].isnull()) & (joined['Headline'].str.contains(r'what we\'re reading', case=False)), 'SectionName'] = 'What we\'re reading'
joined.loc[joined['SectionName'] == 'What we\'re reading', 'NewsDesk'] = 'Daily Rubric'
joined.loc[(joined['NewsDesk'].isnull()) & (joined['Headline'].str.contains(r'first draft', case=False)), 'SectionName'] = 'First draft'
joined.loc[joined['SectionName'] == 'First draft', 'NewsDesk'] = 'Daily Rubric'
joined.loc[(joined['NewsDesk'].isnull()) & (joined['SubsectionName'] == 'Education'), 'NewsDesk'] = 'Daily Rubric'
joined.loc[(joined['Headline'].str.contains('pictures of the day|week in pictures', case=False)), 'NewsDesk'] = 'Daily Rubric'
Filling the gaps in NewsDesk, SectionName and SubsectionName.
In [23]:
section_to_newsdesk = {'Business Day': 'Business', 'Crosswords/Games': 'Business', 'Technology': 'Business',
'Arts': 'Culture',
'World': 'Foreign',
'Magazine': 'Magazine',
'N.Y. / Region': 'Metro',
'Opinion': 'OpEd',
'Travel': 'Travel',
'Multimedia': 'Multimedia',
'Open': 'Open'}
section_to_subsection = {'Crosswords/Games': 'Crosswords/Games',
'Technology': 'Technology'}
newsdesk_to_section = {'TStyle': 'TStyle',
'Culture': 'Arts',
'OpEd': 'Opinion',
'History': 'History'}
newsdesk_to_subsection = {'TStyle': 'TStyle',
'Culture': 'Arts',
'Daily Rubric': 'Rubric',
'Magazine': 'Magazine',
'Metro': 'Metro',
'Multimedia': 'Multimedia',
'OpEd': 'OpEd',
'Science': 'Science',
'Sports': 'Sports',
'Styles': 'Styles',
'Travel': 'Travel',
'History': 'History'}
for sec in set(joined['SectionName']):
try: section_to_newsdesk[sec]
except KeyError:
pass
else:
joined['NewsDesk'].fillna(joined.loc[(joined['SectionName'] == sec)]['NewsDesk'].fillna(section_to_newsdesk[sec]), inplace=True)
try: section_to_subsection[sec]
except KeyError:
pass
else:
joined['SubsectionName'].fillna(joined.loc[(joined['SectionName'] == sec)]['SubsectionName'].fillna(section_to_subsection[sec]), inplace=True)
for nd in set(joined['NewsDesk']):
try: newsdesk_to_section[nd]
except KeyError:
pass
else:
joined['SectionName'].fillna(joined.loc[(joined['NewsDesk'] == nd)]['SectionName'].fillna(newsdesk_to_section[nd]), inplace=True)
try: newsdesk_to_subsection[nd]
except KeyError:
pass
else:
joined['SubsectionName'].fillna(joined.loc[(joined['NewsDesk'] == nd)]['SubsectionName'].fillna(newsdesk_to_subsection[nd]), inplace=True)
Filling even more gaps with some clustering. I created a TFI-DF matrix and did Ward clustering on words. Four of six clusters I thought were meaningful and fitted well into existing NewsDesk/S(ubs)ectionName.
In [24]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
nans = joined.loc[joined['NewsDesk'].isnull()]
words = list(nans.apply(lambda x:'%s' % (x['Abstract']),axis=1))
tfv = TfidfVectorizer(min_df=0.005, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
X_tr = tfv.fit_transform(words)
ward = AgglomerativeClustering(n_clusters=6,
linkage='ward').fit(X_tr.toarray())
joined.loc[joined['NewsDesk'].isnull(), 'cluster'] = ward.labels_
cluster_to = {}
cluster_to['NewsDesk'] = {4: 'Metro', 3: 'National', 2: 'Foreign', 1: 'National'}
cluster_to['SectionName'] = {4: 'N.Y. / Region', 3: 'U.S.', 2: 'Not_Asia', 1: 'U.S.'}
cluster_to['SubsectionName'] = {4: 'NYT', 3: 'Politics', 2: 'Not_Asia', 1: 'Politics'}
for key in cluster_to:
for key2 in cluster_to[key]:
joined.loc[(joined['cluster'] == key2) & (nans['NewsDesk'].isnull()), key] = cluster_to[key][key2]
You can see what these clusters look like by typing:
In [26]:
joined.loc[joined['cluster'] == 3].head()
Out[26]:
Finally, use a few (6) obvious keywords to categorise the data even more. After this, we are left with 950 entries where NewsDesk, SectionName and SubsectionName are NaN, but I didn't have an idea how to deal with them.
In [ ]:
joined.drop('cluster', axis=1, inplace=True)
In [29]:
keywords = {}
keywords['clinton|white house|obama'] = {'NewsDesk': 'National', 'SectionName': 'U.S.', 'SubsectionName': 'Politics'}
keywords['isis|iraq'] = {'NewsDesk': 'Foreign', 'SectionName': 'Not_Asia', 'SubsectionName': 'Not_Asia'}
keywords['york'] = {'NewsDesk': 'Metro', 'SectionName': 'N.Y. / Region', 'SubsectionName': 'N.Y. / Region'}
for key in keywords:
indices = (joined['NewsDesk'].isnull()) & (joined['Abstract'].str.contains(key, case=False))
for sec in keywords[key]:
joined.loc[indices, sec] = keywords[key][sec]
In [30]:
print("Now we have %d entries with NewsDesk=Nan." % len(joined.loc[joined['NewsDesk'].isnull()]))
In [31]:
from sklearn.feature_extraction import DictVectorizer
def categorizeDF(df):
old_columns = df.columns
cat_cols = ['NewsDesk', 'SectionName', 'SubsectionName']
temp_dict = df[cat_cols].to_dict(orient="records")
vec = DictVectorizer()
vec_arr = vec.fit_transform(temp_dict).toarray()
new_df = pd.DataFrame(vec_arr).convert_objects(convert_numeric=True)
new_df.index = df.index
new_df.columns = vec.get_feature_names()
columns_to_add = [col for col in old_columns if col not in cat_cols]
new_df[columns_to_add] = df[columns_to_add]
new_df.drop(cat_cols, inplace=True, axis=1)
return new_df
joined_cat = categorizeDF(joined)
In [32]:
train = joined_cat[joined_cat['UniqueID'] <= 6532]
test = joined_cat[joined_cat['UniqueID'] > 6532]
In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
Xcols = train.columns
Xcols = [x for x in Xcols if not x in ('Headline', 'Snippet', 'Abstract', 'PubDate', 'UniqueID', 'Popular', 'Q&A')]
y = train['Popular']
forest = RandomForestClassifier(n_estimators=7000, max_features=0.1, min_samples_split=24, random_state=33, n_jobs=3)
forest.fit(train[Xcols], y)
probsRF = forest.predict_proba(test[Xcols])[:,1]
print("10 Fold CV Score: ", np.mean(cross_validation.cross_val_score(forest, train[Xcols], y, cv=10, scoring='roc_auc')))
In [35]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
Xcols = train.columns
Xcols = [x for x in Xcols if not x in ('Headline', 'Snippet', 'Abstract', 'PubDate', 'UniqueID', 'Popular', 'Q&A')]
y = train['Popular']
est = GradientBoostingClassifier(n_estimators=3000,
learning_rate=0.005,
max_depth=4,
max_features=0.3,
min_samples_leaf=9,
random_state=33)
est.fit(train[Xcols], y)
probsGBC = est.predict_proba(test[Xcols])[:,1]
print("10 Fold CV Score: ", np.mean(cross_validation.cross_val_score(est, train[Xcols], y, cv=10, scoring='roc_auc')))
Define a function for cross-validating and plotting ensemble of my two models:
In [50]:
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
def calculate_ensemble_score(model1, model2, Xcols, ycol, dataset, cv=10):
'''Calculates the score for various weights of two models in an ensemble'''
num_points = 21
score_arr = np.zeros((cv, num_points))
kf = cross_validation.KFold(len(dataset), cv, shuffle=True)
i = 0
for xtrain, xtest in kf:
train, test = dataset.ix[xtrain], dataset.ix[xtest]
model1.fit(train[Xcols], train[ycol])
probs1 = model1.predict_proba(test[Xcols])[:,1]
model2.fit(train[Xcols], train[ycol])
probs2 = model2.predict_proba(test[Xcols])[:,1]
for wg in range(num_points):
probs = wg/(num_points-1)*probs1 + (1-wg/(num_points-1))*probs2
score_arr[i][wg] = roc_auc_score(test[ycol], probs)
i+=1
return np.mean(score_arr, axis=0)
def plot_ensemble_score(scores):
import seaborn as sbs
fig = plt.figure()
ax = fig.add_subplot(111)
ax.axhline(y=scores[0], linestyle='--', color='red')
ax.axhline(y=scores[-1], linestyle='--', color='green')
ax.text(0.03, scores[0]+0.00001, "Pure GBM", verticalalignment='bottom', horizontalalignment='left', color='red', size='larger')
ax.text(0.98, scores[-1]+0.00001, "Pure RF", verticalalignment='bottom', horizontalalignment='right', color='green', size='larger')
ax.plot(np.linspace(0,1,len(scores)), scores)
ax.set_xlabel("RF model weight")
ax.set_ylabel("AUC")
ax.set_title("Choosing the weights for two models in an ensemble (10-fold cross-validation)")
return fig
In [37]:
means = calculate_ensemble_score(forest, est, Xcols, 'Popular', train)
In [51]:
myplot = plot_ensemble_score(means)
myplot.savefig("AUC.png", dpi=300)
In [54]:
test['Popular'] = (0.6*probsGBC+0.4*probsRF)
test['UniqueID'] = test['UniqueID'].astype(int)
test.to_csv('preds.csv', columns=['UniqueID', 'Popular'], header=['UniqueID', 'Probability1'], index=False)
Kaggle: 0.93613.