In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [ ]:
import time
import xgboost as xgb
import lightgbm as lgb
# import category_encoders as cat_ed
# import gc, mlcrate, glob
# from gplearn.genetic import SymbolicTransformer, SymbolicRegressor
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
# from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor
from IPython.display import display
# from catboost import CatBoostClassifier
# from scipy.cluster import hierarchy as hc
# from collections import Counter
# from sklearn import metrics
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error
# from sklearn.metrics import roc_auc_score, log_loss
# from sklearn.model_selection import KFold, StratifiedKFold
# from sklearn.model_selection import GridSearchCV
# from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
# from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
# from sklearn.cluster import KMeans
# from sklearn.metrics import accuracy_score, log_loss
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# from sklearn.neural_network import MLPClassifier
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);
In [3]:
PATH = os.getcwd()
PATH
Out[3]:
In [4]:
!dir {PATH}
In [5]:
df_raw = pd.read_csv(f'{PATH}\\train_new_agg_feats.csv', low_memory=False,dtype='float32')
df_test = pd.read_csv(f'{PATH}\\test_new_agg_feats.csv', low_memory=False, dtype='float32')
In [6]:
def display_all(df):
with pd.option_context("display.max_rows", 100):
with pd.option_context("display.max_columns", 100):
display(df)
def make_submission(probs):
sample = pd.read_csv(f'{PATH}\\sample_submission.csv')
submit = sample.copy()
submit['Upvotes'] = probs
return submit
In [7]:
df_raw.shape,
Out[7]:
In [8]:
df_raw.get_ftype_counts()
Out[8]:
pretty cleaned datasets ( Majority numbers, so dont forget to use gplearn (Genetic Programming Module) plus different feats on basis of +,-,*,/
In [9]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))
In [10]:
# df_raw['target'] = np.exp(target) - 1
# df_raw.to_csv(f'{PATH}\\train_new_agg_feats.csv', index=False)
# df_test.to_csv(f'{PATH}\\test_new_agg_feats.csv', index=False)
In [11]:
man_train_list = df_raw.Username.unique()
man_test_list = df_test.Username.unique()
man_not_in_test = set(man_train_list) - set(man_test_list)
man_not_in_train = set(man_test_list) - set(man_train_list)
In [12]:
df_raw.drop(index = df_raw.loc[list(man_not_in_test)].index, inplace=True)
target = df_raw.target.values - 1
df_raw.drop('target', axis=1, inplace=True)
In [34]:
function_set = ['add','sub','mul','div','sqrt','log','abs','neg','inv','min','max']
gp = SymbolicTransformer(generations=20,population_size=3000,n_jobs=-1,hall_of_fame=100,n_components=10,verbose=1,\
function_set=function_set,parsimony_coefficient=0.005,max_samples=0.9,random_state=123)
In [35]:
gp.fit(df_raw, target)
Out[35]:
In [21]:
gp_feat_eng_train = gp.transform(df_raw)
gp_feat_eng_test = gp.transform(df_test)
ext_train = np.hstack((df_raw, gp_feat_eng_train))
ext_test = np.hstack((df_test, gp_feat_eng_test))
In [28]:
my_xgb = xgb.XGBRegressor(8,0.01,n_jobs=-1,colsample_bytree=0.9,gamma=0.5,silent=False)
In [29]:
my_xgb.fit(ext_train, target)
Out[29]:
In [30]:
xgb_preds = my_xgb.predict(ext_test)
In [31]:
xgb_preds
Out[31]:
In [32]:
submit = make_submission(xgb_preds)
In [33]:
submit.to_csv(f'{PATH}\\xgb_v1.csv', index=None)
In [38]:
min(xgb_preds), max(xgb_preds)
Out[38]:
In [42]:
sns.distplot(np.log(target + 1))
Out[42]:
In [43]:
sns.distplot(np.log(xgb_preds + 1))
Out[43]:
In [51]:
min(np.percentile(target,[90,91,92,93,94,95,96,97,98,99])), max(np.percentile(target,[90,91,92,93,94,95,96,97,98,99]))
Out[51]:
In [47]:
np.percentile(xgb_preds,[90,91,92,93,94,95,96,97,98,99])
Out[47]:
In [49]:
np.where(xgb_preds>3313,3313,xgb_preds)
Out[49]:
In [50]:
min(np.where(xgb_preds>3313,3313,xgb_preds)), max(np.where(xgb_preds>3313,3313,xgb_preds))
Out[50]:
In [52]:
xgb_preds_threshold = np.where(xgb_preds>3313,3313,xgb_preds)
In [53]:
submit = make_submission(xgb_preds_threshold)
submit.to_csv(f'{PATH}\\xgb_v2_thresholding_at_3133.csv', index=None)
In [13]:
# temp1 = df_raw.groupby('Username').count().iloc[:,-1]
# temp2 = df_test.groupby('Username').count().iloc[:,-1]
# df_man = pd.concat([temp1,temp2], axis = 1, join = 'outer')
# df_man.columns = ['train_count','test_count']
# df_man.head(2)
In [14]:
# man_list = df_man['train_count'].sort_values(ascending = False).index
# ixes = df_raw.Username.isin(man_list)
# df10000 = df_raw[ixes][['Username','Tag']]
# tags_dummies = pd.get_dummies(df10000.Tag)
# df10000 = pd.concat([df10000,tags_dummies[['a', 'c', 'h', 'i', 'j', 'o', 'p', 'r', 's', 'x']]], axis = 1).drop('Tag', axis = 1)
# # print("The contributors account for {} entries\n".format(len(df10000)))
# # print(df10000.head(10))
In [15]:
# df10000.groupby('Username').count().sort_values(by = 'a', ascending = False).head()
In [21]:
xyz = pd.concat([df_raw.groupby('Username').mean(),df_raw.groupby('Username').count()], axis = 1).iloc[:,:-5]
xyz.columns = ['ID', 'Reputation', 'Answers', 'Views', 'Upvotes', 'count']
############################################################################################# Mean Aggs
unames = xyz.sort_values(by = 'count', ascending = False).reset_index()['Username'].values.astype('int64')
count = xyz.sort_values(by = 'count', ascending = False).reset_index()['count'].values.astype('int64')
answers = xyz.sort_values(by = 'count', ascending = False).reset_index()['Answers'].values.astype('int64')
views = xyz.sort_values(by = 'count', ascending = False).reset_index()['Views'].values.astype('int64')
repo = xyz.sort_values(by = 'count', ascending = False).reset_index()['Reputation'].values.astype('int64')
d = {}
for idx,k in enumerate(unames):
d[k] = count[idx]
df_raw['agg_count'] = df_raw['Username'].map(d)
d = {}
for idx,k in enumerate(unames):
d[k] = answers[idx]
df_raw['agg_answers'] = df_raw['Username'].map(d)
d = {}
for idx,k in enumerate(unames):
d[k] = views[idx]
df_raw['agg_views'] = df_raw['Username'].map(d)
d = {}
for idx,k in enumerate(unames):
d[k] = repo[idx]
df_raw['agg_repo'] = df_raw['Username'].map(d)
In [22]:
xyz = pd.concat([df_test.groupby('Username').mean(),df_test.groupby('Username').count()], axis = 1).iloc[:,:-4]
xyz.columns = ['ID', 'Reputation', 'Answers', 'Views', 'count']
########################################################################################## Mean Aggregates
unames = xyz.sort_values(by = 'count', ascending = False).reset_index()['Username'].values.astype('int64')
count = xyz.sort_values(by = 'count', ascending = False).reset_index()['count'].values.astype('int64')
answers = xyz.sort_values(by = 'count', ascending = False).reset_index()['Answers'].values.astype('int64')
views = xyz.sort_values(by = 'count', ascending = False).reset_index()['Views'].values.astype('int64')
repo = xyz.sort_values(by = 'count', ascending = False).reset_index()['Reputation'].values.astype('int64')
d = {}
for idx,k in enumerate(unames):
d[k] = count[idx]
df_test['agg_count'] = df_test['Username'].map(d)
d = {}
for idx,k in enumerate(unames):
d[k] = answers[idx]
df_test['agg_answers'] = df_test['Username'].map(d)
d = {}
for idx,k in enumerate(unames):
d[k] = views[idx]
df_test['agg_views'] = df_test['Username'].map(d)
d = {}
for idx,k in enumerate(unames):
d[k] = repo[idx]
df_test['agg_repo'] = df_test['Username'].map(d)
df_test.head(3)
Out[22]:
In [49]:
add_trans = ['Reputation', 'Answers', 'Username', 'Views', 'agg_count', 'agg_answers', 'agg_views', 'agg_repo']
for col in add_trans:
df_raw[f'log_trans_{col}'.format(col)] = np.log(df_raw[col] + 1) #avoid log 0's if any
df_test[f'log_trans_{col}'.format(col)] = np.log(df_test[col] + 1) #avoid log 0's if any
df_raw['repo_per_Answers'] = df_raw['Reputation'] / (df_raw['Answers']+1)
df_raw['repo_per_Views'] = df_raw['Reputation'] / df_raw['Views']
df_test['repo_per_Answers'] = df_test['Reputation'] / (df_test['Answers'] +1)
df_test['repo_per_Views'] = df_test['Reputation'] / df_test['Views']
df_raw.shape, df_test.shape
Out[49]:
In [24]:
# gby = pd.concat([df10000.groupby('Username').mean(),df10000.groupby('Username').count()], axis = 1).iloc[:,:-9]
# gby.columns = ['a', 'c', 'h', 'i', 'j', 'o', 'p', 'r', 's', 'x', 'count']
# gby.sort_values(by = 'count', ascending = False).head(3)[['a', 'c', 'h', 'i', 'j', 'o', 'p', 'r', 's', 'x', 'count']]
In [25]:
# gby.sort_values(by = 'count', ascending = False).drop('count', axis = 1).plot(kind = 'bar', stacked = True, figsize = (15,6))
# plt.figure()
# gby.sort_values(by = 'count', ascending = False)['count'].plot(kind = 'bar', figsize = (15,6));
In [26]:
# pd.concat([df_raw['Tag'].value_counts().sort_values(ascending=False),df_test['Tag'].value_counts().sort_values(ascending=False)],sort=False, axis =1,\
# keys=['Train_Stats', 'Test_Stats'])
In [27]:
# gby.shape
In [28]:
# gby['skill'] = gby['r']*1 + gby['o']*2 + gby['h']*3 + gby['s']*4 + gby['a']*5 + gby['i']*6 + gby['p']*7 + gby['j']*8 \
# + gby['c']*9
In [32]:
##logging Remeber doing np.exp again
df_raw.Upvotes = np.log(df_raw.Upvotes + 2)
target = df_raw.Upvotes.values
In [34]:
drop_cols = ['ID']
df_raw.drop(drop_cols+['Upvotes'],inplace=True,axis=1)
df_test.drop(drop_cols,inplace=True,axis=1)
In [35]:
sns.distplot(target)
Out[35]:
In [37]:
df_raw.Tag = df_raw.Tag.astype('category')
train_cats(df_raw);
apply_cats(df_test, df_raw);
df_raw.Tag = df_raw.Tag.cat.codes
df_test.Tag = df_test.Tag.cat.codes
In [40]:
df_raw.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)
In [51]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df_raw, target)
Out[51]:
In [66]:
# print('Before -->>', df_raw.shape)
# df_raw.drop(index = df_raw.loc[list(man_not_in_test)].index, inplace=True)
# print('After -->>', df_raw.shape)
In [55]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df_raw, target)
m.score(df_raw,target)
Out[55]:
todo define r^2
Wow, an r^2 of 0.9699 - that's great, right? Well, perhaps not...
Possibly the most important idea in machine learning is that of having separate training & validation data sets
In [58]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df_raw, target, test_size=0.2, random_state=42)
def split_vals(a,n): return a[:n].copy(), a[n:].copy()
n_valid = 30000
n_trn = len(df_raw)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df_raw, n_trn)
y_train, y_valid = split_vals(target, n_trn)
X_train.shape, y_train.shape, X_valid.shape
Out[58]:
In [59]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
def print_score(m):
res = ['RMSLE X_train', rmse(m.predict(X_train), y_train), '\n RMSLE X_valid', rmse(m.predict(X_valid), y_valid),
'\n R**2 Train',m.score(X_train, y_train), '\n R**2 Valid', m.score(X_valid, y_valid)]
if hasattr(m, 'oob_score_'): res.append(['\n OOB_Score', m.oob_score_])
print(res)
In [60]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
It's Pathetic as We are Clearly Overfitting... Have a look at the RM(L)SE Scores and the Accuracy... They aree way too off...
In [102]:
m.fit(df,y)
Out[102]:
In [61]:
preds = np.exp(m.predict(df_test)).astype('int32') - 1;
preds
Out[61]:
In [62]:
submit = make_submission(preds)
submit.to_csv(f'{PATH}\\Adi_rf_08_58_31-07-2018.csv', index=False)
submit.head(2)
Out[62]:
In [65]:
m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [66]:
draw_tree(m.estimators_[0], df_raw, precision=3)
In [67]:
m = RandomForestRegressor(n_estimators=1, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [112]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [113]:
preds = np.stack([t.predict(X_valid) for t in m.estimators_])
preds[:,0], np.mean(preds[:,0]), y_valid[0]
Out[113]:
In [114]:
preds.shape
Out[114]:
In [115]:
plt.plot([metrics.r2_score(y_valid, np.mean(preds[:i+1], axis=0)) for i in range(10)]);
The shape of this curve suggests that adding more trees isn't going to help us much
In [116]:
m = RandomForestRegressor(n_estimators=20, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [117]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [119]:
m = RandomForestRegressor(n_estimators=80, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [122]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)
In [125]:
X_valid.shape, X_train.shape
Out[125]:
In [128]:
df_trn, y_trn, nas = proc_df(df_raw, 'Upvotes', max_n_cat=20)
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)
In [129]:
set_rf_samples(50000)
In [130]:
m = RandomForestRegressor(n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)
In [131]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)
In [132]:
reset_rf_samples()
In [133]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)
In [134]:
X_train.shape
Out[134]:
In [138]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)
RMSLE FOR VALID IS TOO HIGH, we need to change the randomness i guess
In [69]:
fi = rf_feat_importance(m, df_trn); fi[:10]
Out[69]:
In [70]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False);
In [71]:
def plot_fi(fi): return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)
In [73]:
plot_fi(fi[:]);
In [74]:
to_keep = fi[fi.imp>0.005].cols; len(to_keep)
Out[74]:
In [76]:
df_keep = df_raw[to_keep].copy()
X_train, X_valid = split_vals(df_keep, n_trn)
In [77]:
from scipy.cluster import hierarchy as hc
In [78]:
corr = np.round(scipy.stats.spearmanr(df_keep).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(16,10))
dendrogram = hc.dendrogram(z, labels=df_keep.columns, orientation='left', leaf_font_size=16)
In [79]:
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=3, max_features=0.5,
n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)
In [80]:
fi = rf_feat_importance(m, df_keep)
plot_fi(fi);
In [81]:
def get_oob(df):
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.6, n_jobs=-1, oob_score=True)
x, _ = split_vals(df, n_trn)
m.fit(x, y_train)
return m.oob_score_
In [82]:
get_oob(df_keep)
Out[82]:
In [83]:
m
Out[83]:
In [85]:
preds = np.exp(m.predict(df_test[to_keep])) - 1;
preds
Out[85]:
In [86]:
submit = make_submission(preds)
submit.to_csv(f'{PATH}\\Adi_rf_08_58_31-07-2018.csv', index=False)
submit.head(2)
Out[86]: