In [1]:
import time
import xgboost as xgb
import lightgbm as lgb
# import category_encoders as cat_ed
import gc, mlcrate, glob
# from gplearn.genetic import SymbolicTransformer, SymbolicRegressor
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from IPython.display import display
from catboost import CatBoostRegressor
from scipy.cluster import hierarchy as hc
from collections import Counter
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);
PATH = os.getcwd()
df_raw = pd.read_csv(f'{PATH}\\train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\test.csv', low_memory=False)
def display_all(df):
with pd.option_context("display.max_rows", 100):
with pd.option_context("display.max_columns", 100):
display(df)
def make_submission(probs):
sample = pd.read_csv(f'{PATH}\\sample_submission.csv')
submit = sample.copy()
submit['Upvotes'] = probs
return submit
In [2]:
df_raw.head()
Out[2]:
In [3]:
man_train_list = df_raw.Username.unique()
man_test_list = df_test.Username.unique()
print("Train: {0}".format(len(man_train_list)))
print("Test: {0}".format(len(man_test_list)))
In [4]:
man_not_in_test = set(man_train_list) - set(man_test_list)
man_not_in_train = set(man_test_list) - set(man_train_list)
print("{} man are featured in train but not in test".format(len(man_not_in_test)))
print("{} man are featured in test but not in train".format(len(man_not_in_train)))
In [5]:
#df_raw.drop(index = df_raw.loc[list(man_not_in_test)].index, inplace=True)
df_raw.drop(index = df_raw[(df_raw['Reputation'] == 0) & (df_raw['Upvotes'] != 0)].index, inplace=True)
df_raw.drop(index = df_raw[(df_raw['Upvotes'] == 0) & (df_raw['Views']>1000)].index, inplace=True)
In [6]:
df_raw.sort_values(by=['Username', 'Reputation', 'Views'], inplace=True)
In [7]:
temp1 = df_raw.groupby('Username').count().iloc[:,-1]
temp2 = df_test.groupby('Username').count().iloc[:,-1]
df_man = pd.concat([temp1,temp2], axis = 1, join = 'outer')
df_man.columns = ['train_count','test_count']
In [8]:
df_man.sort_values(by = 'train_count', ascending = False).plot.scatter(x = 'train_count', y = 'test_count')
Out[8]:
In [9]:
xyz = pd.concat([df_raw.groupby('Username').mean(),df_raw.groupby('Username').count()], axis = 1).iloc[:,:-5]
xyz.columns = ['ID', 'Reputation', 'Answers', 'Views', 'Upvotes', 'count']
############################################################################################# Mean Aggs
unames = xyz.sort_values(by = 'count', ascending = False).reset_index()['Username'].values
count = xyz.sort_values(by = 'count', ascending = False).reset_index()['count'].values
answers = xyz.sort_values(by = 'count', ascending = False).reset_index()['Answers'].values
views = xyz.sort_values(by = 'count', ascending = False).reset_index()['Views'].values
repo = xyz.sort_values(by = 'count', ascending = False).reset_index()['Reputation'].values
d = {}
for idx,k in enumerate(unames):
d[k] = count[idx]
df_raw['agg_count'] = df_raw['Username'].map(d)
d = {}
for idx,k in enumerate(unames):
d[k] = repo[idx]
df_raw['agg_repo'] = df_raw['Username'].map(d)
In [10]:
xyz = pd.concat([df_test.groupby('Username').mean(),df_test.groupby('Username').count()], axis = 1).iloc[:,:-4]
xyz.columns = ['ID', 'Reputation', 'Answers', 'Views', 'count']
############################################################################################# Mean Aggs
unames = xyz.sort_values(by = 'count', ascending = False).reset_index()['Username'].values
count = xyz.sort_values(by = 'count', ascending = False).reset_index()['count'].values
answers = xyz.sort_values(by = 'count', ascending = False).reset_index()['Answers'].values
views = xyz.sort_values(by = 'count', ascending = False).reset_index()['Views'].values
repo = xyz.sort_values(by = 'count', ascending = False).reset_index()['Reputation'].values
d = {}
for idx,k in enumerate(unames):
d[k] = count[idx]
df_test['agg_count'] = df_test['Username'].map(d)
d = {}
for idx,k in enumerate(unames):
d[k] = repo[idx]
df_test['agg_repo'] = df_test['Username'].map(d)
In [11]:
df_raw.shape, df_test.shape
Out[11]:
In [12]:
df_raw[df_raw['Username'] == 98].head(10) #intresting Stuff All have same Reputaion ?? Why
Out[12]:
In [13]:
unames_trn = df_raw[['Username','Reputation']].groupby('Username')['Reputation'].nunique().reset_index()['Username'].values
unames_test = df_test[['Username','Reputation']].groupby('Username')['Reputation'].nunique().reset_index()['Username'].values
repo_trn = df_raw[['Username','Reputation']].groupby('Username')['Reputation'].nunique().reset_index()['Reputation'].values
repo_test = df_test[['Username','Reputation']].groupby('Username')['Reputation'].nunique().reset_index()['Reputation'].values
In [14]:
d = {}
for idx,k in enumerate(unames_trn):
d[k] = repo_trn[idx]
df_raw['unique_repo'] = df_raw['Username'].map(d)
d = {}
for idx,k in enumerate(unames_test):
d[k] = repo_test[idx]
df_test['unique_repo'] = df_test['Username'].map(d)
In [15]:
df_raw['one_time_user'] = False
df_test['one_time_user'] = False
unames_trn = df_raw[(df_raw['unique_repo'] == 1) & (df_raw['agg_count'] == 1)]['Username'].values
unames_test = df_test[(df_test['unique_repo'] == 1) & (df_test['agg_count'] == 1)]['Username'].values
d = {}
for idx,k in enumerate(unames_trn):
d[k] = True
df_raw['one_time_user'] = df_raw['Username'].map(d)
d = {}
for idx,k in enumerate(unames_test):
d[k] = True
df_test['one_time_user'] = df_test['Username'].map(d)
df_raw.fillna(False,inplace=True)
df_test.fillna(False,inplace=True)
In [16]:
df_raw[(df_raw['Views']>10000) & (df_raw['Answers'] == 1)] ;#remeber binning views
In [17]:
df_raw[(df_raw['Views']>75000) & (df_raw['Answers'] == 1) &(df_raw['one_time_user']==True)];
In [18]:
min(df_raw['Views']), max(df_raw['Views']), min(df_test['Views']), max(df_test['Views'])
Out[18]:
In [19]:
df_raw['avg_repo'] = df_raw['Reputation']/ df_raw['unique_repo']
df_test['avg_repo'] = df_test['Reputation']/ df_test['unique_repo']
In [20]:
add_trans = ['Reputation', 'Answers', 'Views']
for col in add_trans:
df_raw[f'log_trans_{col}'.format(col)] = np.log(df_raw[col] + 1) #avoid log 0's if any
df_test[f'log_trans_{col}'.format(col)] = np.log(df_test[col] + 1) #avoid log 0's if any
df_raw[f'sqrt_trans_{col}'.format(col)] = np.sqrt(df_raw[col])
df_test[f'sqrt_trans_{col}'.format(col)] = np.sqrt(df_test[col])
df_raw['repo_per_Answers'] = df_raw['Reputation'] / (df_raw['Answers']+1)
df_raw['repo_per_Views'] = df_raw['Reputation'] / df_raw['Views']
df_raw['log_trans_repo_per_Answers'] = np.log(df_raw['repo_per_Answers'] + 1)
df_raw['log_trans_repo_per_Views'] = np.log(df_raw['repo_per_Views'] + 1)
df_test['repo_per_Answers'] = df_test['Reputation'] / (df_test['Answers'] +1)
df_test['repo_per_Views'] = df_test['Reputation'] / df_test['Views']
df_test['log_trans_repo_per_Answers'] = np.log(df_test['repo_per_Answers'] + 1)
df_test['log_trans_repo_per_Views'] = np.log(df_test['repo_per_Views'] + 1)
df_raw.shape, df_test.shape
Out[20]:
In [21]:
unames_trn = df_raw[['Username','Tag']].groupby('Username')['Tag'].nunique().reset_index()['Username'].values
unames_test = df_test[['Username','Tag']].groupby('Username')['Tag'].nunique().reset_index()['Username'].values
tag_trn = df_raw[['Username','Tag']].groupby('Username')['Tag'].nunique().reset_index()['Tag'].values
tag_test = df_test[['Username','Tag']].groupby('Username')['Tag'].nunique().reset_index()['Tag'].values
d = {}
for idx,k in enumerate(unames_trn):
d[k] = tag_trn[idx]
df_raw['unique_tag'] = df_raw['Username'].map(d)
d = {}
for idx,k in enumerate(unames_test):
d[k] = tag_test[idx]
df_test['unique_tag'] = df_test['Username'].map(d)
In [22]:
def get_score(l = []):
score = 10
for i in l:
if i == 'c': score += 100
if i == 'j': score += 90
if i == 'p': score += 80
if i == 'i': score += 70
if i == 'a': score += 60
if i == 's': score += 50
if i == 'h': score += 40
if i == 'o': score += 30
if i == 'r': score += 20
return(score)
unames_trn = df_raw[['Username','Tag']].groupby('Username')['Tag'].nunique().reset_index()['Username'].values
unames_test = df_test[['Username','Tag']].groupby('Username')['Tag'].nunique().reset_index()['Username'].values
In [23]:
%%time
import gc
d = {}
for i in unames_trn[::-1]:
d[i] = set(df_raw[df_raw['Username'] == i]['Tag'].values)
for k,v in d.items():
l = []
for i in v:
l.append(i)
d[k] = get_score(l)
df_raw['skill_score'] = df_raw['Username'].map(d)
del d
gc.collect()
In [24]:
%%time
d = {}
for i in unames_test[::-1]:
d[i] = set(df_test[df_test['Username'] == i]['Tag'].values)
for k,v in d.items():
l = []
for i in v:
l.append(i)
d[k] = get_score(l)
df_test['skill_score'] = df_test['Username'].map(d)
del d
gc.collect()
In [25]:
df_raw['rep_per_skill'] = df_raw['Reputation']/ df_raw['skill_score']
df_raw['skill_per_tag'] = df_raw['skill_score']/ df_raw['unique_tag']
df_raw['views_per_ans'] = df_raw['Views'] / (df_raw['Answers']+ 1)
df_test['rep_per_skill'] = df_test['Reputation']/ df_test['skill_score']
df_test['skill_per_tag'] = df_test['skill_score']/ df_test['unique_tag']
df_test['views_per_ans'] = df_test['Views'] / (df_test['Answers']+ 1)
In [26]:
plt.scatter(range(df_raw.shape[0]), np.sort(np.log(df_raw.Upvotes+2)))
Out[26]:
In [223]:
df_raw.to_csv(f'{PATH}\\new__train.csv', index=None)
df_test.to_csv(f'{PATH}\\new__test.csv', index=None)
In [27]:
min(set(df_raw['Reputation'])), max(set(df_raw['Reputation']))
Out[27]:
In [28]:
max(df_test['Answers'])
Out[28]:
In [29]:
min(df_raw['Answers'])
Out[29]:
In [30]:
bins = [-1., 5., 10., 15., 20., 25., 30., 35., 40., 45., 50., 55., 60., 70., 80.]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['Answers']
for col in bin_cols:
df_raw[f'bin_{col}'.format(col)] = pd.cut(df_raw[col] ,bins,labels = labels)
df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
bins = [0, 5000, 10000, 25000, 50000, 75000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 10000**2]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['Views']
for col in bin_cols:
df_raw[f'bin_{col}'.format(col)] = pd.cut(df_raw[col] ,bins,labels = labels)
df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
bins = [-1, 5000, 10000, 25000, 50000, 75000, 100000, 150000, 200000, 250000, 500000, 750000, 1000000, 2000000, 10000**2]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['Reputation']
for col in bin_cols:
df_raw[f'bin_{col}'.format(col)] = pd.cut(df_raw[col] ,bins,labels = labels)
df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
In [32]:
df_raw['Tag'] = df_raw['Tag'].astype('category')
df_test['Tag'] = df_test['Tag'].astype('category')
In [33]:
target = df_raw.Upvotes.values
In [34]:
model=CatBoostRegressor(iterations=300, learning_rate= 0.06, depth = 8, loss_function='RMSE')
In [35]:
df_raw.drop(['ID','Upvotes'], axis=1,inplace=True)
df_test.drop(['ID'], axis=1,inplace=True)
In [38]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df_raw, target, test_size=0.2, random_state=42)
In [39]:
len(df_raw.columns)
Out[39]:
In [ ]:
model.fit(X_train, y_train,cat_features=[0,8,20,25,26,27], eval_set=(X_valid,y_valid))
In [ ]:
model.save_model(f'{PATH}\\catboost_new_feats_model_depth_8', export_parameters=dict())
In [ ]:
preds = model.predict(df_test);
preds[:10]
In [ ]:
submit = make_submission(preds)
In [ ]:
submit.to_csv(f'{PATH}\\Adi_catboost_with_new_feats_10092018_depth_8.csv', index=None)
In [ ]:
from sklearn.metrics import mean_squared_error as mse
def runXGB(train_X, train_y, test_X, test_y=None):
params = {}
params['booster'] = 'gbtree'
params["objective"] = "gpu:reg:linear"
params["eta"] = 0.02
params["min_child_weight"] = 2
params["subsample"] = 0.9
params["colsample_bytree"] = 0.8
params["silent"] = 0
params["max_depth"] = 8
params["seed"] = 1
params['alpha'] = .05
params['tree_method'] = 'gpu_hist'
params['gamma'] = 3
plst = list(params.items())
num_rounds = 900
xgtrain = xgb.DMatrix(train_X, label=train_y)
xgtest = xgb.DMatrix(test_X)
model = xgb.train(plst, xgtrain, num_rounds)
pred_test_y = model.predict(xgtest)
return model, pred_test_y
def rmse(act_y, pred_y):
return np.sqrt(mse(act_y, pred_y))
In [ ]:
model_xgb, preds = runXGB(pd.get_dummies(df_raw,prefix='dummy'), target, pd.get_dummies(df_test,prefix='dummy'))