In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [69]:
import time
import xgboost as xgb
import lightgbm as lgb
import category_encoders as cat_ed
import gc, mlcrate, glob
from gplearn.genetic import SymbolicTransformer
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor
from IPython.display import display
from catboost import CatBoostClassifier, CatBoostRegressor
from scipy.cluster import hierarchy as hc
from collections import Counter
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);
In [3]:
PATH = os.getcwd()
In [4]:
df_raw = pd.read_csv(f'{PATH}\\train_new_agg_feats.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\test_new_agg_feats.csv', low_memory=False)
In [5]:
def display_all(df):
with pd.option_context("display.max_rows", 100):
with pd.option_context("display.max_columns", 100):
display(df)
def make_submission(probs):
sample = pd.read_csv(f'{PATH}\\sample_submission.csv')
submit = sample.copy()
submit['Upvotes'] = probs
return submit
In [6]:
df_raw.shape,
Out[6]:
In [7]:
df_raw.get_ftype_counts()
Out[7]:
In [8]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))
In [41]:
df_raw.head()
Out[41]:
In [45]:
df_raw = pd.get_dummies(df_raw, 'tag', columns=['Tag'])
In [49]:
df_test = pd.get_dummies(df_test, 'tag', columns=['Tag'])
In [ ]:
In [ ]:
In [34]:
man_train_list = df_raw.Username.unique()
man_test_list = df_test.Username.unique()
man_not_in_test = set(man_train_list) - set(man_test_list)
man_not_in_train = set(man_test_list) - set(man_train_list)
In [35]:
df_raw.drop(index = df_raw.loc[list(man_not_in_test)].index, inplace=True)
In [100]:
model=CatBoostRegressor(iterations=500, learning_rate= 0.06, depth = 8, loss_function='RMSE')
In [101]:
model.fit(df_raw, target)
Out[101]:
In [104]:
preds = model.predict(df_test) - 1;
preds[:10]
Out[104]:
In [105]:
submit = make_submission(preds)
In [106]:
submit.to_csv(f'{PATH}\\Adi_catboost_with rf_feats_310818.csv', index=None)
In [ ]:
In [54]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
def print_score(m):
res = ['RMSLE X_train', rmse(m.predict(X_train), y_train), '\n RMSLE X_valid', rmse(m.predict(X_valid), y_valid),
'\n R**2 Train',m.score(X_train, y_train), '\n R**2 Valid', m.score(X_valid, y_valid)]
if hasattr(m, 'oob_score_'): res.append(['\n OOB_Score', m.oob_score_])
print(res)
In [56]:
target = df_raw.target
In [57]:
df_raw.drop('target', axis=1,inplace=True)
In [78]:
df_raw.drop('Username', axis=1,inplace=True)
df_test.drop('Username', axis=1,inplace=True)
In [79]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df_raw, target, test_size=0.2, random_state=42)
def split_vals(a,n): return a[:n].copy(), a[n:].copy()
n_valid = 30000
n_trn = len(df_raw)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df_raw, n_trn)
y_train, y_valid = split_vals(target, n_trn)
X_train.shape, y_train.shape, X_valid.shape
Out[79]:
In [87]:
df_raw.drop(['Reputation', 'Answers', 'Views'], axis=1, inplace=True)
df_test.drop(['Reputation', 'Answers', 'Views'], axis=1, inplace=True)
In [ ]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True, max_depth= 8)
m.fit(X_train, y_train)
print_score(m)
In [81]:
df_raw.head()
Out[81]:
In [86]:
df_raw.columns
Out[86]:
In [85]:
for i in df_raw.columns:
sns.distplot(df_raw[i])
plt.show()
In [ ]: