the boring stuff


In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [69]:
import time
import xgboost as xgb
import lightgbm as lgb
import category_encoders as cat_ed
import gc, mlcrate, glob

from gplearn.genetic import SymbolicTransformer
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor
from IPython.display import display
from catboost import CatBoostClassifier, CatBoostRegressor
from scipy.cluster import hierarchy as hc
from collections import Counter

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import  roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);

In [3]:
PATH = os.getcwd()

In [4]:
df_raw = pd.read_csv(f'{PATH}\\train_new_agg_feats.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\test_new_agg_feats.csv', low_memory=False)

In [5]:
def display_all(df):
    with pd.option_context("display.max_rows", 100): 
        with pd.option_context("display.max_columns", 100): 
            display(df)

def make_submission(probs):
    sample = pd.read_csv(f'{PATH}\\sample_submission.csv')
    submit = sample.copy()
    submit['Upvotes'] = probs
    return submit

In [6]:
df_raw.shape,


Out[6]:
((330045, 20),)

In [7]:
df_raw.get_ftype_counts()


Out[7]:
float64:dense    20
dtype: int64

In [8]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))


Answers                  0.0
Reputation               0.0
Tag                      0.0
Username                 0.0
Views                    0.0
agg_answers              0.0
agg_count                0.0
agg_repo                 0.0
agg_views                0.0
log_trans_Answers        0.0
log_trans_Reputation     0.0
log_trans_Username       0.0
log_trans_Views          0.0
log_trans_agg_answers    0.0
log_trans_agg_count      0.0
log_trans_agg_repo       0.0
log_trans_agg_views      0.0
repo_per_Answers         0.0
repo_per_Views           0.0
target                   0.0
dtype: float64

random


In [41]:
df_raw.head()


Out[41]:
Tag Reputation Answers Username Views agg_count agg_answers agg_views agg_repo log_trans_Reputation log_trans_Answers log_trans_Username log_trans_Views log_trans_agg_count log_trans_agg_answers log_trans_agg_views log_trans_agg_repo repo_per_Answers repo_per_Views target
4 1.0 4271.0 4.0 112223.0 13986.0 3.0 2.0 20598.0 4271.0 8.359838 1.609438 11.628252 9.545883 1.386294 1.098612 9.932998 8.359838 854.200012 0.305377 84.000000
7 4.0 2269.0 2.0 54623.0 312.0 7.0 2.0 4168.0 2269.0 7.727535 1.098612 10.908229 5.746203 2.079442 1.098612 8.335431 7.727535 756.333313 7.272436 4.000000
8 4.0 111.0 2.0 172926.0 53738.0 1.0 2.0 53738.0 111.0 4.718499 1.098612 12.060625 10.891894 0.693147 1.098612 10.891894 4.718499 37.000000 0.002066 80.000008
15 1.0 7952.0 2.0 62155.0 29191.0 4.0 3.0 28314.0 7952.0 8.981304 1.098612 11.037403 10.281650 1.609438 1.386294 10.251147 8.981304 2650.666748 0.272413 224.000015
16 3.0 731.0 4.0 43559.0 5622.0 2.0 2.0 6730.0 731.0 6.595780 1.609438 10.681894 8.634621 1.098612 1.098612 8.814479 6.595780 146.199997 0.130025 14.000001

In [45]:
df_raw = pd.get_dummies(df_raw, 'tag', columns=['Tag'])

In [49]:
df_test = pd.get_dummies(df_test, 'tag', columns=['Tag'])

In [ ]:


In [ ]:

Bazooka ! (anokas)


In [34]:
man_train_list = df_raw.Username.unique()
man_test_list = df_test.Username.unique()

man_not_in_test = set(man_train_list) - set(man_test_list)
man_not_in_train = set(man_test_list) - set(man_train_list)

In [35]:
df_raw.drop(index = df_raw.loc[list(man_not_in_test)].index, inplace=True)

In [100]:
model=CatBoostRegressor(iterations=500, learning_rate= 0.06, depth = 8, loss_function='RMSE')

In [101]:
model.fit(df_raw, target)


0:	learn: 3063.2371640	total: 241ms	remaining: 2m
1:	learn: 2960.8726708	total: 482ms	remaining: 1m 59s
2:	learn: 2859.8232707	total: 725ms	remaining: 2m
3:	learn: 2763.7387297	total: 962ms	remaining: 1m 59s
4:	learn: 2674.4727651	total: 1.2s	remaining: 1m 58s
5:	learn: 2593.9050737	total: 1.45s	remaining: 1m 58s
6:	learn: 2518.4998525	total: 1.68s	remaining: 1m 58s
7:	learn: 2447.5342017	total: 1.92s	remaining: 1m 57s
8:	learn: 2377.0639722	total: 2.14s	remaining: 1m 57s
9:	learn: 2309.8472468	total: 2.36s	remaining: 1m 55s
10:	learn: 2252.9853105	total: 2.59s	remaining: 1m 54s
11:	learn: 2201.9552468	total: 2.81s	remaining: 1m 54s
12:	learn: 2155.0355269	total: 3.05s	remaining: 1m 54s
13:	learn: 2109.8449006	total: 3.29s	remaining: 1m 54s
14:	learn: 2063.1868582	total: 3.51s	remaining: 1m 53s
15:	learn: 2018.2463731	total: 3.74s	remaining: 1m 53s
16:	learn: 1980.3793067	total: 3.96s	remaining: 1m 52s
17:	learn: 1941.8133946	total: 4.18s	remaining: 1m 52s
18:	learn: 1909.5766205	total: 4.4s	remaining: 1m 51s
19:	learn: 1875.9087949	total: 4.63s	remaining: 1m 51s
20:	learn: 1842.5364903	total: 4.85s	remaining: 1m 50s
21:	learn: 1810.7217731	total: 5.08s	remaining: 1m 50s
22:	learn: 1783.7335448	total: 5.3s	remaining: 1m 49s
23:	learn: 1752.4532362	total: 5.53s	remaining: 1m 49s
24:	learn: 1729.7528946	total: 5.76s	remaining: 1m 49s
25:	learn: 1706.3781921	total: 5.99s	remaining: 1m 49s
26:	learn: 1680.6953070	total: 6.21s	remaining: 1m 48s
27:	learn: 1655.2732000	total: 6.44s	remaining: 1m 48s
28:	learn: 1630.7140227	total: 6.66s	remaining: 1m 48s
29:	learn: 1614.4246819	total: 6.88s	remaining: 1m 47s
30:	learn: 1593.9704031	total: 7.14s	remaining: 1m 48s
31:	learn: 1577.7354516	total: 7.38s	remaining: 1m 47s
32:	learn: 1556.5865803	total: 7.6s	remaining: 1m 47s
33:	learn: 1545.4092596	total: 7.83s	remaining: 1m 47s
34:	learn: 1527.4291772	total: 8.04s	remaining: 1m 46s
35:	learn: 1512.1750109	total: 8.3s	remaining: 1m 47s
36:	learn: 1500.5867365	total: 8.57s	remaining: 1m 47s
37:	learn: 1490.7415966	total: 8.86s	remaining: 1m 47s
38:	learn: 1476.8908032	total: 9.14s	remaining: 1m 47s
39:	learn: 1468.1102551	total: 9.41s	remaining: 1m 48s
40:	learn: 1456.8536919	total: 9.7s	remaining: 1m 48s
41:	learn: 1446.7406451	total: 9.97s	remaining: 1m 48s
42:	learn: 1431.9397542	total: 10.2s	remaining: 1m 48s
43:	learn: 1424.4922563	total: 10.5s	remaining: 1m 49s
44:	learn: 1408.6396887	total: 10.8s	remaining: 1m 49s
45:	learn: 1401.5429193	total: 11.1s	remaining: 1m 49s
46:	learn: 1388.1175988	total: 11.3s	remaining: 1m 49s
47:	learn: 1378.1062350	total: 11.6s	remaining: 1m 49s
48:	learn: 1371.2113914	total: 11.8s	remaining: 1m 48s
49:	learn: 1365.5885524	total: 12s	remaining: 1m 48s
50:	learn: 1359.1910504	total: 12.3s	remaining: 1m 48s
51:	learn: 1350.9557096	total: 12.5s	remaining: 1m 48s
52:	learn: 1345.4958794	total: 12.8s	remaining: 1m 47s
53:	learn: 1340.4850279	total: 13s	remaining: 1m 47s
54:	learn: 1329.3792124	total: 13.2s	remaining: 1m 46s
55:	learn: 1321.5710854	total: 13.4s	remaining: 1m 46s
56:	learn: 1315.8275091	total: 13.7s	remaining: 1m 46s
57:	learn: 1312.3330157	total: 13.9s	remaining: 1m 45s
58:	learn: 1305.4889036	total: 14.1s	remaining: 1m 45s
59:	learn: 1299.8737494	total: 14.4s	remaining: 1m 45s
60:	learn: 1293.4323469	total: 14.6s	remaining: 1m 45s
61:	learn: 1289.7888619	total: 14.8s	remaining: 1m 44s
62:	learn: 1285.7002114	total: 15.1s	remaining: 1m 44s
63:	learn: 1281.5030388	total: 15.4s	remaining: 1m 44s
64:	learn: 1274.4544924	total: 15.7s	remaining: 1m 44s
65:	learn: 1270.7663378	total: 15.9s	remaining: 1m 44s
66:	learn: 1264.1172399	total: 16.2s	remaining: 1m 45s
67:	learn: 1258.6797254	total: 16.5s	remaining: 1m 44s
68:	learn: 1251.2412341	total: 16.8s	remaining: 1m 44s
69:	learn: 1246.0455960	total: 17.1s	remaining: 1m 44s
70:	learn: 1237.9917166	total: 17.3s	remaining: 1m 44s
71:	learn: 1231.8716965	total: 17.6s	remaining: 1m 44s
72:	learn: 1227.3734414	total: 17.9s	remaining: 1m 44s
73:	learn: 1222.5464278	total: 18.2s	remaining: 1m 44s
74:	learn: 1218.6713565	total: 18.4s	remaining: 1m 44s
75:	learn: 1210.5758236	total: 18.6s	remaining: 1m 43s
76:	learn: 1208.5395225	total: 18.9s	remaining: 1m 43s
77:	learn: 1204.9079939	total: 19.1s	remaining: 1m 43s
78:	learn: 1200.3180198	total: 19.3s	remaining: 1m 42s
79:	learn: 1198.9817667	total: 19.6s	remaining: 1m 42s
80:	learn: 1193.3297295	total: 19.8s	remaining: 1m 42s
81:	learn: 1188.3144440	total: 20s	remaining: 1m 42s
82:	learn: 1184.7965892	total: 20.2s	remaining: 1m 41s
83:	learn: 1179.2173251	total: 20.5s	remaining: 1m 41s
84:	learn: 1174.7655945	total: 20.7s	remaining: 1m 41s
85:	learn: 1172.4001541	total: 20.9s	remaining: 1m 40s
86:	learn: 1168.8771520	total: 21.2s	remaining: 1m 40s
87:	learn: 1167.2213346	total: 21.4s	remaining: 1m 40s
88:	learn: 1163.8483743	total: 21.7s	remaining: 1m 40s
89:	learn: 1159.9279305	total: 21.9s	remaining: 1m 39s
90:	learn: 1156.3111194	total: 22.2s	remaining: 1m 39s
91:	learn: 1148.0922752	total: 22.5s	remaining: 1m 39s
92:	learn: 1146.5745636	total: 22.7s	remaining: 1m 39s
93:	learn: 1142.6724054	total: 23s	remaining: 1m 39s
94:	learn: 1141.0029265	total: 23.3s	remaining: 1m 39s
95:	learn: 1136.9930913	total: 23.6s	remaining: 1m 39s
96:	learn: 1134.5355230	total: 23.8s	remaining: 1m 39s
97:	learn: 1131.7039787	total: 24.1s	remaining: 1m 38s
98:	learn: 1126.9852997	total: 24.4s	remaining: 1m 38s
99:	learn: 1122.4116532	total: 24.7s	remaining: 1m 38s
100:	learn: 1118.8205045	total: 24.9s	remaining: 1m 38s
101:	learn: 1116.9513704	total: 25.2s	remaining: 1m 38s
102:	learn: 1109.2849838	total: 25.5s	remaining: 1m 38s
103:	learn: 1108.1590024	total: 25.8s	remaining: 1m 38s
104:	learn: 1106.5731717	total: 26s	remaining: 1m 37s
105:	learn: 1106.0268385	total: 26.3s	remaining: 1m 37s
106:	learn: 1105.3845213	total: 26.5s	remaining: 1m 37s
107:	learn: 1103.5649036	total: 26.8s	remaining: 1m 37s
108:	learn: 1097.3003217	total: 27s	remaining: 1m 36s
109:	learn: 1094.3846386	total: 27.2s	remaining: 1m 36s
110:	learn: 1092.4441536	total: 27.4s	remaining: 1m 36s
111:	learn: 1091.1007582	total: 27.7s	remaining: 1m 35s
112:	learn: 1090.1837888	total: 27.9s	remaining: 1m 35s
113:	learn: 1089.2574779	total: 28.1s	remaining: 1m 35s
114:	learn: 1087.6931682	total: 28.4s	remaining: 1m 34s
115:	learn: 1085.9414680	total: 28.6s	remaining: 1m 34s
116:	learn: 1081.4751456	total: 28.8s	remaining: 1m 34s
117:	learn: 1074.4546997	total: 29.1s	remaining: 1m 34s
118:	learn: 1072.4769458	total: 29.3s	remaining: 1m 33s
119:	learn: 1068.6691068	total: 29.5s	remaining: 1m 33s
120:	learn: 1061.7923280	total: 29.8s	remaining: 1m 33s
121:	learn: 1060.9245520	total: 30.1s	remaining: 1m 33s
122:	learn: 1059.9465634	total: 30.3s	remaining: 1m 32s
123:	learn: 1059.9456174	total: 30.4s	remaining: 1m 32s
124:	learn: 1058.3410714	total: 30.7s	remaining: 1m 32s
125:	learn: 1051.2972779	total: 31s	remaining: 1m 32s
126:	learn: 1049.8853857	total: 31.3s	remaining: 1m 31s
127:	learn: 1045.5041186	total: 31.6s	remaining: 1m 31s
128:	learn: 1043.4805824	total: 31.8s	remaining: 1m 31s
129:	learn: 1040.6572685	total: 32.1s	remaining: 1m 31s
130:	learn: 1034.6188054	total: 32.4s	remaining: 1m 31s
131:	learn: 1031.0899417	total: 32.7s	remaining: 1m 31s
132:	learn: 1029.9232118	total: 32.9s	remaining: 1m 30s
133:	learn: 1028.4953615	total: 33.2s	remaining: 1m 30s
134:	learn: 1026.8957748	total: 33.5s	remaining: 1m 30s
135:	learn: 1026.8611494	total: 33.7s	remaining: 1m 30s
136:	learn: 1025.2347461	total: 34s	remaining: 1m 30s
137:	learn: 1022.5856273	total: 34.2s	remaining: 1m 29s
138:	learn: 1020.1729423	total: 34.5s	remaining: 1m 29s
139:	learn: 1018.8064202	total: 34.7s	remaining: 1m 29s
140:	learn: 1017.7358998	total: 34.9s	remaining: 1m 28s
141:	learn: 1015.6226926	total: 35.1s	remaining: 1m 28s
142:	learn: 1013.5023750	total: 35.4s	remaining: 1m 28s
143:	learn: 1013.4908655	total: 35.5s	remaining: 1m 27s
144:	learn: 1010.8018501	total: 35.7s	remaining: 1m 27s
145:	learn: 1007.8380884	total: 36s	remaining: 1m 27s
146:	learn: 1007.7185928	total: 36.1s	remaining: 1m 26s
147:	learn: 1005.3273478	total: 36.4s	remaining: 1m 26s
148:	learn: 1000.0062962	total: 36.6s	remaining: 1m 26s
149:	learn: 999.9606264	total: 36.8s	remaining: 1m 25s
150:	learn: 997.7357668	total: 37s	remaining: 1m 25s
151:	learn: 992.5900040	total: 37.2s	remaining: 1m 25s
152:	learn: 991.7642291	total: 37.4s	remaining: 1m 24s
153:	learn: 991.4635386	total: 37.6s	remaining: 1m 24s
154:	learn: 989.8268487	total: 37.9s	remaining: 1m 24s
155:	learn: 989.2420901	total: 38.2s	remaining: 1m 24s
156:	learn: 987.3081136	total: 38.4s	remaining: 1m 23s
157:	learn: 986.7190499	total: 38.7s	remaining: 1m 23s
158:	learn: 985.3585718	total: 39s	remaining: 1m 23s
159:	learn: 985.0191420	total: 39.3s	remaining: 1m 23s
160:	learn: 981.9221194	total: 39.5s	remaining: 1m 23s
161:	learn: 981.4420216	total: 39.8s	remaining: 1m 23s
162:	learn: 980.7993257	total: 40.1s	remaining: 1m 22s
163:	learn: 979.4011786	total: 40.3s	remaining: 1m 22s
164:	learn: 979.3289771	total: 40.6s	remaining: 1m 22s
165:	learn: 978.9966721	total: 40.9s	remaining: 1m 22s
166:	learn: 978.2825344	total: 41.2s	remaining: 1m 22s
167:	learn: 978.1990025	total: 41.4s	remaining: 1m 21s
168:	learn: 978.1123669	total: 41.6s	remaining: 1m 21s
169:	learn: 974.9072379	total: 41.9s	remaining: 1m 21s
170:	learn: 973.2546655	total: 42.2s	remaining: 1m 21s
171:	learn: 968.6653026	total: 42.5s	remaining: 1m 20s
172:	learn: 965.1675566	total: 42.7s	remaining: 1m 20s
173:	learn: 963.6626781	total: 43s	remaining: 1m 20s
174:	learn: 962.7360495	total: 43.3s	remaining: 1m 20s
175:	learn: 962.7281514	total: 43.4s	remaining: 1m 19s
176:	learn: 959.9471949	total: 43.7s	remaining: 1m 19s
177:	learn: 959.2961580	total: 44s	remaining: 1m 19s
178:	learn: 955.2740549	total: 44.4s	remaining: 1m 19s
179:	learn: 953.8410444	total: 44.7s	remaining: 1m 19s
180:	learn: 953.3603333	total: 45s	remaining: 1m 19s
181:	learn: 951.7340452	total: 45.2s	remaining: 1m 19s
182:	learn: 947.0774181	total: 45.5s	remaining: 1m 18s
183:	learn: 945.8715540	total: 45.8s	remaining: 1m 18s
184:	learn: 943.5421918	total: 46.1s	remaining: 1m 18s
185:	learn: 943.4836069	total: 46.3s	remaining: 1m 18s
186:	learn: 942.6346337	total: 46.6s	remaining: 1m 18s
187:	learn: 941.3982365	total: 46.8s	remaining: 1m 17s
188:	learn: 939.5919895	total: 47.1s	remaining: 1m 17s
189:	learn: 935.3826764	total: 47.3s	remaining: 1m 17s
190:	learn: 933.0288560	total: 47.5s	remaining: 1m 16s
191:	learn: 932.9810286	total: 47.7s	remaining: 1m 16s
192:	learn: 931.7432984	total: 47.9s	remaining: 1m 16s
193:	learn: 930.2959789	total: 48.1s	remaining: 1m 15s
194:	learn: 926.0605251	total: 48.3s	remaining: 1m 15s
195:	learn: 925.6656904	total: 48.5s	remaining: 1m 15s
196:	learn: 922.9097757	total: 48.8s	remaining: 1m 15s
197:	learn: 922.8134588	total: 48.9s	remaining: 1m 14s
198:	learn: 919.8295041	total: 49.1s	remaining: 1m 14s
199:	learn: 918.6970301	total: 49.3s	remaining: 1m 13s
200:	learn: 913.5940386	total: 49.5s	remaining: 1m 13s
201:	learn: 913.5831380	total: 49.6s	remaining: 1m 13s
202:	learn: 913.0023541	total: 49.9s	remaining: 1m 12s
203:	learn: 910.4114667	total: 50.1s	remaining: 1m 12s
204:	learn: 906.4983096	total: 50.4s	remaining: 1m 12s
205:	learn: 905.3112278	total: 50.6s	remaining: 1m 12s
206:	learn: 905.0169408	total: 50.9s	remaining: 1m 12s
207:	learn: 903.1707354	total: 51.1s	remaining: 1m 11s
208:	learn: 903.1527044	total: 51.3s	remaining: 1m 11s
209:	learn: 902.0290371	total: 51.5s	remaining: 1m 11s
210:	learn: 900.6167164	total: 51.8s	remaining: 1m 10s
211:	learn: 897.9693552	total: 52.1s	remaining: 1m 10s
212:	learn: 897.3718981	total: 52.3s	remaining: 1m 10s
213:	learn: 893.1908966	total: 52.6s	remaining: 1m 10s
214:	learn: 890.4804645	total: 52.9s	remaining: 1m 10s
215:	learn: 888.4615734	total: 53.1s	remaining: 1m 9s
216:	learn: 885.0487848	total: 53.4s	remaining: 1m 9s
217:	learn: 885.0187501	total: 53.6s	remaining: 1m 9s
218:	learn: 884.0080252	total: 53.8s	remaining: 1m 9s
219:	learn: 881.1484999	total: 54s	remaining: 1m 8s
220:	learn: 880.3085053	total: 54.2s	remaining: 1m 8s
221:	learn: 878.7649068	total: 54.4s	remaining: 1m 8s
222:	learn: 877.2014133	total: 54.7s	remaining: 1m 7s
223:	learn: 875.4985617	total: 54.9s	remaining: 1m 7s
224:	learn: 875.1659578	total: 55.1s	remaining: 1m 7s
225:	learn: 873.6639670	total: 55.3s	remaining: 1m 7s
226:	learn: 871.9225615	total: 55.5s	remaining: 1m 6s
227:	learn: 870.3859520	total: 55.8s	remaining: 1m 6s
228:	learn: 869.1560191	total: 56s	remaining: 1m 6s
229:	learn: 867.8265288	total: 56.2s	remaining: 1m 5s
230:	learn: 867.0562769	total: 56.4s	remaining: 1m 5s
231:	learn: 864.5923706	total: 56.7s	remaining: 1m 5s
232:	learn: 862.7084115	total: 57s	remaining: 1m 5s
233:	learn: 861.9328079	total: 57.2s	remaining: 1m 5s
234:	learn: 861.5034162	total: 57.4s	remaining: 1m 4s
235:	learn: 860.4756942	total: 57.6s	remaining: 1m 4s
236:	learn: 859.6641813	total: 57.8s	remaining: 1m 4s
237:	learn: 858.8796723	total: 58.1s	remaining: 1m 3s
238:	learn: 856.6945270	total: 58.2s	remaining: 1m 3s
239:	learn: 856.4444410	total: 58.5s	remaining: 1m 3s
240:	learn: 856.0229468	total: 58.7s	remaining: 1m 3s
241:	learn: 853.6472092	total: 58.9s	remaining: 1m 2s
242:	learn: 853.0159442	total: 59.1s	remaining: 1m 2s
243:	learn: 851.3207202	total: 59.4s	remaining: 1m 2s
244:	learn: 850.8282239	total: 59.7s	remaining: 1m 2s
245:	learn: 850.6857697	total: 59.9s	remaining: 1m 1s
246:	learn: 850.2031416	total: 1m	remaining: 1m 1s
247:	learn: 849.7407528	total: 1m	remaining: 1m 1s
248:	learn: 847.4252256	total: 1m	remaining: 1m 1s
249:	learn: 846.8468295	total: 1m	remaining: 1m
250:	learn: 845.0603637	total: 1m 1s	remaining: 1m
251:	learn: 843.5026304	total: 1m 1s	remaining: 1m
252:	learn: 842.5448558	total: 1m 1s	remaining: 1m
253:	learn: 841.4135580	total: 1m 2s	remaining: 1m
254:	learn: 841.1405446	total: 1m 2s	remaining: 59.9s
255:	learn: 841.0838664	total: 1m 2s	remaining: 59.6s
256:	learn: 838.6671860	total: 1m 2s	remaining: 59.4s
257:	learn: 838.0347710	total: 1m 3s	remaining: 59.2s
258:	learn: 837.1949330	total: 1m 3s	remaining: 59s
259:	learn: 836.3643697	total: 1m 3s	remaining: 58.7s
260:	learn: 835.2765737	total: 1m 3s	remaining: 58.5s
261:	learn: 835.2640811	total: 1m 4s	remaining: 58.2s
262:	learn: 834.4504443	total: 1m 4s	remaining: 58s
263:	learn: 833.6251991	total: 1m 4s	remaining: 57.7s
264:	learn: 831.1171839	total: 1m 4s	remaining: 57.5s
265:	learn: 829.8937156	total: 1m 5s	remaining: 57.3s
266:	learn: 829.6511155	total: 1m 5s	remaining: 57.1s
267:	learn: 829.1017726	total: 1m 5s	remaining: 56.8s
268:	learn: 828.3125841	total: 1m 5s	remaining: 56.6s
269:	learn: 826.8324490	total: 1m 6s	remaining: 56.4s
270:	learn: 825.0727420	total: 1m 6s	remaining: 56.2s
271:	learn: 823.8898608	total: 1m 6s	remaining: 55.9s
272:	learn: 823.3300078	total: 1m 7s	remaining: 55.7s
273:	learn: 822.3887549	total: 1m 7s	remaining: 55.5s
274:	learn: 821.0498375	total: 1m 7s	remaining: 55.2s
275:	learn: 820.4431865	total: 1m 7s	remaining: 54.9s
276:	learn: 819.1382485	total: 1m 7s	remaining: 54.7s
277:	learn: 817.0163142	total: 1m 8s	remaining: 54.4s
278:	learn: 815.8609074	total: 1m 8s	remaining: 54.1s
279:	learn: 815.1604222	total: 1m 8s	remaining: 53.9s
280:	learn: 813.0593184	total: 1m 8s	remaining: 53.6s
281:	learn: 811.6386339	total: 1m 9s	remaining: 53.3s
282:	learn: 811.2995560	total: 1m 9s	remaining: 53.1s
283:	learn: 808.7355966	total: 1m 9s	remaining: 52.8s
284:	learn: 807.5639631	total: 1m 9s	remaining: 52.6s
285:	learn: 806.8836833	total: 1m 9s	remaining: 52.3s
286:	learn: 805.7218120	total: 1m 10s	remaining: 52s
287:	learn: 805.7087099	total: 1m 10s	remaining: 51.7s
288:	learn: 804.4081444	total: 1m 10s	remaining: 51.4s
289:	learn: 803.9249634	total: 1m 10s	remaining: 51.2s
290:	learn: 803.4127689	total: 1m 10s	remaining: 51s
291:	learn: 803.1018264	total: 1m 11s	remaining: 50.7s
292:	learn: 802.2229518	total: 1m 11s	remaining: 50.5s
293:	learn: 799.6227524	total: 1m 11s	remaining: 50.3s
294:	learn: 797.6970062	total: 1m 12s	remaining: 50s
295:	learn: 797.5217486	total: 1m 12s	remaining: 49.8s
296:	learn: 796.6043327	total: 1m 12s	remaining: 49.6s
297:	learn: 795.1304315	total: 1m 12s	remaining: 49.3s
298:	learn: 795.1192720	total: 1m 12s	remaining: 49s
299:	learn: 795.0113934	total: 1m 13s	remaining: 48.8s
300:	learn: 794.6548785	total: 1m 13s	remaining: 48.6s
301:	learn: 794.4154340	total: 1m 13s	remaining: 48.4s
302:	learn: 791.8650076	total: 1m 14s	remaining: 48.1s
303:	learn: 790.4335313	total: 1m 14s	remaining: 47.9s
304:	learn: 789.3009397	total: 1m 14s	remaining: 47.7s
305:	learn: 787.7721690	total: 1m 14s	remaining: 47.5s
306:	learn: 786.9612054	total: 1m 15s	remaining: 47.3s
307:	learn: 786.4413806	total: 1m 15s	remaining: 47.1s
308:	learn: 785.6216395	total: 1m 15s	remaining: 46.9s
309:	learn: 785.4690526	total: 1m 16s	remaining: 46.7s
310:	learn: 784.7599126	total: 1m 16s	remaining: 46.4s
311:	learn: 783.8655334	total: 1m 16s	remaining: 46.2s
312:	learn: 782.7737148	total: 1m 16s	remaining: 46s
313:	learn: 780.5268743	total: 1m 17s	remaining: 45.7s
314:	learn: 779.6913249	total: 1m 17s	remaining: 45.5s
315:	learn: 779.6776446	total: 1m 17s	remaining: 45.2s
316:	learn: 776.4204172	total: 1m 17s	remaining: 44.9s
317:	learn: 776.1585568	total: 1m 18s	remaining: 44.7s
318:	learn: 774.2847859	total: 1m 18s	remaining: 44.4s
319:	learn: 774.2826136	total: 1m 18s	remaining: 44.1s
320:	learn: 773.5678847	total: 1m 18s	remaining: 43.8s
321:	learn: 773.1950142	total: 1m 18s	remaining: 43.6s
322:	learn: 772.5131238	total: 1m 19s	remaining: 43.3s
323:	learn: 772.5107229	total: 1m 19s	remaining: 43s
324:	learn: 771.8116079	total: 1m 19s	remaining: 42.7s
325:	learn: 770.5739075	total: 1m 19s	remaining: 42.5s
326:	learn: 770.4987790	total: 1m 19s	remaining: 42.2s
327:	learn: 769.9167564	total: 1m 19s	remaining: 41.9s
328:	learn: 769.1093489	total: 1m 20s	remaining: 41.7s
329:	learn: 767.8221222	total: 1m 20s	remaining: 41.4s
330:	learn: 767.0687205	total: 1m 20s	remaining: 41.2s
331:	learn: 766.4929799	total: 1m 20s	remaining: 40.9s
332:	learn: 766.2006454	total: 1m 21s	remaining: 40.7s
333:	learn: 764.6423362	total: 1m 21s	remaining: 40.5s
334:	learn: 763.4328542	total: 1m 21s	remaining: 40.2s
335:	learn: 763.4314758	total: 1m 21s	remaining: 39.9s
336:	learn: 763.3161117	total: 1m 22s	remaining: 39.7s
337:	learn: 762.4746422	total: 1m 22s	remaining: 39.5s
338:	learn: 761.3450132	total: 1m 22s	remaining: 39.2s
339:	learn: 761.3435779	total: 1m 22s	remaining: 38.9s
340:	learn: 760.3540810	total: 1m 23s	remaining: 38.7s
341:	learn: 760.3524253	total: 1m 23s	remaining: 38.4s
342:	learn: 759.3497922	total: 1m 23s	remaining: 38.2s
343:	learn: 758.6646801	total: 1m 23s	remaining: 38s
344:	learn: 757.0072613	total: 1m 23s	remaining: 37.7s
345:	learn: 755.9663399	total: 1m 24s	remaining: 37.5s
346:	learn: 755.8308477	total: 1m 24s	remaining: 37.3s
347:	learn: 755.7380565	total: 1m 24s	remaining: 37s
348:	learn: 754.6844029	total: 1m 25s	remaining: 36.8s
349:	learn: 754.6842015	total: 1m 25s	remaining: 36.5s
350:	learn: 752.3372411	total: 1m 25s	remaining: 36.2s
351:	learn: 752.3360818	total: 1m 25s	remaining: 36s
352:	learn: 751.1189834	total: 1m 25s	remaining: 35.7s
353:	learn: 750.6071991	total: 1m 25s	remaining: 35.4s
354:	learn: 750.6068761	total: 1m 26s	remaining: 35.2s
355:	learn: 748.4114932	total: 1m 26s	remaining: 34.9s
356:	learn: 748.4104858	total: 1m 26s	remaining: 34.6s
357:	learn: 747.8627869	total: 1m 26s	remaining: 34.4s
358:	learn: 747.3041333	total: 1m 26s	remaining: 34.1s
359:	learn: 747.2448037	total: 1m 27s	remaining: 33.9s
360:	learn: 745.7153666	total: 1m 27s	remaining: 33.6s
361:	learn: 744.5484235	total: 1m 27s	remaining: 33.4s
362:	learn: 744.2973117	total: 1m 27s	remaining: 33.1s
363:	learn: 743.6131041	total: 1m 27s	remaining: 32.9s
364:	learn: 743.3009723	total: 1m 28s	remaining: 32.6s
365:	learn: 742.5127480	total: 1m 28s	remaining: 32.4s
366:	learn: 741.3479193	total: 1m 28s	remaining: 32.1s
367:	learn: 740.6784681	total: 1m 28s	remaining: 31.9s
368:	learn: 740.1043301	total: 1m 29s	remaining: 31.6s
369:	learn: 738.5934180	total: 1m 29s	remaining: 31.4s
370:	learn: 737.6052952	total: 1m 29s	remaining: 31.2s
371:	learn: 736.7764167	total: 1m 29s	remaining: 30.9s
372:	learn: 736.1445414	total: 1m 30s	remaining: 30.7s
373:	learn: 735.5051499	total: 1m 30s	remaining: 30.5s
374:	learn: 735.2123058	total: 1m 30s	remaining: 30.2s
375:	learn: 732.6481962	total: 1m 30s	remaining: 30s
376:	learn: 732.4242743	total: 1m 31s	remaining: 29.8s
377:	learn: 731.8573875	total: 1m 31s	remaining: 29.5s
378:	learn: 731.8492460	total: 1m 31s	remaining: 29.3s
379:	learn: 731.6684530	total: 1m 31s	remaining: 29s
380:	learn: 730.8704621	total: 1m 32s	remaining: 28.8s
381:	learn: 730.1708005	total: 1m 32s	remaining: 28.6s
382:	learn: 728.5928838	total: 1m 32s	remaining: 28.3s
383:	learn: 728.5688475	total: 1m 33s	remaining: 28.1s
384:	learn: 727.1116774	total: 1m 33s	remaining: 27.9s
385:	learn: 725.5070297	total: 1m 33s	remaining: 27.6s
386:	learn: 723.9825808	total: 1m 33s	remaining: 27.4s
387:	learn: 721.9312717	total: 1m 33s	remaining: 27.1s
388:	learn: 720.9438560	total: 1m 34s	remaining: 26.9s
389:	learn: 718.8186671	total: 1m 34s	remaining: 26.6s
390:	learn: 716.3172774	total: 1m 34s	remaining: 26.4s
391:	learn: 714.4640373	total: 1m 34s	remaining: 26.1s
392:	learn: 713.3978220	total: 1m 35s	remaining: 25.9s
393:	learn: 712.3299140	total: 1m 35s	remaining: 25.6s
394:	learn: 712.1237737	total: 1m 35s	remaining: 25.4s
395:	learn: 711.8743598	total: 1m 35s	remaining: 25.1s
396:	learn: 711.6651248	total: 1m 35s	remaining: 24.9s
397:	learn: 711.5190483	total: 1m 36s	remaining: 24.6s
398:	learn: 710.9531075	total: 1m 36s	remaining: 24.4s
399:	learn: 709.9624974	total: 1m 36s	remaining: 24.1s
400:	learn: 709.8192664	total: 1m 36s	remaining: 23.9s
401:	learn: 709.1007244	total: 1m 37s	remaining: 23.7s
402:	learn: 708.8538420	total: 1m 37s	remaining: 23.4s
403:	learn: 708.5390036	total: 1m 37s	remaining: 23.2s
404:	learn: 707.5683963	total: 1m 37s	remaining: 23s
405:	learn: 706.8190609	total: 1m 38s	remaining: 22.7s
406:	learn: 706.2547025	total: 1m 38s	remaining: 22.5s
407:	learn: 705.3271781	total: 1m 38s	remaining: 22.3s
408:	learn: 702.7091699	total: 1m 38s	remaining: 22s
409:	learn: 701.1335801	total: 1m 39s	remaining: 21.8s
410:	learn: 700.3531136	total: 1m 39s	remaining: 21.5s
411:	learn: 699.3498539	total: 1m 39s	remaining: 21.3s
412:	learn: 698.8914831	total: 1m 40s	remaining: 21.1s
413:	learn: 698.5826648	total: 1m 40s	remaining: 20.8s
414:	learn: 698.2652115	total: 1m 40s	remaining: 20.6s
415:	learn: 697.4302264	total: 1m 40s	remaining: 20.4s
416:	learn: 696.0684946	total: 1m 41s	remaining: 20.1s
417:	learn: 695.8898608	total: 1m 41s	remaining: 19.9s
418:	learn: 695.8591374	total: 1m 41s	remaining: 19.6s
419:	learn: 695.0255113	total: 1m 41s	remaining: 19.4s
420:	learn: 693.4513403	total: 1m 42s	remaining: 19.2s
421:	learn: 692.2110455	total: 1m 42s	remaining: 18.9s
422:	learn: 692.0232016	total: 1m 42s	remaining: 18.7s
423:	learn: 690.7677299	total: 1m 42s	remaining: 18.5s
424:	learn: 689.1396679	total: 1m 43s	remaining: 18.2s
425:	learn: 688.7648203	total: 1m 43s	remaining: 18s
426:	learn: 688.1189539	total: 1m 43s	remaining: 17.7s
427:	learn: 687.5303484	total: 1m 44s	remaining: 17.5s
428:	learn: 686.9117608	total: 1m 44s	remaining: 17.3s
429:	learn: 686.5077369	total: 1m 44s	remaining: 17s
430:	learn: 686.0521128	total: 1m 44s	remaining: 16.8s
431:	learn: 684.5345503	total: 1m 45s	remaining: 16.5s
432:	learn: 683.2911165	total: 1m 45s	remaining: 16.3s
433:	learn: 682.6764248	total: 1m 45s	remaining: 16.1s
434:	learn: 682.0506552	total: 1m 45s	remaining: 15.8s
435:	learn: 681.8755665	total: 1m 46s	remaining: 15.6s
436:	learn: 680.8694859	total: 1m 46s	remaining: 15.3s
437:	learn: 680.3207150	total: 1m 46s	remaining: 15.1s
438:	learn: 679.9425685	total: 1m 46s	remaining: 14.8s
439:	learn: 679.1157861	total: 1m 46s	remaining: 14.6s
440:	learn: 678.2798787	total: 1m 47s	remaining: 14.3s
441:	learn: 677.2959838	total: 1m 47s	remaining: 14.1s
442:	learn: 675.9050066	total: 1m 47s	remaining: 13.9s
443:	learn: 674.7484282	total: 1m 47s	remaining: 13.6s
444:	learn: 673.5700114	total: 1m 48s	remaining: 13.4s
445:	learn: 672.8481826	total: 1m 48s	remaining: 13.1s
446:	learn: 672.6932383	total: 1m 48s	remaining: 12.9s
447:	learn: 672.0764158	total: 1m 48s	remaining: 12.6s
448:	learn: 670.8348349	total: 1m 49s	remaining: 12.4s
449:	learn: 669.8428709	total: 1m 49s	remaining: 12.1s
450:	learn: 668.9224196	total: 1m 49s	remaining: 11.9s
451:	learn: 668.7162450	total: 1m 49s	remaining: 11.7s
452:	learn: 667.6379633	total: 1m 50s	remaining: 11.4s
453:	learn: 667.2526721	total: 1m 50s	remaining: 11.2s
454:	learn: 666.8589603	total: 1m 50s	remaining: 10.9s
455:	learn: 665.6129627	total: 1m 50s	remaining: 10.7s
456:	learn: 665.4283978	total: 1m 51s	remaining: 10.5s
457:	learn: 664.9031034	total: 1m 51s	remaining: 10.2s
458:	learn: 664.4429432	total: 1m 51s	remaining: 9.97s
459:	learn: 662.7554730	total: 1m 51s	remaining: 9.73s
460:	learn: 661.7949838	total: 1m 52s	remaining: 9.49s
461:	learn: 661.1719274	total: 1m 52s	remaining: 9.24s
462:	learn: 661.0261829	total: 1m 52s	remaining: 9s
463:	learn: 660.4675751	total: 1m 52s	remaining: 8.76s
464:	learn: 659.2579658	total: 1m 53s	remaining: 8.52s
465:	learn: 658.8995971	total: 1m 53s	remaining: 8.28s
466:	learn: 658.3467436	total: 1m 53s	remaining: 8.04s
467:	learn: 658.0117440	total: 1m 53s	remaining: 7.79s
468:	learn: 656.8870599	total: 1m 54s	remaining: 7.55s
469:	learn: 656.5677388	total: 1m 54s	remaining: 7.31s
470:	learn: 655.9556828	total: 1m 54s	remaining: 7.07s
471:	learn: 654.5686416	total: 1m 55s	remaining: 6.82s
472:	learn: 654.4495835	total: 1m 55s	remaining: 6.58s
473:	learn: 653.8103551	total: 1m 55s	remaining: 6.34s
474:	learn: 652.6816129	total: 1m 55s	remaining: 6.09s
475:	learn: 651.3429969	total: 1m 56s	remaining: 5.85s
476:	learn: 650.8938832	total: 1m 56s	remaining: 5.61s
477:	learn: 649.9794305	total: 1m 56s	remaining: 5.37s
478:	learn: 649.6059601	total: 1m 56s	remaining: 5.12s
479:	learn: 649.4023308	total: 1m 57s	remaining: 4.88s
480:	learn: 648.4227969	total: 1m 57s	remaining: 4.64s
481:	learn: 646.8437027	total: 1m 57s	remaining: 4.39s
482:	learn: 646.4760202	total: 1m 57s	remaining: 4.15s
483:	learn: 646.2949553	total: 1m 58s	remaining: 3.9s
484:	learn: 645.9410739	total: 1m 58s	remaining: 3.66s
485:	learn: 644.3585616	total: 1m 58s	remaining: 3.41s
486:	learn: 643.7781365	total: 1m 58s	remaining: 3.17s
487:	learn: 643.4312812	total: 1m 58s	remaining: 2.92s
488:	learn: 642.9842251	total: 1m 59s	remaining: 2.68s
489:	learn: 642.7216593	total: 1m 59s	remaining: 2.44s
490:	learn: 642.5586526	total: 1m 59s	remaining: 2.19s
491:	learn: 642.4175068	total: 1m 59s	remaining: 1.95s
492:	learn: 642.1007069	total: 2m	remaining: 1.7s
493:	learn: 641.8468232	total: 2m	remaining: 1.46s
494:	learn: 641.2739622	total: 2m	remaining: 1.22s
495:	learn: 640.4798015	total: 2m	remaining: 974ms
496:	learn: 640.1637081	total: 2m	remaining: 730ms
497:	learn: 638.4174588	total: 2m 1s	remaining: 487ms
498:	learn: 637.8842430	total: 2m 1s	remaining: 244ms
499:	learn: 637.4574442	total: 2m 1s	remaining: 0us
Out[101]:
<catboost.core.CatBoostRegressor at 0x1ff001b8630>

In [104]:
preds = model.predict(df_test) - 1;
preds[:10]


Out[104]:
array([ 190.38626,  141.29875,   62.31854,    4.08713,  291.0547 ,   36.24614,   16.04935,   76.81639,
         63.33431,   24.40927])

In [105]:
submit = make_submission(preds)

In [106]:
submit.to_csv(f'{PATH}\\Adi_catboost_with rf_feats_310818.csv', index=None)

In [ ]:

RF


In [54]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = ['RMSLE X_train', rmse(m.predict(X_train), y_train), '\n RMSLE X_valid', rmse(m.predict(X_valid), y_valid),
                '\n R**2 Train',m.score(X_train, y_train), '\n R**2 Valid', m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(['\n OOB_Score', m.oob_score_])
    print(res)

In [56]:
target = df_raw.target

In [57]:
df_raw.drop('target', axis=1,inplace=True)

In [78]:
df_raw.drop('Username', axis=1,inplace=True)
df_test.drop('Username', axis=1,inplace=True)

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df_raw, target, test_size=0.2, random_state=42)

def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 30000
n_trn = len(df_raw)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df_raw, n_trn)
y_train, y_valid = split_vals(target, n_trn)

X_train.shape, y_train.shape, X_valid.shape


Out[79]:
((203657, 27), (203657,), (30000, 27))

In [87]:
df_raw.drop(['Reputation', 'Answers', 'Views'], axis=1, inplace=True)
df_test.drop(['Reputation', 'Answers', 'Views'], axis=1, inplace=True)

In [ ]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True, max_depth= 8)
m.fit(X_train, y_train)
print_score(m)

In [81]:
df_raw.head()


Out[81]:
Reputation Answers Views agg_count agg_answers agg_views agg_repo log_trans_Reputation log_trans_Answers log_trans_Username ... tag_0.0 tag_1.0 tag_2.0 tag_3.0 tag_4.0 tag_5.0 tag_6.0 tag_7.0 tag_8.0 tag_9.0
4 4271.0 4.0 13986.0 3.0 2.0 20598.0 4271.0 8.359838 1.609438 11.628252 ... 0 1 0 0 0 0 0 0 0 0
7 2269.0 2.0 312.0 7.0 2.0 4168.0 2269.0 7.727535 1.098612 10.908229 ... 0 0 0 0 1 0 0 0 0 0
8 111.0 2.0 53738.0 1.0 2.0 53738.0 111.0 4.718499 1.098612 12.060625 ... 0 0 0 0 1 0 0 0 0 0
15 7952.0 2.0 29191.0 4.0 3.0 28314.0 7952.0 8.981304 1.098612 11.037403 ... 0 1 0 0 0 0 0 0 0 0
16 731.0 4.0 5622.0 2.0 2.0 6730.0 731.0 6.595780 1.609438 10.681894 ... 0 0 0 1 0 0 0 0 0 0

5 rows × 27 columns


In [86]:
df_raw.columns


Out[86]:
Index(['Reputation', 'Answers', 'Views', 'agg_count', 'agg_answers',
       'agg_views', 'agg_repo', 'log_trans_Reputation', 'log_trans_Answers',
       'log_trans_Username', 'log_trans_Views', 'log_trans_agg_count',
       'log_trans_agg_answers', 'log_trans_agg_views', 'log_trans_agg_repo',
       'repo_per_Answers', 'repo_per_Views', 'tag_0.0', 'tag_1.0', 'tag_2.0',
       'tag_3.0', 'tag_4.0', 'tag_5.0', 'tag_6.0', 'tag_7.0', 'tag_8.0',
       'tag_9.0'],
      dtype='object')

In [85]:
for i in df_raw.columns:
    sns.distplot(df_raw[i])
    plt.show()



In [ ]: