This is the first notebook file for the *Sberbank Data Science Journey*.
In [ ]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
In [43]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [37]:
transactions = pd.read_csv('transactions.csv')
customers_gender = pd.read_csv('customers_gender_train.csv')
In [45]:
X = transactions.groupby('customer_id') \
.apply(lambda x: x[['mcc_code']].unstack().value_counts()) \
.unstack() \
.fillna(0)
In [47]:
customers_gender = customers_gender.set_index('customer_id')
In [ ]:
Y_train = customers_gender.loc[X.index].gender
Y_train = Y_train.reset_index()
del Y_train['customer_id']
Y_train = Y_train.dropna(0)
In [ ]:
X_train = X.reset_index()
X_train = X_train.loc[Y_train.index].set_index('customer_id')
In [56]:
clf = GradientBoostingClassifier(random_state=13)
clf.fit(X_train, Y_train.values[:, 0]);
In [54]:
X_test = X.drop(customers_gender.index)
result = pd.DataFrame(X_test.index, columns=['customer_id'])
result['gender'] = clf.predict_proba(X_test)[:, 1]
In [ ]:
result.to_csv('baseline_a.csv', index=False)
In [ ]: