notebook.community

Edit and run

This is the first notebook file for the *Sberbank Data Science Journey*.



In [ ]:

    
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier



In [43]:

    
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [37]:

    
transactions = pd.read_csv('transactions.csv')
customers_gender = pd.read_csv('customers_gender_train.csv')



In [45]:

    
X = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['mcc_code']].unstack().value_counts()) \
                    .unstack() \
                    .fillna(0)



In [47]:

    
customers_gender = customers_gender.set_index('customer_id')



In [ ]:

    
Y_train = customers_gender.loc[X.index].gender
Y_train = Y_train.reset_index()
del Y_train['customer_id']
Y_train = Y_train.dropna(0)



In [ ]:

    
X_train = X.reset_index()
X_train = X_train.loc[Y_train.index].set_index('customer_id')



In [56]:

    
clf = GradientBoostingClassifier(random_state=13)
clf.fit(X_train, Y_train.values[:, 0]);



In [54]:

    
X_test = X.drop(customers_gender.index)
result = pd.DataFrame(X_test.index, columns=['customer_id'])
result['gender'] = clf.predict_proba(X_test)[:, 1]



In [ ]:

    
result.to_csv('baseline_a.csv', index=False)



In [ ]: