In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_curve
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import auc
from sklearn.ensemble import ExtraTreesClassifier
In [5]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 100)
import warnings
warnings.simplefilter('ignore')
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8
In [7]:
train = pd.read_csv("train.csv", sep=';')
test = pd.read_csv("test.csv", sep=';')
In [8]:
# Verify the correctness of the load
train.head()
Out[8]:
In [9]:
test.head()
Out[9]:
In [10]:
train.info()
In [11]:
test.info()
In [12]:
# Define the function to fill the missing values
def replace_nan(data):
# в столбцах 'START_PACK' и 'OFFER_GROUP' заменим NaN на 'Unknown'
data['START_PACK'] = data['START_PACK'].fillna('Unknown')
data['OFFER_GROUP'] = data['OFFER_GROUP'].fillna('Unknown')
# столбцы с датами приведем к формату datetime
data['ACT_DATE'] = pd.to_datetime(data['ACT_DATE'], format='%Y-%m-%d', errors='ignore')
data['BIRTHDAY'] = pd.to_datetime(data['BIRTHDAY'], format='%Y-%m-%d', errors='ignore')
# в столбце GENDER заменим NaN на M, так как 16034 из 28600 записей имеют значение M
data['GENDER'] = data['GENDER'].fillna('M')
# по условию задачи, NaN в столбце 'MLLS_STATE' означает что абонент не является участником программы лояльности
data['MLLS_STATE'] = data['MLLS_STATE'].fillna('No')
# по условиям задачи NaN в столбце 'OBLIG_NUM' означает, что абонент не пользовался рассрочкой
data['OBLIG_NUM'] = data['OBLIG_NUM'].fillna(0.0)
# NaN в столбце 'ASSET_TYPE_LAST' вероятно означает, что абонент не приобретал оборудование в компании
data['ASSET_TYPE_LAST'] = data['ASSET_TYPE_LAST'].fillna('Not buying')
# в столбце 'USAGE_AREA' заменим NaN на 'Undefined'
data['USAGE_AREA'] = data['USAGE_AREA'].fillna('Undefined')
# в остальных столбцах заменим NaN на 0.0, считая что отсутствие данных означает отсутствие активности
data['REFILL_OCT_16'] = data['REFILL_OCT_16'].fillna(0.0)
data['REFILL_NOV_16'] = data['REFILL_NOV_16'].fillna(0.0)
data['OUTGOING_OCT_16'] = data['OUTGOING_OCT_16'].fillna(0.0)
data['OUTGOING_NOV_16'] = data['OUTGOING_NOV_16'].fillna(0.0)
data['GPRS_OCT_16'] = data['GPRS_OCT_16'].fillna(0.0)
data['GPRS_NOV_16'] = data['GPRS_NOV_16'].fillna(0.0)
data['REVENUE_OCT_16'] = data['REVENUE_OCT_16'].fillna(0.0)
data['REVENUE_NOV_16'] = data['REVENUE_NOV_16'].fillna(0.0)
In [13]:
# переведем BYR в BYN
def byr_to_byn(data):
data['REFILL_OCT_16'] = data['REFILL_OCT_16']/10000.0
data['REFILL_NOV_16'] = data['REFILL_NOV_16']/10000.0
In [14]:
# Create several new features
def new_features(data):
# срок с даты подключения до 1 декабря 2016 в днях
data['AGE_ACT'] = [int(i.days) for i in (pd.datetime(2016, 12, 1) - data['ACT_DATE'])]
# день недели, в который состоялось подключение
data['WEEKDAY'] = data['ACT_DATE'].dt.dayofweek
# добавим год рождения абонента и заменим пропущенные данные средним
data['BIRTH_YEAR'] = pd.DatetimeIndex(data['BIRTHDAY']).year
data['BIRTH_YEAR'] = data['BIRTH_YEAR'].fillna(data['BIRTH_YEAR'].mean())
# добавим столбец с возрастом абонента на момент подключения
data['AGE_AB'] = pd.DatetimeIndex(data['ACT_DATE']).year - data['BIRTH_YEAR']
# добавим столбцы с разностями показателей ноября и октября
data['REFIL_DELTA'] = data['REFILL_NOV_16'] - data['REFILL_OCT_16']
data['OUTGOING_DELTA'] = data['OUTGOING_NOV_16'] - data['OUTGOING_OCT_16']
data['GPRS_DELTA'] = data['GPRS_NOV_16'] - data['GPRS_OCT_16']
data['REVENUE_DELTA'] = data['REVENUE_NOV_16'] - data['REVENUE_OCT_16']
# удалим столбецы 'BIRTHDAY' и 'ACT_DATE'
del data['BIRTHDAY']
del data['ACT_DATE']
In [15]:
# переведем BYR в BYN
byr_to_byn(train)
byr_to_byn(test)
In [16]:
# Process the training data
replace_nan(train)
new_features(train)
In [17]:
# Process the test data
replace_nan(test)
new_features(test)
In [18]:
train.info()
Now we have test and train data sets without missing data and with a few new features
In [19]:
# Conversion of categorical data
le = LabelEncoder()
for n in ['STATUS', 'TP_CURRENT', 'START_PACK', 'OFFER_GROUP', 'GENDER', 'MLLS_STATE',
'PORTED_IN', 'PORTED_OUT', 'OBLIG_ON_START', 'ASSET_TYPE_LAST', 'DEVICE_TYPE_BUS', 'USAGE_AREA']:
le.fit(train[n])
train[n] = le.transform(train[n])
test[n] = le.transform(test[n])
In [20]:
# Standardization of data
features = list(train.columns)
del features[0]
del features[22]
scaler = StandardScaler()
for n in features:
scaler.fit(train[n])
train[n] = scaler.transform(train[n])
test[n] = scaler.transform(test[n])
In [21]:
# Break train into training and test set
X_train, X_test, y_train, y_test = train_test_split(train[features],
train.ACTIVITY_DEC_16,
test_size=0.20,
random_state=123)
In [82]:
# Ensemble of classifiers by Weighted Average Probabilities
clf1 = LogisticRegression(random_state=42)
clf2 = RandomForestClassifier(random_state=42)
clf3 = SGDClassifier(loss='log', random_state=42)
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('sgd', clf3)], voting='soft', weights=[1,1,1])
In [83]:
# Quality control of the model by cross-validation with calculation of ROC AUC
for clf, label in zip([clf1, clf2, clf3, eclf],
['Logistic Regression', 'Random Forest', 'SGD', 'Ensemble']):
scores2 = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
print("ROC AUC: %0.6f (+/- %0.6f) [%s]" % (scores2.mean(), scores2.std(), label))
On the training data, the best result is provided by an ensemble of three algorithms
In [46]:
# Построим лес и подсчитаем важность признаков
forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Выведем ранг признаков по важности
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. %s (%f)" % (f + 1, list(X_train.columns)[indices[f]], importances[indices[f]]))
# Сделаем график важности признаков
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()
As we can see, the most important features are STATUS, USAGE_AREA, DEVICE_TYPE_BUS и REVENUE_NOV_16
In [22]:
# Create a list of features sorted by importance
imp_features = []
for i in indices:
imp_features.append(features[i])
In [23]:
# the best accuracy is obtained by using the 17 most important features
best_features = imp_features[:17]
X_train2 = X_train[best_features]
# Quality control of the model by cross-validation with calculation of ROC AUC
for clf, label in zip([clf1, clf2, clf3, eclf],
['Logistic Regression', 'Random Forest', 'SGD', 'Ensemble']):
scores2 = cross_val_score(estimator=clf, X=X_train2, y=y_train, cv=10, scoring='roc_auc')
print("ROC AUC: %0.6f (+/- %0.6f) [%s]" % (scores2.mean(), scores2.std(), label))
In [24]:
# roc curve on test data
colors = ['black', 'orange', 'blue', 'green']
linestyles = [':', '--', '-.', '-']
for clf, label, clr, ls in zip([clf1, clf2, clf3, eclf],
['Logistic Regression', 'Random Forest', 'SGD', 'Ensemble'],
colors, linestyles):
y_pred = clf.fit(X_train[best_features], y_train).predict_proba(X_test[best_features])[:, 1]
fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred)
roc_auc = auc(x=fpr, y=tpr)
plt.plot(fpr, tpr, color=clr, linestyle=ls, label='%s (auc = %0.2f)' % (label, roc_auc))
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', linewidth=2)
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.grid()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
The ROC AUC values obtained for the cross validation and for the test sample are the same, which indicates that the model is not overfitted and not underfitted.
In [25]:
result_pred = eclf.fit(X_train[best_features], y_train).predict_proba(test[best_features])
result = pd.DataFrame(test['USER_ID'])
result['ACTIVITY_DEC_16_PROB'] = list(result_pred[:, 1])
result.to_csv('result.csv', encoding='utf8', index=None)
In [ ]: