In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from collections import defaultdict
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
np.random.seed(69572)
# plt.figure(figsize=(120,10))
In [2]:
RAW_X = pd.read_csv('data/traning_set_values.csv', index_col='id')
y = pd.read_csv('data/training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('data/test_set_values.csv', index_col='id')
In [3]:
# Below transformation are required for label ecoding
# bool columns
tmp = ['public_meeting', 'permit']
RAW_X[tmp] = RAW_X[tmp].fillna(False)
RAW_TEST_X[tmp] = RAW_TEST_X[tmp].fillna(False)
In [4]:
na_cols = 'funder installer subvillage scheme_management scheme_name'.split()
RAW_X[na_cols] = RAW_X[na_cols].fillna('Other')
RAW_TEST_X[na_cols] = RAW_TEST_X[na_cols].fillna('Other')
COMPLETE_RAW_DATA = pd.concat([RAW_X.copy(), RAW_TEST_X.copy()])
In [5]:
print(RAW_X.shape)
print(RAW_TEST_X.shape)
print(COMPLETE_RAW_DATA.shape)
In [6]:
# http://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
d = defaultdict(preprocessing.LabelEncoder)
COMPLETE_RAW_DATA.apply(lambda x: d[x.name].fit(x))
Out[6]:
In [7]:
# DE-Encoding the variable
X = RAW_X.apply(lambda x: d[x.name].transform(x))
# DE-Encoding the variable
RAW_TEST_X = RAW_TEST_X.apply(lambda x: d[x.name].transform(x))
#
le = preprocessing.LabelEncoder().fit(y)
y = le.transform(y)
In [8]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
In [10]:
clf = clf.fit(X_train, y_train)
In [11]:
pred = clf.predict_proba(X_test)
In [12]:
plt.figure(figsize=(20,10))
ax = plt.scatter(range(len(clf.feature_importances_)), clf.feature_importances_, alpha=0.6)
In [13]:
plt.figure(figsize=(20,10))
ax = plt.hist(clf.feature_importances_, alpha=0.6)
In [14]:
# len(pred)
clf.score(X_test, y_test) # 0.79303132333435367 # 0.79206203448627688
Out[14]:
In [16]:
test_ids = RAW_TEST_X.index
In [18]:
# predictions = clf.predict_proba(test_submission)
predictions = clf.predict(RAW_TEST_X)
print (predictions.shape)
predictions_labels = le.inverse_transform(predictions)
# sub = pd.DataFrame(predictions, columns=list(le.classes_))
sub = pd.DataFrame(predictions_labels, columns=['status_group'])
sub.head()
Out[18]:
In [19]:
sub.insert(0, 'id', test_ids)
sub.reset_index()
sub.to_csv('submit.csv', index = False)
sub.head()
Out[19]: