In [1]:
import pandas as pd
import numpy as np
In [2]:
train_data = pd.read_csv('../data/train_data.csv')
In [3]:
train_data.head(2)
Out[3]:
In [4]:
train_data.loc[:,'n_records'] = train_data.loc[:,'event_id_list'].apply(lambda x: len(x))
In [5]:
train_data = train_data.loc[:,['building_id','n_records','blighted']]
In [6]:
train_data.head(4)
Out[6]:
In [7]:
import matplotlib.pyplot as plt
In [10]:
%matplotlib inline
In [11]:
fig = plt.figure()
ax = fig.add_subplot(111)
n, bins, patches = ax.hist(train_data['n_records'].values, bins=30,
facecolor='blue', alpha=0.75)
plt.xlabel('Number of Instances')
plt.ylabel('Counts of Buildings')
plt.title('Distribution of number of instances')
plt.show()
In [12]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.boxplot(train_data['n_records'].values)
plt.ylim(0,35)
plt.show()
The distribution of the feature 'n_records' is severely right skewed.
In [13]:
# Try using log scale
train_data.loc[:,'log_n_records'] = np.log(train_data.loc[:,'n_records'].values)
In [14]:
fig = plt.figure()
ax = fig.add_subplot(111)
n, bins, patches = ax.hist(train_data['log_n_records'].values, bins=15, facecolor='blue', alpha=0.75)
plt.xlabel('log(n_records)')
plt.ylabel('Counts of Buildings')
plt.title('Distribution of number of instances')
plt.show()
In [15]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.boxplot(train_data['log_n_records'].values)
ax.set_yscale("log", nonposy='clip')
plt.ylim(1,6)
plt.show()
Still right skewed.
In [16]:
fig = plt.figure()
ax1 = fig.add_subplot(121)
ax1.plot(train_data['n_records'].values, train_data['blighted'].values,'ro', alpha = 0.1)
ax1.set_ylim(-0.2,1.2)
ax1.set_title("n_records vs blighted")
ax2 = fig.add_subplot(122)
ax2.plot(train_data['log_n_records'].values, train_data['blighted'].values,'bo', alpha = 0.1)
ax2.set_ylim(-0.2,1.2)
ax2.set_title("log(n_records) vs blighted")
plt.show()
In [ ]:
In [17]:
from sklearn.svm import SVC
In [24]:
model = SVC(kernel='rbf', class_weight={1:0.95, 0:0.05}, probability=True) # imbalanced data
In [25]:
X = train_data.loc[:,'n_records'].values
X = X[:, np.newaxis]
y = train_data.loc[:,'blighted']
In [26]:
model.fit(X,y)
Out[26]:
In [27]:
y_fit = model.predict(X)
In [28]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y, y_fit))
In [ ]: