notebook.community

Edit and run



In [3]:

    
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap

%matplotlib inline



In [4]:

    
plt.rcParams['figure.figsize'] = 12, 9
plt.rcParams['font.size'] = 20



In [5]:

    
colors = ('red', 'green', 'blue', 'yellow')



In [6]:

    
cmap = ListedColormap(colors)



In [7]:

    
dfs = {}

for i, color in enumerate(colors):
    dfs[color] = pd.read_csv(color + '.csv')
    dfs[color]['label'] = i
    dfs[color]['label_name'] = color

df = pd.concat(dfs.values())



In [8]:

    
df.plot.scatter(x='g_std', y='b_std', c='label', cmap=cmap, vmin=-0.5, vmax=3.5)









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f78d347ceb8>



In [9]:

    
feature = 'b_min'

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
for color in colors:
    ax.set_xlabel(feature)
    limits = [df[feature].min(), df[feature].max()]
    df.query('label_name == @color')[feature].plot.hist(range=limits, bins=25, histtype='step', color=color, normed=True)



In [10]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib



In [11]:

    
df_X = df.drop(['label', 'label_name'], axis=1) 
X = df_X.values
y = df.label.values



In [12]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y)



In [13]:

    
X_train.shape, X_test.shape, y_train.shape, y_test.shape









    Out[13]:





((202, 69), (68, 69), (202,), (68,))



In [19]:

    
model = RandomForestClassifier(n_estimators=50)
model.features = df_X.columns
model.labels = colors



In [20]:

    
model.fit(X_train, y_train);



In [21]:

    
prediction = model.predict(X_test)

accuracy_score(prediction, y_test)









    Out[21]:





1.0



In [23]:

    
joblib.dump(model, 'model.pkl')









    Out[23]:





['model.pkl']



In [17]:

    
df_feature = pd.DataFrame({
    'importance': model.feature_importances_,
    'name': df.drop(['label', 'label_name'], axis=1).columns}
)



In [18]:

    
df_feature = df_feature.sort_values('importance').tail(20)
plt.barh(np.arange(len(df_feature)), df_feature.importance.values)
plt.yticks(np.arange(len(df_feature)), df_feature.name.values, size=12);



In [ ]: