In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-1-e31e9a9bf73f> in <module>()
----> 1 from sklearn.ensemble import RandomForestClassifier
      2 import pandas as pd
      3 import numpy as np
      4 import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'sklearn'

In [4]:
forest = RandomForestClassifier (n_estimators=10000,random_state=0,n_jobs=-1)

In [ ]:
df=pd.read_csv("CBvsBreakCBClasses.csv",names=['Gain', 'Pen', 'X', 'incGain', 'incPen', 'incX', 'incGainх[-1]', 'incPen[-1]', 'incX[-1]', 'incGain[-2]', 'incPen[-2]', 'incX[-2]','class'])
df.tail()

In [12]:
feat_labels = df.columns[:12]

In [7]:
y=df.iloc[:,12]
X=df.iloc[:,0:12]

In [8]:
pd.value_counts(y)


Out[8]:
1    226
2    103
Name: class, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1, stratify=y)

In [14]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [15]:
forest.fit(X_train_std,y_train)


Out[15]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [16]:
importances=forest.feature_importances_
indices=np.argsort(importances)[::-1]

In [20]:
plt.title('Информативность признаков')
plt.bar(range(X.shape[1]), importances[indices],color='lightblue', align='center')
plt.xticks(range(X.shape[1]),feat_labels[indices],rotation=90)
plt.xlim([-1, X.shape[1]])
plt.tight_layout ()
#plt.show()
plt.savefig('fig1.png',dpi=200)



In [21]:
indices


Out[21]:
array([ 4,  5,  6,  8,  3,  1,  2,  7,  0,  9, 11, 10], dtype=int64)

In [1]:
bestfeatures=[4,5,3]

In [22]:
forest.fit(X_train_std[:,bestfeatures],y_train)


Out[22]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [23]:
forest.score(X_train_std[:,bestfeatures],y_train)


Out[23]:
1.0

In [24]:
forest.score(X_test_std[:,bestfeatures],y_test)


Out[24]:
0.7575757575757576

In [117]:
df2=df[df['class'] == 2]
y2=df2.iloc[:,12]
X2=df2.iloc[:,0:12]
X2_std = stdsc.transform(X2)

In [118]:
forest.score(X2_std[:,bestfeatures],y2)


Out[118]:
0.7961165048543689

In [1]:
pd.read_clipboard()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-8cbad928c47b> in <module>()
----> 1 pd.read_clipboard()

NameError: name 'pd' is not defined

In [ ]: