In [1]:
import sys
import imp
import yaml
import csv
import pandas as pd
import re
import numpy as np
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
In [2]:
def label_percent(v):
l=len(v)
pos=sum(x > 0 for x in v)/l
neg=sum(x < 0 for x in v)/l
zero=sum(x == 0 for x in v)/l
print('Pos:'+str("{0:.2f}".format(pos))+"; Neg:"+str("{0:.2f}".format(neg))+'; Zero:'+str("{0:.2f}".format(zero))+';')
In [3]:
# Open test and train sets
df_train = pd.read_csv("data/output/model_clean_data/train1.tar.gz", compression='gzip', index_col = None)
df_test = pd.read_csv("data/output/model_clean_data/test1.tar.gz" , compression='gzip', index_col = None)
In [4]:
df_train.head()
Out[4]:
In [5]:
df_test.columns
Out[5]:
In [6]:
#X_test_new = df_test.drop(df_test.columns[['labels', 'train.csv', 'index']], axis = 1)
x = df_test.drop(['labels', 'test.csv', 'index'], axis=1)
In [7]:
x.head()
Out[7]:
In [8]:
# Define test/training set
X_test = np.array(df_test.drop(['labels', 'test.csv', 'index', 'Time'], axis = 1))
Y_test = np.array(df_test[['labels']])[:,0]
X_train = np.array(df_train.drop(['labels', 'train.csv', 'index', 'Time'], axis = 1))
Y_train = np.array(df_train[['labels']])[:,0]
In [9]:
label_percent(Y_train)
Y_train.size
Out[9]:
In [10]:
label_percent(Y_test)
Y_test.size
Out[10]:
In [12]:
# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 250, max_depth=15)
# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(X_train, Y_train)
# Take the same decision trees and run it on the test data
output = forest.predict(X_test)
classification_report1 = classification_report(y_true=Y_test, y_pred=output)
print(classification_report1)
In [23]:
print(output)
In [24]:
label_percent(output)
In [13]:
from sklearn.cross_validation import StratifiedShuffleSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)
len(sss)
Out[13]:
In [14]:
sss
Out[14]:
In [15]:
print(sss)
In [ ]: