In [1]:
import pandas as pd
# import load_iris function from datasets module
from sklearn.datasets import load_iris
iris = load_iris()
type(iris)
data = pd.DataFrame(data=iris.data, columns = iris.feature_names)
data[ 'specie'] = iris.target
print iris.target_names
print data.head()
print data.shape
In [2]:
# convert to categorycal
pd.__version__
#from pandas import Categorical
print pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
print pd.Categorical(['a','a','b','c'], categories=['a','b','c'])
data['specie_cat'] = pd.Categorical(data['specie'], categories = [0,1,2])
#data['specie'].astype("categorical")
print data.head()
print type(data)
#print type(data[,:1])
print type(data['specie'])
print type(data['specie_cat'])
#species = pd.Categorical(iris.target, categories=iris.target_names, ordered=False)
#pd.Categorical(
##data['specie_cat'] = data['specie'].astype("categorical")
In [3]:
print "Describe in numerical data:"
print(data.describe())
print "Describe in categorical data:"
print(data["specie_cat"].describe())
see this link http://www.agcross.com/2015/02/random-forests-in-python-with-scikit-learn/
In [62]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
## Defining train n test set
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
# convert to categorical data
df['specie'] = pd.Categorical.from_codes(iris.target,categories = iris.target_names)
print "Describe in numerical data:"
print df.describe()
print "Describe in categorical data:"
print df['specie'].describe()
print df.specie[:3]
print df.tail(7)
In [44]:
# You can use ski kit learn
# STEP 1: split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
## random_state is the seed
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)
train, test = df[df['is_train']==True], df[df['is_train']==False]
features = df.columns[0:4]
print features
print train.head()
print test.head()
In [45]:
# Random forest
forest = RFC(n_jobs=2,n_estimators=50)
In [46]:
# y,_ = foo() You decompose the returned tuple into two distinct values, y and _.
# _ means "I don't need that value anymore".
y,_ = pd.factorize(train['specie'])
print y
# Compute the models
forest.fit(train[features], y)
preds = iris.target_names[forest.predict(test[features])]
print pd.crosstab(index=test['specie'], columns=preds, rownames=['actual'], colnames=['preds'])
In [47]:
## Just to check if it doesnt work
yy = train.specie
forest = RFC(n_jobs=2,n_estimators=50)
# Compute the models
forest.fit(train[features], yy)
preds = forest.predict(test[features])
print pd.crosstab(index=test['specie'], columns=preds, rownames=['actual'], colnames=['preds'])
In [48]:
## Find importnat variables
importances = forest.feature_importances_
indices = np.argsort(importances)
# allow plots to appear within the notebook
%matplotlib inline
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
Out[48]:
See this link http://scikit-learn.org/stable/modules/preprocessing.html
In [87]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()
X = [[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]
print pd.DataFrame(X)
enc.fit(X)
enc.transform([[0, 1, 3]]).toarray()
Out[87]:
In [98]:
X = [[1],[0],[1]]
print pd.DataFrame(X)
enc.fit(X)
enc.transform(X).toarray()
Out[98]:
In [ ]:
In [78]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
## Defining train n test set
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
# convert to categorical data
df['specie'] = pd.Categorical.from_codes(iris.target,categories = iris.target_names)
# convert to categorical data
df['is_train'] = np.random.randint(0,4,size=len(df))
df['is_train'] = pd.Categorical.from_codes(df['is_train'],categories = ['c1', 'c2','c3','c4'])
print df.is_train.describe()
print df.is_train.head()
# Redefining X
features = df.columns[[0,1,2,3,4]]
print features
from sklearn.cross_validation import train_test_split
## random_state is the seed
X_train, X_test, y_train, y_test = train_test_split(df[features], df.specie, test_size=0.4, random_state=4)
print X_train.head()
print y_train.describe()
In [79]:
## Runing randomforest
forest = RFC(n_jobs=2,n_estimators=50)
# Compute the models
forest.fit(X_train, y_train)
#preds = forest.predict(test[features])
## Running linearvregression