In [1]:
import pandas as pd

# import load_iris function from datasets module
from sklearn.datasets import load_iris

iris = load_iris()

type(iris)

data = pd.DataFrame(data=iris.data, columns = iris.feature_names)

data[ 'specie'] = iris.target

print iris.target_names

print data.head()

print data.shape


['setosa' 'versicolor' 'virginica']
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   specie  
0       0  
1       0  
2       0  
3       0  
4       0  
(150, 5)

In [2]:
# convert to categorycal

pd.__version__

#from pandas import Categorical

print pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])

print pd.Categorical(['a','a','b','c'], categories=['a','b','c'])

data['specie_cat'] = pd.Categorical(data['specie'], categories = [0,1,2])

#data['specie'].astype("categorical")

print data.head()

print type(data)
#print type(data[,:1])
print type(data['specie'])
print type(data['specie_cat'])

#species = pd.Categorical(iris.target, categories=iris.target_names, ordered=False)

#pd.Categorical(

##data['specie_cat'] = data['specie'].astype("categorical")


[a, b, c, a, b, c]
Categories (3, object): [a, b, c]
[a, a, b, c]
Categories (3, object): [a, b, c]
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   specie specie_cat  
0       0          0  
1       0          0  
2       0          0  
3       0          0  
4       0          0  
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>

In [3]:
print "Describe in numerical data:"
print(data.describe())


print "Describe in categorical data:"
print(data["specie_cat"].describe())


Describe in numerical data:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.054000           3.758667   
std             0.828066          0.433594           1.764420   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)      specie  
count        150.000000  150.000000  
mean           1.198667    1.000000  
std            0.763161    0.819232  
min            0.100000    0.000000  
25%            0.300000    0.000000  
50%            1.300000    1.000000  
75%            1.800000    2.000000  
max            2.500000    2.000000  
Describe in categorical data:
count     150
unique      3
top         2
freq       50
Name: specie_cat, dtype: int64

Converting iris specie to categorical second approach

see this link http://www.agcross.com/2015/02/random-forests-in-python-with-scikit-learn/


In [62]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.datasets import load_iris
 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

## Defining train n test set
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

# convert to categorical data
df['specie'] = pd.Categorical.from_codes(iris.target,categories = iris.target_names)

print "Describe in numerical data:"
print df.describe()

print "Describe in categorical data:"
print df['specie'].describe()


print df.specie[:3]

print df.tail(7)


Describe in numerical data:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.054000           3.758667   
std             0.828066          0.433594           1.764420   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)   is_train  
count        150.000000        150  
mean           1.198667  0.7933333  
std            0.763161  0.4062708  
min            0.100000      False  
25%            0.300000          1  
50%            1.300000          1  
75%            1.800000          1  
max            2.500000       True  
Describe in categorical data:
count           150
unique            3
top       virginica
freq             50
Name: specie, dtype: object
0    setosa
1    setosa
2    setosa
Name: specie, dtype: category
Categories (3, object): [setosa, versicolor, virginica]
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
143                6.8               3.2                5.9               2.3   
144                6.7               3.3                5.7               2.5   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1               1.8   

    is_train     specie  
143     True  virginica  
144     True  virginica  
145    False  virginica  
146     True  virginica  
147     True  virginica  
148     True  virginica  
149     True  virginica  

In [44]:
# You can use ski kit learn

# STEP 1: split X and y into training and testing sets
from sklearn.cross_validation import train_test_split

## random_state is the seed
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

train, test = df[df['is_train']==True], df[df['is_train']==False]
features = df.columns[0:4]

print features

print train.head()

print test.head()


Index([u'sepal length (cm)', u'sepal width (cm)', u'petal length (cm)',
       u'petal width (cm)'],
      dtype='object')
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
2                4.7               3.2                1.3               0.2   
4                5.0               3.6                1.4               0.2   
5                5.4               3.9                1.7               0.4   
6                4.6               3.4                1.4               0.3   

  is_train  specie  
0     True  setosa  
2     True  setosa  
4     True  setosa  
5     True  setosa  
6     True  setosa  
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
1                 4.9               3.0                1.4               0.2   
3                 4.6               3.1                1.5               0.2   
8                 4.4               2.9                1.4               0.2   
20                5.4               3.4                1.7               0.2   
29                4.7               3.2                1.6               0.2   

   is_train  specie  
1     False  setosa  
3     False  setosa  
8     False  setosa  
20    False  setosa  
29    False  setosa  

In [45]:
# Random forest

forest = RFC(n_jobs=2,n_estimators=50)

In [46]:
# y,_ = foo() You decompose the returned tuple into two distinct values, y and _.
# _ means "I don't need that value anymore".
y,_ = pd.factorize(train['specie'])

print y

# Compute the models
forest.fit(train[features], y)

preds = iris.target_names[forest.predict(test[features])]

print pd.crosstab(index=test['specie'], columns=preds, rownames=['actual'], colnames=['preds'])


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2]
preds       setosa  versicolor  virginica
actual                                   
setosa          10           0          0
versicolor       0          15          1
virginica        0           1          9

In [47]:
## Just to check if it doesnt work
yy = train.specie

forest = RFC(n_jobs=2,n_estimators=50)

# Compute the models
forest.fit(train[features], yy)

preds = forest.predict(test[features])

print pd.crosstab(index=test['specie'], columns=preds, rownames=['actual'], colnames=['preds'])


preds       setosa  versicolor  virginica
actual                                   
setosa          10           0          0
versicolor       0          15          1
virginica        0           1          9

In [48]:
## Find importnat variables

importances = forest.feature_importances_
indices = np.argsort(importances)

# allow plots to appear within the notebook
%matplotlib inline

plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')


Out[48]:
<matplotlib.text.Text at 0x7f26d9c7b990>

Testing categorical var at X

See this link http://scikit-learn.org/stable/modules/preprocessing.html

Understnading oneHotEncoder


In [87]:
from sklearn import preprocessing

enc = preprocessing.OneHotEncoder()
X = [[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]

print pd.DataFrame(X)

enc.fit(X)

enc.transform([[0, 1, 3]]).toarray()


   0  1  2
0  0  0  3
1  1  1  0
2  0  2  1
3  1  0  2
Out[87]:
array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])

In [98]:
X = [[1],[0],[1]]
print pd.DataFrame(X)

enc.fit(X)
        
enc.transform(X).toarray()


   0
0  1
1  0
2  1
Out[98]:
array([[ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.]])

Try this github code parece que ele trabalhou com var categprica date and time

see https://github.com/Autodidact24/Bike-sharing-demand


In [ ]:


In [78]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)

## Defining train n test set
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

# convert to categorical data
df['specie'] = pd.Categorical.from_codes(iris.target,categories = iris.target_names)

# convert to categorical data
df['is_train'] = np.random.randint(0,4,size=len(df)) 
df['is_train'] = pd.Categorical.from_codes(df['is_train'],categories = ['c1', 'c2','c3','c4'])

print df.is_train.describe()

print df.is_train.head()

# Redefining X
features = df.columns[[0,1,2,3,4]]
print features

from sklearn.cross_validation import train_test_split

## random_state is the seed
X_train, X_test, y_train, y_test = train_test_split(df[features], df.specie, test_size=0.4, random_state=4)

print X_train.head()
print y_train.describe()


count     150
unique      4
top        c3
freq       48
Name: is_train, dtype: object
0    c1
1    c4
2    c1
3    c3
4    c1
Name: is_train, dtype: category
Categories (4, object): [c1, c2, c3, c4]
Index([u'sepal length (cm)', u'sepal width (cm)', u'petal length (cm)',
       u'petal width (cm)', u'is_train'],
      dtype='object')
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
6                  4.6               3.4                1.4               0.3   
88                 5.6               3.0                4.1               1.3   
39                 5.1               3.4                1.5               0.2   
74                 6.4               2.9                4.3               1.3   
112                6.8               3.0                5.5               2.1   

    is_train  
6         c2  
88        c2  
39        c3  
74        c4  
112       c3  
            counts     freqs
categories                  
setosa          25  0.277778
versicolor      33  0.366667
virginica       32  0.355556

In [79]:
## Runing randomforest

forest = RFC(n_jobs=2,n_estimators=50)

# Compute the models
forest.fit(X_train, y_train)

#preds = forest.predict(test[features])


## Running linearvregression


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-79-ffca1fd14c30> in <module>()
      4 
      5 # Compute the models
----> 6 forest.fit(X_train, y_train)
      7 
      8 #preds = forest.predict(test[features])

/usr/local/lib/python2.7/dist-packages/sklearn/ensemble/forest.pyc in fit(self, X, y, sample_weight)
    193         """
    194         # Validate or convert input data
--> 195         X = check_array(X, dtype=DTYPE, accept_sparse="csc")
    196         if issparse(X):
    197             # Pre-sort indices to avoid that each individual tree of the

/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features)
    342             else:
    343                 dtype = None
--> 344         array = np.array(array, dtype=dtype, order=order, copy=copy)
    345         # make sure we actually converted to numeric:
    346         if dtype_numeric and array.dtype.kind == "O":

ValueError: could not convert string to float: c4