In [71]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import tree

In [72]:
book = pd.read_csv("/Users/page/data/book/DETAIL_INFO.csv")

In [73]:
book.head()


Out[73]:
NO Year CBS_NAME CBS_Class CBS_MY_Order CBS_MY_Rate CBS_XS_MY CBS_DXPZ CBS_XSCS CBS_CRTAX ... User_Area User_Get_Price FXS_Discount FXS_BK_MY_Order FXS_BK_MY_Rate FXS_BK_MY FXS_BK_DXPZ FXS_BK_XSCS FXS_BK_Average_Price Unnamed: 27
0 1 2011 中信出版集团股份有限公司 中央及军队出版社 11 1.0934 90678033.8 2921 2357742 0.07 ... 一线城市,个别省会城市 9.0 0.585 2 3.4196 15289978.1 1579 426206 35.87 NaN
1 2 2011 中信出版集团股份有限公司 中央及军队出版社 11 1.0934 90678033.8 2921 2357742 0.07 ... 一线城市,个别省会城市 9.0 0.585 484 0.0000 36.0 1 2 18.00 NaN
2 3 2011 中信出版集团股份有限公司 中央及军队出版社 11 1.0934 90678033.8 2921 2357742 0.08 ... 一线城市,个别省会城市 9.0 0.585 235 0.0099 15346.0 7 478 32.10 NaN
3 4 2011 中信出版集团股份有限公司 中央及军队出版社 11 1.0934 90678033.8 2921 2357742 0.07 ... 一线城市,个别省会城市 9.0 0.585 3 3.1915 10890168.2 405 232732 46.79 NaN
4 5 2011 中信出版集团股份有限公司 中央及军队出版社 11 1.0934 90678033.8 2921 2357742 0.08 ... 一线城市,个别省会城市 9.0 0.585 181 0.0240 59391.0 21 1022 58.11 NaN

5 rows × 28 columns


In [74]:
book["CBS_Class"].value_counts()


Out[74]:
中央及军队出版社      1819
高校出版社          546
城市与地方媒体出版社     381
地方文艺出版社        321
地方少儿出版社        312
地方科技出版社        194
地方古籍出版社        179
地方人民出版社        175
地方美术出版社        173
Name: CBS_Class, dtype: int64

In [75]:
book.count()


Out[75]:
NO                      4100
Year                    4100
CBS_NAME                4100
CBS_Class               4100
CBS_MY_Order            4100
CBS_MY_Rate             4100
CBS_XS_MY               4100
CBS_DXPZ                4100
CBS_XSCS                4100
CBS_CRTAX               4100
Book_Class              4100
CBS_BK_Average_price    4100
CBS_BK_MY_Order         4100
CBS_BK_MY_Rate          4100
CBS_BK_XS_MY            4100
CBS_BK_DXPZ             4100
CBS_BK_XSCS             4100
FXS_Class               4100
User_Area               4100
User_Get_Price          4100
FXS_Discount            4100
FXS_BK_MY_Order         4100
FXS_BK_MY_Rate          4100
FXS_BK_MY               4100
FXS_BK_DXPZ             4100
FXS_BK_XSCS             4100
FXS_BK_Average_Price    4100
Unnamed: 27                0
dtype: int64

In [76]:
book.columns


Out[76]:
Index(['NO', 'Year', 'CBS_NAME', 'CBS_Class', 'CBS_MY_Order', 'CBS_MY_Rate',
       'CBS_XS_MY', 'CBS_DXPZ', 'CBS_XSCS', 'CBS_CRTAX', 'Book_Class',
       'CBS_BK_Average_price', 'CBS_BK_MY_Order', 'CBS_BK_MY_Rate',
       'CBS_BK_XS_MY', 'CBS_BK_DXPZ', 'CBS_BK_XSCS', 'FXS_Class', 'User_Area',
       'User_Get_Price', 'FXS_Discount', 'FXS_BK_MY_Order', 'FXS_BK_MY_Rate',
       'FXS_BK_MY', 'FXS_BK_DXPZ', 'FXS_BK_XSCS', 'FXS_BK_Average_Price',
       'Unnamed: 27'],
      dtype='object')

In [77]:
x = book[['CBS_Class', 'CBS_MY_Order', 'CBS_MY_Rate',
       'CBS_XS_MY', 'CBS_DXPZ', 'CBS_XSCS', 'CBS_CRTAX', 'Book_Class',
       'CBS_BK_Average_price', 'CBS_BK_MY_Order', 'CBS_BK_MY_Rate',
       'CBS_BK_XS_MY', 'CBS_BK_DXPZ', 'FXS_Class', 'User_Area',
       'User_Get_Price', 'FXS_Discount', 'FXS_BK_MY_Order', 'FXS_BK_MY_Rate',
       'FXS_BK_MY', 'FXS_BK_DXPZ', 'FXS_BK_XSCS', 'FXS_BK_Average_Price']]

In [119]:
new_book = book[['CBS_Class', 'CBS_MY_Order', 'CBS_MY_Rate',
       'CBS_XS_MY', 'CBS_DXPZ', 'CBS_XSCS', 'CBS_CRTAX', 'Book_Class',
       'CBS_BK_Average_price', 'CBS_BK_MY_Order', 'CBS_BK_MY_Rate',
       'CBS_BK_XS_MY', 'CBS_BK_DXPZ','CBS_BK_XSCS', 'FXS_Class', 'User_Area',
       'User_Get_Price', 'FXS_Discount', 'FXS_BK_MY_Order', 'FXS_BK_MY_Rate',
       'FXS_BK_MY', 'FXS_BK_DXPZ', 'FXS_BK_XSCS', 'FXS_BK_Average_Price']]
new_book["User_Area"].replace(["三线城市为主,部分在二线", "一线城市,个别省会城市","一线和二线城市", "三线城市"],[1,2,3,4],inplace=True)
new_book["FXS_Class"].replace(["中等书店", "城市店","超大书城", "大书城","小型书店"],[1,2,3,4,5],inplace=True)
new_book["Book_Class"].replace(["社科", "少儿","文艺", "生活休闲","语言","教辅教材","科技","综合图书"],[1,2,3,4,5,6,7,8],inplace=True)
new_book["CBS_Class"].replace(["中央及军队出版社", "高校出版社","城市与地方媒体出版社", "地方文艺出版社","地方少儿出版社","地方科技出版社",
                        "地方古籍出版社","地方人民出版社","地方美术出版社"],[1,2,3,4,5,6,7,8,9],inplace=True)


/usr/local/lib/python3.5/site-packages/pandas/core/generic.py:3554: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)

In [79]:
y = book[['CBS_BK_XSCS']]

In [121]:
new_book.to_csv("/Users/page/data/book/newbook.csv",header=True)

In [81]:
x.head()


Out[81]:
CBS_Class CBS_MY_Order CBS_MY_Rate CBS_XS_MY CBS_DXPZ CBS_XSCS CBS_CRTAX Book_Class CBS_BK_Average_price CBS_BK_MY_Order ... FXS_Class User_Area User_Get_Price FXS_Discount FXS_BK_MY_Order FXS_BK_MY_Rate FXS_BK_MY FXS_BK_DXPZ FXS_BK_XSCS FXS_BK_Average_Price
0 中央及军队出版社 11 1.0934 90678033.8 2921 2357742 0.07 社科 34.83 3 ... 超大书城 一线城市,个别省会城市 9.0 0.585 2 3.4196 15289978.1 1579 426206 35.87
1 中央及军队出版社 11 1.0934 90678033.8 2921 2357742 0.07 教辅教材 17.75 515 ... 超大书城 一线城市,个别省会城市 9.0 0.585 484 0.0000 36.0 1 2 18.00
2 中央及军队出版社 11 1.0934 90678033.8 2921 2357742 0.08 语言 32.51 258 ... 超大书城 一线城市,个别省会城市 9.0 0.585 235 0.0099 15346.0 7 478 32.10
3 中央及军队出版社 11 1.0934 90678033.8 2921 2357742 0.07 文艺 47.25 3 ... 超大书城 一线城市,个别省会城市 9.0 0.585 3 3.1915 10890168.2 405 232732 46.79
4 中央及军队出版社 11 1.0934 90678033.8 2921 2357742 0.08 科技 60.70 191 ... 超大书城 一线城市,个别省会城市 9.0 0.585 181 0.0240 59391.0 21 1022 58.11

5 rows × 23 columns


In [ ]:


In [82]:
y.head()


Out[82]:
CBS_BK_XSCS
0 1428330
1 16
2 1933
3 736560
4 3451

In [83]:
x["User_Area"].value_counts()


Out[83]:
三线城市为主,部分在二线    1664
一线城市,个别省会城市      826
一线和二线城市          822
三线城市             788
Name: User_Area, dtype: int64

In [84]:
x["User_Area"].replace(["三线城市为主,部分在二线", "一线城市,个别省会城市","一线和二线城市", "三线城市"],[1,2,3,4],inplace=True)


/usr/local/lib/python3.5/site-packages/pandas/core/generic.py:3554: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)

In [85]:
x["FXS_Class"].replace(["中等书店", "城市店","超大书城", "大书城","小型书店"],[1,2,3,4,5],inplace=True)


/usr/local/lib/python3.5/site-packages/pandas/core/generic.py:3554: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)

In [86]:
x["Book_Class"].replace(["社科", "少儿","文艺", "生活休闲","语言","教辅教材","科技","综合图书"],[1,2,3,4,5,6,7,8],inplace=True)


/usr/local/lib/python3.5/site-packages/pandas/core/generic.py:3554: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)

In [87]:



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-87-89b187785776> in <module>()
----> 1 xx.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [88]:
x.shape


Out[88]:
(4100, 23)

In [89]:
x["Book_Class"].value_counts()


Out[89]:
3    575
2    575
1    575
4    559
5    556
6    555
7    526
8    179
Name: Book_Class, dtype: int64

In [90]:
x["CBS_Class"].value_counts()


Out[90]:
中央及军队出版社      1819
高校出版社          546
城市与地方媒体出版社     381
地方文艺出版社        321
地方少儿出版社        312
地方科技出版社        194
地方古籍出版社        179
地方人民出版社        175
地方美术出版社        173
Name: CBS_Class, dtype: int64

In [91]:
x["CBS_Class"].replace(["中央及军队出版社", "高校出版社","城市与地方媒体出版社", "地方文艺出版社","地方少儿出版社","地方科技出版社",
                        "地方古籍出版社","地方人民出版社","地方美术出版社"],[1,2,3,4,5,6,7,8,9],inplace=True)


/usr/local/lib/python3.5/site-packages/pandas/core/generic.py:3554: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)

In [113]:
x.head()


Out[113]:
CBS_Class CBS_MY_Order CBS_MY_Rate CBS_XS_MY CBS_DXPZ CBS_XSCS CBS_CRTAX Book_Class CBS_BK_Average_price CBS_BK_MY_Order ... FXS_Class User_Area User_Get_Price FXS_Discount FXS_BK_MY_Order FXS_BK_MY_Rate FXS_BK_MY FXS_BK_DXPZ FXS_BK_XSCS FXS_BK_Average_Price
0 1 11 1.0934 90678033.8 2921 2357742 0.07 1 34.83 3 ... 3 2 9.0 0.585 2 3.4196 15289978.1 1579 426206 35.87
1 1 11 1.0934 90678033.8 2921 2357742 0.07 6 17.75 515 ... 3 2 9.0 0.585 484 0.0000 36.0 1 2 18.00
2 1 11 1.0934 90678033.8 2921 2357742 0.08 5 32.51 258 ... 3 2 9.0 0.585 235 0.0099 15346.0 7 478 32.10
3 1 11 1.0934 90678033.8 2921 2357742 0.07 3 47.25 3 ... 3 2 9.0 0.585 3 3.1915 10890168.2 405 232732 46.79
4 1 11 1.0934 90678033.8 2921 2357742 0.08 7 60.70 191 ... 3 2 9.0 0.585 181 0.0240 59391.0 21 1022 58.11

5 rows × 23 columns


In [112]:
x.columns


Out[112]:
Index(['CBS_Class', 'CBS_MY_Order', 'CBS_MY_Rate', 'CBS_XS_MY', 'CBS_DXPZ',
       'CBS_XSCS', 'CBS_CRTAX', 'Book_Class', 'CBS_BK_Average_price',
       'CBS_BK_MY_Order', 'CBS_BK_MY_Rate', 'CBS_BK_XS_MY', 'CBS_BK_DXPZ',
       'FXS_Class', 'User_Area', 'User_Get_Price', 'FXS_Discount',
       'FXS_BK_MY_Order', 'FXS_BK_MY_Rate', 'FXS_BK_MY', 'FXS_BK_DXPZ',
       'FXS_BK_XSCS', 'FXS_BK_Average_Price'],
      dtype='object')

In [94]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x, y)

In [97]:
from IPython.display import Image

In [114]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=x,
                         class_names=y,  
                         filled=True, rounded=True,
                         special_characters=True)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2133             try:
-> 2134                 return self._engine.get_loc(key)
   2135             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 11

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-114-8de1aa33087e> in <module>()
      3                          class_names=y,
      4                          filled=True, rounded=True,
----> 5                          special_characters=True)

/usr/local/lib/python3.5/site-packages/sklearn/tree/export.py in export_graphviz(decision_tree, out_file, max_depth, feature_names, class_names, label, filled, leaves_parallel, impurity, node_ids, proportion, rotate, rounded, special_characters)
    431             recurse(decision_tree, 0, criterion="impurity")
    432         else:
--> 433             recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)
    434 
    435         # If required, draw leaf nodes at same depth as each other

/usr/local/lib/python3.5/site-packages/sklearn/tree/export.py in recurse(tree, node_id, criterion, parent, depth)
    319             out_file.write('%d [label=%s'
    320                            % (node_id,
--> 321                               node_to_str(tree, node_id, criterion)))
    322 
    323             if filled:

/usr/local/lib/python3.5/site-packages/sklearn/tree/export.py in node_to_str(tree, node_id, criterion)
    217             # Always write node decision criteria, except for leaves
    218             if feature_names is not None:
--> 219                 feature = feature_names[tree.feature[node_id]]
    220             else:
    221                 feature = "X%s%s%s" % (characters[1],

/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/usr/local/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3541 
   3542             if not isnull(item):
-> 3543                 loc = self.items.get_loc(item)
   3544             else:
   3545                 indexer = np.arange(len(self.items))[isnull(self.items)]

/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 11

In [107]:
x.head()


Out[107]:
CBS_Class CBS_MY_Order CBS_MY_Rate CBS_XS_MY CBS_DXPZ CBS_XSCS CBS_CRTAX Book_Class CBS_BK_Average_price CBS_BK_MY_Order ... FXS_Class User_Area User_Get_Price FXS_Discount FXS_BK_MY_Order FXS_BK_MY_Rate FXS_BK_MY FXS_BK_DXPZ FXS_BK_XSCS FXS_BK_Average_Price
0 1 11 1.0934 90678033.8 2921 2357742 0.07 1 34.83 3 ... 3 2 9.0 0.585 2 3.4196 15289978.1 1579 426206 35.87
1 1 11 1.0934 90678033.8 2921 2357742 0.07 6 17.75 515 ... 3 2 9.0 0.585 484 0.0000 36.0 1 2 18.00
2 1 11 1.0934 90678033.8 2921 2357742 0.08 5 32.51 258 ... 3 2 9.0 0.585 235 0.0099 15346.0 7 478 32.10
3 1 11 1.0934 90678033.8 2921 2357742 0.07 3 47.25 3 ... 3 2 9.0 0.585 3 3.1915 10890168.2 405 232732 46.79
4 1 11 1.0934 90678033.8 2921 2357742 0.08 7 60.70 191 ... 3 2 9.0 0.585 181 0.0240 59391.0 21 1022 58.11

5 rows × 23 columns


In [111]:
clf.predict([[1,11,1.0934,90678033.8,2921, 2357742,0.07, 1, 34.83, 3, 1,1,1,1,1,1,1,1,1,1,1,1,1]])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-111-e2ae81977a86> in <module>()
----> 1 clf.predict([[1,11,1.0934,90678033.8,2921, 2357742,0.07, "Book_Class", 34.83, 3, 1,1,1,1,1,1,1,1,1,1,1,1,1]])

/usr/local/lib/python3.5/site-packages/sklearn/tree/tree.py in predict(self, X, check_input)
    402         """
    403 
--> 404         X = self._validate_X_predict(X, check_input)
    405         proba = self.tree_.predict(X)
    406         n_samples = X.shape[0]

/usr/local/lib/python3.5/site-packages/sklearn/tree/tree.py in _validate_X_predict(self, X, check_input)
    363 
    364         if check_input:
--> 365             X = check_array(X, dtype=DTYPE, accept_sparse="csr")
    366             if issparse(X) and (X.indices.dtype != np.intc or
    367                                 X.indptr.dtype != np.intc):

/usr/local/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: could not convert string to float: 'Book_Class'

In [110]:
clf.predict([[1,11,1.0934,90678033.8,2921, 2357742,0.07, 1, 34.83, 3, 1,1,1,1,1,1,1,1,1,1,1,1,1]])


Out[110]:
array([2])

In [116]:
print(x.dtypes)


CBS_Class                 int64
CBS_MY_Order              int64
CBS_MY_Rate             float64
CBS_XS_MY               float64
CBS_DXPZ                  int64
CBS_XSCS                  int64
CBS_CRTAX               float64
Book_Class                int64
CBS_BK_Average_price    float64
CBS_BK_MY_Order           int64
CBS_BK_MY_Rate          float64
CBS_BK_XS_MY            float64
CBS_BK_DXPZ               int64
FXS_Class                 int64
User_Area                 int64
User_Get_Price          float64
FXS_Discount            float64
FXS_BK_MY_Order           int64
FXS_BK_MY_Rate          float64
FXS_BK_MY               float64
FXS_BK_DXPZ               int64
FXS_BK_XSCS               int64
FXS_BK_Average_Price    float64
dtype: object

In [117]:
print(book.dtypes)


NO                        int64
Year                      int64
CBS_NAME                 object
CBS_Class                object
CBS_MY_Order              int64
CBS_MY_Rate             float64
CBS_XS_MY               float64
CBS_DXPZ                  int64
CBS_XSCS                  int64
CBS_CRTAX               float64
Book_Class               object
CBS_BK_Average_price    float64
CBS_BK_MY_Order           int64
CBS_BK_MY_Rate          float64
CBS_BK_XS_MY            float64
CBS_BK_DXPZ               int64
CBS_BK_XSCS               int64
FXS_Class                object
User_Area                object
User_Get_Price          float64
FXS_Discount            float64
FXS_BK_MY_Order           int64
FXS_BK_MY_Rate          float64
FXS_BK_MY               float64
FXS_BK_DXPZ               int64
FXS_BK_XSCS               int64
FXS_BK_Average_Price    float64
Unnamed: 27             float64
dtype: object

In [122]:
xm = x.as_matrix()

In [123]:
type(xm)


Out[123]:
numpy.ndarray

xm


In [125]:
ym = y.as_matrix()

In [140]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x, y)

In [141]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=x.columns.tolist(),
                         class_names=y.columns.tolist(),  
                         filled=True, rounded=True,
                         special_characters=True)


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-141-e1999297aabf> in <module>()
      3                          class_names=y.columns.tolist(),
      4                          filled=True, rounded=True,
----> 5                          special_characters=True)

/usr/local/lib/python3.5/site-packages/sklearn/tree/export.py in export_graphviz(decision_tree, out_file, max_depth, feature_names, class_names, label, filled, leaves_parallel, impurity, node_ids, proportion, rotate, rounded, special_characters)
    431             recurse(decision_tree, 0, criterion="impurity")
    432         else:
--> 433             recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)
    434 
    435         # If required, draw leaf nodes at same depth as each other

/usr/local/lib/python3.5/site-packages/sklearn/tree/export.py in recurse(tree, node_id, criterion, parent, depth)
    319             out_file.write('%d [label=%s'
    320                            % (node_id,
--> 321                               node_to_str(tree, node_id, criterion)))
    322 
    323             if filled:

/usr/local/lib/python3.5/site-packages/sklearn/tree/export.py in node_to_str(tree, node_id, criterion)
    284                 node_string += 'class = '
    285             if class_names is not True:
--> 286                 class_name = class_names[np.argmax(value)]
    287             else:
    288                 class_name = "y%s%s%s" % (characters[1],

IndexError: list index out of range

In [137]:
x.columns.tolist()


Out[137]:
['CBS_Class',
 'CBS_MY_Order',
 'CBS_MY_Rate',
 'CBS_XS_MY',
 'CBS_DXPZ',
 'CBS_XSCS',
 'CBS_CRTAX',
 'Book_Class',
 'CBS_BK_Average_price',
 'CBS_BK_MY_Order',
 'CBS_BK_MY_Rate',
 'CBS_BK_XS_MY',
 'CBS_BK_DXPZ',
 'FXS_Class',
 'User_Area',
 'User_Get_Price',
 'FXS_Discount',
 'FXS_BK_MY_Order',
 'FXS_BK_MY_Rate',
 'FXS_BK_MY',
 'FXS_BK_DXPZ',
 'FXS_BK_XSCS',
 'FXS_BK_Average_Price']

In [138]:
y.columns.tolist()


Out[138]:
['CBS_BK_XSCS']

In [142]:
x.shape


Out[142]:
(4100, 23)

In [143]:
y.shape


Out[143]:
(4100, 1)

In [144]:
x.columns.shape


Out[144]:
(23,)

In [145]:
y.columns.shape


Out[145]:
(1,)

In [146]:
y.head()


Out[146]:
CBS_BK_XSCS
0 1428330
1 16
2 1933
3 736560
4 3451

In [148]:
y["CBS_BK_XSCS"].value_counts()


Out[148]:
232        10
72          9
2           9
3406        9
404         9
1           8
51          7
10          6
924362      5
88774       5
1663693     5
66225       5
19116       5
8873        5
129702      5
98977       5
31382       5
100347      5
565885      5
31350       5
1886960     5
222025      5
1061463     5
589         5
488007      5
594495      5
45619       5
21035       5
1020531     5
291591      5
           ..
14          4
20          4
30          4
225         4
140         4
510         4
1379        4
44809       4
739         4
834         4
1386        4
195         4
222         4
176         4
197         4
476         4
168         4
1209        4
8           3
12          3
16          3
40          3
36          3
48          3
54          3
108         3
7           2
3           2
21          1
11          1
Name: CBS_BK_XSCS, dtype: int64

In [150]:
import seaborn as sns
sns.kdeplot(y, shade=True)


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-150-267c380977ad> in <module>()
      1 import seaborn as sns
----> 2 sns.kdeplot(y, shade=True)

/usr/local/lib/python3.5/site-packages/seaborn/distributions.py in kdeplot(data, data2, shade, vertical, kernel, bw, gridsize, cut, clip, legend, cumulative, shade_lowest, ax, **kwargs)
    586         bivariate = True
    587         x = data.iloc[:, 0].values
--> 588         y = data.iloc[:, 1].values
    589     elif data2 is not None:
    590         bivariate = True

/usr/local/lib/python3.5/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1308 
   1309         if type(key) is tuple:
-> 1310             return self._getitem_tuple(key)
   1311         else:
   1312             return self._getitem_axis(key, axis=0)

/usr/local/lib/python3.5/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   1558     def _getitem_tuple(self, tup):
   1559 
-> 1560         self._has_valid_tuple(tup)
   1561         try:
   1562             return self._getitem_lowerdim(tup)

/usr/local/lib/python3.5/site-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
    149             if i >= self.obj.ndim:
    150                 raise IndexingError('Too many indexers')
--> 151             if not self._has_valid_type(k, i):
    152                 raise ValueError("Location based indexing can only have [%s] "
    153                                  "types" % self._valid_types)

/usr/local/lib/python3.5/site-packages/pandas/core/indexing.py in _has_valid_type(self, key, axis)
   1526             return True
   1527         elif is_integer(key):
-> 1528             return self._is_valid_integer(key, axis)
   1529         elif is_list_like_indexer(key):
   1530             return self._is_valid_list_like(key, axis)

/usr/local/lib/python3.5/site-packages/pandas/core/indexing.py in _is_valid_integer(self, key, axis)
   1540         l = len(ax)
   1541         if key >= l or key < -l:
-> 1542             raise IndexError("single positional indexer is out-of-bounds")
   1543         return True
   1544 

IndexError: single positional indexer is out-of-bounds

In [151]:
y["CBS_BK_XSCS"]


Out[151]:
0       1428330
1            16
2          1933
3        736560
4          3451
5         97266
6         90184
7       1428330
8          1933
9        736560
10         3451
11        97266
12        90184
13      1428330
14           16
15         1933
16       736560
17         3451
18        97266
19        90184
20            2
21      1428330
22           16
23         1933
24       736560
25         3451
26        97266
27        90184
28            2
29      1428330
         ...   
4070      79572
4071          2
4072    6070987
4073     299984
4074      26793
4075     222520
4076       7381
4077      91485
4078      79572
4079    6070987
4080     299984
4081      26793
4082     222520
4083       7381
4084      91485
4085      79572
4086    6070987
4087     299984
4088      26793
4089     222520
4090       7381
4091      91485
4092      79572
4093    6070987
4094     299984
4095      26793
4096     222520
4097       7381
4098      91485
4099      79572
Name: CBS_BK_XSCS, dtype: int64

In [152]:
import matplotlib.pyplot as plt
%matplotlib inline

In [154]:
plt.hist(y["CBS_BK_XSCS"])


Out[154]:
(array([ 3800.,   170.,    55.,     5.,    35.,    10.,    15.,     0.,
            0.,    10.]),
 array([  1.00000000e+00,   1.16430980e+06,   2.32861860e+06,
          3.49292740e+06,   4.65723620e+06,   5.82154500e+06,
          6.98585380e+06,   8.15016260e+06,   9.31447140e+06,
          1.04787802e+07,   1.16430890e+07]),
 <a list of 10 Patch objects>)

In [ ]:


In [ ]:
type(x)

In [ ]: