In [71]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import tree
In [72]:
book = pd.read_csv("/Users/page/data/book/DETAIL_INFO.csv")
In [73]:
book.head()
Out[73]:
In [74]:
book["CBS_Class"].value_counts()
Out[74]:
In [75]:
book.count()
Out[75]:
In [76]:
book.columns
Out[76]:
In [77]:
x = book[['CBS_Class', 'CBS_MY_Order', 'CBS_MY_Rate',
'CBS_XS_MY', 'CBS_DXPZ', 'CBS_XSCS', 'CBS_CRTAX', 'Book_Class',
'CBS_BK_Average_price', 'CBS_BK_MY_Order', 'CBS_BK_MY_Rate',
'CBS_BK_XS_MY', 'CBS_BK_DXPZ', 'FXS_Class', 'User_Area',
'User_Get_Price', 'FXS_Discount', 'FXS_BK_MY_Order', 'FXS_BK_MY_Rate',
'FXS_BK_MY', 'FXS_BK_DXPZ', 'FXS_BK_XSCS', 'FXS_BK_Average_Price']]
In [119]:
new_book = book[['CBS_Class', 'CBS_MY_Order', 'CBS_MY_Rate',
'CBS_XS_MY', 'CBS_DXPZ', 'CBS_XSCS', 'CBS_CRTAX', 'Book_Class',
'CBS_BK_Average_price', 'CBS_BK_MY_Order', 'CBS_BK_MY_Rate',
'CBS_BK_XS_MY', 'CBS_BK_DXPZ','CBS_BK_XSCS', 'FXS_Class', 'User_Area',
'User_Get_Price', 'FXS_Discount', 'FXS_BK_MY_Order', 'FXS_BK_MY_Rate',
'FXS_BK_MY', 'FXS_BK_DXPZ', 'FXS_BK_XSCS', 'FXS_BK_Average_Price']]
new_book["User_Area"].replace(["三线城市为主,部分在二线", "一线城市,个别省会城市","一线和二线城市", "三线城市"],[1,2,3,4],inplace=True)
new_book["FXS_Class"].replace(["中等书店", "城市店","超大书城", "大书城","小型书店"],[1,2,3,4,5],inplace=True)
new_book["Book_Class"].replace(["社科", "少儿","文艺", "生活休闲","语言","教辅教材","科技","综合图书"],[1,2,3,4,5,6,7,8],inplace=True)
new_book["CBS_Class"].replace(["中央及军队出版社", "高校出版社","城市与地方媒体出版社", "地方文艺出版社","地方少儿出版社","地方科技出版社",
"地方古籍出版社","地方人民出版社","地方美术出版社"],[1,2,3,4,5,6,7,8,9],inplace=True)
In [79]:
y = book[['CBS_BK_XSCS']]
In [121]:
new_book.to_csv("/Users/page/data/book/newbook.csv",header=True)
In [81]:
x.head()
Out[81]:
In [ ]:
In [82]:
y.head()
Out[82]:
In [83]:
x["User_Area"].value_counts()
Out[83]:
In [84]:
x["User_Area"].replace(["三线城市为主,部分在二线", "一线城市,个别省会城市","一线和二线城市", "三线城市"],[1,2,3,4],inplace=True)
In [85]:
x["FXS_Class"].replace(["中等书店", "城市店","超大书城", "大书城","小型书店"],[1,2,3,4,5],inplace=True)
In [86]:
x["Book_Class"].replace(["社科", "少儿","文艺", "生活休闲","语言","教辅教材","科技","综合图书"],[1,2,3,4,5,6,7,8],inplace=True)
In [87]:
In [88]:
x.shape
Out[88]:
In [89]:
x["Book_Class"].value_counts()
Out[89]:
In [90]:
x["CBS_Class"].value_counts()
Out[90]:
In [91]:
x["CBS_Class"].replace(["中央及军队出版社", "高校出版社","城市与地方媒体出版社", "地方文艺出版社","地方少儿出版社","地方科技出版社",
"地方古籍出版社","地方人民出版社","地方美术出版社"],[1,2,3,4,5,6,7,8,9],inplace=True)
In [113]:
x.head()
Out[113]:
In [112]:
x.columns
Out[112]:
In [94]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x, y)
In [97]:
from IPython.display import Image
In [114]:
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=x,
class_names=y,
filled=True, rounded=True,
special_characters=True)
In [107]:
x.head()
Out[107]:
In [111]:
clf.predict([[1,11,1.0934,90678033.8,2921, 2357742,0.07, 1, 34.83, 3, 1,1,1,1,1,1,1,1,1,1,1,1,1]])
In [110]:
clf.predict([[1,11,1.0934,90678033.8,2921, 2357742,0.07, 1, 34.83, 3, 1,1,1,1,1,1,1,1,1,1,1,1,1]])
Out[110]:
In [116]:
print(x.dtypes)
In [117]:
print(book.dtypes)
In [122]:
xm = x.as_matrix()
In [123]:
type(xm)
Out[123]:
xm
In [125]:
ym = y.as_matrix()
In [140]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x, y)
In [141]:
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=x.columns.tolist(),
class_names=y.columns.tolist(),
filled=True, rounded=True,
special_characters=True)
In [137]:
x.columns.tolist()
Out[137]:
In [138]:
y.columns.tolist()
Out[138]:
In [142]:
x.shape
Out[142]:
In [143]:
y.shape
Out[143]:
In [144]:
x.columns.shape
Out[144]:
In [145]:
y.columns.shape
Out[145]:
In [146]:
y.head()
Out[146]:
In [148]:
y["CBS_BK_XSCS"].value_counts()
Out[148]:
In [150]:
import seaborn as sns
sns.kdeplot(y, shade=True)
In [151]:
y["CBS_BK_XSCS"]
Out[151]:
In [152]:
import matplotlib.pyplot as plt
%matplotlib inline
In [154]:
plt.hist(y["CBS_BK_XSCS"])
Out[154]:
In [ ]:
In [ ]:
type(x)
In [ ]: