notebook.community

Edit and run



In [1]:

    
# 主要内容：
# 1、对缺失值进行差值补全(依赖scipy)
# 2、数据规范化处理(简单的向量运算)
# 3、对数据进行离散化处理，并可视化结果
# 4、通过主成分分析对数据进行降维(sk-learn)



In [2]:

    
# 对数据进行拉格朗日差值，补全缺失值

import pandas as pd
import numpy as np
from scipy.interpolate import lagrange

data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/catering_sale.xls')
data['销量'][(data['销量'] < 400) | (data['销量'] > 5000)] = np.nan

# 定义插值函数, 默认区前后5项进行差值

def ployinterp_column(s, n, k = 5):
    y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))]
    y = y[y.notnull()]
    return lagrange(y.index, list(y))(n)

for i in data.columns:
    for j in range(len(data)):
        #if(data[i].isnull())[j]:  #竟然还可以这样写？？
        if(data[i].isnull()[j]): 
            data[i][j] = ployinterp_column(data[i], j)

data.describe()









    



/home/jeff/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/jeff/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    Out[2]:






  
    
      
      销量
    
  
  
    
      count
      201.000000
    
    
      mean
      2360.242862
    
    
      std
      5568.724439
    
    
      min
      -75744.000000
    
    
      25%
      2452.600000
    
    
      50%
      2655.900000
    
    
      75%
      3033.100000
    
    
      max
      6720.000000



In [3]:

    
import numpy as np

# 数据规范化
data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/normalization_data.xls')

# 最大-最小规范化
(data - data.min()) / (data.max() - data.min())

# 正则化
(data - data.mean()) / data.std()

# 小数定标规范化(不明觉厉)
data / 10**np.ceil(np.log10(data.abs().max()))



In [4]:

    
# 连续属性离散化

import pandas as pd

data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/discretization_data.xls')
data = data['肝气郁结证型系数'].copy()

# 等宽离散化
d1 = pd.cut(data, 4, labels=range(4))

# 等频率离散化
k = 4
w = [1.0*i/k for i in range(k+1)]
w = data.describe(percentiles = w)[4:4+k+1]
d2 = pd.cut(data, w, labels = range(4))



In [5]:

    
# 通过k-means聚类进行分类

from sklearn.cluster import KMeans

kmodel = KMeans(n_clusters = 4, n_jobs = 4) # 建立模型, n_jobs为并行数
kmodel.fit(data.reshape((len(data), 1))) # 训练模型
c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0) # 输出聚类中心并排序
w = c.rolling(window = 2, center = False).mean().iloc[1:] # 把相邻两项中点作为分界点
w = [0] + list(w[0]) + [data.max()] # 加上首末节点
d3 = pd.cut(data, w, labels = range(4))



In [6]:

    
# 可视化离散效果

def cluster_plot(d, k):
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    
    plt.figure(figsize = (8, 3))
    for j in range(k):
        plt.plot(data[d==j], [j for i in d[d==j]], 'o')
    plt.ylim(-0.5, k-0.5)
    return plt

cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()









    



/home/jeff/anaconda3/lib/python3.5/site-packages/matplotlib/font_manager.py:1288: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [7]:

    
# 主成分分析

#sklearn.decomposition.PCA(n_components = None, copy = True, whiten = False)
# 参数： 
# n_components: 主成分个数
# copy: 默认True, False时会在原始数据上操作
# whiten: 白化，使每个特征有相同方差

import pandas as pd
from sklearn.decomposition import PCA

data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/principal_component.xls')
pca = PCA()
pca.fit(data)
pca.components_ # 各个特征向量
pca.explained_variance_ratio_.cumsum() # 确定使用三个主成分

pca = PCA(3)
pca.fit(data)
low_d = pca.transform(data) # 对数据进行降维
pd.DataFrame(low_d) # 得到三个主成分
#pca.inverse_transform(low_d) # pca降维的逆运算

	78	521	602	2863
0	0.144	-0.600	-0.521	0.2245
1	0.095	-0.457	0.468	-0.1283
2	0.069	0.596	0.695	0.1054
3	0.190	0.527	0.691	0.2051
4	0.101	0.403	0.470	0.2487
5	0.146	0.413	0.435	0.2571

	0	1	2
0	1.050012	-5.517485	-5.914412
1	-22.997229	-1.975124	-0.209006
2	-13.897677	3.372639	-0.799927
3	5.677104	10.923606	11.640817
4	25.053489	-6.973499	0.857758
5	-2.812806	-6.078801	-2.652072
6	14.148987	16.433028	-4.117091
7	41.831847	-11.329605	3.202778
8	-1.006256	-2.657807	-0.274015
9	-21.334646	-2.825551	0.170441
10	-35.913965	-5.991210	3.786294
11	3.684030	5.683312	1.426253
12	6.517108	6.936497	-7.117820

	销量
count	201.000000
mean	2360.242862
std	5568.724439
min	-75744.000000
25%	2452.600000
50%	2655.900000
75%	3033.100000
max	6720.000000