In [1]:
# 主要内容:
# 1、对缺失值进行差值补全(依赖scipy)
# 2、数据规范化处理(简单的向量运算)
# 3、对数据进行离散化处理,并可视化结果
# 4、通过主成分分析对数据进行降维(sk-learn)

In [2]:
# 对数据进行拉格朗日差值,补全缺失值

import pandas as pd
import numpy as np
from scipy.interpolate import lagrange

data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/catering_sale.xls')
data['销量'][(data['销量'] < 400) | (data['销量'] > 5000)] = np.nan

# 定义插值函数, 默认区前后5项进行差值

def ployinterp_column(s, n, k = 5):
    y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))]
    y = y[y.notnull()]
    return lagrange(y.index, list(y))(n)

for i in data.columns:
    for j in range(len(data)):
        #if(data[i].isnull())[j]:  #竟然还可以这样写??
        if(data[i].isnull()[j]): 
            data[i][j] = ployinterp_column(data[i], j)

data.describe()


/home/jeff/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/jeff/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[2]:
销量
count 201.000000
mean 2360.242862
std 5568.724439
min -75744.000000
25% 2452.600000
50% 2655.900000
75% 3033.100000
max 6720.000000

In [3]:
import numpy as np

# 数据规范化
data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/normalization_data.xls')

# 最大-最小规范化
(data - data.min()) / (data.max() - data.min())

# 正则化
(data - data.mean()) / data.std()

# 小数定标规范化(不明觉厉)
data / 10**np.ceil(np.log10(data.abs().max()))


Out[3]:
78 521 602 2863
0 0.144 -0.600 -0.521 0.2245
1 0.095 -0.457 0.468 -0.1283
2 0.069 0.596 0.695 0.1054
3 0.190 0.527 0.691 0.2051
4 0.101 0.403 0.470 0.2487
5 0.146 0.413 0.435 0.2571

In [4]:
# 连续属性离散化

import pandas as pd

data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/discretization_data.xls')
data = data['肝气郁结证型系数'].copy()

# 等宽离散化
d1 = pd.cut(data, 4, labels=range(4))

# 等频率离散化
k = 4
w = [1.0*i/k for i in range(k+1)]
w = data.describe(percentiles = w)[4:4+k+1]
d2 = pd.cut(data, w, labels = range(4))

In [5]:
# 通过k-means聚类进行分类

from sklearn.cluster import KMeans

kmodel = KMeans(n_clusters = 4, n_jobs = 4) # 建立模型, n_jobs为并行数
kmodel.fit(data.reshape((len(data), 1))) # 训练模型
c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0) # 输出聚类中心并排序
w = c.rolling(window = 2, center = False).mean().iloc[1:] # 把相邻两项中点作为分界点
w = [0] + list(w[0]) + [data.max()] # 加上首末节点
d3 = pd.cut(data, w, labels = range(4))

In [6]:
# 可视化离散效果

def cluster_plot(d, k):
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    
    plt.figure(figsize = (8, 3))
    for j in range(k):
        plt.plot(data[d==j], [j for i in d[d==j]], 'o')
    plt.ylim(-0.5, k-0.5)
    return plt

cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()


/home/jeff/anaconda3/lib/python3.5/site-packages/matplotlib/font_manager.py:1288: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [7]:
# 主成分分析

#sklearn.decomposition.PCA(n_components = None, copy = True, whiten = False)
# 参数: 
# n_components: 主成分个数
# copy: 默认True, False时会在原始数据上操作
# whiten: 白化,使每个特征有相同方差

import pandas as pd
from sklearn.decomposition import PCA

data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/principal_component.xls')
pca = PCA()
pca.fit(data)
pca.components_ # 各个特征向量
pca.explained_variance_ratio_.cumsum() # 确定使用三个主成分

pca = PCA(3)
pca.fit(data)
low_d = pca.transform(data) # 对数据进行降维
pd.DataFrame(low_d) # 得到三个主成分
#pca.inverse_transform(low_d) # pca降维的逆运算


Out[7]:
0 1 2
0 1.050012 -5.517485 -5.914412
1 -22.997229 -1.975124 -0.209006
2 -13.897677 3.372639 -0.799927
3 5.677104 10.923606 11.640817
4 25.053489 -6.973499 0.857758
5 -2.812806 -6.078801 -2.652072
6 14.148987 16.433028 -4.117091
7 41.831847 -11.329605 3.202778
8 -1.006256 -2.657807 -0.274015
9 -21.334646 -2.825551 0.170441
10 -35.913965 -5.991210 3.786294
11 3.684030 5.683312 1.426253
12 6.517108 6.936497 -7.117820