In [1]:
# 主要内容:
# 1、对缺失值进行差值补全(依赖scipy)
# 2、数据规范化处理(简单的向量运算)
# 3、对数据进行离散化处理,并可视化结果
# 4、通过主成分分析对数据进行降维(sk-learn)
In [2]:
# 对数据进行拉格朗日差值,补全缺失值
import pandas as pd
import numpy as np
from scipy.interpolate import lagrange
data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/catering_sale.xls')
data['销量'][(data['销量'] < 400) | (data['销量'] > 5000)] = np.nan
# 定义插值函数, 默认区前后5项进行差值
def ployinterp_column(s, n, k = 5):
y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))]
y = y[y.notnull()]
return lagrange(y.index, list(y))(n)
for i in data.columns:
for j in range(len(data)):
#if(data[i].isnull())[j]: #竟然还可以这样写??
if(data[i].isnull()[j]):
data[i][j] = ployinterp_column(data[i], j)
data.describe()
Out[2]:
In [3]:
import numpy as np
# 数据规范化
data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/normalization_data.xls')
# 最大-最小规范化
(data - data.min()) / (data.max() - data.min())
# 正则化
(data - data.mean()) / data.std()
# 小数定标规范化(不明觉厉)
data / 10**np.ceil(np.log10(data.abs().max()))
Out[3]:
In [4]:
# 连续属性离散化
import pandas as pd
data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/discretization_data.xls')
data = data['肝气郁结证型系数'].copy()
# 等宽离散化
d1 = pd.cut(data, 4, labels=range(4))
# 等频率离散化
k = 4
w = [1.0*i/k for i in range(k+1)]
w = data.describe(percentiles = w)[4:4+k+1]
d2 = pd.cut(data, w, labels = range(4))
In [5]:
# 通过k-means聚类进行分类
from sklearn.cluster import KMeans
kmodel = KMeans(n_clusters = 4, n_jobs = 4) # 建立模型, n_jobs为并行数
kmodel.fit(data.reshape((len(data), 1))) # 训练模型
c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0) # 输出聚类中心并排序
w = c.rolling(window = 2, center = False).mean().iloc[1:] # 把相邻两项中点作为分界点
w = [0] + list(w[0]) + [data.max()] # 加上首末节点
d3 = pd.cut(data, w, labels = range(4))
In [6]:
# 可视化离散效果
def cluster_plot(d, k):
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize = (8, 3))
for j in range(k):
plt.plot(data[d==j], [j for i in d[d==j]], 'o')
plt.ylim(-0.5, k-0.5)
return plt
cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()
In [7]:
# 主成分分析
#sklearn.decomposition.PCA(n_components = None, copy = True, whiten = False)
# 参数:
# n_components: 主成分个数
# copy: 默认True, False时会在原始数据上操作
# whiten: 白化,使每个特征有相同方差
import pandas as pd
from sklearn.decomposition import PCA
data = pd.read_excel('/home/jeff/python_data/chapter4/chapter4/demo/data/principal_component.xls')
pca = PCA()
pca.fit(data)
pca.components_ # 各个特征向量
pca.explained_variance_ratio_.cumsum() # 确定使用三个主成分
pca = PCA(3)
pca.fit(data)
low_d = pca.transform(data) # 对数据进行降维
pd.DataFrame(low_d) # 得到三个主成分
#pca.inverse_transform(low_d) # pca降维的逆运算
Out[7]: