In [1]:
# 加载库
from sklearn.datasets import load_iris
import numpy as np
from scipy.stats import trim_mean
# 加载数据集
data = load_iris()
x = data['data']
y = data['target']
col_names = data['feature_names']
In [2]:
# 计算平均值
print 'col name,mean value'
for i,col_name in enumerate(col_names):
print '%s,%0.2f' % (col_name,np.mean(x[:,i]))
截尾均值是另一种有趣的指标,它有自己的特长,10%截尾均值就是将数据集中的最大10%和最小的10%排除,然后计算80%的数据平均值
In [3]:
# 计算截尾平均值
p = 0.1
print 'col name,trimmed mean value'
for i,col_name in enumerate(col_names):
print '%s,%0.2f' % (col_name,trim_mean(x[:,i],p))
In [5]:
# 数据离差,计算并显示幅度值
print 'col_name,max,min,range'
for i,col_name in enumerate(col_names):
print '%s,%0.2f,%0.2f,%0.2f' % (col_name,max(x[:,i]),min(x[:,i]),max(x[:,i])-min(x[:,i]))
In [6]:
# 方差,标准差
print 'col_name,variance,std-dev'
for i,col_name in enumerate(col_names):
print '%s,%0.2f,%0.2f'%(col_name,np.var(x[:,i]),np.std(x[:,i]))
In [7]:
# 计算平均绝对离差
def mad(x,axis=None):
mean = np.mean(x,axis=axis)
return np.sum(np.abs(x-mean))/(1.0*len(x))
print 'col_name,mad'
for i,col_name in enumerate(col_names):
print '%s,%0.2f' % (col_name,mad(x[:,i]))
In [8]:
# 计算绝对中位数
def mdad(x,axis=None):
median = np.median(x,axis=axis)
return np.median(np.abs(x-median))
print 'col_name,median,abs dev,quartile range'
for i,col_name in enumerate(col_names):
iqr = np.percentile(x[:,i],75) - np.percentile(x[:,i],25)
print '%s,%0.2f,%0.2f,%0.2f'%(col_name,np.median(x[:,i]),mdad(x[:,i]),iqr)