In [1]:
# 加载库
from sklearn.datasets import load_iris
import numpy as np
from scipy.stats import trim_mean

# 加载数据集
data = load_iris()
x = data['data']
y = data['target']
col_names = data['feature_names']

In [2]:
# 计算平均值
print 'col name,mean value'
for i,col_name in enumerate(col_names):
    print '%s,%0.2f' % (col_name,np.mean(x[:,i]))


col name,mean value
sepal length (cm),5.84
sepal width (cm),3.05
petal length (cm),3.76
petal width (cm),1.20

截尾均值是另一种有趣的指标,它有自己的特长,10%截尾均值就是将数据集中的最大10%和最小的10%排除,然后计算80%的数据平均值


In [3]:
# 计算截尾平均值
p = 0.1
print 'col name,trimmed mean value'
for i,col_name in enumerate(col_names):
    print '%s,%0.2f' % (col_name,trim_mean(x[:,i],p))


col name,trimmed mean value
sepal length (cm),5.81
sepal width (cm),3.04
petal length (cm),3.76
petal width (cm),1.18

In [5]:
# 数据离差,计算并显示幅度值
print 'col_name,max,min,range'
for i,col_name in enumerate(col_names):
    print '%s,%0.2f,%0.2f,%0.2f' % (col_name,max(x[:,i]),min(x[:,i]),max(x[:,i])-min(x[:,i]))


col_name,max,min,range
sepal length (cm),7.90,4.30,3.60
sepal width (cm),4.40,2.00,2.40
petal length (cm),6.90,1.00,5.90
petal width (cm),2.50,0.10,2.40

In [6]:
# 方差,标准差
print 'col_name,variance,std-dev'
for i,col_name in enumerate(col_names):
    print '%s,%0.2f,%0.2f'%(col_name,np.var(x[:,i]),np.std(x[:,i]))


col_name,variance,std-dev
sepal length (cm),0.68,0.83
sepal width (cm),0.19,0.43
petal length (cm),3.09,1.76
petal width (cm),0.58,0.76

In [7]:
# 计算平均绝对离差
def mad(x,axis=None):
    mean = np.mean(x,axis=axis)
    return np.sum(np.abs(x-mean))/(1.0*len(x))

print 'col_name,mad'
for i,col_name in enumerate(col_names):
    print '%s,%0.2f' % (col_name,mad(x[:,i]))


col_name,mad
sepal length (cm),0.69
sepal width (cm),0.33
petal length (cm),1.56
petal width (cm),0.66

In [8]:
# 计算绝对中位数
def mdad(x,axis=None):
    median = np.median(x,axis=axis)
    return np.median(np.abs(x-median))

print 'col_name,median,abs dev,quartile range'
for i,col_name in enumerate(col_names):
    iqr = np.percentile(x[:,i],75) - np.percentile(x[:,i],25)
    print '%s,%0.2f,%0.2f,%0.2f'%(col_name,np.median(x[:,i]),mdad(x[:,i]),iqr)


col_name,median,abs dev,quartile range
sepal length (cm),5.80,0.70,1.30
sepal width (cm),3.00,0.25,0.50
petal length (cm),4.35,1.25,3.50
petal width (cm),1.30,0.70,1.50