In [1]:
import pandas as pd

In [2]:
import numpy as np

In [17]:
df=pd.read_csv("http://vincentarelbundock.github.io/Rdatasets/csv/MASS/Boston.csv")
df2=df.iloc[:,1:]
df2.head()


Out[17]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2

In [18]:
def my_descriptive_stats(df):
    my_stats_dict=dict()
    y=list(df.columns)
    nmiss=[]
    n=[]
    sd=[]
    mean_val=[]
    min_val=[]
    p1=[]
    p5=[]
    p10=[]
    p90=[]
    p95=[]
    p99=[]
    q1=[]
    q2=[]
    q3=[]
    max_val=[]
    UC=[]
    LC=[]
    outlier_flag=[]
    
    
    for i in range(len(df.columns)):
        col_name=y[i]
        nmiss.append(sum(df[col_name ].isnull()))
        x=df[df[col_name].notnull()][col_name]
        mean_val.append(np.mean(x))
        n.append(len(x))
        sd.append(np.std(x))
        min_val.append(min(x))
        p1.append(x.quantile(.01))
        p5.append(x.quantile(.05))
        p10.append(x.quantile(.1))
        q1.append(x.quantile(.25))
        q2.append(x.quantile(.5))
        q3.append(x.quantile(.75))
        p90.append(x.quantile(.90))
        p95.append(x.quantile(.95))
        p99.append(x.quantile(.99))
        max_val.append(max(x))
        UC.append(np.mean(x)+3*np.std(x))
        LC.append(np.mean(x)-3*np.std(x))
        outlier_flag.append(np.sum(max_val>UC or min_val<LC))
        

    my_stats_dict['nmiss']=nmiss
    my_stats_dict['mean_val']=mean_val
    my_stats_dict['n']=n
    my_stats_dict['sd']=sd
    my_stats_dict['min_val']=min_val
    my_stats_dict['p1']=p1
    my_stats_dict['p5']=p5
    my_stats_dict['p10']=p10
    my_stats_dict['q1']=q1
    my_stats_dict['q2']=q2
    my_stats_dict['q3']=q3
    my_stats_dict['p90']=p90
    my_stats_dict['p95']=p95
    my_stats_dict['p99']=p99
    my_stats_dict['max_val']=max_val
    my_stats_dict['UC']=UC
    my_stats_dict['LC']=LC
    my_stats_dict['outlier_flag']=outlier_flag
    q=pd.DataFrame(my_stats_dict,index=y)
    
    return q

my_descriptive_stats(df2)


Out[18]:
LC UC max_val mean_val min_val n nmiss outlier_flag p1 p10 p5 p90 p95 p99 q1 q2 q3 sd
crim -22.165600 29.392648 88.9762 3.613524 0.00632 506 0 1 0.01361 0.038195 0.027910 10.7530 15.78915 41.37033 0.082045 0.25651 3.677082 8.593041
zn -58.534551 81.261823 100.0000 11.363636 0.00000 506 0 1 0.00000 0.000000 0.000000 42.5000 80.00000 90.00000 0.000000 0.00000 12.500000 23.299396
indus -9.423933 31.697490 27.7400 11.136779 0.46000 506 0 1 1.25350 2.910000 2.180000 19.5800 21.89000 25.65000 5.190000 9.69000 18.100000 6.853571
chas -0.692059 0.830399 1.0000 0.069170 0.00000 506 0 1 0.00000 0.000000 0.000000 0.0000 1.00000 1.00000 0.000000 0.00000 0.000000 0.253743
nox 0.207406 0.901984 0.8710 0.554695 0.38500 506 0 1 0.39800 0.427000 0.409250 0.7130 0.74000 0.87100 0.449000 0.53800 0.624000 0.115763
rm 4.178867 8.390402 8.7800 6.284634 3.56100 506 0 1 4.52445 5.593500 5.314000 7.1515 7.58750 8.33500 5.885500 6.20850 6.623500 0.701923
age -15.788197 152.937999 100.0000 68.574901 2.90000 506 0 1 6.61000 26.950000 17.725000 98.8000 100.00000 100.00000 45.025000 77.50000 94.075000 28.121033
dis -2.515842 10.105928 12.1265 3.795043 1.12960 506 0 1 1.20654 1.628300 1.461975 6.8166 7.82780 9.22277 2.100175 3.20745 5.188425 2.103628
rad -16.546546 35.645360 24.0000 9.549407 1.00000 506 0 1 1.00000 3.000000 2.000000 24.0000 24.00000 24.00000 4.000000 5.00000 24.000000 8.698651
tax -96.874331 913.348639 711.0000 408.237154 187.00000 506 0 1 188.00000 233.000000 222.000000 666.0000 666.00000 666.00000 279.000000 330.00000 666.000000 168.370495
ptratio 11.967118 24.943949 22.0000 18.455534 12.60000 506 0 1 13.00000 14.750000 14.700000 20.9000 21.00000 21.20000 17.400000 19.05000 20.200000 2.162805
black 83.060209 630.287854 396.9000 356.674032 0.32000 506 0 1 6.73000 290.270000 84.590000 396.9000 396.90000 396.90000 375.377500 391.44000 396.225000 91.204607
lstat -8.748942 34.055068 37.9700 12.653063 1.73000 506 0 1 2.88300 4.680000 3.707500 23.0350 26.80750 33.91850 6.950000 11.36000 16.955000 7.134002
medv -5.031228 50.096841 50.0000 22.532806 5.00000 506 0 1 7.01000 12.750000 10.200000 34.8000 43.40000 50.00000 17.025000 21.20000 25.000000 9.188012

In [ ]: