In [ ]:
# -*- coding: utf-8 -*-
import numpy as np
from scipy.optimize import leastsq
import matplotlib.pyplot as plt

def func(x, p):
    """
    Function to generate raw data: A*sin(2*pi*k*x + theta)
    """
    A, k, theta = p
    return A*np.sin(2*np.pi*k*x+theta)

def residuals(p, y, x):
    return y - func(x, p)

x = np.linspace(0, -2*np.pi, 100)
A, k, theta = 10, 0.34, np.pi/6
y0 = func(x, [A, k, theta])
y1 = y0 + 2 * np.random.randn(len(x))

p0 = [7, 0.2, 0]

# 调用leastsq进行数据拟合
# residuals为计算误差的函数
# p0为拟合参数的初始值
# args为需要拟合的实验数据
plsq = leastsq(residuals, p0, args=(y1, x))

print("Real Parameters:", [A, k, theta])
print("Regression Parameters", plsq[0]) # 实验数据拟合后的参数

plt.plot(x, y0, label="Real Data")
plt.plot(x, y1, label="Real Data with Noise")
plt.plot(x, func(x, plsq[0]), label="Regression Data")
plt.legend()
plt.show()

In [ ]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

mu = 0
sigma = 1
x = np.arange(-5, 5, 0.1)
y = stats.norm.pdf(x, mu, sigma)

plt.figure()
plt.title('Probability Density')
plt.plot(x, y, label='mu={}, sigma={}'.format(mu, sigma))
plt.legend()
plt.show()

In [ ]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

mu = 0
sigma = 1
x = np.arange(-5, 5, 0.1)
y = stats.norm.cdf(x, mu, sigma)

plt.figure()
plt.title('Cumulative Distribution')
plt.plot(x, y, label='mu={}, sigma={}'.format(mu, sigma))
plt.legend()
plt.show()

In [ ]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt


x = np.arange(-5, 5, 0.1)
plt.figure()
for sigma in [0.5, 1, 1.5]:
    plt.title('Probability Density')
    plt.plot(x, stats.norm.pdf(x, 0, sigma), label='mu={}, sigma={}'.format(0, sigma))
    plt.legend()
plt.show()

In [ ]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt


x = np.arange(-5, 5, 0.1)
plt.figure()
for sigma in [0.5, 1, 1.5]:
    plt.title('Cumulative Distribution')
    plt.plot(x, stats.norm.cdf(x, 0, sigma), label='mu={}, sigma={}'.format(0, sigma))
    plt.legend()
plt.show()

In [ ]:
from scipy import stats
import matplotlib.pyplot as plt


fig = plt.figure()
ax1 = fig.add_subplot(211)
x = stats.loggamma.rvs(5, size=500) + 5
prob = stats.probplot(x, dist=stats.norm, plot=ax1)
ax1.set_xlabel('')
ax1.set_title('Probplot against normal distribution')

print('Use boxcox to transform the data to make it closest to normal')
ax2 = fig.add_subplot(212)
xt, _ = stats.boxcox(x)
prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
ax2.set_title('Probplot after Box-Cox transformation')
plt.show()

In [ ]:
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


plt.figure(figsize=(16, 8))

A1 = np.random.normal(loc=0.0, scale=1.0, size=5000)
A2 = np.random.normal(loc=0.0, scale=5.0, size=5000)
plt.subplot(221)
sns.distplot(A1, kde=True, rug=False, norm_hist=True, label='A1')
sns.distplot(A2, kde=True, rug=False, norm_hist=True, label='A2')
plt.legend()

B1 = np.random.normal(loc=10.0, scale=1.0, size=5000)
B2 = np.random.normal(loc=10.0, scale=5.0, size=5000)
plt.subplot(222)
sns.distplot(B1, kde=True, rug=False, norm_hist=True, label='B1')
sns.distplot(B2, kde=True, rug=False, norm_hist=True, label='B2')
plt.legend()

C = np.random.exponential(2, 5000)
plt.subplot(223)
sns.distplot(C, kde=True, rug=False, norm_hist=True, label='C')
plt.legend()

D, _ = stats.boxcox(C)
plt.subplot(224)
sns.distplot(D, kde=True, rug=False, norm_hist=True, label='D')
plt.legend()

plt.show()

In [ ]:
from scipy import stats
import numpy as np


def normality_test(arr):
    print('--------')
    print('Skewness of dataset   {}'.format(stats.skew(arr)))
    print('Skewness test p-value {}'.format(stats.skewtest(arr)[1]))
    print('Kurtosis of dataset   {}'.format(stats.kurtosis(arr)))
    print('Kurtosis test p-value {}'.format(stats.kurtosistest(arr)[1]))
    print('Normal test p-value   {}'.format(stats.normaltest(arr)[1]))

A1 = np.random.normal(loc=0.0, scale=1.0, size=50000)
A2 = np.random.normal(loc=0.0, scale=5.0, size=50000)
B1 = np.random.normal(loc=10.0, scale=1.0, size=50000)
B2 = np.random.normal(loc=10.0, scale=5.0, size=50000)
C = np.random.exponential(2, 50000)
D, _ = stats.boxcox(C)


normality_test(A1)
normality_test(A2)
normality_test(B1)
normality_test(B2)
normality_test(C)
normality_test(D)

In [ ]:
from sklearn import preprocessing
from scipy import stats
import numpy as np
import pandas as pd


def normality_test(arr):
    print('--------')
    print('Skewness of dataset   {}'.format(stats.skew(arr)))
    print('Skewness test p-value {}'.format(stats.skewtest(arr)[1]))
    print('Kurtosis of dataset   {}'.format(stats.kurtosis(arr)))
    print('Kurtosis test p-value {}'.format(stats.kurtosistest(arr)[1]))
    print('Normal test p-value   {}'.format(stats.normaltest(arr)[1]))

A1 = np.random.normal(loc=10.0, scale=5.0, size=5000)
A2 = preprocessing.scale(A1)
B1 = np.random.exponential(2, 5000)
B2 = preprocessing.scale(B1)
C1, _ = stats.boxcox(B1)
C2 = preprocessing.scale(C1)

normality_test(A1)
normality_test(A2)
normality_test(B1)
normality_test(B2)
normality_test(C1)
normality_test(C2)

pd.DataFrame({'A1': A1, 'A2': A2, 'B1': B1, 'B2': B2, 'C1': C1, 'C2': C2}).describe()