Fitting Distributions to a Dataset



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import gridspec
import scipy
import scipy.stats as stats
import pandas as pd
import numpy as np
from ipywidgets import interact, interact_manual
import os

Import Data



In [2]:

    
continuous_distributions = "C:\\Users\\jdorvinen\\Documents\\ipynbs\\Hoboken\\continuous_distributions_.csv"
dists_unindexed = pd.read_csv(continuous_distributions)
dist_list = dists_unindexed.distribution.tolist()
dists = dists_unindexed.set_index(dists_unindexed.distribution)



In [3]:

    
file_name = 'montauk_combined_data.csv'
file_path = 'C:/Users/jdorvinen/Documents/Jared/Projects/East Hampton/met_data'   #Path to your data here
#If your data is in an excel spreadsheet save it as a delimited text file (.csv formatted)
filename = os.path.join(file_path,file_name)
df = pd.read_fwf(filename,usecols=['length','inter','swel','hsig','tps','a_hsig','a_tps']).dropna()           
df = df[df.hsig>3.1]
title = str(file_name)+"\n"



In [22]:

    
pd.tools.plotting.scatter_matrix(df.drop(labels=['a_hsig','a_tps'],axis=1))









    Out[22]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000000108B9978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000010940470>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001091E278>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000109B42E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000109FA400>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000010A3A400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000010A84DD8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000010AC0748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000010B08978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000010B49588>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000010B918D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000010BDA908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000011BEAF98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000011C399B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000011C76320>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000011CBE550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000011CFF160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000011D464A8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000011D924E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000011DD0B70>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000011E1E588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000011E54EB8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000011EA5128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000011EDCCF8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000FFC3438>]], dtype=object)



In [27]:

    
plt.scatter(df.length,df.tps)









    Out[27]:





<matplotlib.collections.PathCollection at 0x1229cb70>



In [28]:

    
import sklearn as sk



In [ ]:



In [4]:

    
def save_figure(path,name):
    plt.savefig(path+name+".png",
                dpi=200,
                facecolor='none',
                edgecolor='none'
               )



In [5]:

    
def find_nearest(array,value):
    idx = (np.abs(array-value)).argmin()
    return idx



In [ ]:



In [6]:

    
def dist_fit(name,dist_name,bins,parameter):
    global df
    #Initialize figure and set dimensions
    fig = plt.figure(figsize = (18,6))
    gs = gridspec.GridSpec(2,2)
    ax1 = fig.add_subplot(gs[:,0])
    ax3 = fig.add_subplot(gs[:,1])
    ax1.set_title(title,fontsize=20)
    
    #Remove the plot frame lines. They are unnecessary chartjunk.
    ax1.spines["top"].set_visible(False)  
    ax1.spines["right"].set_visible(False) 
    ax3.spines["top"].set_visible(False)  
    ax3.spines["right"].set_visible(False) 
    
    # Ensure that the axis ticks only show up on the bottom and left of the plot.  
    # Ticks on the right and top of the plot are generally unnecessary chartjunk.  
    ax1.get_xaxis().tick_bottom()  
    ax1.get_yaxis().tick_left()  
    ax3.get_xaxis().tick_bottom()  
    ax3.get_yaxis().tick_left()  
    
    # Make sure your axis ticks are large enough to be easily read.  
    # You don't want your viewers squinting to read your plot.  
    ax1.tick_params(axis="both", which="both", bottom="off", top="off",  
                    labelbottom="on", left="on", right="off", labelleft="on",labelsize=14)
    ax3.tick_params(axis="both", which="both", bottom="off", top="off",  
                    labelbottom="on", left="on", right="off", labelleft="on",labelsize=14)

    # Along the same vein, make sure your axis labels are large  
    # enough to be easily read as well. Make them slightly larger  
    # than your axis tick labels so they stand out.  
    ax1.set_xlabel(parameter, fontsize=16)  
    ax1.set_ylabel("Frequency of occurence", fontsize=16)  
    ax3.set_xlabel(parameter, fontsize=16) 
    ax3.set_ylabel("Exceedance Probability", fontsize=16)
    #Setting .... variables
    size    = len(df[parameter])
    max_val = 1.1*max(df[parameter])
    min_val = min(df[parameter])
    range_val = max_val-min_val
    binsize = range_val/bins
    x0 = np.arange(min_val,max_val,range_val*0.0001)
    x1 = np.arange(min_val,max_val,binsize)
    y1 = df[parameter]
    #set x-axis limits
    ax1.set_xlim(min_val,max_val)
    ax3.set_xlim(min_val,max_val)
    ax3.set_ylim(0,1.1)
    #Plot histograms
    EPDF = ax1.hist(y1, bins=x1, color='w')
    ECDF = ax3.hist(y1, bins=x1, color='w', normed=1, cumulative=True)
    #Fitting distribution
    dist  = getattr(scipy.stats, dist_name)
    param = dist.fit(y1)
    pdf_fitted = dist.pdf(x0, *param[:-2], loc=param[-2], scale=param[-1])*size*binsize
    cdf_fitted = dist.cdf(x0, *param[:-2], loc=param[-2], scale=param[-1])
    
    #Checking goodness of fit
    #ks_fit = stats.kstest(pdf_fitted,dist_name) # Kolmogorov-Smirnov test: returns [KS stat (D,D+,orD-),pvalue]
    #print(ks_fit)
    
    #Finding location of 0.002 and 0.01 exceedence probability events 
    FiveHundInd = find_nearest(cdf_fitted,0.998)
    OneHundInd  = find_nearest(cdf_fitted,0.990)
    #Plotting pdf and cdf    
    ax1.plot(x0,pdf_fitted,linewidth=2,label=dist_name)
    ax3.plot(x0,cdf_fitted,linewidth=2,label=dist_name)
    #update figure spacing
    gs.update(wspace=0.1, hspace=0.2)
    #adding a text box
    ax3.text(min_val+0.1*range_val,1.1,
             dist_name.upper()+" distribution\n"
             +"\n"
             +"0.2% - value: " + str("%.2f" %x0[FiveHundInd])+ " meters\n"
             +"1.0% - value: " + str("%.2f" %x0[OneHundInd]) + " meters",
             fontsize=14
            )

    print(dists.loc[dist_name,'description']+"\n")
    
    param_names = (dist.shapes + ', loc, scale').split(', ') if dist.shapes else ['loc', 'scale']
    param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, param)])
    dist_str = '{}({})'.format(dist_name, param_str)
    print(dist_str)
    plt.show()
    print(stats.kstest(y1,dist_name,param,alternative='less'))
    print(stats.kstest(y1,dist_name,param,alternative='greater'))
    print(stats.kstest(y1,dist_name,param,alternative='two-sided'))
    return name



In [17]:

    
interact_manual(dist_fit, name=filename, dist_name=dist_list,bins=[25,100,5],parameter=['length','inter','swel','hsig','tps','a_hsig','a_tps'])









    



A generalized Pareto continuous random variable.

genpareto(c=-0.94, loc=3.11, scale=4.03)






    












    



KstestResult(statistic=0.0044752279883455426, pvalue=0.98815411589101232)
KstestResult(statistic=0.38169408613001987, pvalue=0.0)
KstestResult(statistic=0.38169408613001987, pvalue=0.0)






    





'C:/Users/jdorvinen/Documents/Jared/Projects/East Hampton/met_data\\montauk_combined_data.csv'

References:

Python http://stackoverflow.com/questions/6615489/fitting-distributions-goodness-of-fit-p-value-is-it-possible-to-do-this-with/16651524#16651524 http://stackoverflow.com/questions/6620471/fitting-empirical-distribution-to-theoretical-ones-with-scipy-python Extreme wave statistics http://drs.nio.org/drs/bitstream/handle/2264/4165/Nat_Hazards_64_223a.pdf;jsessionid=55AAEDE5A2BF3AA06C6CCB5CE3CBEBAD?sequence=1 List of available distributions can be found here http://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions Goodness of fit tests http://statsmodels.sourceforge.net/stable/stats.html#goodness-of-fit-tests-and-measures http://docs.scipy.org/doc/scipy/reference/stats.html#statistical-functions



In [ ]:

    
dist  = getattr(scipy.stats, 'genextreme')

#param = dist.fit(y1)

Length
genexpon(a=1.38, b=0.20, c=0.03, loc=3.00, scale=19.39)
KstestResult(statistic=0.090090090087017877, pvalue=0.051121878132879051)
genpareto(c=-0.02, loc=3.00, scale=14.16)
KstestResult(statistic=0.08997375153682749, pvalue=0.051604871224521176)

Interim
foldcauchy(c=0.63, loc=25.94, scale=137.16) KstestResult(statistic=0.026999638083165967, pvalue=0.99705005373423183)

SWEL
johnsonsb(a=-0.95, b=1.41, loc=-1.12, scale=1.90)
KstestResult(statistic=0.029574404259407161, pvalue=0.9901000531575973)

HSIG
genexpon(a=0.05, b=0.32, c=1.83, loc=3.05, scale=0.36)
KstestResult(statistic=0.041869945686266341, pvalue=0.83122014943398015)
exponnorm(K=12.76, loc=3.16, scale=0.08)
KstestResult(statistic=0.045765702677044495, pvalue=0.74112752318380581)

TPS
recipinvgauss(mu=0.24, loc=5.60, scale=0.74)
KstestResult(statistic=0.10435298093410683, pvalue=0.014720986714411577)
genextreme(c=-0.08, loc=8.58, scale=1.31)
KstestResult(statistic=0.11307745876143599, pvalue=0.0062809593607968672)

a_hsig
genexpon(a=0.22, b=2.34, c=7.13, loc=3.03, scale=1.09)
KstestResult(statistic=0.047691926585930133, pvalue=0.70607193161667459)
exponnorm(K=8.97, loc=3.11, scale=0.05)
KstestResult(statistic=0.050605085052604193, pvalue=0.62055905806032152)

a_tps
recipinvgauss(mu=0.22, loc=5.90, scale=0.61)
KstestResult(statistic=0.038004922394594909, pvalue=0.90556531830166975)
genextreme(c=-0.08, loc=8.55, scale=1.12)
KstestResult(statistic=0.045956458061729433, pvalue=0.73647282485448418)



In [ ]: