In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import gridspec
import scipy
import scipy.stats as stats
import pandas as pd
import numpy as np
from ipywidgets import interact, interact_manual
import os
In [2]:
continuous_distributions = "C:\\Users\\jdorvinen\\Documents\\ipynbs\\Hoboken\\continuous_distributions_.csv"
dists_unindexed = pd.read_csv(continuous_distributions)
dist_list = dists_unindexed.distribution.tolist()
dists = dists_unindexed.set_index(dists_unindexed.distribution)
In [3]:
file_name = 'montauk_combined_data.csv'
file_path = 'C:/Users/jdorvinen/Documents/Jared/Projects/East Hampton/met_data' #Path to your data here
#If your data is in an excel spreadsheet save it as a delimited text file (.csv formatted)
filename = os.path.join(file_path,file_name)
df = pd.read_fwf(filename,usecols=['length','inter','swel','hsig','tps','a_hsig','a_tps']).dropna()
df = df[df.hsig>3.1]
title = str(file_name)+"\n"
In [22]:
pd.tools.plotting.scatter_matrix(df.drop(labels=['a_hsig','a_tps'],axis=1))
Out[22]:
In [27]:
plt.scatter(df.length,df.tps)
Out[27]:
In [28]:
import sklearn as sk
In [ ]:
In [4]:
def save_figure(path,name):
plt.savefig(path+name+".png",
dpi=200,
facecolor='none',
edgecolor='none'
)
In [5]:
def find_nearest(array,value):
idx = (np.abs(array-value)).argmin()
return idx
In [ ]:
In [6]:
def dist_fit(name,dist_name,bins,parameter):
global df
#Initialize figure and set dimensions
fig = plt.figure(figsize = (18,6))
gs = gridspec.GridSpec(2,2)
ax1 = fig.add_subplot(gs[:,0])
ax3 = fig.add_subplot(gs[:,1])
ax1.set_title(title,fontsize=20)
#Remove the plot frame lines. They are unnecessary chartjunk.
ax1.spines["top"].set_visible(False)
ax1.spines["right"].set_visible(False)
ax3.spines["top"].set_visible(False)
ax3.spines["right"].set_visible(False)
# Ensure that the axis ticks only show up on the bottom and left of the plot.
# Ticks on the right and top of the plot are generally unnecessary chartjunk.
ax1.get_xaxis().tick_bottom()
ax1.get_yaxis().tick_left()
ax3.get_xaxis().tick_bottom()
ax3.get_yaxis().tick_left()
# Make sure your axis ticks are large enough to be easily read.
# You don't want your viewers squinting to read your plot.
ax1.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="on", right="off", labelleft="on",labelsize=14)
ax3.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="on", right="off", labelleft="on",labelsize=14)
# Along the same vein, make sure your axis labels are large
# enough to be easily read as well. Make them slightly larger
# than your axis tick labels so they stand out.
ax1.set_xlabel(parameter, fontsize=16)
ax1.set_ylabel("Frequency of occurence", fontsize=16)
ax3.set_xlabel(parameter, fontsize=16)
ax3.set_ylabel("Exceedance Probability", fontsize=16)
#Setting .... variables
size = len(df[parameter])
max_val = 1.1*max(df[parameter])
min_val = min(df[parameter])
range_val = max_val-min_val
binsize = range_val/bins
x0 = np.arange(min_val,max_val,range_val*0.0001)
x1 = np.arange(min_val,max_val,binsize)
y1 = df[parameter]
#set x-axis limits
ax1.set_xlim(min_val,max_val)
ax3.set_xlim(min_val,max_val)
ax3.set_ylim(0,1.1)
#Plot histograms
EPDF = ax1.hist(y1, bins=x1, color='w')
ECDF = ax3.hist(y1, bins=x1, color='w', normed=1, cumulative=True)
#Fitting distribution
dist = getattr(scipy.stats, dist_name)
param = dist.fit(y1)
pdf_fitted = dist.pdf(x0, *param[:-2], loc=param[-2], scale=param[-1])*size*binsize
cdf_fitted = dist.cdf(x0, *param[:-2], loc=param[-2], scale=param[-1])
#Checking goodness of fit
#ks_fit = stats.kstest(pdf_fitted,dist_name) # Kolmogorov-Smirnov test: returns [KS stat (D,D+,orD-),pvalue]
#print(ks_fit)
#Finding location of 0.002 and 0.01 exceedence probability events
FiveHundInd = find_nearest(cdf_fitted,0.998)
OneHundInd = find_nearest(cdf_fitted,0.990)
#Plotting pdf and cdf
ax1.plot(x0,pdf_fitted,linewidth=2,label=dist_name)
ax3.plot(x0,cdf_fitted,linewidth=2,label=dist_name)
#update figure spacing
gs.update(wspace=0.1, hspace=0.2)
#adding a text box
ax3.text(min_val+0.1*range_val,1.1,
dist_name.upper()+" distribution\n"
+"\n"
+"0.2% - value: " + str("%.2f" %x0[FiveHundInd])+ " meters\n"
+"1.0% - value: " + str("%.2f" %x0[OneHundInd]) + " meters",
fontsize=14
)
print(dists.loc[dist_name,'description']+"\n")
param_names = (dist.shapes + ', loc, scale').split(', ') if dist.shapes else ['loc', 'scale']
param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, param)])
dist_str = '{}({})'.format(dist_name, param_str)
print(dist_str)
plt.show()
print(stats.kstest(y1,dist_name,param,alternative='less'))
print(stats.kstest(y1,dist_name,param,alternative='greater'))
print(stats.kstest(y1,dist_name,param,alternative='two-sided'))
return name
In [17]:
interact_manual(dist_fit, name=filename, dist_name=dist_list,bins=[25,100,5],parameter=['length','inter','swel','hsig','tps','a_hsig','a_tps'])
Python <br> http://stackoverflow.com/questions/6615489/fitting-distributions-goodness-of-fit-p-value-is-it-possible-to-do-this-with/16651524#16651524 <br> http://stackoverflow.com/questions/6620471/fitting-empirical-distribution-to-theoretical-ones-with-scipy-python <br><br> Extreme wave statistics <br> http://drs.nio.org/drs/bitstream/handle/2264/4165/Nat_Hazards_64_223a.pdf;jsessionid=55AAEDE5A2BF3AA06C6CCB5CE3CBEBAD?sequence=1 <br><br> List of available distributions can be found here <br> http://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions<br><br> Goodness of fit tests <br> http://statsmodels.sourceforge.net/stable/stats.html#goodness-of-fit-tests-and-measures <br> http://docs.scipy.org/doc/scipy/reference/stats.html#statistical-functions <br>
In [ ]:
dist = getattr(scipy.stats, 'genextreme')
#param = dist.fit(y1)
Length
genexpon(a=1.38, b=0.20, c=0.03, loc=3.00, scale=19.39)
KstestResult(statistic=0.090090090087017877, pvalue=0.051121878132879051)
genpareto(c=-0.02, loc=3.00, scale=14.16)
KstestResult(statistic=0.08997375153682749, pvalue=0.051604871224521176)
Interim
foldcauchy(c=0.63, loc=25.94, scale=137.16)
KstestResult(statistic=0.026999638083165967, pvalue=0.99705005373423183)
SWEL
johnsonsb(a=-0.95, b=1.41, loc=-1.12, scale=1.90)
KstestResult(statistic=0.029574404259407161, pvalue=0.9901000531575973)
HSIG
genexpon(a=0.05, b=0.32, c=1.83, loc=3.05, scale=0.36)
KstestResult(statistic=0.041869945686266341, pvalue=0.83122014943398015)
exponnorm(K=12.76, loc=3.16, scale=0.08)
KstestResult(statistic=0.045765702677044495, pvalue=0.74112752318380581)
TPS
recipinvgauss(mu=0.24, loc=5.60, scale=0.74)
KstestResult(statistic=0.10435298093410683, pvalue=0.014720986714411577)
genextreme(c=-0.08, loc=8.58, scale=1.31)
KstestResult(statistic=0.11307745876143599, pvalue=0.0062809593607968672)
a_hsig
genexpon(a=0.22, b=2.34, c=7.13, loc=3.03, scale=1.09)
KstestResult(statistic=0.047691926585930133, pvalue=0.70607193161667459)
exponnorm(K=8.97, loc=3.11, scale=0.05)
KstestResult(statistic=0.050605085052604193, pvalue=0.62055905806032152)
a_tps
recipinvgauss(mu=0.22, loc=5.90, scale=0.61)
KstestResult(statistic=0.038004922394594909, pvalue=0.90556531830166975)
genextreme(c=-0.08, loc=8.55, scale=1.12)
KstestResult(statistic=0.045956458061729433, pvalue=0.73647282485448418)
In [ ]: