In [1]:

    
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

Getting the observations for the selected stations



In [2]:

    
p = Path('../../data/ncdc')
observationsCSV = p.joinpath('observations_vlc.csv')
print ('Reading observations CSV')
dfObs = pd.read_csv(str(observationsCSV),index_col=0)
dfObs = dfObs[(dfObs.stn != 'stn')]
print ("{:,} observations".format(len(dfObs)))









    



Reading observations CSV
13,331 observations



In [3]:

    
dfObs.head()









    Out[3]:






  
    
      
      stn
      wban
      year
      monthday
      temp
      temp_count
      dewp
      dewp_count
      slp
      slp_count
      ...
      gust
      max
      max_flag
      min
      min_flag
      prcp
      prc_flag
      sndp
      frshtt
      koppen
    
  
  
    
      0
      82840
      99999
      1991
      101
      52.5
      24
      46.5
      24
      9999.9
      0
      ...
      999.9
      61.5
      NaN
      44.6
      *
      0.0
      D
      999.9
      0
      BSk
    
    
      1
      82840
      99999
      1991
      102
      47.7
      24
      40.9
      24
      9999.9
      0
      ...
      999.9
      64.4
      *
      35.6
      *
      0.0
      D
      999.9
      0
      BSk
    
    
      2
      82840
      99999
      1991
      103
      47.0
      24
      40.0
      24
      9999.9
      0
      ...
      999.9
      63.3
      NaN
      34.7
      NaN
      0.0
      D
      999.9
      0
      BSk
    
    
      3
      82840
      99999
      1991
      104
      48.5
      24
      37.0
      24
      9999.9
      0
      ...
      20.0
      57.2
      NaN
      39.2
      NaN
      0.0
      D
      999.9
      0
      BSk
    
    
      4
      82840
      99999
      1991
      105
      47.0
      24
      34.2
      24
      9999.9
      0
      ...
      999.9
      59.0
      *
      34.2
      NaN
      0.0
      D
      999.9
      0
      BSk
    
  

5 rows × 27 columns

Data management operations

Generate an index using the id station and the date



In [4]:

    
def getId(stn,wban):
    try:
        istn = int(stn)
        iwban = int(wban)
        return "{:0>6}-{:0>5}".format(istn,iwban)
    except ValueError:
        print("{}/{}".format(stn,wban))
        
def getStationByStnWban(stn,wban):
    try:
        koppen = scdfc.loc[getId(stn,wban)].koppen
    except KeyError:
        koppen = None
    return koppen   
def getDateTimeFromRow(row):
    try:
        iyear = int(row.year)
        imonth = int("{:0>4}".format(row.monthday)[0:2])
        iday = int("{:0>4}".format(row.monthday)[2:4])
        return  datetime(iyear,imonth,iday)
    except ValueError:
        return np.nan

dfObs['date'] = dfObs.apply(lambda row : getDateTimeFromRow(row),axis=1)
dfObs.set_index(['date'],inplace=True)

The frshtt column needs to be padded with zeros to get all the flags in the correct place. Then is possible to get the occurrence of different weather conditions



In [5]:

    
dfObs['frshtt']  = dfObs.apply(lambda row: "{:0>6}".format(row.frshtt),axis=1)
dfObs['fog']     = dfObs['frshtt'].apply(lambda row: row[0:1]=='1')
dfObs['rain']    = dfObs['frshtt'].apply(lambda row: row[1:2]=='1')
dfObs['snow']    = dfObs['frshtt'].apply(lambda row: row[2:3]=='1')
dfObs['hail']    = dfObs['frshtt'].apply(lambda row: row[3:4]=='1')
dfObs['thunder'] = dfObs['frshtt'].apply(lambda row: row[4:5]=='1')
dfObs['tornado'] = dfObs['frshtt'].apply(lambda row: row[5:6]=='1')

Recode the temperatures columns, replacing the NaN values and afterwards as numerics



In [6]:

    
dfObs['tempC'] = dfObs['temp'].replace('99.9', np.nan)
dfObs['maxC']  = dfObs['max'].replace('99.9', np.nan)
dfObs['minC']  = dfObs['min'].replace('99.9', np.nan)
dfObs['prcp']  = dfObs['min'].replace('99.99', np.nan)
dfObs['mxspd'] = dfObs['mxspd'].replace('999.9',np.nan)
dfObs['slp']   = dfObs['slp'].replace('9999.9',np.nan)

dfObs['tempC'] = pd.to_numeric(dfObs['tempC'])
dfObs['maxC']  = pd.to_numeric(dfObs['maxC'])
dfObs['minC']  = pd.to_numeric(dfObs['minC']) 
dfObs['prcp']  = pd.to_numeric(dfObs['prcp']) 
dfObs['mxspd'] = pd.to_numeric(dfObs['mxspd'])
dfObs['slp']   = pd.to_numeric(dfObs['slp'])
dfObs['visib'] = pd.to_numeric(dfObs['visib'])

def FtoC(f):
    return (f-32)*5/9

dfObs['tempC']= dfObs['tempC'].apply(lambda temp: FtoC(temp))
dfObs['maxC'] = dfObs['maxC'].apply(lambda temp: FtoC(temp))
dfObs['minC'] = dfObs['minC'].apply(lambda temp: FtoC(temp))



In [8]:

    
dfObs[['tempC','maxC','minC','prcp','mxspd','slp','visib']].describe()









    Out[8]:






  
    
      
      tempC
      maxC
      minC
      prcp
      mxspd
      slp
      visib
    
  
  
    
      count
      13331.000000
      13330.000000
      13331.000000
      13331.000000
      13317.000000
      4779.000000
      13331.000000
    
    
      mean
      17.114712
      22.761665
      12.062036
      53.711665
      13.966757
      1017.453338
      6.317035
    
    
      std
      6.023689
      6.110861
      6.213491
      11.184284
      6.173005
      6.455067
      1.689609
    
    
      min
      0.611111
      5.000000
      -6.000000
      21.200000
      1.900000
      991.800000
      0.100000
    
    
      25%
      12.166667
      18.000000
      7.000000
      44.600000
      9.900000
      1013.200000
      5.400000
    
    
      50%
      16.722222
      22.611111
      12.000000
      53.600000
      12.000000
      1017.300000
      6.400000
    
    
      75%
      22.555556
      28.000000
      17.611111
      63.700000
      15.900000
      1021.500000
      7.000000
    
    
      max
      31.277778
      43.388889
      26.388889
      79.500000
      55.900000
      1038.800000
      18.400000



In [9]:

    
print("Observations from {} to {}".format(dfObs.index.min().strftime("%d/%m/%y"),dfObs.index.max().strftime("%d/%m/%y")))









    



Observations from 01/01/73 to 08/07/09

Univariate visualization



In [10]:

    
%matplotlib inline
df = dfObs.copy()

Quantitative variables



In [11]:

    
sns.distplot(df["tempC"].dropna(), kde=False);
plt.xlabel('Temperature (ºC)')
plt.title('Mean temperature')









    Out[11]:





<matplotlib.text.Text at 0x7f530159ba20>



In [12]:

    
sns.distplot(df["maxC"].dropna(), kde=False);
plt.xlabel('Temperature (ºC)')
plt.title('Max temperature')









    Out[12]:





<matplotlib.text.Text at 0x7f53014c49e8>



In [13]:

    
sns.distplot(df["minC"].dropna(), kde=False);
plt.xlabel('Temperature (ºC)')
plt.title('Min temperature')









    Out[13]:





<matplotlib.text.Text at 0x7f5301097be0>

Plotting the three variables together



In [14]:

    
sns.kdeplot(df.tempC, label="Mean")
sns.kdeplot(df.maxC, label="Max")
sns.kdeplot(df.minC, label="Min")
plt.legend();
plt.xlabel('Temperature (ºC)')
plt.title('Valencia station temperatures')









    Out[14]:





<matplotlib.text.Text at 0x7f530100a240>



In [15]:

    
sns.distplot(dfObs.mxspd.dropna(),kde=False)
plt.xlabel('Speed (knots)')
plt.title(
    'Wind max speed')









    Out[15]:





<matplotlib.text.Text at 0x7f5300f1e0b8>

Qualitative variables

Our qualitative variables are all True/False so they are categorical by definition



In [16]:

    
sns.countplot(x="rain", data=df);
plt.xlabel('It rained?')
plt.title('Raining days')









    Out[16]:





<matplotlib.text.Text at 0x7f5300e2fa58>



In [17]:

    
sns.countplot(x="fog", data=df);
plt.xlabel('Fog recorded?')
plt.title('Foggy days')









    Out[17]:





<matplotlib.text.Text at 0x7f5300e0a080>

Bivariate visualizations

Let's compare temperature and rainy days (quantitative to qualitative)



In [18]:

    
sns.factorplot(x="rain", y="tempC", data=df, kind="bar", ci=None)
plt.xlabel('It rained?')
plt.ylabel('Mean temperature')
plt.title('Rainy days accross temperatures')









    Out[18]:





<matplotlib.text.Text at 0x7f5300d68e80>

What about pressure and rainy days?



In [19]:

    
sns.factorplot(x="rain", y="slp", data=df, kind="bar", ci=None)
plt.xlabel('It rained?')
plt.ylabel('Mean pressure')
plt.title('Rainy days against sea level pressure')









    Out[19]:





<matplotlib.text.Text at 0x7f52fcf9a208>



In [20]:

    
sns.factorplot(x="rain", y="visib", data=df, kind="bar", ci=None)
plt.xlabel('It rained?')
plt.ylabel('Visibility (miles)')
plt.title('Rainy days against visibility (in miles)')









    Out[20]:





<matplotlib.text.Text at 0x7f52fcf03710>

Let's compare sea level presure and temperatures using a scatter plot



In [21]:

    
sns.regplot(x="tempC", y="slp", data=df)
plt.xlabel('Mean temperature')
plt.ylabel('Sea Level Pressure')
plt.title('Scatterplot for temperatures aganist sea level pressure')









    Out[21]:





<matplotlib.text.Text at 0x7f5300cfc6d8>

What about using the measured precipitations?



In [22]:

    
sns.regplot(x="prcp", y="slp", data=df)
plt.xlabel('Precipitation')
plt.ylabel('Sea Level Pressure')
plt.title('Scatterplot for precipitation aganist sea level pressure')









    Out[22]:





<matplotlib.text.Text at 0x7f5300c9b3c8>



In [23]:

    
sns.regplot(x="prcp", y="visib", data=df)
plt.xlabel('Precipitation')
plt.ylabel('Visibility (miles)')
plt.title('Scatterplot for precipitation aganist visibility')









    Out[23]:





<matplotlib.text.Text at 0x7f52fd251780>



In [24]:

    
sns.jointplot(df.tempC, df.slp, kind="kde", size=7, space=0)
plt.xlabel('Mean temperature')
plt.ylabel('Sea Level Pressure')









    Out[24]:





<matplotlib.text.Text at 0x7f52fd3accf8>



In [ ]:

	stn	wban	year	monthday	temp	temp_count	dewp	dewp_count	slp	...	gust	max	max_flag	min	min_flag	prc_flag	sndp	koppen
0	82840	99999	1991	101	52.5	24	46.5	24	9999.9	...	999.9	61.5	NaN	44.6	*	D	999.9	BSk
1	82840	99999	1991	102	47.7	24	40.9	24	9999.9	...	999.9	64.4	*	35.6	*	D	999.9	BSk
2	82840	99999	1991	103	47.0	24	40.0	24	9999.9	...	999.9	63.3	NaN	34.7	NaN	D	999.9	BSk
3	82840	99999	1991	104	48.5	24	37.0	24	9999.9	...	20.0	57.2	NaN	39.2	NaN	D	999.9	BSk
4	82840	99999	1991	105	47.0	24	34.2	24	9999.9	...	999.9	59.0	*	34.2	NaN	D	999.9	BSk

	tempC	maxC	minC	prcp	mxspd	slp	visib
count	13331.000000	13330.000000	13331.000000	13331.000000	13317.000000	4779.000000	13331.000000
mean	17.114712	22.761665	12.062036	53.711665	13.966757	1017.453338	6.317035
std	6.023689	6.110861	6.213491	11.184284	6.173005	6.455067	1.689609
min	0.611111	5.000000	-6.000000	21.200000	1.900000	991.800000	0.100000
25%	12.166667	18.000000	7.000000	44.600000	9.900000	1013.200000	5.400000
50%	16.722222	22.611111	12.000000	53.600000	12.000000	1017.300000	6.400000
75%	22.555556	28.000000	17.611111	63.700000	15.900000	1021.500000	7.000000
max	31.277778	43.388889	26.388889	79.500000	55.900000	1038.800000	18.400000