In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
In [2]:
p = Path('../../data/ncdc')
observationsCSV = p.joinpath('observations_vlc.csv')
print ('Reading observations CSV')
dfObs = pd.read_csv(str(observationsCSV),index_col=0)
dfObs = dfObs[(dfObs.stn != 'stn')]
print ("{:,} observations".format(len(dfObs)))
In [3]:
dfObs.head()
Out[3]:
Generate an index using the id station and the date
In [4]:
def getId(stn,wban):
try:
istn = int(stn)
iwban = int(wban)
return "{:0>6}-{:0>5}".format(istn,iwban)
except ValueError:
print("{}/{}".format(stn,wban))
def getStationByStnWban(stn,wban):
try:
koppen = scdfc.loc[getId(stn,wban)].koppen
except KeyError:
koppen = None
return koppen
def getDateTimeFromRow(row):
try:
iyear = int(row.year)
imonth = int("{:0>4}".format(row.monthday)[0:2])
iday = int("{:0>4}".format(row.monthday)[2:4])
return datetime(iyear,imonth,iday)
except ValueError:
return np.nan
dfObs['date'] = dfObs.apply(lambda row : getDateTimeFromRow(row),axis=1)
dfObs.set_index(['date'],inplace=True)
The frshtt column needs to be padded with zeros to get all the flags in the correct place. Then is possible to get the occurrence of different weather conditions
In [5]:
dfObs['frshtt'] = dfObs.apply(lambda row: "{:0>6}".format(row.frshtt),axis=1)
dfObs['fog'] = dfObs['frshtt'].apply(lambda row: row[0:1]=='1')
dfObs['rain'] = dfObs['frshtt'].apply(lambda row: row[1:2]=='1')
dfObs['snow'] = dfObs['frshtt'].apply(lambda row: row[2:3]=='1')
dfObs['hail'] = dfObs['frshtt'].apply(lambda row: row[3:4]=='1')
dfObs['thunder'] = dfObs['frshtt'].apply(lambda row: row[4:5]=='1')
dfObs['tornado'] = dfObs['frshtt'].apply(lambda row: row[5:6]=='1')
Recode the temperatures columns, replacing the NaN values and afterwards as numerics
In [6]:
dfObs['tempC'] = dfObs['temp'].replace('99.9', np.nan)
dfObs['maxC'] = dfObs['max'].replace('99.9', np.nan)
dfObs['minC'] = dfObs['min'].replace('99.9', np.nan)
dfObs['prcp'] = dfObs['min'].replace('99.99', np.nan)
dfObs['mxspd'] = dfObs['mxspd'].replace('999.9',np.nan)
dfObs['slp'] = dfObs['slp'].replace('9999.9',np.nan)
dfObs['tempC'] = pd.to_numeric(dfObs['tempC'])
dfObs['maxC'] = pd.to_numeric(dfObs['maxC'])
dfObs['minC'] = pd.to_numeric(dfObs['minC'])
dfObs['prcp'] = pd.to_numeric(dfObs['prcp'])
dfObs['mxspd'] = pd.to_numeric(dfObs['mxspd'])
dfObs['slp'] = pd.to_numeric(dfObs['slp'])
dfObs['visib'] = pd.to_numeric(dfObs['visib'])
def FtoC(f):
return (f-32)*5/9
dfObs['tempC']= dfObs['tempC'].apply(lambda temp: FtoC(temp))
dfObs['maxC'] = dfObs['maxC'].apply(lambda temp: FtoC(temp))
dfObs['minC'] = dfObs['minC'].apply(lambda temp: FtoC(temp))
In [8]:
dfObs[['tempC','maxC','minC','prcp','mxspd','slp','visib']].describe()
Out[8]:
In [9]:
print("Observations from {} to {}".format(dfObs.index.min().strftime("%d/%m/%y"),dfObs.index.max().strftime("%d/%m/%y")))
In [10]:
%matplotlib inline
df = dfObs.copy()
In [11]:
sns.distplot(df["tempC"].dropna(), kde=False);
plt.xlabel('Temperature (ºC)')
plt.title('Mean temperature')
Out[11]:
In [12]:
sns.distplot(df["maxC"].dropna(), kde=False);
plt.xlabel('Temperature (ºC)')
plt.title('Max temperature')
Out[12]:
In [13]:
sns.distplot(df["minC"].dropna(), kde=False);
plt.xlabel('Temperature (ºC)')
plt.title('Min temperature')
Out[13]:
Plotting the three variables together
In [14]:
sns.kdeplot(df.tempC, label="Mean")
sns.kdeplot(df.maxC, label="Max")
sns.kdeplot(df.minC, label="Min")
plt.legend();
plt.xlabel('Temperature (ºC)')
plt.title('Valencia station temperatures')
Out[14]:
In [15]:
sns.distplot(dfObs.mxspd.dropna(),kde=False)
plt.xlabel('Speed (knots)')
plt.title(
'Wind max speed')
Out[15]:
Our qualitative variables are all True/False so they are categorical by definition
In [16]:
sns.countplot(x="rain", data=df);
plt.xlabel('It rained?')
plt.title('Raining days')
Out[16]:
In [17]:
sns.countplot(x="fog", data=df);
plt.xlabel('Fog recorded?')
plt.title('Foggy days')
Out[17]:
Let's compare temperature and rainy days (quantitative to qualitative)
In [18]:
sns.factorplot(x="rain", y="tempC", data=df, kind="bar", ci=None)
plt.xlabel('It rained?')
plt.ylabel('Mean temperature')
plt.title('Rainy days accross temperatures')
Out[18]:
What about pressure and rainy days?
In [19]:
sns.factorplot(x="rain", y="slp", data=df, kind="bar", ci=None)
plt.xlabel('It rained?')
plt.ylabel('Mean pressure')
plt.title('Rainy days against sea level pressure')
Out[19]:
In [20]:
sns.factorplot(x="rain", y="visib", data=df, kind="bar", ci=None)
plt.xlabel('It rained?')
plt.ylabel('Visibility (miles)')
plt.title('Rainy days against visibility (in miles)')
Out[20]:
Let's compare sea level presure and temperatures using a scatter plot
In [21]:
sns.regplot(x="tempC", y="slp", data=df)
plt.xlabel('Mean temperature')
plt.ylabel('Sea Level Pressure')
plt.title('Scatterplot for temperatures aganist sea level pressure')
Out[21]:
What about using the measured precipitations?
In [22]:
sns.regplot(x="prcp", y="slp", data=df)
plt.xlabel('Precipitation')
plt.ylabel('Sea Level Pressure')
plt.title('Scatterplot for precipitation aganist sea level pressure')
Out[22]:
In [23]:
sns.regplot(x="prcp", y="visib", data=df)
plt.xlabel('Precipitation')
plt.ylabel('Visibility (miles)')
plt.title('Scatterplot for precipitation aganist visibility')
Out[23]:
In [24]:
sns.jointplot(df.tempC, df.slp, kind="kde", size=7, space=0)
plt.xlabel('Mean temperature')
plt.ylabel('Sea Level Pressure')
Out[24]:
In [ ]: