In [1]:

    
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib
from IPython.display import HTML
import requests

# plotting options
%matplotlib inline
#pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier

Import stations CSV

My first step is to find interesting stations and one of the requirements is to find some that have a good time range coverage so I will evaluate the files downloaded.

Reading the stations coordinates CSV



In [2]:

    
stations = pd.read_csv(
    '../../data/ncdc/ish-history.csv',
    skiprows=1, 
    names=['usaf','wban','stname','ctry','fips','state','call','lat' ,'lon' ,'elev'],
    dtype=object)
print (len(stations))
stations[:3]









    



30567






    Out[2]:






  
    
      
      usaf
      wban
      stname
      ctry
      fips
      state
      call
      lat
      lon
      elev
    
  
  
    
      0
       000000
       99999
        NYGGBUKTA GREENLAND- STA
       GL
       GL
       NaN
       NaN
       +73483
       +021567
       +00030
    
    
      1
       000010
       99999
                       JAN HAYEN
       NO
       NO
       NaN
       NaN
       +70983
       -007700
       +00229
    
    
      2
       000020
       99999
       ISFJORD RADIO SPITZBERGEN
       NO
       NO
       NaN
       NaN
       +78067
       +013633
       +00079

Get correct coordinates dividing the lat/lon by 1000 and the elevation by 10. Then generate a unique index and a column that specficies if the id is from a USAF station or from WBAN. Finally drop any station that doesn't have a location to use.



In [3]:

    
stations['lat'] = stations.lat.apply(lambda lat: float(lat)/1000)
stations['lon'] = stations.lon.apply(lambda lon: float(lon)/1000)
stations['elev'] = stations.elev.apply(lambda e: float(e)/10.0)



In [4]:

    
stations['id']     = stations.apply(lambda row : "{}-{}".format(row.usaf,row.wban),axis=1)
stations.set_index(['id'],inplace=True)
stations.dropna(how='any',subset=['lat','lon'],inplace=True)
stations.head()









    Out[4]:






  
    
      
      usaf
      wban
      stname
      ctry
      fips
      state
      call
      lat
      lon
      elev
    
    
      id
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      000000-99999
       000000
       99999
        NYGGBUKTA GREENLAND- STA
       GL
       GL
       NaN
       NaN
       73.483
       21.567
        3.0
    
    
      000010-99999
       000010
       99999
                       JAN HAYEN
       NO
       NO
       NaN
       NaN
       70.983
       -7.700
       22.9
    
    
      000020-99999
       000020
       99999
       ISFJORD RADIO SPITZBERGEN
       NO
       NO
       NaN
       NaN
       78.067
       13.633
        7.9
    
    
      000030-99999
       000030
       99999
            BJORNOYA BARENTS SEA
       NO
       NO
       NaN
       NaN
       74.467
       19.283
       29.0
    
    
      000040-99999
       000040
       99999
                           VAROO
       NO
       NO
       NaN
       NaN
       70.367
       31.100
       11.9



In [5]:

    
stations.tail()









    Out[5]:






  
    
      
      usaf
      wban
      stname
      ctry
      fips
      state
      call
      lat
      lon
      elev
    
    
      id
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      999999-94910
       999999
       94910
             WATERLOO MUNICIPAL AP
       US
       US
       IA
       KALO
       42.554
      -92.401
       267.6
    
    
      999999-94925
       999999
       94925
                    GRAND FORKS AF
       US
       US
       ND
       KRDR
       47.961
      -97.401
       277.7
    
    
      999999-94931
       999999
       94931
       HIBBING CHISHOLM-HIBBING AP
       US
       US
       MN
       KHIB
       47.387
      -92.839
       413.6
    
    
      999999-94995
       999999
       94995
              CRN NE LINCOLN 8 ENE
       US
       US
       NE
       C52A
       40.848
      -96.565
       362.4
    
    
      999999-94996
       999999
       94996
              CRN NE LINCOLN 11 SW
       US
       US
       NE
       B3BA
       40.695
      -96.854
       418.2

Read CSV of observations

Instantiate the path where the NCDC data has been downloaded



In [6]:

    
p = Path('../../data/ncdc')

Check if we have already the stations CSV or generate it from the files. To generate the CSV the name file will be inspected but also a full read of the file will be performed in order to count the number of observations on every station.



In [7]:

    
gsodCSV = p.joinpath('gsod.csv')
if not gsodCSV.exists():
    ops = p.joinpath('raw').joinpath('gsod').glob('**/*.op')
    agsod = []

    for op in ops:
        try:
            data = op.name.replace('.op','').split('-')
            data.append(sum(1 for line in op.open( encoding='utf8' ))-1)
            agsod.append(data)
        except UnicodeDecodeError:
          print (op.absolute())
    dfGsod = pd.DataFrame(data = agsod, columns=['usaf','wban','year','obs'])
    dfGsod['id'] = dfGsod.apply(lambda row: "{}-{}".format(row.usaf,row.wban) ,axis=1)
    dfGsod = dfGsod.set_index(['id'])
    dfGsod[['year','obs']].to_csv(str(gsodCSV))



In [8]:

    
print ('Reading existing stations per year CSV')
dfGsod = pd.read_csv(str(gsodCSV),index_col=['id'])
print ("{:,} station files".format(len(dfGsod)))
print ("{:,} observations".format(dfGsod.obs.sum()))
dfGsod.head()









    



Reading existing stations per year CSV
404,762 station files
111,648,951 observations






    Out[8]:






  
    
      
      year
      obs
    
    
      id
      
      
    
  
  
    
      033110-99999
       1931
       357
    
    
      105130-99999
       1931
       196
    
    
      103610-99999
       1931
         1
    
    
      030050-99999
       1931
       365
    
    
      122050-99999
       1931
       359

Year statistics per station

Now study this dataframe, grouping by id and see the total years recorded, max and min



In [9]:

    
year_groups = dfGsod[['year']].groupby(level=0)
year_count  = year_groups.count()
year_max    = year_groups.max()
year_min    = year_groups.min()

years = pd.concat([year_count,year_max,year_min],axis=1)
years.columns = ['count','max','min']
years.head()









    Out[9]:






  
    
      
      count
      max
      min
    
    
      id
      
      
      
    
  
  
    
      008209-99999
        1
       2009
       2009
    
    
      008210-99999
        1
       2009
       2009
    
    
      010010-99999
       41
       2009
       1955
    
    
      010013-99999
        3
       1988
       1986
    
    
      010014-99999
       24
       2009
       1986

Count observations per station



In [10]:

    
obs_groups = dfGsod[['obs']].groupby(level=0)
obs_count = obs_groups.sum()
obs_count.head()









    Out[10]:






  
    
      
      obs
    
    
      id
      
    
  
  
    
      008209-99999
          27
    
    
      008210-99999
          32
    
    
      010010-99999
       14180
    
    
      010013-99999
         335
    
    
      010014-99999
        6300

Join stations and observations statistics

Now we can check if the indexes of both data frames are unique and then join them to retreive only the stations with observations



In [11]:

    
stations.index.is_unique and years.index.is_unique and obs_count.index.is_unique









    Out[11]:





True



In [12]:

    
scdf = pd.concat([stations,years,obs_count],axis=1,join='inner')
scdf.head()









    Out[12]:






  
    
      
      usaf
      wban
      stname
      ctry
      fips
      state
      call
      lat
      lon
      elev
      count
      max
      min
      obs
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      010010-99999
       010010
       99999
         JAN MAYEN
       NO
       NO
       NaN
       ENJA
       70.933
       -8.667
         9
       41
       2009
       1955
       14180
    
    
      010014-99999
       010014
       99999
       SOERSTOKKEN
       NO
       NO
       NaN
       ENSO
       59.783
        5.350
        49
       24
       2009
       1986
        6300
    
    
      010015-99999
       010015
       99999
        BRINGELAND
       NO
       NO
       NaN
       ENBL
       61.383
        5.867
       327
       23
       2009
       1987
        8030
    
    
      010016-99999
       010016
       99999
       RORVIK/RYUM
       NO
       NO
       NaN
        NaN
       64.850
       11.233
        14
        5
       1991
       1987
        1555
    
    
      010017-99999
       010017
       99999
             FRIGG
       NO
       NO
       NaN
       ENFR
       59.933
        2.417
        48
       18
       2005
       1988
        4086

Finally we can study this dataset and filter the appropriate stations for our study on this first iteration



In [13]:

    
scdf.to_csv('stations.csv')

Get preferred stations

Next step is done at CartoDB an analysis and mapping service. From the exported stations.csv, I've created a map of the stations that have more than 40 years, are below 500 meters elevation and that intersect with the warm regions according to the Köppen-Geiger classification. The map is interactive, you can zoom in and click on stations to check it's attributions.



In [14]:

    
HTML('<iframe width="100%" height="520" frameborder="0" src="https://team.cartodb.com/u/jsanz/viz/bd9456f8-6530-11e5-b18a-0e0c41326911/embed_map" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>')









    Out[14]:

To get those stations back to this notebook I will use CartoDB SQL API so I will execute this SQL using the CSV format so it can be read directly into a Pandas DataFrame.

WITH regions AS (
  SELECT *,
  CASE GRIDCODE 
    WHEN 31 THEN 'Cfa'
    WHEN 32 THEN 'Cfb'
    WHEN 33 THEN 'Cfc'
    WHEN 34 THEN 'Csa'
    WHEN 35 THEN 'Csb'
    WHEN 36 THEN 'Csc'
    ELSE 'NA' 
  END cat
  FROM climate_regions 
  WHERE gridcode >= 31 and gridcode <= 36
), wstations AS ( 
  SELECT s.id, r.cat
  FROM jsanz.stations s 
  JOIN regions r ON ST_Intersects(s.the_geom,r.the_geom)
  WHERE count >= 40 AND elev < 500
)
  select id, string_agg(cat,', ') from wstations group by id



In [15]:

    
query = 'https://jsanz.cartodb.com/api/v1/sql?format=csv&q=WITH+regions+AS+(%0A++SELECT+*,%0A++CASE+GRIDCODE+%0A++++WHEN+31+THEN+%27Cfa%27%0A++++WHEN+32+THEN+%27Cfb%27%0A++++WHEN+33+THEN+%27Cfc%27%0A++++WHEN+34+THEN+%27Csa%27%0A++++WHEN+35+THEN+%27Csb%27%0A++++WHEN+36+THEN+%27Csc%27%0A++++ELSE+%27NA%27+%0A++END+cat%0A++FROM+climate_regions+%0A++WHERE+gridcode+>%3D+31+and+gridcode+<%3D+36%0A),+wstations+AS+(+%0A++SELECT+s.id,+r.cat%0A++FROM+jsanz.stations+s+%0A++JOIN+regions+r+ON+ST_Intersects(s.the_geom,r.the_geom)%0A++WHERE+count+>%3D+40+AND+elev+<+500%0A)%0A++select+id,+string_agg(cat,%27,+%27)+cat+from+wstations+group+by+id'
selections = pd.read_csv(query,index_col=['id'])
selections.head()









    Out[15]:






  
    
      
      cat
    
    
      id
      
    
  
  
    
      161400-99999
       Cfa
    
    
      062650-99999
       Cfb
    
    
      746710-13806
       Cfa
    
    
      373950-99999
       Cfa
    
    
      161490-99999
       Cfa

Check if the index of the imported dataframe is unique and then join it with our stations dataset. This join will keep only data on both data frames using the parameter join='inner'.



In [16]:

    
selections.index.is_unique









    Out[16]:





True



In [17]:

    
scdfc = pd.concat([scdf,selections],axis=1,join='inner')
scdfc.head()









    Out[17]:






  
    
      
      usaf
      wban
      stname
      ctry
      fips
      state
      call
      lat
      lon
      elev
      count
      max
      min
      obs
      cat
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      161400-99999
       161400
       99999
       BOLOGNA/BORGO PANIG
       IY
       IT
       NaN
       LIPE
       44.533
       11.300
        49.0
       42
       2009
       1964
       13853
       Cfa
    
    
      062650-99999
       062650
       99999
               SOESTERBERG
       NL
       NL
       NaN
       EHSB
       52.133
        5.283
        16.0
       52
       2008
       1951
       18578
       Cfb
    
    
      746710-13806
       746710
       13806
         FORT CAMPBELL AAF
       US
       US
        KY
       KHOP
       36.673
      -87.492
       174.7
       51
       1999
       1943
       18197
       Cfa
    
    
      373950-99999
       373950
       99999
                   KUTAISI
       RS
       GG
       NaN
        NaN
       42.267
       42.633
       116.0
       55
       2009
       1937
       15626
       Cfa
    
    
      161490-99999
       161490
       99999
                    RIMINI
       IY
       IT
       NaN
       LIPR
       44.033
       12.617
        13.0
       43
       2009
       1945
       14001
       Cfa

Descriptors for the preferred stations data frame

Note: This is the formal answer for this week assignment, all the previous work was meant to have this dataset ready to work with it. Next weeks I will load the real observations data and do more coding with it but as for now, this is enough for the tasks asked.

Frenquency table for the selected stations by country and filtering those with more than 20 stations.



In [18]:

    
scdfc_group_ctry = scdfc.groupby(['ctry']).size()
scdfc_group_ctry[scdfc_group_ctry>20]









    Out[18]:





ctry
AU       26
CI       65
DL       43
IY       54
JP       87
PL       35
RS       21
US      141
dtype: int64

Frequency table for the climate categories. Some climate areas overlapped, that's why there are three records with two assigned categories.



In [19]:

    
scdfc_group_cat = scdfc.groupby(['cat']).size()
scdfc_group_cat









    Out[19]:





cat
Cfa         349
Cfa, Csa      1
Cfb         210
Cfb, Cfa      1
Cfc           8
Csa         101
Csa, Csb      1
Csb          22
dtype: int64

Frequency table for the number of years recorded.



In [20]:

    
scdfc_group_count = scdfc.groupby(['count']).size()
scdfc_group_count









    Out[20]:





count
40       33
41       42
42       43
43       49
44       49
45       44
46       63
47       42
48       56
49       38
50       28
51       24
52       22
53       14
54       12
55       17
56       17
57       14
58       14
59        8
60        9
61        8
62        6
63       12
64        7
65        5
66        4
67        4
68        5
69        2
70        1
71        1
dtype: int64

	usaf	wban	stname	ctry	fips	state	call	lat	lon	elev
0	000000	99999	NYGGBUKTA GREENLAND- STA	GL	GL	NaN	NaN	+73483	+021567	+00030
1	000010	99999	JAN HAYEN	NO	NO	NaN	NaN	+70983	-007700	+00229
2	000020	99999	ISFJORD RADIO SPITZBERGEN	NO	NO	NaN	NaN	+78067	+013633	+00079

	usaf	wban	stname	ctry	fips	state	call	lat	lon	elev
id
000000-99999	000000	99999	NYGGBUKTA GREENLAND- STA	GL	GL	NaN	NaN	73.483	21.567	3.0
000010-99999	000010	99999	JAN HAYEN	NO	NO	NaN	NaN	70.983	-7.700	22.9
000020-99999	000020	99999	ISFJORD RADIO SPITZBERGEN	NO	NO	NaN	NaN	78.067	13.633	7.9
000030-99999	000030	99999	BJORNOYA BARENTS SEA	NO	NO	NaN	NaN	74.467	19.283	29.0
000040-99999	000040	99999	VAROO	NO	NO	NaN	NaN	70.367	31.100	11.9

	usaf	wban	stname	ctry	fips	state	call	lat	lon	elev
id
999999-94910	999999	94910	WATERLOO MUNICIPAL AP	US	US	IA	KALO	42.554	-92.401	267.6
999999-94925	999999	94925	GRAND FORKS AF	US	US	ND	KRDR	47.961	-97.401	277.7
999999-94931	999999	94931	HIBBING CHISHOLM-HIBBING AP	US	US	MN	KHIB	47.387	-92.839	413.6
999999-94995	999999	94995	CRN NE LINCOLN 8 ENE	US	US	NE	C52A	40.848	-96.565	362.4
999999-94996	999999	94996	CRN NE LINCOLN 11 SW	US	US	NE	B3BA	40.695	-96.854	418.2

	year	obs
id
033110-99999	1931	357
105130-99999	1931	196
103610-99999	1931	1
030050-99999	1931	365
122050-99999	1931	359

	count	max	min
id
008209-99999	1	2009	2009
008210-99999	1	2009	2009
010010-99999	41	2009	1955
010013-99999	3	1988	1986
010014-99999	24	2009	1986

	usaf	wban	stname	ctry	fips	state	call	lat	lon	elev	count	max	min	obs
id
010010-99999	010010	99999	JAN MAYEN	NO	NO	NaN	ENJA	70.933	-8.667	9	41	2009	1955	14180
010014-99999	010014	99999	SOERSTOKKEN	NO	NO	NaN	ENSO	59.783	5.350	49	24	2009	1986	6300
010015-99999	010015	99999	BRINGELAND	NO	NO	NaN	ENBL	61.383	5.867	327	23	2009	1987	8030
010016-99999	010016	99999	RORVIK/RYUM	NO	NO	NaN	NaN	64.850	11.233	14	5	1991	1987	1555
010017-99999	010017	99999	FRIGG	NO	NO	NaN	ENFR	59.933	2.417	48	18	2005	1988	4086

	cat
id
161400-99999	Cfa
062650-99999	Cfb
746710-13806	Cfa
373950-99999	Cfa
161490-99999	Cfa

	usaf	wban	stname	ctry	fips	state	call	lat	lon	elev	count	max	min	obs	cat
id
161400-99999	161400	99999	BOLOGNA/BORGO PANIG	IY	IT	NaN	LIPE	44.533	11.300	49.0	42	2009	1964	13853	Cfa
062650-99999	062650	99999	SOESTERBERG	NL	NL	NaN	EHSB	52.133	5.283	16.0	52	2008	1951	18578	Cfb
746710-13806	746710	13806	FORT CAMPBELL AAF	US	US	KY	KHOP	36.673	-87.492	174.7	51	1999	1943	18197	Cfa
373950-99999	373950	99999	KUTAISI	RS	GG	NaN	NaN	42.267	42.633	116.0	55	2009	1937	15626	Cfa
161490-99999	161490	99999	RIMINI	IY	IT	NaN	LIPR	44.033	12.617	13.0	43	2009	1945	14001	Cfa