Read CSV files into pandas dataframes
In [1]:
import pandas as pd
top_airport_csv = 'hw_5_data/top_airports.csv'
ICAO_airport_csv = 'hw_5_data/ICAO_airports.csv'
top50_df = pd.read_csv(top_airport_csv)
icao_df = pd.read_csv(ICAO_airport_csv)
# merge two data frames to obtain info for the top 50 airports
merged_df = pd.merge(top50_df, icao_df, how='inner', left_on='ICAO', right_on='ident')
Select columns for city, airport, latitude and longitude info
In [2]:
call_df = merged_df[['City', 'Airport', 'ICAO', 'latitude_deg', 'longitude_deg']]
call_df.head(3)
Out[2]:
In [3]:
import sqlite3, os.path
# remove database in case there's already one there
!rm hw_5_data/call.db
connection = sqlite3.connect("hw_5_data/call.db")
cursor = connection.cursor()
sql_cmd = """CREATE TABLE airports (id INTEGER PRIMARY KEY AUTOINCREMENT,
airport TEXT, city TEXT, icao TEXT, latitude FLOAT, longitude FLOAT)"""
cursor.execute(sql_cmd)
Out[3]:
In [4]:
for row in call_df.values:
city, airport, icao, lat, lon = row
sql_cmd = """INSERT INTO airports (airport, city, icao,
latitude, longitude) VALUES ("{}","{}","{}", {},{})""".format(
airport, city, icao, str(lat), str(lon))
cursor.execute(sql_cmd)
connection.commit()
connection.close()
In [5]:
# remove database in case there's already one there
!rm hw_5_data/weather.db
connection = sqlite3.connect("hw_5_data/weather.db")
cursor = connection.cursor()
sql_cmd = """CREATE TABLE weather (id INTEGER PRIMARY KEY AUTOINCREMENT, date DATE,
icao TEXT, min_temp INT, max_temp INT, min_hum INT, max_hum INT, prec FLOAT)"""
cursor.execute(sql_cmd)
connection.commit()
connection.close()
In [6]:
from util import write2csv
In [7]:
# list of top 50 airports
icao_list = call_df['ICAO'].values
In [8]:
#-------------------------------------
# this is just a demo
#
# expect it to be slow
# a faster version is mentioned in the
# next cell
#-------------------------------------
# timerange just 1 day
tr_2010_mar = pd.date_range('20100301', '20100301')
fn = 'temp/2010_3.csv'
# only looked at the top 10 cities
# %time write2csv(tr_2010_mar, icao_list[:10], fn) # uncomment to see demo
In [9]:
import os.path
def check_files():
tr = pd.date_range('20080101', '20161006')
ok = True
for date in tr:
filename = 'hw_5_data/weather_data/' + date.strftime('%Y') + '/'+ \
date.strftime('%Y%m%d')+'.csv'
if not os.path.isfile(filename):
print(date.strftime('%Y%m%d') + ' is missing.')
ok = False
continue
f = open(filename)
num_lines = sum(1 for line in f)
f.close()
if num_lines != 50:
ok = False
print(date.strftime('%Y%m%d') + ' may be corrupted, number of cities =/= 50.')
if ok: print('no file corruption/missing')
In [10]:
# this takes about 15s
check_files()
First load all the data into a pandas DataFrame
In [11]:
from util import fetch_df
tr = pd.date_range('20080101', '20161006')
all_df = pd.DataFrame()
for date in tr:
# fetch data for that day
if all_df.empty:
all_df = fetch_df(date)
else:
df = fetch_df(date)
all_df = all_df.append(df, ignore_index=True)
# interpolate data to remove NaN
all_df = all_df.fillna(method='pad').fillna(method='bfill')
In [12]:
connection = sqlite3.connect("hw_5_data/weather.db")
cursor = connection.cursor()
#insert data into database
for row in all_df.values:
date, icao, min_temp, max_temp, min_hum, max_hum, prec = row
date = pd.to_datetime(date).strftime('%m/%d/%Y')
sql_cmd = """INSERT INTO weather (date, icao, min_temp, max_temp,
min_hum, max_hum, prec) VALUES ("{}","{}",{},{},{},{},{})""".format(
date, icao, min_temp, max_temp, min_hum, max_hum, prec)
cursor.execute(sql_cmd)
connection.commit()
connection.close()
In [13]:
import numpy as np
import pandas as pd
import sqlite3
In [67]:
connection = sqlite3.connect("hw_5_data/weather.db")
cursor = connection.cursor()
def temp_prec_corr(n):
"""
find the correlation between temperature between any two cities with N day shifts.
"""
import os.path
# this is the file we are gonna store the correlations
filename = 'hw_5_data/corr_{}.npy'.format(n)
# skip if .npy file already exist
if os.path.isfile(filename):
return
temp_corr_arr = []
prec_corr_arr = []
for (i, city_1) in enumerate(icao_list):
for (j, city_2) in enumerate(icao_list):
sql_cmd = """SELECT max_temp, min_temp, prec FROM weather WHERE icao = "{}" """.format(city_1)
cursor.execute(sql_cmd)
city_1_info = np.array(cursor.fetchall())
high_temp_1 = np.array([int(temp) for temp in city_1_info[:, 0]])
low_temp_1 = np.array([int(temp) for temp in city_1_info[:, 1]])
#average temperature
temp_1 = (high_temp_1 + low_temp_1)/2
prec_1 = np.array([float(temp) for temp in city_1_info[:, 2]])
# daily change in temperature
temp_change_1 = np.roll(temp_1, 1) - temp_1
# prec_change_1 = np.roll(prec_1, 1) - prec_1
sql_cmd = """SELECT max_temp, min_temp, prec FROM weather WHERE icao = "{}" """.format(city_2)
cursor.execute(sql_cmd)
city_2_info = np.array(cursor.fetchall())
high_temp_2 = np.array([int(temp) for temp in city_2_info[:, 0]])
low_temp_2 = np.array([int(temp) for temp in city_2_info[:, 1]])
#average temperature
temp_2 = (high_temp_2 + low_temp_2)/2
prec_2 = np.array([float(temp) for temp in city_2_info[:, 2]])
# daily change in temperature
temp_change_2 = np.roll(temp_2, 1) - temp_2
# prec_change_2 = np.roll(prec_2, 1) - prec_2
t_corr = np.corrcoef(temp_change_1, np.roll(temp_change_2, n))[1, 0]
# p_corr = np.corrcoef(prec_change_1, np.roll(prec_change_2,n))[1, 0]
p_corr = np.corrcoef(prec_1, np.roll(prec_2,n))[1, 0]
temp_corr_arr += [t_corr]
prec_corr_arr += [p_corr]
corr_n = [temp_corr_arr, prec_corr_arr]
# save to .npy file for future use
np.save('hw_5_data/corr_{}'.format(n), corr_n)
return corr_n
In [69]:
# !rm -rf hw_5_data/corr_*.npy
# import multiprocessing
# pool = multiprocessing.Pool(processes=3)
# %time result = pool.map(temp_prec_corr, [1, 3, 7])
# pool.close()
# pool.join()
In [70]:
# load data from .npy data file
arr = np.load('hw_5_data/corr_1.npy')
temp_corr_1, prec_corr_1 = arr[0, :], arr[1, :]
arr = np.load('hw_5_data/corr_3.npy')
temp_corr_3, prec_corr_3 = arr[0, :], arr[1, :]
arr = np.load('hw_5_data/corr_7.npy')
temp_corr_7, prec_corr_7 = arr[0, :], arr[1, :]
In [71]:
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib as mpl
%matplotlib inline
# mpl.style.use('ggplot')
mpl.rcParams['axes.titlesize'] = 20
In [72]:
fig1, (ax1, ax2) = plt.subplots(1,2, figsize=[10, 5])
# num of city
nc = 50
im1 = ax1.imshow(temp_corr_1.reshape(nc, nc), interpolation='nearest',cmap='rainbow')
ax1.set_title('temp corr 1 day')
im2 = ax2.imshow(prec_corr_1.reshape(nc, nc), interpolation='nearest',cmap='rainbow')
ax2.set_title('precip corr 1 day')
# add color bar
fig1.subplots_adjust(right=0.8)
cbar_ax = fig1.add_axes([0.85, 0.15, 0.05, 0.7])
fig1.colorbar(im1, cax=cbar_ax)
Out[72]:
In [73]:
fig3, (ax1, ax2) = plt.subplots(1,2, figsize=[10, 5])
# num of city
nc = 50
im1 = ax1.imshow(temp_corr_3.reshape(nc, nc), interpolation='nearest',cmap='rainbow')
ax1.set_title('temp corr 3 day')
im2 = ax2.imshow(prec_corr_3.reshape(nc, nc), interpolation='nearest',cmap='rainbow')
ax2.set_title('precip corr 3 day')
# add color bar
fig3.subplots_adjust(right=0.8)
cbar_ax = fig3.add_axes([0.85, 0.15, 0.05, 0.7])
fig3.colorbar(im1, cax=cbar_ax)
Out[73]:
In [74]:
fig7, (ax1, ax2) = plt.subplots(1,2, figsize=[10, 5])
# num of city
nc = 50
im1 = ax1.imshow(temp_corr_7.reshape(nc, nc), interpolation='nearest',cmap='rainbow')
ax1.set_title('temp corr 7 day')
im2 = ax2.imshow(prec_corr_7.reshape(nc, nc), interpolation='nearest',cmap='rainbow')
ax2.set_title('precip corr 7 day')
# add color bar
fig7.subplots_adjust(right=0.8)
cbar_ax = fig7.add_axes([0.85, 0.15, 0.05, 0.7])
fig7.colorbar(im1, cax=cbar_ax)
Out[74]:
In [75]:
def get_ntop_corr(corr_pairs, ntop = 10, nc = 50):
"""
parameter
---------
corr_pairs: correlation between all city pairs between NC number of cities
for some weather variable.
ntop: number of most correlated city pairs that we want to study
nc: number of cities --> number of pairs are nc^2
return
------
pandas DataFrame containing info for
NTOP most correlated pairs of cities with
city1: 1st city in the pair
city2: 2nd city in the pair (whose weather 'is' predicted)
icao1: ICAO # for the airport near city1
icao2: ICAO # for the airport near city2
distance: distance in km between city1 and city2
corr: correlation coeff
"""
# index of the N top correlated pairs
ntop_ind = corr_pairs.argsort()[-ntop:][::-1]
corr_arr = corr_pairs[ntop_ind]
city1_ind = ntop_ind // nc
city2_ind = ntop_ind % nc
city1_arr = call_df['City'][city1_ind].values
icao1_arr = call_df['ICAO'][city1_ind].values
city2_arr = call_df['City'][city2_ind].values
icao2_arr = call_df['ICAO'][city2_ind].values
lat1_arr = call_df['latitude_deg'][city1_ind].values
lat2_arr = call_df['latitude_deg'][city2_ind].values
lon1_arr = call_df['longitude_deg'][city1_ind].values
lon2_arr = call_df['longitude_deg'][city2_ind].values
from util import lat_lon_2_distance
from itertools import starmap
dist_arr = np.array(list(starmap(lat_lon_2_distance,
zip(lat1_arr, lon1_arr, lat2_arr, lon2_arr))))
diff_lon = np.abs(lon1_arr - lon2_arr)
# build a new dataframe
return pd.DataFrame({'city 1': city1_arr,
'icao 1': icao1_arr,
'city 2': city2_arr,
'icao 2': icao2_arr,
'distance': dist_arr,
'diff_lon': diff_lon,
'corr': corr_arr
})
In [76]:
temp_1_df = get_ntop_corr(temp_corr_1)
prec_1_df = get_ntop_corr(prec_corr_1)
In [77]:
# most correlated cities in temperature variation
temp_1_df
Out[77]:
In [78]:
# most correlated cities in precipitation variation
prec_1_df
Out[78]:
In [79]:
fig1, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2,2, figsize=[12,10])
ax1.scatter(temp_1_df['distance'], temp_1_df['corr'], s=50)
ax1.set_title('temp corr with 1 day diff')
ax1.set_ylabel('corr')
ax1.set_xlabel('distance')
ax2.scatter(prec_1_df['distance'], prec_1_df['corr'], s=50)
ax2.set_title('precipitation corr with 1 day diff')
ax2.set_ylabel('corr')
ax2.set_xlabel('distance')
ax3.scatter(temp_1_df['diff_lon'], temp_1_df['corr'], s=50)
ax3.set_ylabel('corr')
ax3.set_xlabel('longitude difference')
ax4.scatter(prec_1_df['diff_lon'], prec_1_df['corr'], s=50)
ax4.set_ylabel('corr')
ax4.set_xlabel('longitude difference')
Out[79]:
In [80]:
temp_3_df = get_ntop_corr(temp_corr_3)
prec_3_df = get_ntop_corr(prec_corr_3)
In [81]:
# most correlated cities in temperature variation
temp_3_df
Out[81]:
In [82]:
# most correlated cities in precipitation variation
prec_3_df
Out[82]:
In [85]:
fig3, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2,2, figsize=[12,10])
ax1.scatter(temp_3_df['distance'], temp_3_df['corr'], s=50)
ax1.set_title('temp corr with 3 day diff')
ax1.set_ylabel('corr')
ax1.set_xlabel('distance')
ax2.scatter(prec_3_df['distance'], prec_3_df['corr'], s=50)
ax2.set_title('precipitation corr with 3 day diff')
ax2.set_ylabel('corr')
ax2.set_xlabel('distance')
ax3.scatter(temp_3_df['diff_lon'], temp_3_df['corr'], s=50)
ax3.set_ylabel('corr')
ax3.set_xlabel('longitude difference')
ax4.scatter(prec_3_df['diff_lon'], prec_3_df['corr'], s=50)
ax4.set_ylabel('corr')
ax4.set_xlabel('longitude difference')
Out[85]:
In [86]:
temp_7_df = get_ntop_corr(temp_corr_7)
prec_7_df = get_ntop_corr(prec_corr_7)
# most correlated cities in temperature variation
temp_7_df
Out[86]:
In [87]:
# most correlated cities in precipitation variation
prec_7_df
Out[87]:
In [88]:
fig7, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2,2, figsize=[12,10])
ax1.scatter(temp_7_df['distance'], temp_7_df['corr'], s=50)
ax1.set_title('temp corr with 7 day diff')
ax1.set_ylabel('corr')
ax1.set_xlabel('distance')
ax2.scatter(prec_7_df['distance'], prec_7_df['corr'], s=50)
ax2.set_title('precipitation corr with 7 day diff')
ax2.set_ylabel('corr')
ax2.set_xlabel('distance')
ax3.scatter(temp_7_df['diff_lon'], temp_7_df['corr'], s=50)
ax3.set_ylabel('corr')
ax3.set_xlabel('longitude difference')
ax4.scatter(prec_7_df['diff_lon'], prec_7_df['corr'], s=50)
ax4.set_ylabel('corr')
ax4.set_xlabel('longitude difference')
Out[88]:
Trends (obvious ...)
Other description