In [1]:
import numpy as np
import pandas as pd
import os, re
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
nex_df = pd.read_csv('data/nexrad_testing_10k.csv')
nex_df['timestamp'] = pd.to_datetime(nex_df['timestamp'])
nex_df = nex_df.set_index(pd.DatetimeIndex(nex_df['timestamp']))
nex_df = nex_df.dropna()
print(nex_df.dtypes)
nex_df.head()
Out[2]:
In [3]:
zip_cols = nex_df.columns.values[1:len(nex_df.columns.values)-1]
zip_precip = nex_df[zip_cols].sum()
zip_precip = zip_precip.sort_values(ascending=False)
zip_precip.plot(kind='bar')
Out[3]:
In [4]:
zip_area = pd.read_csv('data/zip_code_area.csv')
print(zip_area.dtypes)
zip_area.head()
Out[4]:
In [8]:
zip_precip.head()
Out[8]:
The differences in the top zip codes for precipitation seem pretty extreme, and could possibly be a result of the top zip code (60605) being much smaller. Plotting versus zip code areas (obtained from the city data portal) to see if this makes sense. Doesn't seem like there's a relationship though
In [7]:
zip_precip_sum = pd.DataFrame(zip_precip).reset_index()
zip_precip_sum = zip_precip_sum.rename(columns={'index':'zip',0:'precip'})
zip_precip_sum.head()
Out[7]:
In [8]:
zip_precip_sum['zip'] = zip_precip_sum['zip'].astype(int)
zip_precip_area = zip_precip_sum.merge(zip_area, on='zip')
zip_precip_area.head()
Out[8]:
In [9]:
zip_precip_area.plot(x='precip',y='shape_area')
Out[9]:
In [ ]: