ETL Weather DFs


In [247]:
import pandas as pd
import os
import numpy as np
import re

Aggregate station data by distance


In [ ]:
pd.options.display.max_rows = 10
cities = ["ATL_stations","CH_stations","LA_stations","NYC_stations","SD_stations","SF_stations"]
#Aggregate Station Location tuples 
stations_data = pd.DataFrame()
for city in cities:
    path = 'station_locations/%s.txt' % city
    if os.path.exists(path):
        frame = pd.read_csv(path,delim_whitespace=True,header=None,error_bad_lines=False)
        frame['city'] = city
        stations_data = stations_data.append(frame,ignore_index=True)
stations_data = stations_data.rename(columns={0:"distance",1:"station_name"})

In [249]:
ATLshortlist = stations_data[(stations_data.city == "ATL_stations") & (stations_data.distance <= 100)][['station_name']]
CHshortlist = stations_data[(stations_data.city == "CH_stations") & (stations_data.distance <= 100)][['station_name']]
LAshortlist = stations_data[(stations_data.city == "LA_stations") & (stations_data.distance <= 100)][['station_name']]
NYCshortlist = stations_data[(stations_data.city == "NYC_stations") & (stations_data.distance <= 100)][['station_name']]
SDshortlist = stations_data[(stations_data.city == "SD_stations") & (stations_data.distance <= 100)][['station_name']]
SFshortlist = stations_data[(stations_data.city == "SF_stations") & (stations_data.distance <= 100)][['station_name']]

Transform and Load Data

Merge datasets from each station with PARAM_STATION-NAME as default column header
Process results in 10 stations per city
UPDATE: Frame.query removes scrappy data (missing data will still exist for some!)

In [250]:
######ATL Weather######
ATL_stations = [];
ATL_weather = pd.DataFrame()
for station in ATLshortlist['station_name']:
    path = 'ATL/%s.csv' % station
    if os.path.exists(path):
        frame = pd.read_csv(path,delim_whitespace=True)
        frame = frame.query('TMIN != TMAX')
        frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
        frame['DATE'] = pd.to_datetime(frame['DATE'])
        if frame.shape[0] >= 12650:
            ATL_stations.append(station);
            if ATL_weather.empty:
                ATL_weather = ATL_weather.append(frame,ignore_index=True)
            else:
                ATL_weather = ATL_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
ATL_weather = ATL_weather.groupby('DATE').mean()
ATL_weather.reset_index(level=0, inplace=True)

In [251]:
######CH Weather######
CH_stations = [];
CH_weather = pd.DataFrame()
for station in CHshortlist['station_name']:
    path = 'CH/%s.csv' % station
    if os.path.exists(path):
        frame = pd.read_csv(path,delim_whitespace=True)
        frame = frame.query('TMIN != TMAX')
        frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
        frame['DATE'] = pd.to_datetime(frame['DATE'])
        if frame.shape[0] >= 12764:
            CH_stations.append(station)
            if CH_weather.empty:
                CH_weather = CH_weather.append(frame,ignore_index=True)
            else:
                CH_weather = CH_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
CH_weather = CH_weather.groupby('DATE').mean()
CH_weather.reset_index(level=0, inplace=True)

In [252]:
######NYC Weather######
NYC_stations = [];
NYC_weather = pd.DataFrame()
for station in NYCshortlist['station_name']:
    path = 'NYC/%s.csv' % station
    if os.path.exists(path):
        frame = pd.read_csv(path,delim_whitespace=True)
        frame = frame.query('TMIN != TMAX')
        frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
        frame['DATE'] = pd.to_datetime(frame['DATE'])
        if frame.shape[0] >= 12700:
            NYC_stations.append(station)
            if NYC_weather.empty:
                NYC_weather = NYC_weather.append(frame,ignore_index=True)
            else:
                NYC_weather = NYC_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
NYC_weather = NYC_weather.groupby('DATE').mean()
NYC_weather.reset_index(level=0, inplace=True)

In [253]:
######LA Weather######
LA_stations = [];
LA_weather = pd.DataFrame()
for station in LAshortlist['station_name']:
    path = 'LA/%s.csv' % station
    if os.path.exists(path):
        frame = pd.read_csv(path,delim_whitespace=True)
        frame = frame.query('TMIN != TMAX')
        frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
        frame['DATE'] = pd.to_datetime(frame['DATE'])
        if frame.shape[0] >= 12600: 
            LA_stations.append(station)
            if LA_weather.empty:
                LA_weather = LA_weather.append(frame,ignore_index=True)
            else:
                LA_weather = LA_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
LA_weather = LA_weather.groupby('DATE').mean()
LA_weather.reset_index(level=0, inplace=True)

In [254]:
######SF Weather######
SF_stations = [];
SF_weather = pd.DataFrame()
for station in SFshortlist['station_name']:
    path = 'SF/%s.csv' % station
    if os.path.exists(path):
        frame = pd.read_csv(path,delim_whitespace=True)
        frame = frame.query('TMIN != TMAX')
        frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
        frame['DATE'] = pd.to_datetime(frame['DATE'])
        if frame.shape[0] >= 12784: 
            SF_stations.append(station)
            if SF_weather.empty:
                SF_weather = SF_weather.append(frame,ignore_index=True)
            else:
                SF_weather = SF_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
SF_weather = SF_weather.groupby('DATE').mean()
SF_weather.reset_index(level=0, inplace=True)

In [255]:
######SD Weather######
SD_stations = [];
SD_weather = pd.DataFrame()
for station in SDshortlist['station_name']:
    path = 'SD/%s.csv' % station
    if os.path.exists(path):
        frame = pd.read_csv(path,delim_whitespace=True)
        frame = frame.query('TMIN != TMAX')
        frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
        frame['DATE'] = pd.to_datetime(frame['DATE'])
        if frame.shape[0] >= 12600: 
            SD_stations.append(station)
            if SD_weather.empty:
                SD_weather = SD_weather.append(frame,ignore_index=True)
            else:
                SD_weather = SD_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
SD_weather = SD_weather.groupby('DATE').mean()
SD_weather.reset_index(level=0, inplace=True)

Missing Data Logger

True indicates a missing data row

In [256]:
print("New York City stations missing data")
for station in NYC_stations:
    print("Station %s" % station)
    print(pd.isnull(NYC_weather)['TMIN_'+station].value_counts())
    print()

print("Atlanta stations missing data")
for station in ATL_stations:
    print("Station %s" % station)
    print(pd.isnull(ATL_weather)['TMIN_'+station].value_counts())
    print()
    
print("San Francisco stations missing data")
for station in SF_stations:
    print("Station %s" % station)
    print(pd.isnull(SF_weather)['TMIN_'+station].value_counts())
    print()

print("San Diego stations missing data")
for station in SD_stations:
    print("Station %s" % station)
    print(pd.isnull(SD_weather)['TMIN_'+station].value_counts())
    print()

print("Los Angeles stations missing data")
for station in LA_stations:
    print("Station %s" % station)
    print(pd.isnull(LA_weather)['TMIN_'+station].value_counts())
    print()

print("Chicago stations missing data")
for station in CH_stations:
    print("Station %s" % station)
    print(pd.isnull(CH_weather)['TMIN_'+station].value_counts())
    print()


New York City stations missing data
Station USC00300889
False    12631
dtype: int64
()
Station USC00305426
False    12631
dtype: int64
()
Station USW00094745
False    12631
dtype: int64
()
Station USC00283951
False    12631
dtype: int64
()
Station USW00014734
False    12631
dtype: int64
()
Atlanta stations missing data
Station USC00092283
False    12593
dtype: int64
()
Station USC00092485
False    12593
dtype: int64
()
Station USC00093060
False    12593
dtype: int64
()
Station USC00093621
False    12593
dtype: int64
()
Station USW00093842
False    12593
dtype: int64
()
Station USC00018469
False    12593
dtype: int64
()
Station USW00013871
False    12593
dtype: int64
()
San Francisco stations missing data
Station USC00043578
False    12773
dtype: int64
()
Station USC00045795
False    12773
dtype: int64
()
Station USC00047916
False    12773
dtype: int64
()
Station USC00049742
False    12773
dtype: int64
()
Station USW00023271
False    12773
dtype: int64
()
Station USW00023272
False    12773
dtype: int64
()
San Diego stations missing data
Station USC00040983
False    12109
dtype: int64
()
Station USC00042239
False    12109
dtype: int64
()
Station USC00043914
False    12109
dtype: int64
()
Station USC00044223
False    12109
dtype: int64
()
Station USW00003164
False    12109
dtype: int64
()
Los Angeles stations missing data
Station USC00041194
False    12106
dtype: int64
()
Station USC00046624
False    12106
dtype: int64
()
Station USC00046719
False    12106
dtype: int64
()
Station USC00047888
False    12106
dtype: int64
()
Station USC00049152
False    12106
dtype: int64
()
Station USC00049785
False    12106
dtype: int64
()
Chicago stations missing data
Station USC00472869
False    12652
dtype: int64
()
Station USC00473058
False    12652
dtype: int64
()
Station USC00473453
False    12652
dtype: int64
()
Station USC00476200
False    12652
dtype: int64
()
Station USW00014839
False    12652
dtype: int64
()
Station USC00125174
False    12652
dtype: int64
()
Station USC00113262
False    12652
dtype: int64
()

In [297]:
#Merge Datasets from each stock with PARAM_STOCK-NAME as default columns           
######Stock Data######
stocks = ['Apple, Inc Stock','International Business Machines Stock','Wal-Mart Stores, Inc Common St Stock','FedEx Corporation','The Boeing Company']
stock_data = pd.DataFrame()
for stock in stocks:
    path = 'Stock Data/%s.csv' % stock
    if os.path.exists(path):
        frame = pd.read_csv(path)
        frame.columns = ['DATE','OPEN_'+stock,'HIGH_'+stock,'LOW_'+stock,'CLOSE_'+stock,'VOLUME_'+stock,'ADJ CLOSE_'+stock]
        frame['DATE'] = pd.to_datetime(frame['DATE'])
        for column in frame.columns:
            if column != 'DATE':
                frame['PRE'+column] = frame[column].shift(-1)
        if stock_data.empty:
            stock_data = stock_data.append(frame,ignore_index=True)
        else:
            stock_data = stock_data.merge(frame, on='DATE', how='inner')
stock_data = stock_data.groupby('DATE').mean()
stock_data.reset_index(level=0, inplace=True)

In [280]:



Out[280]:
Index([u'DATE', u'OPEN_Apple, Inc Stock', u'HIGH_Apple, Inc Stock',
       u'LOW_Apple, Inc Stock', u'CLOSE_Apple, Inc Stock',
       u'VOLUME_Apple, Inc Stock', u'ADJ CLOSE_Apple, Inc Stock',
       u'OPEN_International Business Machines Stock',
       u'HIGH_International Business Machines Stock',
       u'LOW_International Business Machines Stock',
       u'CLOSE_International Business Machines Stock',
       u'VOLUME_International Business Machines Stock',
       u'ADJ CLOSE_International Business Machines Stock',
       u'OPEN_Wal-Mart Stores, Inc Common St Stock',
       u'HIGH_Wal-Mart Stores, Inc Common St Stock',
       u'LOW_Wal-Mart Stores, Inc Common St Stock',
       u'CLOSE_Wal-Mart Stores, Inc Common St Stock',
       u'VOLUME_Wal-Mart Stores, Inc Common St Stock',
       u'ADJ CLOSE_Wal-Mart Stores, Inc Common St Stock',
       u'OPEN_FedEx Corporation', u'HIGH_FedEx Corporation',
       u'LOW_FedEx Corporation', u'CLOSE_FedEx Corporation',
       u'VOLUME_FedEx Corporation', u'ADJ CLOSE_FedEx Corporation',
       u'OPEN_The Boeing Company', u'HIGH_The Boeing Company',
       u'LOW_The Boeing Company', u'CLOSE_The Boeing Company',
       u'VOLUME_The Boeing Company', u'ADJ CLOSE_The Boeing Company'],
      dtype='object')

In [298]:
#Merge Datasets from each stock with CITY-NAME_sunlight as default columns
#####Sunlight Data######
cities = ['ATL_sunlight','NYC_sunlight','LA_sunlight','SF_sunlight','SD_sunlight','CH_sunlight']
sunlight_data = pd.DataFrame()
for city in cities:
    path = 'Sunlight_data/%s.csv' % city
    if os.path.exists(path):
        frame = pd.read_csv(path,delim_whitespace=True,keep_date_col=True)
        frame = frame.iloc[:,0:2]
        frame.columns = ['DATE',city]
        frame['DATE'] = pd.to_datetime(frame['DATE'])
        if sunlight_data.empty:
            sunlight_data = sunlight_data.append(frame,ignore_index=True)
        else:
            sunlight_data = sunlight_data.merge(frame, on='DATE', how='inner')
sunlight_data = sunlight_data.groupby('DATE').mean()
sunlight_data.reset_index(level=0, inplace=True)

In [295]:
#pd.DataFrame.head(sunlight_data)
#pd.DataFrame.head(stock_data)
#pd.DataFrame.head(NYC_weather)
#pd.DataFrame.head(df_final)
#stock_data.dtypes


Out[295]:
DATE OPEN_Apple, Inc Stock HIGH_Apple, Inc Stock LOW_Apple, Inc Stock CLOSE_Apple, Inc Stock VOLUME_Apple, Inc Stock ADJ CLOSE_Apple, Inc Stock PREOPEN_Apple, Inc Stock PREHIGH_Apple, Inc Stock PRELOW_Apple, Inc Stock ... LOW_The Boeing Company CLOSE_The Boeing Company VOLUME_The Boeing Company ADJ CLOSE_The Boeing Company PREOPEN_The Boeing Company PREHIGH_The Boeing Company PRELOW_The Boeing Company PRECLOSE_The Boeing Company PREVOLUME_The Boeing Company PREADJ CLOSE_The Boeing Company
0 1980-12-12 28.749840 28.874720 28.749840 28.749840 117258400 0.438205 NaN NaN NaN ... 36.625028 36.999991 816600 2.457737 36.999991 36.999991 36.124988 36.874978 1828200 2.449433
1 1980-12-15 27.375041 27.375041 27.250160 27.250160 43971200 0.415346 28.749840 28.874720 28.749840 ... 37.125000 37.250009 1157400 2.474345 36.874978 37.250009 36.625028 36.999991 816600 2.457737
2 1980-12-16 25.374720 25.374720 25.249840 25.249840 26432000 0.384858 27.375041 27.375041 27.250160 ... 36.749969 37.125000 1181000 2.466041 37.125000 37.624972 37.125000 37.250009 1157400 2.474345
3 1980-12-17 25.874799 26.000240 25.874799 25.874799 21610400 0.394383 25.374720 25.374720 25.249840 ... 37.125000 38.375034 1158200 2.549075 37.250009 37.375022 36.749969 37.125000 1181000 2.466041
4 1980-12-18 26.625201 26.750080 26.625201 26.625201 18362400 0.405821 25.874799 26.000240 25.874799 ... 38.125012 38.499975 1458200 2.557374 37.125000 38.375034 37.125000 38.375034 1158200 2.549075

5 rows × 61 columns


In [340]:
data_frames = [ATL_weather,CH_weather,NYC_weather,LA_weather,SD_weather,SF_weather,sunlight_data,stock_data]
df_final = reduce(lambda left,right: pd.merge(left,right,how='inner',on='DATE'), data_frames)
df_final = df_final.ix[3:]

In [263]:
SD_TMAX = pd.DataFrame()
for column in list(SD_weather.columns.values):
    if (re.match('TMAX',column)):
        SD_TMAX[column] = SD_weather[column]
print(SD_TMAX.corr())


                  TMAX_USC00040983  TMAX_USC00042239  TMAX_USC00043914  \
TMAX_USC00040983          1.000000          0.909123          0.890611   
TMAX_USC00042239          0.909123          1.000000          0.937859   
TMAX_USC00043914          0.890611          0.937859          1.000000   
TMAX_USC00044223          0.963431          0.899407          0.881659   
TMAX_USW00003164          0.900046          0.885791          0.880609   

                  TMAX_USC00044223  TMAX_USW00003164  
TMAX_USC00040983          0.963431          0.900046  
TMAX_USC00042239          0.899407          0.885791  
TMAX_USC00043914          0.881659          0.880609  
TMAX_USC00044223          1.000000          0.906180  
TMAX_USW00003164          0.906180          1.000000  

In [264]:
SF_TMIN = pd.DataFrame()
for column in list(SF_weather.columns.values):
    if (re.match('TMIN',column)):
        SF_TMIN[column] = SF_weather[column]
print(SF_TMIN.corr())


                  TMIN_USC00043578  TMIN_USC00045795  TMIN_USC00047916  \
TMIN_USC00043578          1.000000          0.773245          0.875948   
TMIN_USC00045795          0.773245          1.000000          0.863843   
TMIN_USC00047916          0.875948          0.863843          1.000000   
TMIN_USC00049742          0.809673          0.725458          0.824087   
TMIN_USW00023271          0.822961          0.772047          0.849067   
TMIN_USW00023272          0.774539          0.855064          0.808545   

                  TMIN_USC00049742  TMIN_USW00023271  TMIN_USW00023272  
TMIN_USC00043578          0.809673          0.822961          0.774539  
TMIN_USC00045795          0.725458          0.772047          0.855064  
TMIN_USC00047916          0.824087          0.849067          0.808545  
TMIN_USC00049742          1.000000          0.933714          0.772974  
TMIN_USW00023271          0.933714          1.000000          0.808318  
TMIN_USW00023272          0.772974          0.808318          1.000000  

In [265]:
SF_SNOW = pd.DataFrame()
for column in list(SF_weather.columns.values):
    if (re.match('SNOW',column)):
        SF_SNOW[column] = SF_weather[column]
print(SF_SNOW.corr())


                  SNOW_USC00043578  SNOW_USC00045795  SNOW_USC00047916  \
SNOW_USC00043578               NaN               NaN               NaN   
SNOW_USC00045795               NaN          1.000000               NaN   
SNOW_USC00047916               NaN               NaN               NaN   
SNOW_USC00049742               NaN          0.059015               NaN   
SNOW_USW00023271               NaN         -0.000194               NaN   
SNOW_USW00023272               NaN               NaN               NaN   

                  SNOW_USC00049742  SNOW_USW00023271  SNOW_USW00023272  
SNOW_USC00043578               NaN               NaN               NaN  
SNOW_USC00045795          0.059015         -0.000194               NaN  
SNOW_USC00047916               NaN               NaN               NaN  
SNOW_USC00049742          1.000000         -0.000129               NaN  
SNOW_USW00023271         -0.000129          1.000000               NaN  
SNOW_USW00023272               NaN               NaN               NaN  

In [266]:
SF_SNWD = pd.DataFrame()
for column in list(SF_weather.columns.values):
    if (re.match('SNWD',column)):
        SF_SNWD[column] = SF_weather[column]
print(SF_SNWD.corr())


                  SNWD_USC00043578  SNWD_USC00045795  SNWD_USC00047916  \
SNWD_USC00043578                 1               NaN               NaN   
SNWD_USC00045795               NaN               NaN               NaN   
SNWD_USC00047916               NaN               NaN               NaN   
SNWD_USC00049742               NaN               NaN               NaN   
SNWD_USW00023271               NaN               NaN               NaN   
SNWD_USW00023272               NaN               NaN               NaN   

                  SNWD_USC00049742  SNWD_USW00023271  SNWD_USW00023272  
SNWD_USC00043578               NaN               NaN               NaN  
SNWD_USC00045795               NaN               NaN               NaN  
SNWD_USC00047916               NaN               NaN               NaN  
SNWD_USC00049742               NaN               NaN               NaN  
SNWD_USW00023271               NaN               NaN               NaN  
SNWD_USW00023272               NaN               NaN               NaN  

In [267]:
SF_PRCP = pd.DataFrame()
for column in list(SF_weather.columns.values):
    if (re.match('PRCP',column)):
        SF_PRCP[column] = SF_weather[column]
print(SF_PRCP.corr())


                  PRCP_USC00043578  PRCP_USC00045795  PRCP_USC00047916  \
PRCP_USC00043578          1.000000          0.618600          0.720685   
PRCP_USC00045795          0.618600          1.000000          0.750712   
PRCP_USC00047916          0.720685          0.750712          1.000000   
PRCP_USC00049742          0.812961          0.645877          0.719130   
PRCP_USW00023271          0.633869          0.616065          0.607253   
PRCP_USW00023272          0.623863          0.612386          0.646948   

                  PRCP_USC00049742  PRCP_USW00023271  PRCP_USW00023272  
PRCP_USC00043578          0.812961          0.633869          0.623863  
PRCP_USC00045795          0.645877          0.616065          0.612386  
PRCP_USC00047916          0.719130          0.607253          0.646948  
PRCP_USC00049742          1.000000          0.595812          0.546373  
PRCP_USW00023271          0.595812          1.000000          0.786289  
PRCP_USW00023272          0.546373          0.786289          1.000000  

In [ ]:


In [268]:
cd Documents/Weather Data


[Errno 2] No such file or directory: 'Documents/Weather Data'
/home/athena/Documents/Weather Data

In [456]:
TMAX_AVG = [0 for x in xrange(len(df_final.index))]
zipped = zip(list(df_final.columns.values),TMAX_AVG)
count=0
for i in list(df_final.columns.values):
    if(re.match('TMAX',i)):
        count=count+1
        TMAX_AVG=TMAX_AVG+df_final.ix[:,i]


################################################333
TMAX_ATL = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(ATL_weather.columns.values):
    if(re.match('TMAX',i)):
        count=count+1
        TMAX_ATL=TMAX_ATL+df_final.ix[:,i]

TMAX_AVG_ATL= TMAX_ATL/count 
###################################################3
################################################333
TMAX_CH = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(CH_weather.columns.values):
    if(re.match('TMAX',i)):
        count=count+1
        TMAX_CH=TMAX_CH+df_final.ix[:,i]

TMAX_AVG_CH= TMAX_CH/count 
###################################################3
################################################333

TMAX_LA = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(LA_weather.columns.values):
    if(re.match('TMAX',i)):
        count=count+1
        TMAX_LA=TMAX_LA+df_final.ix[:,i]

TMAX_AVG_LA= TMAX_LA/count
print TMAX_AVG_LA

###################################################3
################################################333
TMAX_NYC = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(NYC_weather.columns.values):
    if(re.match('TMAX',i)):
        count=count+1
        TMAX_NYC=TMAX_NYC+df_final.ix[:,i]

TMAX_AVG_NYC= TMAX_NYC/count 
###################################################3
################################################333

TMAX_SD = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SD_weather.columns.values):
    if(re.match('TMAX',i)):
        count=count+1
        TMAX_SD=TMAX_SD+df_final.ix[:,i]

TMAX_AVG_SD= TMAX_SD/count
###################################################3
################################################333
TMAX_SF = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SF_weather.columns.values):
    if(re.match('TMAX',i)):
        count=count+1
        TMAX_SF=TMAX_SF+df_final.ix[:,i]

TMAX_AVG_SF= TMAX_SF/count 
###################################################3
################################################333
TMIN_ATL = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(ATL_weather.columns.values):
    if(re.match('TMIN',i)):
        count=count+1
        TMIN_ATL=TMIN_ATL+df_final.ix[:,i]

TMIN_AVG_ATL= TMIN_ATL/count 
###################################################3
################################################333
TMIN_CH = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(CH_weather.columns.values):
    if(re.match('TMIN',i)):
        count=count+1
        TMIN_CH=TMIN_CH+df_final.ix[:,i]

TMIN_AVG_CH= TMIN_CH/count 
###################################################3
################################################333
TMIN_LA = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(LA_weather.columns.values):
    if(re.match('TMIN',i)):
        count=count+1
        TMIN_LA=TMIN_LA+df_final.ix[:,i]

TMIN_AVG_LA= TMIN_LA/count
###################################################3
################################################333
TMIN_NYC = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(NYC_weather.columns.values):
    if(re.match('TMIN',i)):
        count=count+1
        TMIN_NYC=TMIN_NYC+df_final.ix[:,i]

TMIN_AVG_NYC= TMIN_NYC/count 
###################################################3
################################################333

TMIN_SD = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SD_weather.columns.values):
    if(re.match('TMIN',i)):
        count=count+1
        TMIN_SD=TMIN_SD+df_final.ix[:,i]

TMIN_AVG_SD= TMIN_SD/count 
###################################################3
################################################333
TMIN_SF = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SF_weather.columns.values):
    if(re.match('TMIN',i)):
        count=count+1
        TMIN_SF=TMIN_SF+df_final.ix[:,i]

TMIN_AVG_SF= TMIN_SF/count 
###################################################3
##################################################

SNOW_ATL = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(ATL_weather.columns.values):
    if(re.match('SNOW',i)):
        count=count+1
        SNOW_ATL=SNOW_ATL+df_final.ix[:,i]

SNOW_AVG_ATL= SNOW_ATL/count 

####################################################
####################################################

SNOW_NYC = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(NYC_weather.columns.values):
    if(re.match('SNOW',i)):
        count=count+1
        SNOW_NYC=SNOW_NYC+df_final.ix[:,i]

SNOW_AVG_NYC= SNOW_NYC/count 

###################################################
###################################################

SNOW_CH = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(CH_weather.columns.values):
    if(re.match('SNOW',i)):
        count=count+1
        SNOW_CH=SNOW_CH+df_final.ix[:,i]

SNOW_AVG_CH= SNOW_CH/count

###################################################
###################################################

SNOW_LA = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(LA_weather.columns.values):
    if(re.match('SNOW',i)):
        count=count+1
        SNOW_LA=SNOW_LA+df_final.ix[:,i]

SNOW_AVG_LA= SNOW_LA/count

###################################################
###################################################

SNOW_SD = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SD_weather.columns.values):
    if(re.match('SNOW',i)):
        count=count+1
        SNOW_SD=SNOW_SD+df_final.ix[:,i]

SNOW_AVG_SD= SNOW_SD/count

###################################################
###################################################

SNOW_SF = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SF_weather.columns.values):
    if(re.match('SNOW',i)):
        count=count+1
        SNOW_SF=SNOW_SF+df_final.ix[:,i]

SNOW_AVG_SF= SNOW_SF/count 

###################################################
###################################################

SNWD_ATL = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(ATL_weather.columns.values):
    if(re.match('SNWD',i)):
        count=count+1
        SNWD_ATL=SNWD_ATL+df_final.ix[:,i]

SNWD_AVG_ATL= SNWD_ATL/count 

####################################################
####################################################

SNWD_NYC = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(NYC_weather.columns.values):
    if(re.match('SNWD',i)):
        count=count+1
        SNWD_NYC=SNWD_NYC+df_final.ix[:,i]

SNWD_AVG_NYC= SNWD_NYC/count

####################################################
####################################################

SNWD_CH = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(CH_weather.columns.values):
    if(re.match('SNWD',i)):
        count=count+1
        SNWD_CH=SNWD_CH+df_final.ix[:,i]

SNWD_AVG_CH= SNWD_CH/count 

####################################################
####################################################

SNWD_LA = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(LA_weather.columns.values):
    if(re.match('SNWD',i)):
        count=count+1
        SNWD_LA=SNWD_LA+df_final.ix[:,i]

SNWD_AVG_LA= SNWD_LA/count 


####################################################
####################################################

SNWD_SD = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SD_weather.columns.values):
    if(re.match('SNWD',i)):
        count=count+1
        SNWD_SD=SNWD_SD+df_final.ix[:,i]

SNWD_AVG_SD= SNWD_SD/count 
#####################################################
#####################################################

SNWD_SF = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SF_weather.columns.values):
    if(re.match('SNWD',i)):
        count=count+1
        SNWD_SF=SNWD_SF+df_final.ix[:,i]

SNWD_AVG_SF= SNWD_SF/count 

#####################################################
#####################################################
PRCP_NYC = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(NYC_weather.columns.values):
    if(re.match('PRCP',i)):
        count=count+1
        PRCP_NYC=PRCP_NYC+df_final.ix[:,i]

PRCP_AVG_NYC= PRCP_NYC/count

#####################################################
#####################################################

PRCP_ATL = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(ATL_weather.columns.values):
    if(re.match('PRCP',i)):
        count=count+1
        PRCP_ATL=PRCP_ATL+df_final.ix[:,i]

PRCP_AVG_ATL= PRCP_ATL/count 

#####################################################
#####################################################

PRCP_CH = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(CH_weather.columns.values):
    if(re.match('PRCP',i)):
        count=count+1
        PRCP_CH=PRCP_CH+df_final.ix[:,i]

PRCP_AVG_CH= PRCP_CH/count 

#####################################################
#####################################################

PRCP_LA = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(LA_weather.columns.values):
    if(re.match('PRCP',i)):
        count=count+1
        PRCP_LA=PRCP_LA+df_final.ix[:,i]

PRCP_AVG_LA= PRCP_LA/count 

#####################################################
#####################################################

PRCP_SD = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SD_weather.columns.values):
    if(re.match('PRCP',i)):
        count=count+1
        PRCP_SD=PRCP_SD+df_final.ix[:,i]

PRCP_AVG_SD= PRCP_SD/count 

#####################################################
#####################################################

PRCP_SF = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SF_weather.columns.values):
    if(re.match('PRCP',i)):
        count=count+1
        PRCP_SF=PRCP_SF+df_final.ix[:,i]

PRCP_AVG_SF= PRCP_SF/count 

df = pd.DataFrame()
df["TMAX_ATL"]=TMAX_AVG_ATL
df["TMAX_NYC"]=TMAX_AVG_NYC
df["TMAX_CH"]=TMAX_AVG_CH
df["TMAX_LA"]=TMAX_AVG_LA
df["TMAX_SD"]=TMAX_AVG_SD
df["TMAX_SF"]=TMAX_AVG_SF


df["TMIN_ATL"]=TMIN_AVG_ATL
df["TMIN_NYC"]=TMIN_AVG_NYC
df["TMIN_CH"]=TMIN_AVG_CH
df["TMIN_LA"]=TMIN_AVG_LA
df["TMIN_SD"]=TMIN_AVG_SD
df["TMIN_SF"]=TMIN_AVG_SF 

df["SNOW_ATL"]=SNOW_AVG_ATL
df["SNOW_NYC"]=SNOW_AVG_NYC
df["SNOW_CH"]=SNOW_AVG_CH
df["SNOW_LA"]=SNOW_AVG_LA
df["SNOW_SD"]=SNOW_AVG_SD
df["SNOW_SF"]=SNOW_AVG_SF 

df["SNWD_ATL"]=SNWD_AVG_ATL
df["SNWD_NYC"]=SNWD_AVG_NYC
df["SNWD_CH"]=SNWD_AVG_CH
df["SNWD_LA"]=SNWD_AVG_LA
df["SNWD_SD"]=SNWD_AVG_SD
df["SNWD_SF"]=SNWD_AVG_SF 

df["PRCP_ATL"]=PRCP_AVG_ATL
df["PRCP_NYC"]=PRCP_AVG_NYC
df["PRCP_CH"]=PRCP_AVG_CH
df["PRCP_LA"]=PRCP_AVG_LA
df["PRCP_SD"]=PRCP_AVG_SD
df["PRCP_SF"]=PRCP_AVG_SF

df["SUN_ATL"]=df_final["ATL_sunlight"]
df["SUN_NYC"]=df_final["NYC_sunlight"]
df["SUN_LA"]=df_final["LA_sunlight"]
df["SUN_CH"]=df_final["CH_sunlight"]
df["SUN_SD"]=df_final["SD_sunlight"]
df["SUN_SF"]=df_final["SF_sunlight"]

##Adding day column##
from datetime import date
import calendar
date.today().strftime("%A")
dx = {}
for date in df_final["DATE"]:
        dx[date] = date.strftime("%A")
import collections
dx = collections.OrderedDict(sorted(dx.items()))
df_final["DAY"] = dx.values()
df_final["DAY"] = df_final["DAY"].astype('category')
df = pd.concat([df, pd.get_dummies(df_final["DAY"])],axis =1) 
####

df["PreWalMart"]= (-np.log(df_final["PREOPEN_Wal-Mart Stores, Inc Common St Stock"])+ np.log(df_final["PRECLOSE_Wal-Mart Stores, Inc Common St Stock"]))*100*df_final["PREVOLUME_Wal-Mart Stores, Inc Common St Stock"]  

df["PreApple"]= (-np.log(df_final["PREOPEN_Apple, Inc Stock"])+ np.log(df_final["PRECLOSE_Apple, Inc Stock"]))*100*df_final["PREVOLUME_Apple, Inc Stock"]
                                                               
df["PreBoeing"]= (-np.log(df_final["PREOPEN_The Boeing Company"])+ np.log(df_final["PRECLOSE_The Boeing Company"]))*100*df_final["PREVOLUME_The Boeing Company"]

df["PreFedEx"]= (-np.log(df_final["PREOPEN_FedEx Corporation"])+ np.log(df_final["PRECLOSE_FedEx Corporation"]))*100*df_final["PREVOLUME_FedEx Corporation"]

df["PreIBM"]= (-np.log(df_final["PREOPEN_International Business Machines Stock"])+ np.log(df_final["PRECLOSE_International Business Machines Stock"]))*100*df_final["PREVOLUME_International Business Machines Stock"]

df["WalMart"]= (-np.log(df_final["OPEN_Wal-Mart Stores, Inc Common St Stock"])+ np.log(df_final["CLOSE_Wal-Mart Stores, Inc Common St Stock"]))*100*df_final["VOLUME_Wal-Mart Stores, Inc Common St Stock"]  

df["Apple"]= (-np.log(df_final["OPEN_Apple, Inc Stock"])+ np.log(df_final["CLOSE_Apple, Inc Stock"]))*100*df_final["VOLUME_Apple, Inc Stock"]
                                                               
df["Boeing"]= (-np.log(df_final["OPEN_The Boeing Company"])+ np.log(df_final["CLOSE_The Boeing Company"]))*100*df_final["VOLUME_The Boeing Company"]

df["FedEx"]= (-np.log(df_final["OPEN_FedEx Corporation"])+ np.log(df_final["CLOSE_FedEx Corporation"]))*100*df_final["VOLUME_FedEx Corporation"]

df["IBM"]= (-np.log(df_final["OPEN_International Business Machines Stock"])+ np.log(df_final["CLOSE_International Business Machines Stock"]))*100*df_final["VOLUME_International Business Machines Stock"]


3       263.833333
4       213.000000
5       188.833333
6       222.000000
7       248.166667
           ...    
6830    192.500000
6831    202.666667
6832    180.500000
6833    158.333333
6834    156.666667
dtype: float64

In [459]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
import time
from collections import defaultdict
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import ElasticNet
import matplotlib.pyplot as plt

res = defaultdict(dict)

X = pd.DataFrame(df.ix[:,0:42])
Y = pd.DataFrame(df.ix[:,42:47])

X_train, X_test, y_train, y_test = train_test_split(X, Y.ix[:,0], test_size=0.2, random_state=3)

In [463]:
param_grid1 = {'learning_rate': [0.1,0.01],
              'max_depth': [1,2],
              'max_features': [1]
              }
param_grid2 = {'max_features' : [0.3,0.5],
              'min_samples_split' : [4,10],
               }
param_grid3 = {'alpha' : [0.01,0.1,1,100,1000],
              'l1_ratio' : [0.2,0.4,0.6,0.8,1]}

est1 = GradientBoostingRegressor(n_estimators=50)
est2 = RandomForestRegressor(n_estimators=200)
est3 = ElasticNet()

# this may take some minutes
gs_cv = GridSearchCV(est2, param_grid2, scoring='mean_squared_error', n_jobs=4).fit(X_train, y_train)

# best hyperparameter setting
print('Best hyperparameters: %r' % gs_cv.best_params_)
#print "abc"
#regressor = DecisionTreeRegressor(max_depth = 1,random_state=0)
#print cross_val_score(regressor, X, Y.ix[:,0], cv=10)

# predict class labels
#pred = est.predict(X_test)

# score on test data (accuracy)
#testacc = est.score(X_test, y_test)
#trainacc = est.score(X_train,y_train)

#print('TestACC: %.4f' % testacc)
#print('TrainACC:%.4f' % trainacc)


Best hyperparameters: {'max_features': 0.3, 'min_samples_split': 10}

In [464]:
# refit model on best parameters
est2.set_params(**gs_cv.best_params_)
est2.fit(X_train, y_train)
pred = est2.predict(X_test)
acc = est2.score(X_test, y_test)
print('ACC: %.4f' % acc)


ACC: 0.0578

In [ ]: