In [247]:
import pandas as pd
import os
import numpy as np
import re
In [ ]:
pd.options.display.max_rows = 10
cities = ["ATL_stations","CH_stations","LA_stations","NYC_stations","SD_stations","SF_stations"]
#Aggregate Station Location tuples
stations_data = pd.DataFrame()
for city in cities:
path = 'station_locations/%s.txt' % city
if os.path.exists(path):
frame = pd.read_csv(path,delim_whitespace=True,header=None,error_bad_lines=False)
frame['city'] = city
stations_data = stations_data.append(frame,ignore_index=True)
stations_data = stations_data.rename(columns={0:"distance",1:"station_name"})
In [249]:
ATLshortlist = stations_data[(stations_data.city == "ATL_stations") & (stations_data.distance <= 100)][['station_name']]
CHshortlist = stations_data[(stations_data.city == "CH_stations") & (stations_data.distance <= 100)][['station_name']]
LAshortlist = stations_data[(stations_data.city == "LA_stations") & (stations_data.distance <= 100)][['station_name']]
NYCshortlist = stations_data[(stations_data.city == "NYC_stations") & (stations_data.distance <= 100)][['station_name']]
SDshortlist = stations_data[(stations_data.city == "SD_stations") & (stations_data.distance <= 100)][['station_name']]
SFshortlist = stations_data[(stations_data.city == "SF_stations") & (stations_data.distance <= 100)][['station_name']]
In [250]:
######ATL Weather######
ATL_stations = [];
ATL_weather = pd.DataFrame()
for station in ATLshortlist['station_name']:
path = 'ATL/%s.csv' % station
if os.path.exists(path):
frame = pd.read_csv(path,delim_whitespace=True)
frame = frame.query('TMIN != TMAX')
frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
frame['DATE'] = pd.to_datetime(frame['DATE'])
if frame.shape[0] >= 12650:
ATL_stations.append(station);
if ATL_weather.empty:
ATL_weather = ATL_weather.append(frame,ignore_index=True)
else:
ATL_weather = ATL_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
ATL_weather = ATL_weather.groupby('DATE').mean()
ATL_weather.reset_index(level=0, inplace=True)
In [251]:
######CH Weather######
CH_stations = [];
CH_weather = pd.DataFrame()
for station in CHshortlist['station_name']:
path = 'CH/%s.csv' % station
if os.path.exists(path):
frame = pd.read_csv(path,delim_whitespace=True)
frame = frame.query('TMIN != TMAX')
frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
frame['DATE'] = pd.to_datetime(frame['DATE'])
if frame.shape[0] >= 12764:
CH_stations.append(station)
if CH_weather.empty:
CH_weather = CH_weather.append(frame,ignore_index=True)
else:
CH_weather = CH_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
CH_weather = CH_weather.groupby('DATE').mean()
CH_weather.reset_index(level=0, inplace=True)
In [252]:
######NYC Weather######
NYC_stations = [];
NYC_weather = pd.DataFrame()
for station in NYCshortlist['station_name']:
path = 'NYC/%s.csv' % station
if os.path.exists(path):
frame = pd.read_csv(path,delim_whitespace=True)
frame = frame.query('TMIN != TMAX')
frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
frame['DATE'] = pd.to_datetime(frame['DATE'])
if frame.shape[0] >= 12700:
NYC_stations.append(station)
if NYC_weather.empty:
NYC_weather = NYC_weather.append(frame,ignore_index=True)
else:
NYC_weather = NYC_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
NYC_weather = NYC_weather.groupby('DATE').mean()
NYC_weather.reset_index(level=0, inplace=True)
In [253]:
######LA Weather######
LA_stations = [];
LA_weather = pd.DataFrame()
for station in LAshortlist['station_name']:
path = 'LA/%s.csv' % station
if os.path.exists(path):
frame = pd.read_csv(path,delim_whitespace=True)
frame = frame.query('TMIN != TMAX')
frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
frame['DATE'] = pd.to_datetime(frame['DATE'])
if frame.shape[0] >= 12600:
LA_stations.append(station)
if LA_weather.empty:
LA_weather = LA_weather.append(frame,ignore_index=True)
else:
LA_weather = LA_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
LA_weather = LA_weather.groupby('DATE').mean()
LA_weather.reset_index(level=0, inplace=True)
In [254]:
######SF Weather######
SF_stations = [];
SF_weather = pd.DataFrame()
for station in SFshortlist['station_name']:
path = 'SF/%s.csv' % station
if os.path.exists(path):
frame = pd.read_csv(path,delim_whitespace=True)
frame = frame.query('TMIN != TMAX')
frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
frame['DATE'] = pd.to_datetime(frame['DATE'])
if frame.shape[0] >= 12784:
SF_stations.append(station)
if SF_weather.empty:
SF_weather = SF_weather.append(frame,ignore_index=True)
else:
SF_weather = SF_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
SF_weather = SF_weather.groupby('DATE').mean()
SF_weather.reset_index(level=0, inplace=True)
In [255]:
######SD Weather######
SD_stations = [];
SD_weather = pd.DataFrame()
for station in SDshortlist['station_name']:
path = 'SD/%s.csv' % station
if os.path.exists(path):
frame = pd.read_csv(path,delim_whitespace=True)
frame = frame.query('TMIN != TMAX')
frame.columns = ['DATE', 'TMAX_'+station,'TMIN_'+station,'SNOW_'+station,'SNWD_'+station,'PRCP_'+station]
frame['DATE'] = pd.to_datetime(frame['DATE'])
if frame.shape[0] >= 12600:
SD_stations.append(station)
if SD_weather.empty:
SD_weather = SD_weather.append(frame,ignore_index=True)
else:
SD_weather = SD_weather.merge(frame, on='DATE', how='inner', suffixes=('',''))
SD_weather = SD_weather.groupby('DATE').mean()
SD_weather.reset_index(level=0, inplace=True)
In [256]:
print("New York City stations missing data")
for station in NYC_stations:
print("Station %s" % station)
print(pd.isnull(NYC_weather)['TMIN_'+station].value_counts())
print()
print("Atlanta stations missing data")
for station in ATL_stations:
print("Station %s" % station)
print(pd.isnull(ATL_weather)['TMIN_'+station].value_counts())
print()
print("San Francisco stations missing data")
for station in SF_stations:
print("Station %s" % station)
print(pd.isnull(SF_weather)['TMIN_'+station].value_counts())
print()
print("San Diego stations missing data")
for station in SD_stations:
print("Station %s" % station)
print(pd.isnull(SD_weather)['TMIN_'+station].value_counts())
print()
print("Los Angeles stations missing data")
for station in LA_stations:
print("Station %s" % station)
print(pd.isnull(LA_weather)['TMIN_'+station].value_counts())
print()
print("Chicago stations missing data")
for station in CH_stations:
print("Station %s" % station)
print(pd.isnull(CH_weather)['TMIN_'+station].value_counts())
print()
In [297]:
#Merge Datasets from each stock with PARAM_STOCK-NAME as default columns
######Stock Data######
stocks = ['Apple, Inc Stock','International Business Machines Stock','Wal-Mart Stores, Inc Common St Stock','FedEx Corporation','The Boeing Company']
stock_data = pd.DataFrame()
for stock in stocks:
path = 'Stock Data/%s.csv' % stock
if os.path.exists(path):
frame = pd.read_csv(path)
frame.columns = ['DATE','OPEN_'+stock,'HIGH_'+stock,'LOW_'+stock,'CLOSE_'+stock,'VOLUME_'+stock,'ADJ CLOSE_'+stock]
frame['DATE'] = pd.to_datetime(frame['DATE'])
for column in frame.columns:
if column != 'DATE':
frame['PRE'+column] = frame[column].shift(-1)
if stock_data.empty:
stock_data = stock_data.append(frame,ignore_index=True)
else:
stock_data = stock_data.merge(frame, on='DATE', how='inner')
stock_data = stock_data.groupby('DATE').mean()
stock_data.reset_index(level=0, inplace=True)
In [280]:
Out[280]:
In [298]:
#Merge Datasets from each stock with CITY-NAME_sunlight as default columns
#####Sunlight Data######
cities = ['ATL_sunlight','NYC_sunlight','LA_sunlight','SF_sunlight','SD_sunlight','CH_sunlight']
sunlight_data = pd.DataFrame()
for city in cities:
path = 'Sunlight_data/%s.csv' % city
if os.path.exists(path):
frame = pd.read_csv(path,delim_whitespace=True,keep_date_col=True)
frame = frame.iloc[:,0:2]
frame.columns = ['DATE',city]
frame['DATE'] = pd.to_datetime(frame['DATE'])
if sunlight_data.empty:
sunlight_data = sunlight_data.append(frame,ignore_index=True)
else:
sunlight_data = sunlight_data.merge(frame, on='DATE', how='inner')
sunlight_data = sunlight_data.groupby('DATE').mean()
sunlight_data.reset_index(level=0, inplace=True)
In [295]:
#pd.DataFrame.head(sunlight_data)
#pd.DataFrame.head(stock_data)
#pd.DataFrame.head(NYC_weather)
#pd.DataFrame.head(df_final)
#stock_data.dtypes
Out[295]:
In [340]:
data_frames = [ATL_weather,CH_weather,NYC_weather,LA_weather,SD_weather,SF_weather,sunlight_data,stock_data]
df_final = reduce(lambda left,right: pd.merge(left,right,how='inner',on='DATE'), data_frames)
df_final = df_final.ix[3:]
In [263]:
SD_TMAX = pd.DataFrame()
for column in list(SD_weather.columns.values):
if (re.match('TMAX',column)):
SD_TMAX[column] = SD_weather[column]
print(SD_TMAX.corr())
In [264]:
SF_TMIN = pd.DataFrame()
for column in list(SF_weather.columns.values):
if (re.match('TMIN',column)):
SF_TMIN[column] = SF_weather[column]
print(SF_TMIN.corr())
In [265]:
SF_SNOW = pd.DataFrame()
for column in list(SF_weather.columns.values):
if (re.match('SNOW',column)):
SF_SNOW[column] = SF_weather[column]
print(SF_SNOW.corr())
In [266]:
SF_SNWD = pd.DataFrame()
for column in list(SF_weather.columns.values):
if (re.match('SNWD',column)):
SF_SNWD[column] = SF_weather[column]
print(SF_SNWD.corr())
In [267]:
SF_PRCP = pd.DataFrame()
for column in list(SF_weather.columns.values):
if (re.match('PRCP',column)):
SF_PRCP[column] = SF_weather[column]
print(SF_PRCP.corr())
In [ ]:
In [268]:
cd Documents/Weather Data
In [456]:
TMAX_AVG = [0 for x in xrange(len(df_final.index))]
zipped = zip(list(df_final.columns.values),TMAX_AVG)
count=0
for i in list(df_final.columns.values):
if(re.match('TMAX',i)):
count=count+1
TMAX_AVG=TMAX_AVG+df_final.ix[:,i]
################################################333
TMAX_ATL = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(ATL_weather.columns.values):
if(re.match('TMAX',i)):
count=count+1
TMAX_ATL=TMAX_ATL+df_final.ix[:,i]
TMAX_AVG_ATL= TMAX_ATL/count
###################################################3
################################################333
TMAX_CH = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(CH_weather.columns.values):
if(re.match('TMAX',i)):
count=count+1
TMAX_CH=TMAX_CH+df_final.ix[:,i]
TMAX_AVG_CH= TMAX_CH/count
###################################################3
################################################333
TMAX_LA = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(LA_weather.columns.values):
if(re.match('TMAX',i)):
count=count+1
TMAX_LA=TMAX_LA+df_final.ix[:,i]
TMAX_AVG_LA= TMAX_LA/count
print TMAX_AVG_LA
###################################################3
################################################333
TMAX_NYC = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(NYC_weather.columns.values):
if(re.match('TMAX',i)):
count=count+1
TMAX_NYC=TMAX_NYC+df_final.ix[:,i]
TMAX_AVG_NYC= TMAX_NYC/count
###################################################3
################################################333
TMAX_SD = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SD_weather.columns.values):
if(re.match('TMAX',i)):
count=count+1
TMAX_SD=TMAX_SD+df_final.ix[:,i]
TMAX_AVG_SD= TMAX_SD/count
###################################################3
################################################333
TMAX_SF = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SF_weather.columns.values):
if(re.match('TMAX',i)):
count=count+1
TMAX_SF=TMAX_SF+df_final.ix[:,i]
TMAX_AVG_SF= TMAX_SF/count
###################################################3
################################################333
TMIN_ATL = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(ATL_weather.columns.values):
if(re.match('TMIN',i)):
count=count+1
TMIN_ATL=TMIN_ATL+df_final.ix[:,i]
TMIN_AVG_ATL= TMIN_ATL/count
###################################################3
################################################333
TMIN_CH = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(CH_weather.columns.values):
if(re.match('TMIN',i)):
count=count+1
TMIN_CH=TMIN_CH+df_final.ix[:,i]
TMIN_AVG_CH= TMIN_CH/count
###################################################3
################################################333
TMIN_LA = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(LA_weather.columns.values):
if(re.match('TMIN',i)):
count=count+1
TMIN_LA=TMIN_LA+df_final.ix[:,i]
TMIN_AVG_LA= TMIN_LA/count
###################################################3
################################################333
TMIN_NYC = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(NYC_weather.columns.values):
if(re.match('TMIN',i)):
count=count+1
TMIN_NYC=TMIN_NYC+df_final.ix[:,i]
TMIN_AVG_NYC= TMIN_NYC/count
###################################################3
################################################333
TMIN_SD = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SD_weather.columns.values):
if(re.match('TMIN',i)):
count=count+1
TMIN_SD=TMIN_SD+df_final.ix[:,i]
TMIN_AVG_SD= TMIN_SD/count
###################################################3
################################################333
TMIN_SF = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SF_weather.columns.values):
if(re.match('TMIN',i)):
count=count+1
TMIN_SF=TMIN_SF+df_final.ix[:,i]
TMIN_AVG_SF= TMIN_SF/count
###################################################3
##################################################
SNOW_ATL = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(ATL_weather.columns.values):
if(re.match('SNOW',i)):
count=count+1
SNOW_ATL=SNOW_ATL+df_final.ix[:,i]
SNOW_AVG_ATL= SNOW_ATL/count
####################################################
####################################################
SNOW_NYC = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(NYC_weather.columns.values):
if(re.match('SNOW',i)):
count=count+1
SNOW_NYC=SNOW_NYC+df_final.ix[:,i]
SNOW_AVG_NYC= SNOW_NYC/count
###################################################
###################################################
SNOW_CH = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(CH_weather.columns.values):
if(re.match('SNOW',i)):
count=count+1
SNOW_CH=SNOW_CH+df_final.ix[:,i]
SNOW_AVG_CH= SNOW_CH/count
###################################################
###################################################
SNOW_LA = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(LA_weather.columns.values):
if(re.match('SNOW',i)):
count=count+1
SNOW_LA=SNOW_LA+df_final.ix[:,i]
SNOW_AVG_LA= SNOW_LA/count
###################################################
###################################################
SNOW_SD = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SD_weather.columns.values):
if(re.match('SNOW',i)):
count=count+1
SNOW_SD=SNOW_SD+df_final.ix[:,i]
SNOW_AVG_SD= SNOW_SD/count
###################################################
###################################################
SNOW_SF = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SF_weather.columns.values):
if(re.match('SNOW',i)):
count=count+1
SNOW_SF=SNOW_SF+df_final.ix[:,i]
SNOW_AVG_SF= SNOW_SF/count
###################################################
###################################################
SNWD_ATL = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(ATL_weather.columns.values):
if(re.match('SNWD',i)):
count=count+1
SNWD_ATL=SNWD_ATL+df_final.ix[:,i]
SNWD_AVG_ATL= SNWD_ATL/count
####################################################
####################################################
SNWD_NYC = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(NYC_weather.columns.values):
if(re.match('SNWD',i)):
count=count+1
SNWD_NYC=SNWD_NYC+df_final.ix[:,i]
SNWD_AVG_NYC= SNWD_NYC/count
####################################################
####################################################
SNWD_CH = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(CH_weather.columns.values):
if(re.match('SNWD',i)):
count=count+1
SNWD_CH=SNWD_CH+df_final.ix[:,i]
SNWD_AVG_CH= SNWD_CH/count
####################################################
####################################################
SNWD_LA = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(LA_weather.columns.values):
if(re.match('SNWD',i)):
count=count+1
SNWD_LA=SNWD_LA+df_final.ix[:,i]
SNWD_AVG_LA= SNWD_LA/count
####################################################
####################################################
SNWD_SD = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SD_weather.columns.values):
if(re.match('SNWD',i)):
count=count+1
SNWD_SD=SNWD_SD+df_final.ix[:,i]
SNWD_AVG_SD= SNWD_SD/count
#####################################################
#####################################################
SNWD_SF = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SF_weather.columns.values):
if(re.match('SNWD',i)):
count=count+1
SNWD_SF=SNWD_SF+df_final.ix[:,i]
SNWD_AVG_SF= SNWD_SF/count
#####################################################
#####################################################
PRCP_NYC = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(NYC_weather.columns.values):
if(re.match('PRCP',i)):
count=count+1
PRCP_NYC=PRCP_NYC+df_final.ix[:,i]
PRCP_AVG_NYC= PRCP_NYC/count
#####################################################
#####################################################
PRCP_ATL = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(ATL_weather.columns.values):
if(re.match('PRCP',i)):
count=count+1
PRCP_ATL=PRCP_ATL+df_final.ix[:,i]
PRCP_AVG_ATL= PRCP_ATL/count
#####################################################
#####################################################
PRCP_CH = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(CH_weather.columns.values):
if(re.match('PRCP',i)):
count=count+1
PRCP_CH=PRCP_CH+df_final.ix[:,i]
PRCP_AVG_CH= PRCP_CH/count
#####################################################
#####################################################
PRCP_LA = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(LA_weather.columns.values):
if(re.match('PRCP',i)):
count=count+1
PRCP_LA=PRCP_LA+df_final.ix[:,i]
PRCP_AVG_LA= PRCP_LA/count
#####################################################
#####################################################
PRCP_SD = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SD_weather.columns.values):
if(re.match('PRCP',i)):
count=count+1
PRCP_SD=PRCP_SD+df_final.ix[:,i]
PRCP_AVG_SD= PRCP_SD/count
#####################################################
#####################################################
PRCP_SF = [0 for x in xrange(len(df_final.index))]
count=0
for i in list(SF_weather.columns.values):
if(re.match('PRCP',i)):
count=count+1
PRCP_SF=PRCP_SF+df_final.ix[:,i]
PRCP_AVG_SF= PRCP_SF/count
df = pd.DataFrame()
df["TMAX_ATL"]=TMAX_AVG_ATL
df["TMAX_NYC"]=TMAX_AVG_NYC
df["TMAX_CH"]=TMAX_AVG_CH
df["TMAX_LA"]=TMAX_AVG_LA
df["TMAX_SD"]=TMAX_AVG_SD
df["TMAX_SF"]=TMAX_AVG_SF
df["TMIN_ATL"]=TMIN_AVG_ATL
df["TMIN_NYC"]=TMIN_AVG_NYC
df["TMIN_CH"]=TMIN_AVG_CH
df["TMIN_LA"]=TMIN_AVG_LA
df["TMIN_SD"]=TMIN_AVG_SD
df["TMIN_SF"]=TMIN_AVG_SF
df["SNOW_ATL"]=SNOW_AVG_ATL
df["SNOW_NYC"]=SNOW_AVG_NYC
df["SNOW_CH"]=SNOW_AVG_CH
df["SNOW_LA"]=SNOW_AVG_LA
df["SNOW_SD"]=SNOW_AVG_SD
df["SNOW_SF"]=SNOW_AVG_SF
df["SNWD_ATL"]=SNWD_AVG_ATL
df["SNWD_NYC"]=SNWD_AVG_NYC
df["SNWD_CH"]=SNWD_AVG_CH
df["SNWD_LA"]=SNWD_AVG_LA
df["SNWD_SD"]=SNWD_AVG_SD
df["SNWD_SF"]=SNWD_AVG_SF
df["PRCP_ATL"]=PRCP_AVG_ATL
df["PRCP_NYC"]=PRCP_AVG_NYC
df["PRCP_CH"]=PRCP_AVG_CH
df["PRCP_LA"]=PRCP_AVG_LA
df["PRCP_SD"]=PRCP_AVG_SD
df["PRCP_SF"]=PRCP_AVG_SF
df["SUN_ATL"]=df_final["ATL_sunlight"]
df["SUN_NYC"]=df_final["NYC_sunlight"]
df["SUN_LA"]=df_final["LA_sunlight"]
df["SUN_CH"]=df_final["CH_sunlight"]
df["SUN_SD"]=df_final["SD_sunlight"]
df["SUN_SF"]=df_final["SF_sunlight"]
##Adding day column##
from datetime import date
import calendar
date.today().strftime("%A")
dx = {}
for date in df_final["DATE"]:
dx[date] = date.strftime("%A")
import collections
dx = collections.OrderedDict(sorted(dx.items()))
df_final["DAY"] = dx.values()
df_final["DAY"] = df_final["DAY"].astype('category')
df = pd.concat([df, pd.get_dummies(df_final["DAY"])],axis =1)
####
df["PreWalMart"]= (-np.log(df_final["PREOPEN_Wal-Mart Stores, Inc Common St Stock"])+ np.log(df_final["PRECLOSE_Wal-Mart Stores, Inc Common St Stock"]))*100*df_final["PREVOLUME_Wal-Mart Stores, Inc Common St Stock"]
df["PreApple"]= (-np.log(df_final["PREOPEN_Apple, Inc Stock"])+ np.log(df_final["PRECLOSE_Apple, Inc Stock"]))*100*df_final["PREVOLUME_Apple, Inc Stock"]
df["PreBoeing"]= (-np.log(df_final["PREOPEN_The Boeing Company"])+ np.log(df_final["PRECLOSE_The Boeing Company"]))*100*df_final["PREVOLUME_The Boeing Company"]
df["PreFedEx"]= (-np.log(df_final["PREOPEN_FedEx Corporation"])+ np.log(df_final["PRECLOSE_FedEx Corporation"]))*100*df_final["PREVOLUME_FedEx Corporation"]
df["PreIBM"]= (-np.log(df_final["PREOPEN_International Business Machines Stock"])+ np.log(df_final["PRECLOSE_International Business Machines Stock"]))*100*df_final["PREVOLUME_International Business Machines Stock"]
df["WalMart"]= (-np.log(df_final["OPEN_Wal-Mart Stores, Inc Common St Stock"])+ np.log(df_final["CLOSE_Wal-Mart Stores, Inc Common St Stock"]))*100*df_final["VOLUME_Wal-Mart Stores, Inc Common St Stock"]
df["Apple"]= (-np.log(df_final["OPEN_Apple, Inc Stock"])+ np.log(df_final["CLOSE_Apple, Inc Stock"]))*100*df_final["VOLUME_Apple, Inc Stock"]
df["Boeing"]= (-np.log(df_final["OPEN_The Boeing Company"])+ np.log(df_final["CLOSE_The Boeing Company"]))*100*df_final["VOLUME_The Boeing Company"]
df["FedEx"]= (-np.log(df_final["OPEN_FedEx Corporation"])+ np.log(df_final["CLOSE_FedEx Corporation"]))*100*df_final["VOLUME_FedEx Corporation"]
df["IBM"]= (-np.log(df_final["OPEN_International Business Machines Stock"])+ np.log(df_final["CLOSE_International Business Machines Stock"]))*100*df_final["VOLUME_International Business Machines Stock"]
In [459]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
import time
from collections import defaultdict
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import ElasticNet
import matplotlib.pyplot as plt
res = defaultdict(dict)
X = pd.DataFrame(df.ix[:,0:42])
Y = pd.DataFrame(df.ix[:,42:47])
X_train, X_test, y_train, y_test = train_test_split(X, Y.ix[:,0], test_size=0.2, random_state=3)
In [463]:
param_grid1 = {'learning_rate': [0.1,0.01],
'max_depth': [1,2],
'max_features': [1]
}
param_grid2 = {'max_features' : [0.3,0.5],
'min_samples_split' : [4,10],
}
param_grid3 = {'alpha' : [0.01,0.1,1,100,1000],
'l1_ratio' : [0.2,0.4,0.6,0.8,1]}
est1 = GradientBoostingRegressor(n_estimators=50)
est2 = RandomForestRegressor(n_estimators=200)
est3 = ElasticNet()
# this may take some minutes
gs_cv = GridSearchCV(est2, param_grid2, scoring='mean_squared_error', n_jobs=4).fit(X_train, y_train)
# best hyperparameter setting
print('Best hyperparameters: %r' % gs_cv.best_params_)
#print "abc"
#regressor = DecisionTreeRegressor(max_depth = 1,random_state=0)
#print cross_val_score(regressor, X, Y.ix[:,0], cv=10)
# predict class labels
#pred = est.predict(X_test)
# score on test data (accuracy)
#testacc = est.score(X_test, y_test)
#trainacc = est.score(X_train,y_train)
#print('TestACC: %.4f' % testacc)
#print('TrainACC:%.4f' % trainacc)
In [464]:
# refit model on best parameters
est2.set_params(**gs_cv.best_params_)
est2.fit(X_train, y_train)
pred = est2.predict(X_test)
acc = est2.score(X_test, y_test)
print('ACC: %.4f' % acc)
In [ ]: