In [55]:
dfMain2 = pd.DataFrame(index=dates)
# dfMain = dfMain.join(dfSPY)
dfMain2 = dfMain2.join(dfJPM)
dfMain2.dropna(inplace=True)

print("Inspect missing values:")
display(dfMain2.isnull().sum())
print(len(dfMain2))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-55-dd114a2d5465> in <module>()
----> 1 dfMain2 = pd.DataFrame(index=dates)
      2 # dfMain = dfMain.join(dfSPY)
      3 dfMain2 = dfMain2.join(dfJPM)
      4 dfMain2.dropna(inplace=True)
      5 

NameError: name 'dates' is not defined

In [56]:
# Adjust Open, High, Low, Volume
dfMain2['Adj Factor'] = dfMain2['Adj Close'] / dfMain2['Close']

dfMain2['Open'] = dfMain2['Open'] * dfMain2['Adj Factor']
dfMain2['High'] = dfMain2['High'] * dfMain2['Adj Factor']
dfMain2['Low'] = dfMain2['Low'] * dfMain2['Adj Factor']

dfMain2['Volume'] = dfMain2['Volume'] / dfMain2['Adj Factor']
dfMain2.drop(['Close', 'Adj Factor'], axis=1, inplace=True)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-56-ab90611dd45a> in <module>()
      1 # Adjust Open, High, Low, Volume
----> 2 dfMain2['Adj Factor'] = dfMain2['Adj Close'] / dfMain2['Close']
      3 
      4 dfMain2['Open'] = dfMain2['Open'] * dfMain2['Adj Factor']
      5 dfMain2['High'] = dfMain2['High'] * dfMain2['Adj Factor']

NameError: name 'dfMain2' is not defined

In [57]:
display(dfMain2.head())


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-57-92f840fc1c27> in <module>()
----> 1 display(dfMain2.head())

NameError: name 'dfMain2' is not defined

In [58]:
feature_days = 21 * 6

# Price Engineering
for i in range(feature_days):
    # Get opens
    dfMain2['-' + str(i + 1) + 'd_Open'] = dfMain2['Open'].shift(i + 1)
    # Get adjCloses
    dfMain2['-' + str(i + 1) + 'd_adjClose'] = dfMain2['Adj Close'].shift(i + 1)
    # Get Highs
    dfMain2['-' + str(i + 1) + 'd_High'] = dfMain2['High'].shift(i + 1)
    # Get Lows
    dfMain2['-' + str(i + 1) + 'd_Low'] = dfMain2['Low'].shift(i + 1)

# TODO: remove -xd_Open, -xd_adjClose, -xd_High, -xd_Low, x = range(1, feature_days + 1)
    
period_list = [21*x for x in range(1, 13)] # Create relative bases
period_list.extend([5, 10]) # Add 1, 2 week comparison basese
print(period_list)

for x in period_list:
    # Get Max volumes
    dfMain2[str(x) + 'd_Max_Vol'] = dfMain2['Volume'].rolling(window=x).max()
    # Get Avg volumes
    dfMain2[str(x) + 'd_Avg_Vol'] = dfMain2['Volume'].ewm(span=x).mean()
    # Get Min volumes
    dfMain2[str(x) + 'd_Min_Vol'] = dfMain2['Volume'].rolling(window=x).min()

# TODO: remove xd_Max_Vol, xd_Avg_Vol, xd_Min_Vol, for x in period_list

dfMain2['Abs_Spread'] = np.abs(dfMain2['Adj Close'] - dfMain2['Open'])
# dfMain2['Abs_Spread_Shift1'] = dfMain2['Abs_Spread'].shift()

for x in period_list:
    # Get Max spreads
    dfMain2[str(x) + 'd_Max_Spread'] = dfMain2['Abs_Spread'].rolling(window=x).max()
    # Get Avg spreads
    dfMain2[str(x) + 'd_Avg_Spread'] = dfMain2['Abs_Spread'].ewm(span=x).mean()

# TODO: remove xd_Max_Spread, xd_Avg_Spread, for x in period_list

dfMain2.drop(['Abs_Spread'], axis=1, inplace=True)

display(dfMain2.head())


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-58-39507b3d1853> in <module>()
      4 for i in range(feature_days):
      5     # Get opens
----> 6     dfMain2['-' + str(i + 1) + 'd_Open'] = dfMain2['Open'].shift(i + 1)
      7     # Get adjCloses
      8     dfMain2['-' + str(i + 1) + 'd_adjClose'] = dfMain2['Adj Close'].shift(i + 1)

NameError: name 'dfMain2' is not defined

In [59]:
# Volume Engineering
start_time = time.time()
for i in range(feature_days):
    # Get volumes
    dfMain2['-' + str(i + 1) + 'd_Vol'] = dfMain2['Volume'].shift(i + 1)
    # Get relative volumes
    for x in period_list:
        dfMain2['-' + str(i + 1) + 'd_Vol_' + str(x) + 'Max'] = dfMain2['-' + str(i + 1) + 'd_Vol'] / dfMain2[str(x) + 'd_Max_Vol']
        dfMain2['-' + str(i + 1) + 'd_Vol_' + str(x) + 'Avg'] = dfMain2['-' + str(i + 1) + 'd_Vol'] / dfMain2[str(x) + 'd_Avg_Vol']
        dfMain2['-' + str(i + 1) + 'd_Vol_' + str(x) + 'Min'] = dfMain2['-' + str(i + 1) + 'd_Vol'] / dfMain2[str(x) + 'd_Min_Vol']
        
print("Generating volume features took {} seconds.".format(time.time() - start_time))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-59-9b953dc989f3> in <module>()
      3 for i in range(feature_days):
      4     # Get volumes
----> 5     dfMain2['-' + str(i + 1) + 'd_Vol'] = dfMain2['Volume'].shift(i + 1)
      6     # Get relative volumes
      7     for x in period_list:

NameError: name 'dfMain2' is not defined

In [60]:
# Spread Engineering
start_time = time.time()
for i in range(feature_days):
    # Get spread
    dfMain2['-' + str(i + 1) + 'd_Spread'] = dfMain2['-' + str(i + 1) + 'd_adjClose'] - dfMain2['-' + str(i + 1) + 'd_Open']
    # Get relative spread
    for x in period_list:
        dfMain2['-' + str(i + 1) + 'd_Spread_' + str(x) + 'Max'] = dfMain2['-' + str(i + 1) + 'd_Spread'] / dfMain2[str(x) + 'd_Max_Spread']
        dfMain2['-' + str(i + 1) + 'd_Spread_' + str(x) + 'Vol'] = dfMain2['-' + str(i + 1) + 'd_Spread'] / dfMain2[str(x) + 'd_Avg_Spread']
#         dfMain2['-' + str(i + 1) + 'd_Spread_' + str(x) + 'Min'] = dfMain2['-' + str(i + 1) + 'd_Spread'] / dfMain2[str(x) + 'd_Min_Spread']

print("Generating spread features took {} seconds.".format(time.time() - start_time))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-60-777e58571021> in <module>()
      3 for i in range(feature_days):
      4     # Get spread
----> 5     dfMain2['-' + str(i + 1) + 'd_Spread'] = dfMain2['-' + str(i + 1) + 'd_adjClose'] - dfMain2['-' + str(i + 1) + 'd_Open']
      6     # Get relative spread
      7     for x in period_list:

NameError: name 'dfMain2' is not defined

In [61]:
# Level Engineering
start_time = time.time()
for x in period_list:
    # Get Max adjClose
    dfMain2[str(x) + 'd_Max_Price'] = dfMain2['Adj Close'].rolling(window=x).max()
    # Get Avg adjClose
    dfMain2[str(x) + 'd_Avg_Price'] = dfMain2['Adj Close'].ewm(span=x).mean()
    # Get Min adjClose
    dfMain2[str(x) + 'd_Min_Price'] = dfMain2['Adj Close'].rolling(window=x).min()
    # Get Std adjClose
    dfMain2[str(x) + 'd_Std_Price'] = dfMain2['Adj Close'].ewm(span=x).std()

# TODO: remove xd_Max_Price, xd_Avg_Price, xd_Min_Price. Retain xd_Std_Price for x in period_list

for i in range(feature_days):
    # Get relative price
    for x in period_list:
        dfMain2['-' + str(i + 1) + 'd_Price_' + str(x) + 'Max'] = dfMain2['-' + str(i + 1) + 'd_adjClose'] / dfMain2[str(x) + 'd_Max_Price']
        dfMain2['-' + str(i + 1) + 'd_Price_' + str(x) + 'Vol'] = dfMain2['-' + str(i + 1) + 'd_adjClose'] / dfMain2[str(x) + 'd_Avg_Price']
        dfMain2['-' + str(i + 1) + 'd_Price_' + str(x) + 'Min'] = dfMain2['-' + str(i + 1) + 'd_adjClose'] / dfMain2[str(x) + 'd_Min_Price']
        
print("Generating level features took {} seconds.".format(time.time() - start_time))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-61-8328b3c6b7c4> in <module>()
      1 # Level Engineering
      2 start_time = time.time()
----> 3 for x in period_list:
      4     # Get Max adjClose
      5     dfMain2[str(x) + 'd_Max_Price'] = dfMain2['Adj Close'].rolling(window=x).max()

NameError: name 'period_list' is not defined

In [62]:
def upperwick(open, adj_close, high):
    if high > open and high > adj_close:
        return True
    else:
        return False
def lowerwick(open, adj_close, low):
    if low < open and low < adj_close:
        return True
    else:
        return False
    
# Get wicks - new code has 10X speed!
start_time = time.time()

for i in range(feature_days):
    dfMain2.ix[:, '-' + str(i + 1) + 'd_upperwick_bool'] = dfMain2.apply(lambda row: upperwick(row['-' + str(i + 1) + 'd_Open'], row['-' + str(i + 1) + 'd_adjClose'], row['-' + str(i + 1) + 'd_High']), axis=1)
    dfMain2.ix[:, '-' + str(i + 1) + 'd_lowerwick_bool'] = dfMain2.apply(lambda row: lowerwick(row['-' + str(i + 1) + 'd_Open'], row['-' + str(i + 1) + 'd_adjClose'], row['-' + str(i + 1) + 'd_Low']), axis=1)        

# TODO: remove -xd_upperwick_bool, -xd_lowerwick_bool, x in range(1, feature_days + 1)
print("Getting wicks took {} seconds.".format(time.time() - start_time))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-62-50f095cb486c> in <module>()
     14 
     15 for i in range(feature_days):
---> 16     dfMain2.ix[:, '-' + str(i + 1) + 'd_upperwick_bool'] = dfMain2.apply(lambda row: upperwick(row['-' + str(i + 1) + 'd_Open'], row['-' + str(i + 1) + 'd_adjClose'], row['-' + str(i + 1) + 'd_High']), axis=1)
     17     dfMain2.ix[:, '-' + str(i + 1) + 'd_lowerwick_bool'] = dfMain2.apply(lambda row: lowerwick(row['-' + str(i + 1) + 'd_Open'], row['-' + str(i + 1) + 'd_adjClose'], row['-' + str(i + 1) + 'd_Low']), axis=1)
     18 

NameError: name 'dfMain2' is not defined

In [63]:
def get_upperwick_length(open, adj_close, high):
    return high - max(open, adj_close)

def get_lowerwick_length(open, adj_close, low):
    return min(open, adj_close) - low
    
    
start_time = time.time()

# Transform upper wicks
for i in range(feature_days):
    has_upperwicks = dfMain2['-' + str(i + 1) + 'd_upperwick_bool']
    has_lowerwicks = dfMain2['-' + str(i + 1) + 'd_lowerwick_bool']
    
    dfMain2.loc[has_upperwicks, '-' + str(i + 1) + 'd_upperwick'] = dfMain2.loc[has_upperwicks, :].apply(lambda row: get_upperwick_length(row['-' + str(i + 1) + 'd_Open'], row['-' + str(i + 1) + 'd_adjClose'], row['-' + str(i + 1) + 'd_High']), axis=1)
    dfMain2.loc[has_lowerwicks, '-' + str(i + 1) + 'd_lowerwick'] = dfMain2.loc[has_lowerwicks, :].apply(lambda row: get_lowerwick_length(row['-' + str(i + 1) + 'd_Open'], row['-' + str(i + 1) + 'd_adjClose'], row['-' + str(i + 1) + 'd_Low']), axis=1)
    
    # Get relative upperwick length
    dfMain2.loc[dfMain2['-' + str(i + 1) + 'd_upperwick_bool'], '-' + str(i + 1) + 'd_upperwick'] = dfMain2.loc[dfMain2['-' + str(i + 1) + 'd_upperwick_bool'], '-' + str(i + 1) + 'd_upperwick'] / dfMain2.loc[dfMain2['-' + str(i + 1) + 'd_upperwick_bool'], '126d_Avg_Spread']
    # Get relative lowerwick length
    dfMain2.loc[dfMain2['-' + str(i + 1) + 'd_lowerwick_bool'], '-' + str(i + 1) + 'd_lowerwick'] = dfMain2.loc[dfMain2['-' + str(i + 1) + 'd_lowerwick_bool'], '-' + str(i + 1) + 'd_lowerwick'] / dfMain2.loc[dfMain2['-' + str(i + 1) + 'd_lowerwick_bool'], '126d_Avg_Spread']

    # Assign 0 to no-upperwick days
    dfMain2.loc[np.logical_not(dfMain2['-' + str(i + 1) + 'd_upperwick_bool']), '-' + str(i + 1) + 'd_upperwick'] = 0
    # Assign 0 to no-lowerwick days
    dfMain2.loc[np.logical_not(dfMain2['-' + str(i + 1) + 'd_lowerwick_bool']), '-' + str(i + 1) + 'd_lowerwick'] = 0

print("Transforming wicks took {} seconds.".format(time.time() - start_time))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-63-15eac6f64008> in <module>()
     10 # Transform upper wicks
     11 for i in range(feature_days):
---> 12     has_upperwicks = dfMain2['-' + str(i + 1) + 'd_upperwick_bool']
     13     has_lowerwicks = dfMain2['-' + str(i + 1) + 'd_lowerwick_bool']
     14 

NameError: name 'dfMain2' is not defined

In [64]:
dfMain2['Trade Price'] = dfMain2['Adj Close']
print(dfMain2[['Trade Price', 'Open', 'Adj Close']].head())


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-64-4fe6a387c865> in <module>()
----> 1 dfMain2['Trade Price'] = dfMain2['Adj Close']
      2 print(dfMain2[['Trade Price', 'Open', 'Adj Close']].head())

NameError: name 'dfMain2' is not defined

In [65]:
print(dfMain2.shape)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-65-c539d9ce8f92> in <module>()
----> 1 print(dfMain2.shape)

NameError: name 'dfMain2' is not defined

In [66]:
# Remove raw features
# raw_features = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
raw_features = []

# Remove vol comparison base
# vol_compare_type = ['Max', 'Avg', 'Min']
vol_compare_features = []

# Remove vol meta features
vol_meta = []
# for d in range(1, feature_days + 1):
#     vol_meta.append('-' + str(d) + 'd_Vol')

# Remove spread comparison base
# sprd_compare_type = ['Max', 'Avg']
sprd_compare_features = []

# for d in period_list:
#     for t in vol_compare_type:
#         vol_compare_features.append(str(d) + 'd_' + t + '_Vol')
#     for u in sprd_compare_type:
#         sprd_compare_features.append(str(d) + 'd_' + u + '_Spread')

# Remove spread meta features
# price_raw = ['Open', 'adjClose', 'High', 'Low']
spread_meta = []

# for d in range(1, feature_days + 1):
#     for t in price_raw:
#         spread_meta.append('-' + str(d) + 'd_' + t)

# Remove price comparison base
# price_compare_type = vol_compare_type
price_compare_features = []

# for d in period_list:
#     for t in price_compare_type:
#         price_compare_features.append(str(d) + 'd_' + t + '_Price')

# Remove wick bools
wick_type = ['upperwick', 'lowerwick']
wick_bools = []

for d in range(1, feature_days + 1):
    for t in wick_type:
        wick_bools.append('-' + str(d) + 'd_' + t + '_bool')

drop_list = []
drop_list = drop_list + raw_features + vol_compare_features + vol_meta + sprd_compare_features + spread_meta + price_compare_features + wick_bools

dfMain2.drop(drop_list, axis=1, inplace=True)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-66-d6fc4ec92682> in <module>()
     49 drop_list = drop_list + raw_features + vol_compare_features + vol_meta + sprd_compare_features + spread_meta + price_compare_features + wick_bools
     50 
---> 51 dfMain2.drop(drop_list, axis=1, inplace=True)

NameError: name 'dfMain2' is not defined

In [67]:
from copy import deepcopy

new_data_full = deepcopy(dfMain2)
new_data_full.dropna(inplace=True)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-67-6e2905e2a20c> in <module>()
      1 from copy import deepcopy
      2 
----> 3 new_data_full = deepcopy(dfMain2)
      4 new_data_full.dropna(inplace=True)

NameError: name 'dfMain2' is not defined

In [68]:
display(new_data_full.head())


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-68-4de37fbf34a0> in <module>()
----> 1 display(new_data_full.head())

NameError: name 'new_data_full' is not defined

In [69]:
# Add derivatives
# Add 1st derivatives
diff1_temp = new_data_full.ix[:, :-1] - new_data_full.ix[:, :-1].shift()
diff1 = diff1_temp.add_suffix('_diff1')

# Add 2nd derivatives
diff2 = diff1_temp - diff1_temp.shift()
diff2 = diff2.add_suffix('_diff2')

# Concatenate all dataframes
trade_price = new_data_full['Trade Price']
new_data_full = pd.concat([new_data_full, diff1, diff2], axis=1)
new_data_full['Trade Price'] = trade_price
new_data_full.dropna(inplace=True)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-69-232a43cd8b37> in <module>()
      1 # Add derivatives
      2 # Add 1st derivatives
----> 3 diff1_temp = new_data_full.ix[:, :-1] - new_data_full.ix[:, :-1].shift()
      4 diff1 = diff1_temp.add_suffix('_diff1')
      5 

NameError: name 'new_data_full' is not defined

In [70]:
trade_price_idx = new_data_full.columns.get_loc("Trade Price")
cols = new_data_full.columns.tolist()
cols = cols[:trade_price_idx] + cols[trade_price_idx + 1:] + [cols[trade_price_idx]]
new_data_full = new_data_full.reindex(columns=cols)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-70-74fadd79cf12> in <module>()
----> 1 trade_price_idx = new_data_full.columns.get_loc("Trade Price")
      2 cols = new_data_full.columns.tolist()
      3 cols = cols[:trade_price_idx] + cols[trade_price_idx + 1:] + [cols[trade_price_idx]]
      4 new_data_full = new_data_full.reindex(columns=cols)

NameError: name 'new_data_full' is not defined

In [71]:
def split_data(df):
    df_features = df.ix[:, :-1]
    df_labels = df.ix[:, -1]
    return df_features, df_labels

df_features, df_labels = split_data(new_data_full)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-71-df1f9353fea5> in <module>()
      4     return df_features, df_labels
      5 
----> 6 df_features, df_labels = split_data(new_data_full)

NameError: name 'new_data_full' is not defined

In [72]:
# Normalization
def normalization(X_train, X_test):
    X_test_norm = (X_test - X_train.mean()) / (X_train.max() - X_train.min())
    return X_test_norm

df_features_norm = normalization(df_features, df_features)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-72-c6d6122e6287> in <module>()
      4     return X_test_norm
      5 
----> 6 df_features_norm = normalization(df_features, df_features)

NameError: name 'df_features' is not defined

In [73]:
from sklearn.decomposition import PCA

start_time = time.time()

# Choose n in PCA
pca_dim = 1000
pca = PCA(n_components=pca_dim)
pca.fit(df_features_norm) # all data

cp_imp = pca.explained_variance_ratio_
cp_imp = pd.Series(cp_imp)
cp_imp_cum = cp_imp.cumsum()
cp_imp_cum.plot()

print("Running PCA took {} seconds".format(time.time() - start_time))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-73-b6a4e67bcecf> in <module>()
      6 pca_dim = 1000
      7 pca = PCA(n_components=pca_dim)
----> 8 pca.fit(df_features_norm) # all data
      9 
     10 cp_imp = pca.explained_variance_ratio_

NameError: name 'df_features_norm' is not defined

In [75]:
# PCA Fit and Transformation and DF Reconstruction
def pca_transform_reconstruct(fit_data, trans_data, pca_dim):            
    pca = PCA(n_components=pca_dim)
    pca.fit(fit_data)

    data_pca = pca.transform(trans_data)
    data_pca = pd.DataFrame(data=data_pca)
    data_pca['Date'] = trans_data.index
    data_pca.set_index(data_pca['Date'], inplace=True)
    del data_pca.index.name
    del data_pca['Date']
    
    return data_pca

df_features_pca = pca_transform_reconstruct(df_features_norm, df_features_norm, 300)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-75-71775ba60806> in <module>()
     13     return data_pca
     14 
---> 15 df_features_pca = pca_transform_reconstruct(df_features_norm, df_features_norm, 300)

NameError: name 'df_features_norm' is not defined

In [76]:
df_features = normalization(df_features_pca, df_features_pca)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-76-57a2ffee6bfd> in <module>()
----> 1 df_features = normalization(df_features_pca, df_features_pca)

NameError: name 'df_features_pca' is not defined

In [77]:
new_data_full_pca = df_features
new_data_full_pca['Trade Price'] = df_labels


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-77-078e969d2445> in <module>()
----> 1 new_data_full_pca = df_features
      2 new_data_full_pca['Trade Price'] = df_labels

NameError: name 'df_features' is not defined

In [78]:
validation_start_date = datetime(2006, 9, 25)
validation_end_date = datetime(2011, 9, 27)
test_start_date = datetime(2011, 9, 26)
test_end_date = datetime(2016, 9, 27)

print("Validation phase")
print("{0} Trade Price: {1}".format(validation_start_date, new_data_full_pca.ix[validation_start_date, 'Trade Price']))
print("{0} Trade Price: {1}".format(validation_end_date, new_data_full_pca.ix[validation_end_date, 'Trade Price']))
validation_phase_data = new_data_full_pca.ix[validation_start_date:validation_end_date, :]
print("Number of dates in validation dataset: {}\n".format(len(validation_phase_data)))

print("Test phase")
print("{0} Trade Price: {1}".format(test_start_date, new_data_full_pca.ix[test_start_date, 'Trade Price']))
print("{0} Trade Price: {1}".format(test_end_date, new_data_full_pca.ix[test_end_date, 'Trade Price']))
test_phase_data = new_data_full_pca.ix[test_start_date:test_end_date, :]
print("Number of dates in test dataset: {}".format(len(test_phase_data)))


Validation phase
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-78-aa0cc33e4d4c> in <module>()
      5 
      6 print("Validation phase")
----> 7 print("{0} Trade Price: {1}".format(validation_start_date, new_data_full_pca.ix[validation_start_date, 'Trade Price']))
      8 print("{0} Trade Price: {1}".format(validation_end_date, new_data_full_pca.ix[validation_end_date, 'Trade Price']))
      9 validation_phase_data = new_data_full_pca.ix[validation_start_date:validation_end_date, :]

NameError: name 'new_data_full_pca' is not defined

In [80]:
class MonkeyBot(object):
    def __init__(self, dfEnv, cash=1000, share=0, pv=0, random_state=0):
        random.seed(random_state)
        np.random.seed(random_state)
        
        self.cash = cash
        self.share = share
        self.pv = pv
        self.asset_history_list = []
        self.action_list = []
        
        self.env = deepcopy(dfEnv)

    def buy(self, stock_price, cost, fold=1):
        if self.cash < stock_price:
            self.hold(stock_price)
            
        else:
            num_affordable = int(self.cash // stock_price)
            buy_amount = int(num_affordable // fold)
            self.cash = self.cash - stock_price * buy_amount
            self.share = self.share + buy_amount
            self.pv = stock_price * self.share

             # Adding transaction cost
            self.trading_cost(buy_amount, cost)
            
        # Append action to action list
        self.action_list.append('Buy')

    def sell(self, stock_price, cost, fold=1):
        if self.share == 0:
            self.hold(stock_price)
            
        else:
            sell_amount = int(self.share // fold)
            self.cash = self.cash + stock_price * sell_amount
            self.pv = 0
            self.share = 0

            # Adding transaction cost
            self.trading_cost(sell_amount, cost)
            
        self.action_list.append('Sell')

    def hold(self, stock_price):
        self.pv = stock_price * self.share

    def trading_cost(self, trading_amount, cost):
            if cost is None:
                pass                
            elif cost == 'low':
                if trading_amount * 0.01 < 1.99:
                    self.cash = self.cash - 1.99
                else:
                    self.cash = self.cash - trading_amount * 0.01
            elif cost == 'medium':
                if trading_amount * 0.01 < 5:
                    self.cash = self.cash - 5
                else:
                    self.cash = self.cash - trading_amount * 0.01
            elif cost == 'high':
                if trading_amount * 0.01 < 7:
                    self.cash = self.cash - 7
                else:
                    self.cash = self.cash - trading_amount * 0.01
            else:
                raise ValueError("Invalid cost parameter!")
        
    def reset(self):
        self.cash = 1000
        self.share = 0
        self.pv = 0

    def make_decision(self, x, cost):
        random_choice = random.choice([1, 2])

        if random_choice == 0:
            self.hold(x)
        elif random_choice == 1:
            self.buy(x, cost)
        elif random_choice == 2:
            self.sell(x, cost)
        else:
            raise ValueError("Invalid choice!")

        return self.pv # for frame-wise operation

    def simulate(self, iters, cost=None):
        start_time = time.time()
        for i in range(iters):
            for index, row in self.env.iterrows():
                self.make_decision(row['Trade Price'], cost)
            self.asset_history_list.append(self.pv + self.cash)
            self.reset()
        print("{0} iterations took {1} seconds".format(iters, time.time() - start_time))
        
        return self.asset_history_list, self.action_list

In [81]:
pd.Series(monkey_full.asset_history_list).describe()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-81-1020589dd8a9> in <module>()
----> 1 pd.Series(monkey_full.asset_history_list).describe()

NameError: name 'monkey_full' is not defined

In [82]:
import sys

class ChimpBot(MonkeyBot):
    """An agent that learns to drive in the smartcab world."""
    
    def __init__(self, dfEnv, iter_random_rounds, gamma, random_state=0, test_mode=False, cash=1000, share=0, pv=0):
        super(ChimpBot, self).__init__(dfEnv, iter_random_rounds, cash, share, pv)
        # From MonkeyBot:
        # sets self.cash = 1000
        # sets self.share = 0
        # sets self.pv = 0
        # sets self.pv_history_list = []
        # sets self.env = dfEnv
        # implements buy(self, stock_price)
        # implements sell(self, stock_price)
        # implements hold(self)

        # Set random state
        self.random_state = random_state
        random.seed(self.random_state)
        np.random.seed(self.random_state)
        
        # Chimp parameters
        self.valid_actions = ['Buy', 'Sell']
        self.gamma = gamma # Discount factor
        self.epsilon = 1 # Exploration-exploitation
        self.test_mode = test_mode
        self.random_rounds = iter_random_rounds # Number of rounds where the bot chooses to go monkey
        self.num_features = len(dfEnv.columns) # Use every columns from the input data
        
        # Turn input data into index, row
        self.iter_env = self.env.iterrows()
        self.now_env_index, self.now_row = self.iter_env.next()

        # Numpy alternative
#         self.env_arr = self.env.values
#         self.now_row = 0

        # May need to put back later
#         self.prev_cash = self.cash
#         self.prev_share = self.share
#         self.prev_pv = self.pv

        # Q-table and q_df
        self.q_df_columns = list(self.env.columns)
        self.q_df_columns.extend(['Action', 'Q Value'])
        self.q_df = pd.DataFrame(columns=self.q_df_columns)
        
        self.q_dict = defaultdict(lambda: (0, 0)) # element of q_dict is (state, act): (q_value, t)
        self.q_dict_analysis = defaultdict(lambda: (0, 0))

        # Misc
        self.reset_counter = 0

    def make_q_df(self):
        """Make a q_df out of the q_dict."""
        print("Making q_df...")
        result_dict = defaultdict(list)
        for index, row in self.q_dict.iteritems():
            for i in range(len(self.q_dict.keys()[0])):
                column_name = 'col' + str(i + 1)
                result_dict[column_name].append(index[i])
            result_dict['Q'].append(self.q_dict[index][0])

        self.q_df = pd.DataFrame(result_dict)
        q_df_column_list = ['col' + str(x) for x in range(1, self.num_features - 1 + 1 + 1)] # features + action
        q_df_column_list.append('Q')
        self.q_df = self.q_df[q_df_column_list]

        def transfer_action(x):
            if x == 'Buy':
                return 1
            elif x == 'Sell':
                return 2
            elif x == 'Hold':
                return 0
            else:
                raise ValueError("Wrong action!")

        def str_float_int(x):
            return int(float(x))

        arr_int = np.vectorize(str_float_int)

        print(self.q_df.head())
        self.q_df.ix[:, -2] = self.q_df.ix[:, -2].apply(transfer_action)
        self.q_df.ix[:, :-1] = self.q_df.ix[:, :-1].apply(arr_int) # Maybe useless

    def split_q_df(self):
        """Splitting q_df into features and labels."""
        
        self.q_df_X = self.q_df.ix[:, :-1]
        self.q_df_y = self.q_df.ix[:, -1]

    def train_on_q_df(self):
        """Model the q_df."""
        print("Training on q_df...")
        self.q_reg = RandomForestRegressor(n_estimators=2000, max_features='sqrt', n_jobs=-1, random_state=self.random_state)
        self.q_reg = self.q_reg.fit(self.q_df_X, self.q_df_y)

    def update_q_model(self):
        """1. Make q_df
           2. Split q_df
           3. Train on q_df
        """
#         print("Updating Q model...")
#         start_time = time.time()
        self.make_q_df()
        self.split_q_df()
        self.train_on_q_df()
#         print("Update took {} seconds".format(time.time() - start_time))

    def from_state_action_predict_q(self, state_action):
        """Make prediction using self.reg"""
        state_action = [state_action]
        pred_q = self.q_reg.predict(state_action)

        return pred_q

    def max_q(self):
#         print("Calculating Max Q")
        def transfer_action(x):
            if x == 'Buy':
                return 1
            elif x == 'Sell':
                return 2
            elif x == 'Hold':
                return 0
            else:
                raise ValueError("Invalid action!")

#         def str_float_int(x):
#             return int(float(x))

        max_q = None
        q_compare_dict = {}

        if len(self.now_states) != self.num_features - 1:
            raise ValueError("Got ya bastard! @ MaxQ")

        # Populate the q_dict
        for act in set(self.valid_actions):
            # added 1 more additional features to the feature set
            self.now_states.append(act)
            now_row_key = tuple(self.now_states)

            _ = self.q_dict[now_row_key]

            try:
                self.q_reg
            except AttributeError:
                pass
                # print('No q_reg yet...going with default.')
            else:
                if _[1] == 0:
                    # print("Dreaming mode...")                    
                    single_X = np.array(now_row_key)
                    # print(single_X)
#                     arr_int = np.vectorize(str_float_int)
                    single_X[-1] = transfer_action(single_X[-1])
#                     single_X = arr_int(single_X)
                    single_X = single_X.reshape(1, -1)
                    pred_q = self.q_reg.predict(single_X)
                    dreamed_q = (1 - (1 / (self.q_dict[now_row_key][1] + 1))) * self.q_dict[now_row_key][0] + (1 / (self.q_dict[now_row_key][1] + 1)) * pred_q[0]
                    self.q_dict[now_row_key] = (dreamed_q, self.q_dict[now_row_key][1] + 1)

            q_compare_dict[now_row_key] = self.q_dict[now_row_key]
            self.now_states.pop()

        try:
            key, qAndT = max(q_compare_dict.iteritems(), key=lambda x:x[1])
        except ValueError:
            print("Wrong Q Value in Q Compare Dict!")
            sys.exit(1)
        else:
            return key[-1], qAndT[0], qAndT[1]

    def q_update(self):
#         print("Updating Q table...")
        # prev_states.append(self.prev_yes_share)
        self.prev_states.append(self.prev_action)
        prev_states_key = tuple(self.prev_states)

        if len(prev_states_key) != self.num_features - 1 + 1:
            raise ValueError("Got ya bastard! @ Q_Update")

        q_temp = self.q_dict[prev_states_key]
        q_temp0 = (1 - (1 / (q_temp[1] + 1))) * q_temp[0] + (1 / (q_temp[1] + 1)) * (self.reward + self.gamma * self.max_q()[1])

        self.q_dict[prev_states_key] = (q_temp0, q_temp[1] + 1)
        # For analysis purpose
        self.q_dict_analysis[prev_states_key] = (q_temp0, self.prev_env_index)

    def reset(self):
#         print("Resetting...")
        # Portfolio change over iterations
        self.asset_history_list.append(self.pv + self.cash)

        self.iter_env = self.env.iterrows()
        self.now_env_index, self.now_row = self.iter_env.next()
#         self.now_row = 0 # Numpy option

        self.cash = 1000
        self.share = 0
        self.pv = 0

        # Delete all prevs
        del self.prev_states
        del self.prev_env_index        
        del self.prev_cash
        del self.prev_share
        del self.prev_pv
        del self.prev_action

        if self.test_mode is True:
            self.epsilon = 0
        
        else:
            if self.epsilon - 1/self.random_rounds > 0.00001: # Epislon threshold: 0.01
                self.epsilon = self.epsilon - 1/self.random_rounds
            else:
                self.epsilon = 0.00001 # Epislon threshold: 0.1
                
        self.reset_counter += 1

        if self.reset_counter % self.random_rounds == 0:
            self.update_q_model()

        if np.abs(self.epsilon - 0.00001) > 0.000001:
            self.action_list = []

    def make_decision(self):
        return self.max_q()[0]

    def update(self, cost):
        # Update state
        self.now_states = list(self.now_row)
        self.now_states.pop() # Remove Trade Price

### Numpy option
#         try:
#             self.now_states = list(self.env_arr[self.now_row])
#         except IndexError:
#             print("End of data.")
#             sys.exit(1)
#         self.now_states.pop() # Remove Trade Price

        if len(self.now_states) != self.num_features - 1:
            raise ValueError("Got ya bastard! @ Q_Update...something wrong with the self.now_row!!!")

        # Update Q-table using prevs
        try:
            self.prev_states
        except AttributeError:
            pass
#             print("Running the first time...no prevs exist.")
        else:
            self.hold(self.now_row[-1])
            self.reward = ((self.cash - self.prev_cash) + (self.pv - self.prev_pv)) / (self.prev_cash + self.prev_pv)
            self.q_update()

        # All the prev stuff!
        self.prev_states = copy(self.now_states)
        self.prev_env_index = deepcopy(self.now_env_index)
#         self.prev_env_index = self.env.index[self.now_row] # Numpy option
        self.prev_cash = self.cash
        self.prev_share = self.share
        self.prev_pv = self.pv
        
        # Exploitation-exploration decisioning
        self.decision = np.random.choice(2, p = [self.epsilon, 1 - self.epsilon]) # decide to go random or with the policy
        # self.decision = 0 # Force random mode

        # print("Random decision: {0}, Epislon: {1}".format(self.decision, self.epsilon))
        if self.decision == 0: # if zero, go random
            action = random.choice(self.valid_actions)
        else: # else go with the policy
            action = self.make_decision()

        # Execute action and get reward
        if action == 'Buy':
            # print(self.now_row)
            self.buy(self.now_row[-1], cost)
#             self.buy(self.env_arr[self.now_row][-1], cost) # Numpy option
        elif action == 'Sell':
            # print(self.now_row)
            self.sell(self.now_row[-1], cost)
#             self.sell(self.env_arr[self.now_row][-1], cost) # Numpy option
        elif action == 'Hold':
            # print(self.now_row)
            self.hold(self.now_row[-1])
#             self.hold(self.env_arr[self.now_row][-1]) # Numpy option
        else:
            raise ValueError("Invalid action man!")
        
        self.prev_action = action
        
#         self.now_row += 1 # Numpy option

        try:
            self.now_env_index, self.now_row = self.iter_env.next()
        except StopIteration:
            pass

    def simulate(self, cost=None):
        start_time = time.time()

        for i in range(self.random_rounds):
            for l in range(len(self.env)):
#             for l in range(len(self.env_arr)): # Numpy option
                self.update(cost)
            self.reset()
            if (i + 1) % 500 == 0:
                print(self.asset_history_list[-1])
                print("Round {} finished".format(i + 1))
#             print(self.asset_history_list[-1])
#             print("Round {} finished".format(i + 1))
        print("{0} rounds of simulation with cost = {1}, took {2} seconds".format(self.random_rounds, cost, time.time() - start_time))
        return self.asset_history_list, self.action_list

In [83]:
god_chimp = ChimpBot(new_data_full_pca, iter_random_rounds=20000, gamma=0.9, random_state=0)
asset_history_list, action_list = god_chimp.simulate(cost='high')

print(pd.Series(action_list).describe())
print(asset_history_list[-1])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-83-666a92229d04> in <module>()
----> 1 god_chimp = ChimpBot(new_data_full_pca, iter_random_rounds=20000, gamma=0.9, random_state=0)
      2 asset_history_list, action_list = god_chimp.simulate(cost='high')
      3 
      4 print(pd.Series(action_list).describe())
      5 print(asset_history_list[-1])

NameError: name 'new_data_full_pca' is not defined

In [84]:
pd.Series(asset_history_list).plot()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-84-eb8094e9f01a> in <module>()
----> 1 pd.Series(asset_history_list).plot()

NameError: name 'asset_history_list' is not defined

In [85]:
(np.sign(pd.Series(asset_history_list)) * np.log(np.abs(pd.Series(asset_history_list)) + 1)).plot()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-85-af8d3d54665d> in <module>()
----> 1 (np.sign(pd.Series(asset_history_list)) * np.log(np.abs(pd.Series(asset_history_list)) + 1)).plot()

NameError: name 'asset_history_list' is not defined

In [86]:
# Convert Q-Table to Dataframe from the God Chimp (full dataset)
iter_random_rounds=20000
result_dict = defaultdict(list)
for index, row in god_chimp.q_dict_analysis.iteritems():
    for i in range(len(god_chimp.q_dict_analysis.keys()[0])):
        column_name = 'col' + str(i + 1)
        result_dict[column_name].append(index[i])
    result_dict['Q'].append(god_chimp.q_dict_analysis[index][0])
    result_dict['Date'].append(god_chimp.q_dict_analysis[index][1])

god_chimp_q_df = pd.DataFrame(result_dict)

# Yes share column removed
column_list = ['col' + str(x) for x in range(1, 301 + 1)]
column_list.extend(['Date', 'Q'])
god_chimp_q_df = god_chimp_q_df[column_list]
god_chimp_q_df.sort_values('Date', inplace=True)
god_chimp_q_df.reset_index(inplace=True)
del god_chimp_q_df['index']

god_chimp_q_df.reset_index(inplace=True)
del god_chimp_q_df['index']

god_chimp_q_df.set_index(god_chimp_q_df['Date'], inplace=True)
del god_chimp_q_df.index.name
del god_chimp_q_df['Date']

print(len(god_chimp_q_df))
display(god_chimp_q_df.head())


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-86-15d0087f5d18> in <module>()
      1 # Convert Q-Table to Dataframe from the God Chimp (full dataset)
      2 iter_random_rounds=20000
----> 3 result_dict = defaultdict(list)
      4 for index, row in god_chimp.q_dict_analysis.iteritems():
      5     for i in range(len(god_chimp.q_dict_analysis.keys()[0])):

NameError: name 'defaultdict' is not defined

In [87]:
god_chimp = ChimpBot(new_data_full_pca, iter_random_rounds=5000, gamma=0.75, random_state=0)
asset_history_list, action_list = god_chimp.simulate(cost='high')

print(pd.Series(action_list).describe())
print(asset_history_list[-1])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-87-fd0dbf2c65f2> in <module>()
----> 1 god_chimp = ChimpBot(new_data_full_pca, iter_random_rounds=5000, gamma=0.75, random_state=0)
      2 asset_history_list, action_list = god_chimp.simulate(cost='high')
      3 
      4 print(pd.Series(action_list).describe())
      5 print(asset_history_list[-1])

NameError: name 'new_data_full_pca' is not defined

In [88]:
# Convert Q-Table to Dataframe from the God Chimp (full dataset)
iter_random_rounds=5000
result_dict = defaultdict(list)
for index, row in god_chimp.q_dict_analysis.iteritems():
    for i in range(len(god_chimp.q_dict_analysis.keys()[0])):
        column_name = 'col' + str(i + 1)
        result_dict[column_name].append(index[i])
    result_dict['Q'].append(god_chimp.q_dict_analysis[index][0])
    result_dict['Date'].append(god_chimp.q_dict_analysis[index][1])

god_chimp_q_df = pd.DataFrame(result_dict)

# Yes share column removed
column_list = ['col' + str(x) for x in range(1, 301 + 1)]
column_list.extend(['Date', 'Q'])
god_chimp_q_df = god_chimp_q_df[column_list]
god_chimp_q_df.sort_values('Date', inplace=True)
god_chimp_q_df.reset_index(inplace=True)
del god_chimp_q_df['index']

god_chimp_q_df.reset_index(inplace=True)
del god_chimp_q_df['index']

god_chimp_q_df.set_index(god_chimp_q_df['Date'], inplace=True)
del god_chimp_q_df.index.name
del god_chimp_q_df['Date']

print(len(god_chimp_q_df))
display(god_chimp_q_df.head())


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-88-ac1ccf9bc746> in <module>()
      1 # Convert Q-Table to Dataframe from the God Chimp (full dataset)
      2 iter_random_rounds=5000
----> 3 result_dict = defaultdict(list)
      4 for index, row in god_chimp.q_dict_analysis.iteritems():
      5     for i in range(len(god_chimp.q_dict_analysis.keys()[0])):

NameError: name 'defaultdict' is not defined

In [89]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(god_chimp_q_df.ix[:, -2])
print(le.classes_)
god_chimp_q_df.ix[:, -2] = le.transform(god_chimp_q_df.ix[:, -2])

# = god_chimp_q_df.ix[:, -2].apply(action_to_int)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-89-9f2b6adc08c9> in <module>()
      1 from sklearn import preprocessing
      2 le = preprocessing.LabelEncoder()
----> 3 le.fit(god_chimp_q_df.ix[:, -2])
      4 print(le.classes_)
      5 god_chimp_q_df.ix[:, -2] = le.transform(god_chimp_q_df.ix[:, -2])

NameError: name 'god_chimp_q_df' is not defined

In [90]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

fs_data = god_chimp_q_df
fs_X = fs_data.ix[:, :-1]
fs_y = fs_data.ix[:, -1]
names = list(fs_X.columns)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-90-97289dbba162> in <module>()
      2 from sklearn.svm import SVR
      3 
----> 4 fs_data = god_chimp_q_df
      5 fs_X = fs_data.ix[:, :-1]
      6 fs_y = fs_data.ix[:, -1]

NameError: name 'god_chimp_q_df' is not defined

In [91]:
estimator = SVR(kernel="linear")
rfe = RFE(estimator, 37, step=1)
rfe = rfe.fit(fs_X, fs_y)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-91-dea849c1bbab> in <module>()
      1 estimator = SVR(kernel="linear")
      2 rfe = RFE(estimator, 37, step=1)
----> 3 rfe = rfe.fit(fs_X, fs_y)

NameError: name 'fs_X' is not defined

In [92]:
rfe_ranking = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))
print("Features sorted by their rank:")
print(rfe_ranking)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-92-b22f7b70966a> in <module>()
----> 1 rfe_ranking = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))
      2 print("Features sorted by their rank:")
      3 print(rfe_ranking)

AttributeError: 'RFE' object has no attribute 'ranking_'

In [93]:
rfe_cols = [x[1] for i, x in enumerate(rfe_ranking)]
rfe_cols = rfe_cols[:37]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-93-e2671c5e3315> in <module>()
----> 1 rfe_cols = [x[1] for i, x in enumerate(rfe_ranking)]
      2 rfe_cols = rfe_cols[:37]

NameError: name 'rfe_ranking' is not defined

In [94]:
from sklearn.linear_model import RandomizedLasso

rlasso = RandomizedLasso(alpha='aic')
rlasso.fit(fs_X, fs_y)
rlasso_ranking = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True)
print "Features sorted by their score:"
print(rlasso_ranking)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-94-282ab000677f> in <module>()
      2 
      3 rlasso = RandomizedLasso(alpha='aic')
----> 4 rlasso.fit(fs_X, fs_y)
      5 rlasso_ranking = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True)
      6 print "Features sorted by their score:"

NameError: name 'fs_X' is not defined

In [95]:
# 24 features before the drop
plt.plot([x[0] for i, x in enumerate(rlasso_ranking)])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-95-c93b1ea1d5fe> in <module>()
      1 # 24 features before the drop
----> 2 plt.plot([x[0] for i, x in enumerate(rlasso_ranking)])

NameError: name 'rlasso_ranking' is not defined

In [96]:
rlasso_cols = [x[1] for i, x in enumerate(rlasso_ranking) if x[0] >= 0.95]
print(len(rlasso_cols))
rlasso_cols = rlasso_cols[:96]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-96-9fe7c5139be7> in <module>()
----> 1 rlasso_cols = [x[1] for i, x in enumerate(rlasso_ranking) if x[0] >= 0.95]
      2 print(len(rlasso_cols))
      3 rlasso_cols = rlasso_cols[:96]

NameError: name 'rlasso_ranking' is not defined

In [97]:
# reduced_columns = ['col' + str(x) for x in range(1, 11)]
reduced_columns = []
# reduced_columns.extend(rlasso_cols)
reduced_columns.extend(rfe_cols)
reduced_columns.extend(['col301', 'Q'])
reduced_columns = list(set(reduced_columns))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-97-5807f3ac1af9> in <module>()
      2 reduced_columns = []
      3 # reduced_columns.extend(rlasso_cols)
----> 4 reduced_columns.extend(rfe_cols)
      5 reduced_columns.extend(['col301', 'Q'])
      6 reduced_columns = list(set(reduced_columns))

NameError: name 'rfe_cols' is not defined

In [98]:
print(len(reduced_columns))


0

In [99]:
god_chimp_q_df = god_chimp_q_df[reduced_columns]

action_idx = god_chimp_q_df.columns.get_loc('col301')
cols = god_chimp_q_df.columns.tolist()
cols = cols[:action_idx] + cols[action_idx + 1:] + [cols[action_idx]]
god_chimp_q_df = god_chimp_q_df.reindex(columns=cols)

q_idx = god_chimp_q_df.columns.get_loc('Q')
cols = god_chimp_q_df.columns.tolist()
cols = cols[:q_idx] + cols[q_idx + 1:] + [cols[q_idx]]
god_chimp_q_df = god_chimp_q_df.reindex(columns=cols)

# god_chimp_q_df.ix[:, -2] = god_chimp_q_df.ix[:, -2].apply(action_to_int)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-99-11919e6c34a2> in <module>()
----> 1 god_chimp_q_df = god_chimp_q_df[reduced_columns]
      2 
      3 action_idx = god_chimp_q_df.columns.get_loc('col301')
      4 cols = god_chimp_q_df.columns.tolist()
      5 cols = cols[:action_idx] + cols[action_idx + 1:] + [cols[action_idx]]

NameError: name 'god_chimp_q_df' is not defined

In [100]:
from sklearn.metrics import accuracy_score
# from sklearn.linear_model import LinearRegression
# import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

def find_best_training_size(data_full, full_q_df, training_sizes, testing_size, target_data, random_state=0):
    start_time = time.time()
    accs = []
    d_counter = 0

    # Loop through all batches in validation dataset
    (u, ) = data_full.index.get_indexer_for([target_data.index[0]])
    for d in range(u, u + testing_size * (len(target_data) // testing_size), testing_size):
        acc_num_train_months = []
        d_counter += 1

        # Dates in the batch
        date_range = data_full.iloc[d:d + testing_size].index
        
        # Loop through all sizes of training sets
        for num_train_month in range(1, training_sizes + 1):  
            # Prepare Training/Testing Datasets
            X_train = full_q_df.iloc[d - (int(21 * num_train_month)):d, :-1]
            y_train = full_q_df.iloc[d - (int(21 * num_train_month)):d, -1]
            X_test = full_q_df.ix[date_range, :-1]
            y_test = full_q_df.ix[date_range, -1]

            # Fit data and make predictions
            reg = GradientBoostingRegressor()
#             reg = KNeighborsRegressor(n_neighbors=5, weights='distance', n_jobs=-1)
#             reg = LinearRegression(n_jobs=-1)
#             reg = SVR(kernel='rbf')
#             reg = XGBRegressor()
            reg = RandomForestRegressor(n_estimators=1500, max_features='auto', oob_score=True, n_jobs=-1, random_state=random_state)
            reg.fit(X_train, y_train)

            y_pred = reg.predict(X_test)
            y_fit = reg.predict(X_train)

            pred_q = y_pred
            actions = X_test.ix[:, -1]
            data = {'Action': actions, 'Q': pred_q}
            df_pred = pd.DataFrame(data=data, index=y_test.index)

            pred_actions = []

            for date in date_range:  
                max_q = [0, -1]
                for i, r in df_pred.ix[date].iterrows():
                    if r['Q'] > max_q[1]:
                        max_q = [r['Action'], r['Q']]
                pred_actions.append(max_q[0])

            best_actions = []

            for date in date_range:
                max_q = [0, -1]
                for i, r in full_q_df.ix[date].iterrows():
                    if r['Q'] > max_q[1]:
                        max_q = [r[-2], r['Q']]
                best_actions.append(max_q[0])

            acc_num_train_months.append(accuracy_score(best_actions, pred_actions))
        accs.append(np.array(acc_num_train_months))
        print("Batch {0} completed....{1:.2f}%".format(d_counter, 100 * (d_counter / len(range(u, u + testing_size * (len(target_data) // testing_size), testing_size)))))
        geo_means = np.power(reduce(lambda x,y: x*y, accs), (1/len(accs)))
        arithmetic_means = reduce(lambda x,y: x+y, accs) / len(accs)
        print("Geometric Means Max: {}".format((np.argmax(geo_means) + 1, np.max(geo_means))))
        print("Arithemtic Means Max: {}".format((np.argmax(arithmetic_means) + 1, np.max(arithmetic_means))))
    
    print("Grid search best num_train_year took {} seconds:".format(time.time() - start_time))
    
    return (geo_means, arithmetic_means)

In [101]:
means = find_best_training_size(data_full=new_data_full_pca, full_q_df=god_chimp_q_df, training_sizes=120, testing_size=5, target_data=validation_phase_data, random_state=0)
geo_means = means[0]
arithmetic_means = means[1]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-101-f39bd43f9739> in <module>()
----> 1 means = find_best_training_size(data_full=new_data_full_pca, full_q_df=god_chimp_q_df, training_sizes=120, testing_size=5, target_data=validation_phase_data, random_state=0)
      2 geo_means = means[0]
      3 arithmetic_means = means[1]

NameError: name 'new_data_full_pca' is not defined

In [102]:
print(geo_means)
print(sorted(range(len(geo_means)), key=lambda k: geo_means[k], reverse=True))

print(arithmetic_means)
print(sorted(range(len(arithmetic_means)), key=lambda k: arithmetic_means[k], reverse=True))

validation_phase_data['Trade Price'].plot()
plt.figure()
plt.plot(geo_means)
plt.figure()
plt.plot(arithmetic_means)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-102-7439754c0bb9> in <module>()
----> 1 print(geo_means)
      2 print(sorted(range(len(geo_means)), key=lambda k: geo_means[k], reverse=True))
      3 
      4 print(arithmetic_means)
      5 print(sorted(range(len(arithmetic_means)), key=lambda k: arithmetic_means[k], reverse=True))

NameError: name 'geo_means' is not defined

In [103]:
from sklearn.model_selection import GridSearchCV

def grid_search(data_full, full_q_df, training_size, testing_size, target_data, random_state=0):
    start_time = time.time()
    accs = []
    d_counter = 0
#     feature_importance_list = []
    best_param_list = []

    # Loop through all batches in validation dataset
    (u, ) = data_full.index.get_indexer_for([target_data.index[0]])
    for d in range(u, u + testing_size * (len(target_data) // testing_size), testing_size):
        acc_num_train_months = []
        d_counter += 1

        # Dates in the batch
        date_range = data_full.iloc[d:d + testing_size].index
        
        # Loop through all sizes of training sets
        num_train_month = training_size
        # Prepare Training/Testing Datasets
        X_train = full_q_df.iloc[d - (int(21 * num_train_month)):d, :-1]
        y_train = full_q_df.iloc[d - (int(21 * num_train_month)):d, -1]
        X_test = full_q_df.ix[date_range, :-1]
        y_test = full_q_df.ix[date_range, -1]

        # Fit data and make predictions
#         reg = GradientBoostingRegressor()
#         param_grid = {'n_estimators': [100, 200, 300, 400, 500], 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3], 'min_samples_leaf': [1, 5, 10, 20, 50]}
#         reg = KNeighborsRegressor(n_jobs=-1)
#         param_grid = {'n_neighbors': [5, 10, 15, 20, 40, 80], 'weights': ['uniform', 'distance']}
        reg = RandomForestRegressor(n_estimators=128, max_features='sqrt', n_jobs=-1, random_state=random_state)
#         param_grid = {'n_estimators': [128, 1500], 'max_features': ['auto'], 'min_samples_leaf': [1, 10, 50]}

#         reg_gs = GridSearchCV(reg, param_grid, scoring='neg_mean_squared_error')
        reg.fit(X_train, y_train)
#         best_param_list.append(reg_gs.best_params_)

#         # Create feature importance histogram
#         feature_importance_list.append(reg.feature_importances_)
    
#         vif = [int(np.argmax(x)) for x in feature_importance_list]
#         vif = pd.DataFrame(vif)

        y_pred = reg.predict(X_test)
        y_fit = reg.predict(X_train)

        pred_q = y_pred
        actions = X_test.ix[:, -1]
        data = {'Action': actions, 'Q': pred_q}
        df_pred = pd.DataFrame(data=data, index=y_test.index)

        pred_actions = []

        for date in date_range:  
            max_q = [0, -1]
            for i, r in df_pred.ix[date].iterrows():
                if r['Q'] > max_q[1]:
                    max_q = [r['Action'], r['Q']]
            pred_actions.append(max_q[0])

        best_actions = []

        for date in date_range:
            max_q = [0, -1]
            for i, r in full_q_df.ix[date].iterrows():
                if r['Q'] > max_q[1]:
                    max_q = [r[-2], r['Q']]
            best_actions.append(max_q[0])

        acc_num_train_months.append(accuracy_score(best_actions, pred_actions))

        accs.append(np.array(acc_num_train_months))
        print("Batch {0} completed....{1:.2f}%".format(d_counter, 100 * (d_counter / len(range(u, u + testing_size * (len(target_data) // testing_size), testing_size)))))
        geo_means = np.power(reduce(lambda x,y: x*y, accs), (1/len(accs)))
        arithmetic_means = reduce(lambda x,y: x+y, accs) / len(accs)
        print("Geometric Means Max: {}".format((np.argmax(geo_means) + 1, np.max(geo_means))))
        print("Arithemtic Means Max: {}".format((np.argmax(arithmetic_means) + 1, np.max(arithmetic_means))))
    
    print("Grid search best num_train_year took {} seconds:".format(time.time() - start_time))
    
    return (geo_means, arithmetic_means, best_param_list)

In [104]:
results = grid_search(data_full=new_data_full_pca, full_q_df=god_chimp_q_df, training_size=35, testing_size=7, target_data=validation_phase_data, random_state=0)
geo_means = results[0]
arithmetic_means = results[1]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-104-0ff3520fc67f> in <module>()
----> 1 results = grid_search(data_full=new_data_full_pca, full_q_df=god_chimp_q_df, training_size=35, testing_size=7, target_data=validation_phase_data, random_state=0)
      2 geo_means = results[0]
      3 arithmetic_means = results[1]

NameError: name 'new_data_full_pca' is not defined

In [105]:
# Start simulation for the chimp ------ get new full q_df
start_time = time.time()

num_iter = 1500
day_count = 0
pv_history_list = []

new_data_features = new_data_full.ix[:, :-1]
new_data_features_norm = normalization(new_data_features, new_data_features)
new_data_features_pca = pca_transform_reconstruct(new_data_features_norm, new_data_features_norm)
new_data_full_norm2 = normalization(new_data_feature_pca, new_data_features_pca)

new_data_full_norm2['Trade Price'] = new_data_full.ix[:, 'Trade Price']
new_data_full = new_data_full_norm2

# display(new_data_full.isnull().sum())
# display(new_data_full.describe())
display(new_data_full.head())


chimp_analytics = EnhancedChimpBot(new_data_full)

start_time = time.time()

for i in range(num_iter):
    for l in range(len(chimp_analytics.env)):
        chimp_analytics.update()
    pv_history_list.append(chimp_analytics.cash + chimp_analytics.pv)
    chimp_analytics.reset()

print("{0} rounds of training took {1} seconds".format(num_iter, time.time() - start_time))

print(pv_history_list[-1])

# Convert Q-Table to Dataframe from trained chimp (full)
result_dict = defaultdict(list)
for index, row in chimp_analytics.q_dict_analysis.iteritems():
    for i in range(len(chimp_analytics.q_dict_analysis.keys()[0])):
        column_name = 'col' + str(i + 1)
        result_dict[column_name].append(index[i])
    result_dict['Q'].append(chimp_analytics.q_dict_analysis[index][0])
    result_dict['Date'].append(chimp_analytics.q_dict_analysis[index][1])

new_full_q_df = pd.DataFrame(result_dict)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-105-160098eda582> in <module>()
      8 pv_history_list = []
      9 
---> 10 new_data_features = new_data_full.ix[:, :-1]
     11 new_data_features_norm = normalization(new_data_features, new_data_features)
     12 new_data_features_pca = pca_transform_reconstruct(new_data_features_norm, new_data_features_norm)

NameError: name 'new_data_full' is not defined

In [106]:
new_column_list = ['col' + str(x) for x in range(1, 247 + 1)]
new_column_list.extend(['Q'])
new_full_q_df = new_full_q_df[new_column_list]
new_full_q_df = new_full_q_df.sort_index()
del new_full_q_df.index.name

display(new_full_q_df.head())
print(type(new_full_q_df.index[0]))

new_full_q_df['col247'] = new_full_q_df['col247'].apply(action_to_int)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-106-6f31fd9fb53e> in <module>()
      1 new_column_list = ['col' + str(x) for x in range(1, 247 + 1)]
      2 new_column_list.extend(['Q'])
----> 3 new_full_q_df = new_full_q_df[new_column_list]
      4 new_full_q_df = new_full_q_df.sort_index()
      5 del new_full_q_df.index.name

NameError: name 'new_full_q_df' is not defined

In [107]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

start_time = time.time()

accs = []

training_sizes = 48
testing_size = 7

# Loop through all batches in validation dataset
(u, ) = new_data_full.index.get_indexer_for([validation_phase_data.index[0]])
for d in range(u, u + testing_size * (252 // testing_size), testing_size):
    acc_num_train_months = []
    
#     Dates in the batch
    date_range = new_data_full.iloc[d:d + testing_size].index
    
    # Loop through all sizes of training sets
    for num_train_month in range(1, training_sizes + 1):  
#     for num_train_month in range(1, 240 + 1):
        # Prepare Training/Testing Datasets
        X_train = new_full_q_df.iloc[d - (int(21 * num_train_month)):d, :-1]
        y_train = new_full_q_df.iloc[d - (int(21 * num_train_month)):d, -1]
        X_test = new_full_q_df.ix[date_range, :-1]
        y_test = new_full_q_df.ix[date_range, -1]
        
        # Fit data and make predictions
        reg = RandomForestRegressor(n_estimators=1500, max_features='sqrt', oob_score=True, n_jobs=-1, random_state=0)
        reg.fit(X_train, y_train)

        y_pred = reg.predict(X_test)
        y_fit = reg.predict(X_train)

        pred_q = y_pred
        actions = X_test['col247']
        data = {'Action': actions, 'Q': pred_q}
        df_pred = pd.DataFrame(data=data, index=y_test.index)

        pred_actions = []

        for date in date_range:  
            max_q = [0, -1]
            for i, r in df_pred.ix[date].iterrows():
                if r['Q'] > max_q[1]:
                    max_q = [r['Action'], r['Q']]
            pred_actions.append(max_q[0])
            
        best_actions = []

        for date in date_range:
            max_q = [0, -1]
            for i, r in new_full_q_df.ix[date].iterrows():
                if r['Q'] > max_q[1]:
                    max_q = [r['col247'], r['Q']]
            best_actions.append(max_q[0])

        acc_num_train_months.append(accuracy_score(best_actions, pred_actions))
    accs.append(np.array(acc_num_train_months))
    print("Batch {0} completed. Total progress {1}%".format(d + 1 - u, d / (u + testing_size * (252 // testing_size))))
    harmonic_means = np.power(reduce(lambda x,y: x*y, accs), (1/len(accs)))
    arithmetic_means = reduce(lambda x,y: x+y, accs) / len(accs)
    print("Harmonic Means Max: {}".format((np.argmax(harmonic_means) + 1, np.max(harmonic_means))))
    print("Arithemtic Means Max: {}".format((np.argmax(arithmetic_means) + 1, np.max(arithmetic_means))))
    
print("Grid search best num_train_year took {} seconds:".format(time.time() - start_time))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-107-f032054f43b3> in <module>()
     11 
     12 # Loop through all batches in validation dataset
---> 13 (u, ) = new_data_full.index.get_indexer_for([validation_phase_data.index[0]])
     14 for d in range(u, u + testing_size * (252 // testing_size), testing_size):
     15     acc_num_train_months = []

NameError: name 'new_data_full' is not defined

In [108]:
from collections import defaultdict
from datetime import datetime, timedelta
from copy import deepcopy

from sklearn import cross_validation
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import grid_search

class ChimpBot(MonkeyBot):
    """An agent that learns to drive in the smartcab world."""
    num_features = 246
    valid_actions = ['Buy', 'Sell']

    num_trial = 500
    trial_counter = 0 # For getting the trial number

    random_rounds = 1500 # Number of rounds where the bot chooses to go monkey

    trial_meta_info = {} # For monitoring what happens in each trial

    epsilon = 1
    gamma = 0.75
    random_reward = [0]

    random_counter = 0
    policy_counter = 0

    track_key1 = {'Sell': 0, 'Buy': 0, 'Hold': 0}
    track_key2 = {'Sell': 0, 'Buy': 0, 'Hold': 0}

    track_random_decision = {'Sell': 0, 'Buy': 0, 'Hold': 0}

    reset_counter = 0

    def __init__(self, dfEnv, cash=1000, share=0, pv=0):
        super(ChimpBot, self).__init__(dfEnv, cash, share, pv)
        # sets self.cash = 1000
        # sets self.share = 0
        # sets self.pv = 0
        # sets self.pv_history_list = []
        # sets self.env = dfEnv
        # implements buy(self, stock_price)
        # implements sell(self, stock_price)
        # implements hold(self)

        self.iter_env = self.env.iterrows()
        self.now_env_index, self.now_row = self.iter_env.next()

        # self.now_yes_share = 0
        self.now_action = ''
        # self.now_q = 0

        self.prev_cash = self.cash
        self.prev_share = self.share
        self.prev_pv = self.pv

        self.q_df_columns = list(self.env.columns)
        self.q_df_columns.pop()
        self.q_df_columns.extend(['Action', 'Q Value'])
        self.q_df = pd.DataFrame(columns=self.q_df_columns)
        self.q_dict = defaultdict(lambda: (0, 0)) # element of q_dict is (state, act): (q_value, t)
        self.q_dict_analysis = defaultdict(lambda: (0, 0))

        self.negative_reward = 0
        self.n_reward_hisotry = []
        self.net_reward = 0

        self.reset_counter = 0

        # Smartcab use only
        # self.penalty = False
        # self.num_step = 0 # Number of steps for each trial; get reset each time a new trial begins

    def make_q_df(self):
        result_dict = defaultdict(list)

        for index, row in self.q_dict.iteritems():
            for i in range(len(self.q_dict.keys()[0])):
                column_name = 'col' + str(i + 1)
                result_dict[column_name].append(index[i])
            result_dict['Q'].append(self.q_dict[index][0])

        self.q_df = pd.DataFrame(result_dict)
        q_df_column_list = ['col' + str(x) for x in range(1, self.num_features + 1 + 1)]
        q_df_column_list.append('Q')
        # q_df_column_list = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27', 'col28', 'col29', 'col30', 'col31', 'col32', 'col33', 'col34', 'col35', 'col36', 'col37', 'col38', 'col39', 'Q']
        self.q_df = self.q_df[q_df_column_list]

        def transfer_action(x):
            if x == 'Buy':
                return 1
            elif x == 'Sell':
                return 2
            elif x == 'Hold':
                return 0
            else:
                raise ValueError("Wrong action!")

        def str_float_int(x):
            return int(float(x))

        arr_int = np.vectorize(str_float_int)

        self.q_df['col' + str(self.num_features + 1)] = self.q_df['col' + str(self.num_features + 1)].apply(transfer_action)
        self.q_df.ix[:, :-1] = self.q_df.ix[:, :-1].apply(arr_int)

    def split_q_df(self):
        self.q_df_X = self.q_df.ix[:, :-1]
        self.q_df_y = self.q_df.ix[:, -1]
        # self.X_train, self.X_test, self.y_train, self.y_test = cross_validation.train_test_split(self.q_df_X, self.q_df_y, test_size=0.1, random_state=0)

    def train_on_q_df(self):
        reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=25), n_estimators=50, random_state=0)
        self.q_reg = reg
        self.q_reg = self.q_reg.fit(self.q_df_X, self.q_df_y)

    def update_q_model(self):
#         print("Updating Q model...")
        start_time = time.time()
        self.make_q_df()
        self.split_q_df()
        self.train_on_q_df()
#         print("Update took {} seconds".format(time.time() - start_time))

    def from_state_action_predict_q(self, state_action):
        state_action = [state_action]

        pred_q = self.q_reg.predict(state_action)

        return pred_q

    # def yes_share(self):
    #     # Represent chimp asset in state_action
    #     if self.share > 0:
    #         return 1
    #     else:
    #         return 0

    def max_q(self, now_row):
        def transfer_action(x):
            if x == 'Buy':
                return 1
            elif x == 'Sell':
                return 2
            elif x == 'Hold':
                return 0
            else:
                raise ValueError("Wrong action!")

        def str_float_int(x):
            return int(float(x))

        now_row2 = list(now_row)
        # now_row2.append(self.now_yes_share)
        max_q = ''
        q_compare_dict = {}

        if len(now_row2) > self.num_features:
            raise ValueError("Got ya bastard! @ MaxQ")

        # Populate the q_dict
        for act in set(self.valid_actions):
            now_row2.append(act)
            now_row_key = tuple(now_row2)

            _ = self.q_dict[now_row_key]

            # # K-Q Algorithm
            # if np.random.choice(2, p = [0.9, 0.1]) == 1 and len(self.q_dict) > 30000:
            # if _[1] == 0 and np.random.choice(2, p = [0.7, 0.3]) == 1 and len(self.q_dict) > 30000:
            try:
                self.q_reg
            except AttributeError:
                pass
                # print('No q_reg yet...going with default.')
            else:
                if _[1] == 0:
                    # print("Dreaming mode...")
                    # start_time = time.time()
                    # self.update_q_model()

                    single_X = np.array(now_row_key)
                    # print(single_X)
                    arr_int = np.vectorize(str_float_int)
                    single_X[-1] = transfer_action(single_X[-1])
                    single_X = arr_int(single_X)
                    single_X = single_X.reshape(1, -1)
                    pred_q = self.q_reg.predict(single_X)
                    dreamed_q = (1 - (1 / (self.q_dict[now_row_key][1] + 1))) * self.q_dict[now_row_key][0] + (1 / (self.q_dict[now_row_key][1] + 1)) * pred_q[0]
                    self.q_dict[now_row_key] = (dreamed_q, self.q_dict[now_row_key][1] + 1)
                    # print("Q-dreamed: {0} for Act: {1}, taking {2} seconds.".format(self.q_dict[now_row_key], act, time.time() - start_time))

            # print(act, self.q_dict[now_row_key])

            q_compare_dict[now_row_key] = self.q_dict[now_row_key]
            now_row2.pop()

        try:
            max(q_compare_dict.iteritems(), key=lambda x:x[1])
        except ValueError:
            print("Wrong Q Value in Q Compare Dict!")
        else:
            key, qAndT = max(q_compare_dict.iteritems(), key=lambda x:x[1])
            # print("Action: {0}, with Q-value: {1}".format(key[-1], qAndT[0]))
            return key[-1], qAndT[0], qAndT[1]

    def q_update(self):
        # print("Data Index: {}".format(self.now_env_index))
        now_states = list(self.now_row)
        # now_states = list(now_states)
        now_states.pop() # disregard the Trade Price

        prev_states = list(self.prev_states)

        if len(prev_states) > self.num_features:
            raise ValueError("Got ya bastard! @ Q_Update...something wrong with the self.prev_states!!!")

        # prev_states.append(self.prev_yes_share)
        prev_states.append(self.prev_action)
        prev_states_key = tuple(prev_states)

        if len(prev_states_key) > self.num_features + 2:
            raise ValueError("Got ya bastard! @ Q_Update")

        q_temp = self.q_dict[prev_states_key]

        q_temp0 = (1 - (1 / (q_temp[1] + 1))) * q_temp[0] + (1 / (q_temp[1] + 1)) * (self.reward + self.gamma * self.max_q(now_states)[1])

        if prev_states_key[:-1] == ('Low', 'Low', 'Average', 'Average', 'Low', 'Average', 'Average', 'Average', 'Low', 'Low', 'Low', 'Low', 'Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'N-Very Low', 'Low', 'Average', 'N-Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'High', 'Yes'):
            self.track_key1[prev_states_key[-1]] += 1
        elif prev_states_key[:-1] == ('Low', 'Low', 'Average', 'Average', 'Low', 'Average', 'Average', 'Average', 'Low', 'Low', 'Low', 'Low', 'Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'N-Very Low', 'Low', 'Average', 'N-Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'High', 'No'):
            self.track_key2[prev_states_key[-1]] += 1
        # elif prev_states_key[:-1] == ('Very High', 'Very High', 'Very High', 'Very High', 'Very High', 'Very High', 'Average', 'High', 'Average', 'Average', 'Average', 'Low', 'Average', 'Very Low', 'Low', 'N-Very Low', 'N-Very Low', 'N-Very Low', 'N-Very Low', 'Very Low', 'Very Low', 'Average', 'Very Low', 'Low', 'Low', 'Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Low', 'Very Low', 'Low', 'Very Low', 'Average', 'No'):
        #     self.track_key2[prev_states_key[-1]] += 1

        self.q_dict[prev_states_key] = (q_temp0, q_temp[1] + 1)
        # For analysis purpose
        self.q_dict_analysis[prev_states_key] = (q_temp0, self.prev_env_index)
        # print("Now Action: {}".format())
        # print(prev_states_key)
        return (self.q_dict[prev_states_key])

    def policy(self, now_row):
        return self.max_q(now_row)[0]

    def reset(self):
        # Portfolio change over iterations
        self.pv_history_list.append(self.pv + self.cash)

        self.iter_env = self.env.iterrows()
        self.now_env_index, self.now_row = self.iter_env.next()

        self.cash = 1000
        self.share = 0
        self.pv = 0

        self.prev_cash = self.cash
        self.prev_share = self.share
        self.prev_pv = self.pv

        if self.epsilon - 1/self.random_rounds > 0.001: # Epislon threshold: 0.01
            self.random_counter += 1
            self.epsilon = self.epsilon - 1/self.random_rounds
        else:
            self.epsilon = 0.001 # Epislon threshold: 0.1
            self.policy_counter += 1

        self.net_reward = 0

        self.reset_counter += 1

        if self.reset_counter % random_rounds == 0:
            self.update_q_model()

        # self.num_step = 0 # Recalculate the steps for the new trial
        # self.penalty = False
        # self.fail = False

    def make_decision(self, now_row):
        return self.policy(now_row)

    def update(self):
        # Update state
        now_states = list(self.now_row)

        if len(now_states) > self.num_features + 1:
            print(len(now_states))
            print(self.num_features)
            raise ValueError("Got ya bastard! @ Q_Update...something wrong with the self.now_row!!!")

        # now_states = list(now_states)
        # print(type(self.now_row))
        now_states.pop() # disregard the Trade Price

        if len(now_states) > self.num_features:
            print(now_states)
            raise ValueError("Got ya bastard! @ Q_Update...something wrong with now_states after pop!!!")

        # Exploitation-exploration decisioning
        random.seed(datetime.now())
        self.decision = np.random.choice(2, p = [self.epsilon, 1 - self.epsilon]) # decide to go random or with the policy
        # self.decision = 0 # Force random mode

        # print("Random decision: {0}, Epislon: {1}".format(self.decision, self.epsilon))
        # print("What the FUCK?!")
        if self.decision == 0: # if zero, go random
            random.seed(datetime.now())
            action = random.choice(self.valid_actions)
            # if tuple(now_states) == ('Low', 'Low', 'Average', 'Average', 'Low', 'Average', 'Average', 'Average', 'Low', 'Low', 'Low', 'Low', 'Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'N-Very Low', 'Low', 'Average', 'N-Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'Very Low', 'High'):
            #     self.track_random_decision[action] += 1
        else: # else go with the policy
            # print("now_states: {}".format(now_states))
            # self.now_yes_share = self.yes_share()
            action = self.make_decision(now_states)

        if len(now_states) > self.num_features:
            print(now_states)
            raise ValueError("Got ya bastard! @ Q_Update...something wrong with now_states after make_decision!!!")

        # print("Now Action Real: {}".format(action))
        # Execute action and get reward
        if action == 'Buy':
            # print(self.now_row)
            self.buy(self.now_row[-1])
        elif action == 'Sell':
            # print(self.now_row)
            self.sell(self.now_row[-1])
        elif action == 'Hold':
            # print(self.now_row)
            self.hold(self.now_row[-1])
        else:
            raise ValueError("Wrong action man!")

        try:
            self.prev_states
        except AttributeError:
            print("Running the first time...no prevs exist.")
        else:
            self.reward = ((self.cash - self.prev_cash) + (self.pv - self.prev_pv)) / (self.prev_cash + self.prev_pv)
            self.q_update()

        self.prev_states = now_states

        if len(now_states) > self.num_features:
            raise ValueError("Got ya bastard! @ Q_Update...something wrong with the now_states!!!")

        self.now_action = action
        self.prev_action = action
        # self.prev_yes_share = self.now_yes_share
        self.prev_env_index = deepcopy(self.now_env_index)
        self.prev_cash = self.cash
        self.prev_share = self.share
        self.prev_pv = self.pv

        # if len(self.q_dict) > 20000:
        #     self.update_q_model()

        try:
            self.now_env_index, self.now_row = self.iter_env.next()
        except StopIteration:
            pass
            # print("End of data.")
        else:
            pass

        # if reward < 0:
        #     self.penalty = True

        try:
            _ = self.reward
        except AttributeError:
            print("No reward yet...0 assigned.")
            self.reward = 0
        # print "ChimpBot.update(): Action: {0} at Price: {1}, Cash: {2}, Num_Share: {3}, Cash + PV = {4}, Reward = {5}".format(action, self.now_row[-1], self.cash, self.share, self.cash + self.pv, self.reward)  # [debug]
        # print('Portfolio + Cash: {}'.format(self.cash + self.pv))
        # print("================================")


/home/calvinjku/anaconda2/envs/python2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/calvinjku/anaconda2/envs/python2/lib/python2.7/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [109]:
def main_simulate():
# Initiating data and the chimp
    start_date = test_phase_data.index[0]
    end_date = test_phase_data.index[-1]
    
    global data_full
    dfFull = data_full
    train_size = 21 * 8
    batch_size = 7
    date_range = test_phase_data.index[:] # Using 7 months of data to predict one month
    print(date_range)
    
    batch_count = 0
        
    cash = 1000
    share = 0
    pv = 0
    now_yes_share = 0

    for batch in range(len(test_phase_data) // batch_size):
        
#     for date in date_range:
        batch_count += 1
        print("Batch {}".format(batch_count))

        try:
            dfTest = dfFull.ix[test_phase_data.index[batch * batch_size]:test_phase_data.index[batch * batch_size + batch_size - 1]]
        except IndexError:
            dfTest = dfFull.ix[test_phase_data.index[batch * batch_size]:test_phase_data.index[-1]]
        
        (u,) = dfFull.index.get_indexer_for([test_phase_data.index[batch * batch_size]])

        dfTrain = dfFull.iloc[u - (train_size):u]
        
        # Normalization
        dfTrain.ix[:, :-1] = (dfTrain.ix[:, :-1] - dfTrain.ix[:, :-1].mean()) / dfTrain.ix[:, :-1].max() - dfTrain.ix[:, :-1].min())
        dfTest.ix[:, :-1] = (dfTest.ix[:, :-1] - dfTrain.ix[:, :-1].mean()) / dfTrain.ix[:, :-1].max() - dfTrain.ix[:, :-1].min())

        pca_dim = 246
        pca = PCA(n_components=pca_dim)
        pca.fit(dfTrain.ix[:, :-1])

        chimp_train = ChimpBot(dfTrain)

        for i in range(1500):
            for l in range(len(chimp_train.env)):
                # print("Train Round {0}-{1}".format(i + 1, l + 1))
                chimp_train.update()
            chimp_train.reset()

        # Test the Chimp!
        q_df = deepcopy(chimp_train.q_df)
        q_dict = deepcopy(chimp_train.q_dict)
        q_reg = deepcopy(chimp_train.q_reg)

        try:
            _ = chimp_test
        except NameError:
            print("First time running...")
        else:
            cash = chimp_test.cash
            share = chimp_test.share
            pv = chimp_test.pv
            now_yes_share = chimp_test.now_yes_share

        chimp_test = ChimpBot(dfTest, cash=cash, share=share, pv=pv, now_yes_share=now_yes_share)

        chimp_test.q_df = deepcopy(q_df)
        chimp_test.q_dict = deepcopy(q_dict)
        chimp_test.q_reg = deepcopy(q_reg)
        chimp_test.epsilon = 0

        # Pass the cheatsheet to the next chimp
        try:
            chimp_test.prev_states = prev_states
            chimp_test.now_action = now_action
            chimp_test.prev_action = prev_action
            chimp_test.prev_yes_share = prev_yes_share
            chimp_test.reward = reward
            chimp_test.prev_cash = prev_cash
            chimp_test.prev_share = prev_share
            chimp_test.prev_pv = prev_pv
            chimp_test.prev_env_index = prev_env_index

        except UnboundLocalError:
            print("No cheatsheet to pass over yet...no worries!")

        for l in range(len(chimp_test.env)):
            # print("Train Round {0}-{1}".format(i + 1, l + 1))
            chimp_test.update()
                
        # Create cheatsheet for the next chimp
        prev_states = chimp_test.prev_states
        now_action = chimp_test.now_action
        prev_action = chimp_test.prev_action
        prev_yes_share = chimp_test.prev_yes_share
        prev_env_index = chimp_test.prev_env_index
        reward = chimp_test.reward
        prev_cash = chimp_test.prev_cash
        prev_share = chimp_test.prev_share
        prev_pv = chimp_test.prev_pv

        global action_lists
        action_lists.append(chimp_test.action_list)
        
        global pv_history_list
        pv_history_list.append(chimp_test.cash + chimp_test.pv)
        
        if (batch + 1) % 3 == 0:
            print(pv_history_list)
        
    print(pv_history_list)


  File "<ipython-input-109-7f0a073d3d3d>", line 38
    dfTrain.ix[:, :-1] = (dfTrain.ix[:, :-1] - dfTrain.ix[:, :-1].mean()) / dfTrain.ix[:, :-1].max() - dfTrain.ix[:, :-1].min())
                                                                                                                               ^
SyntaxError: invalid syntax

In [110]:
# PCA Definition
# feature_days = 21 * 6 = 126
# n = 271: 0.96 <-- goldilock 1
# n = 130: 0.895 <-- goldilock 2
# Going with n = 246
pca_dim = 246
pca = PCA(n_components=pca_dim)
pca.fit(df_features[:last_training_day]) # all data

print(np.sum(pca.explained_variance_ratio_))
# print(pca.explained_variance_ratio_)

print("Running PCA took {} seconds".format(time.time() - start_time))


# PCA transformation
cp_imp = pca.explained_variance_ratio_
cp_imp = pd.Series(cp_imp)
cp_imp_cum = cp_imp.cumsum()
cp_imp_cum.plot()

df_features_pca = pca.transform(df_features)
df_features_pca = pd.DataFrame(data=df_features_pca)
df_features_pca['Date'] = df_features.index
df_features_pca.set_index(df_features_pca['Date'], inplace=True)
del df_features_pca.index.name
del df_features_pca['Date']

# Normalization post PCA
df_features_pca = (df_features_pca - df_features_pca[:last_training_day].mean()) / (df_features_pca[:last_training_day].max() - df_features_pca[:last_training_day].min())

# Reconstruct dataset
df_full = df_features_pca
df_full = df_features_pca['Trade Price'] = df_labels
df_full = df_features_pca
display(df_full.head())



# Start simulation for the chimp
num_iter = 1500
day_count = 0
pv_history_list = []

chimp = ChimpBot(df_full)

start_time = time.time()

for i in range(num_iter):
    for l in range(len(chimp.env)):
        chimp.update()
    pv_history_list.append(chimp.cash + chimp.pv)
    print(pv_history_list[-1])
    chimp.reset()
    
print("{0} rounds of training took {1} seconds".format(num_iter, time.time() - start_time))


# Convert Q-Table to Dataframe from trained chimp (full)
result_dict = defaultdict(list)
for index, row in chimp.q_dict_analysis.iteritems():
    for i in range(len(chimp.q_dict_analysis.keys()[0])):
        column_name = 'col' + str(i + 1)
        result_dict[column_name].append(index[i])
    result_dict['Q'].append(chimp.q_dict_analysis[index][0])
    result_dict['Date'].append(chimp.q_dict_analysis[index][1])

q_df = pd.DataFrame(result_dict)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-110-faacc422734f> in <module>()
      7 pca_dim = 246
      8 pca = PCA(n_components=pca_dim)
----> 9 pca.fit(df_features[:last_training_day]) # all data
     10 
     11 print(np.sum(pca.explained_variance_ratio_))

NameError: name 'df_features' is not defined

In [ ]: