In [1]:
%matplotlib inline
import random
from operator import add
from functools import reduce
import matplotlib.pyplot as plt
import math
import numpy as np
import numpy.random as nrand
import pandas as pd
import itertools
import scipy.stats as stats
import powerlaw
from stockmarket import baselinemodel
from tqdm import tqdm
from pandas_datareader import data
from pylab import plot, show
from math import isclose
from stockmarket.stylizedfacts import *
import itertools
import quandl
from SALib.sample import latin
from statistics import mean
import bisect
In [2]:
start_date = '2010-01-01'
end_date = '2016-12-31'
spy = data.DataReader("SPY",
start=start_date,
end=end_date,
data_source='google')['Close']
spy_returns = spy.pct_change()[1:]
spy_volume = data.DataReader("SPY",
start=start_date,
end=end_date,
data_source='google')['Volume']
In [3]:
# calculate stylized facts for SP500
spy_autocorrelation = autocorrelation_returns(spy_returns, 25)
spy_kurtosis = kurtosis(spy_returns)
spy_autocorrelation_abs = autocorrelation_abs_returns(spy_returns, 25)
spy_hurst = hurst(spy, lag1=2 , lag2=20)
spy_cor_volu_vola = correlation_volume_volatility(spy_volume, spy_returns, window=10)
stylized_facts_spy = [spy_autocorrelation, spy_kurtosis, spy_autocorrelation_abs, spy_hurst, spy_cor_volu_vola]
print('spy_autocorrelation ',spy_autocorrelation)
print('spy_kurtosis ', spy_kurtosis)
print('spy_autocorrelation_abs ', spy_autocorrelation_abs)
print('spy_hurst ', spy_hurst)
print('spy_cor_volu_vola ',spy_cor_volu_vola)
In [4]:
SIMTIME = 200
NRUNS = 5
backward_simulated_time = 400
initial_total_money = 26000
init_profit = 1000
init_discount_rate = 0.17
We define the parameter bounds as follows.
| Parameter | Values (start, stop, step) |
|---|---|
| share_chartists | 0 - 1, 0.1 |
| share_mean_reversion | 0 - 1, 0.1 |
| order_expiration_time | 1000 - 10000, 1000 |
| agent_order_price_variability | 1 - 10, 1 |
| agent_order_variability | 0.1 - 5 |
| agent_ma_short | 5 - 100, 5 |
| agent_ma_long | 50 - 400, 50 |
| agents_hold_thresholds | 0.0005 |
| Agent_volume_risk_aversion | 0.1 - 1, 0.1 |
| Agent_propensity_to_switch | 0.1 - 2.2, 0.1 |
| profit_announcement_working_days | 5 - 50, 5 |
| price_to_earnings_spread | 5 - 50, 5 |
| price_to_earnings_heterogeneity | 5 - 50, 5 |
In [5]:
parameter_space = {'share_chartists':[0.0, 1.0], 'share_mean_reversion':[0.0, 1.0], 'order_expiration_time':[1000, 10000],
'agent_order_price_variability':[1, 10], 'agent_order_variability':[0.1, 5.0],
'agent_ma_short':[5, 100], 'agent_ma_long':[50, 400], 'agents_hold_thresholds':[0.00005,0.01],
'agent_volume_risk_aversion':[0.1, 1.0], 'agent_propensity_to_switch':[0.1, 2.2],
'profit_announcement_working_days':[5, 50], 'price_to_earnings_base':[10,20],
'price_to_earnings_heterogeneity':[1.1,2.5], 'price_to_earnings_gap':[4,20],
'longMA_heterogeneity':[1.1,1.8], 'shortMA_heterogeneity':[1.1,1.8], 'shortMA_memory_divider':[1, 10]}
Then, we determine the amount of starting points we want for the genetic algorithm and sample the parameter space using a Latin hypercube sample.
In [6]:
population_size = 8
In [7]:
problem = {
'num_vars': 17,
'names': ['share_chartists', 'share_mean_reversion', 'order_expiration_time', 'agent_order_price_variability',
'agent_order_variability', 'agent_ma_short', 'agent_ma_long', 'agents_hold_thresholds',
'agent_volume_risk_aversion', 'agent_propensity_to_switch', 'profit_announcement_working_days',
'price_to_earnings_base', 'price_to_earnings_heterogeneity', 'price_to_earnings_gap',
'longMA_heterogeneity', 'shortMA_heterogeneity', 'shortMA_memory_divider'],
'bounds': [[0.0, 1.0], [0.0, 1.0], [1000, 10000], [1, 10],
[0.1, 5.0], [5, 100], [50, 400], [0.00005,0.01],
[0.1, 1], [0.1, 2.2], [5, 50],
[10,20], [1.1,2.5], [4,20],
[1.1,1.8], [1.1,1.8], [1, 10]]
}
In [8]:
latin_hyper_cube = latin.sample(problem=problem, N=population_size)
latin_hyper_cube = latin_hyper_cube.tolist()
In [9]:
# transform some of the parameters to integer
for idx, parameters in enumerate(latin_hyper_cube):
latin_hyper_cube[idx][2] = int(latin_hyper_cube[idx][2])
latin_hyper_cube[idx][3] = int(latin_hyper_cube[idx][3])
latin_hyper_cube[idx][4] = int(latin_hyper_cube[idx][4])
latin_hyper_cube[idx][5] = int(latin_hyper_cube[idx][5])
latin_hyper_cube[idx][6] = int(latin_hyper_cube[idx][6])
latin_hyper_cube[idx][10] = int(latin_hyper_cube[idx][10])
latin_hyper_cube[idx][11] = int(latin_hyper_cube[idx][11])
latin_hyper_cube[idx][13] = int(latin_hyper_cube[idx][13])
latin_hyper_cube[idx][16] = int(latin_hyper_cube[idx][16])
We try to match average simulation stylized facts as closely as possible to observed stylized facts.
For that, we use an evolutionary algorithm to minimize a cost function.
In our algorithm, an individual is a set a of parameters and its average associated values for stylized facts over several simulation runs.
In [10]:
class Individual:
"""The order class can represent both bid or ask type orders"""
def __init__(self, parameters, stylized_facts, cost):
self.parameters = parameters
self.stylized_facts = stylized_facts
self.cost = cost
def __lt__(self, other):
"""Allows comparison to other individuals based on its cost (negative fitness)"""
return self.cost < other.cost
In [11]:
# create initial population
population = []
for parameters in latin_hyper_cube:
# add an individual to the population
population.append(Individual(parameters, [], np.inf))
# create populations_over_time
populations_over_time = [population]
We measure the relative difference between the simulated and actual data using
$c(s)= \frac{spy(s) - a}{spy(s)}^2$.
Then, for each simulaten, we measure total costs as:
$t(w,v,x,y,z)= c(w) + c(v) + c(x) + c(y) + c(z) $ where, w represents autocorrelation, v fat tails, x is clustered volatility, y is long memory, and z is the correlation between price and volume.
In [12]:
def cost_function(observed_values, average_simulated_values):
"""cost function"""
score = 0
for obs, sim in zip(observed_values, average_simulated_values):
score += ((obs - sim) / obs)**2
return score
In [13]:
def average_fitness(population):
total_cost = 0
for individual in population:
total_cost += individual.cost
return total_cost / (float(len(population)))
In [16]:
#av_pop_fitness = []
In [15]:
def simulate_population(population, number_of_runs, simulation_time, number_of_agents):
"""
Simulate a population of parameter spaces for the stock market model
:param population: population of parameter spaces used to simulate model
:param number_of_runs: number of times the simulation should be run
:param simulation_time: amount of days which will be simulated for each run
:return: simulated population, average population fitness
"""
simulated_population = []
for idx, individual in tqdm(enumerate(population)):
parameters = individual.parameters
stylized_facts = [[],[],[],[],[]]
# identify parameters
share_chartists= parameters[0]
share_mean_reversion = parameters[1]
order_expiration_time = parameters[2]
agent_order_price_variability = parameters[3]
agent_order_variability = parameters[4]
agent_ma_short = parameters[5]
agent_ma_long = parameters[6]
agents_hold_thresholds = parameters[7]
agent_volume_risk_aversion = parameters[8]
agent_propensity_to_switch = parameters[9]
profit_announcement_working_days = parameters[10]
price_to_earnings_base = parameters[11]
price_to_earnings_heterogeneity = parameters[12]
price_to_earnings_gap = parameters[13]
longMA_heterogeneity = parameters[14]
shortMA_heterogeneity = parameters[15]
shortMA_memory_divider = parameters[16]
PE_low_low = price_to_earnings_base
PE_low_high = int(price_to_earnings_heterogeneity*price_to_earnings_base)
PE_high_low = PE_low_high + price_to_earnings_gap
PE_high_high = int(price_to_earnings_heterogeneity*PE_high_low)
# simulate the model
for seed in range(NRUNS):
agents, firms, stocks, order_books = baselinemodel.stockMarketSimulation(seed=seed,
simulation_time=SIMTIME,
init_backward_simulated_time=int(agent_ma_long*longMA_heterogeneity),
number_of_agents=number_of_agents,
share_chartists=share_chartists,
share_mean_reversion=share_mean_reversion,
amount_of_firms=1,
initial_total_money=(initial_total_money,int(initial_total_money*1.1)),
initial_profit=(init_profit, init_profit),
discount_rate=init_discount_rate,
init_price_to_earnings_window=((PE_low_low,
PE_low_high),
(PE_high_low,
PE_high_high)),
order_expiration_time=order_expiration_time,
agent_order_price_variability=(agent_order_price_variability,agent_order_price_variability),
agent_order_variability=agent_order_variability,
agent_ma_short=(agent_ma_short, int(agent_ma_short*shortMA_heterogeneity)),
agent_ma_long=(agent_ma_long, int(agent_ma_long*longMA_heterogeneity)),
agents_hold_thresholds=(1-agents_hold_thresholds, 1+agents_hold_thresholds),
agent_volume_risk_aversion=agent_volume_risk_aversion,
agent_propensity_to_switch=agent_propensity_to_switch,
firm_profit_mu=0.058,
firm_profit_delta=0.00396825396,
firm_profit_sigma=0.125,
profit_announcement_working_days=profit_announcement_working_days,
mean_reversion_memory_divider=4,
printProgress=False,
)
# store simulated stylized facts
sim_returns = calculate_returns(order_books[0].transaction_prices_history)
sim_volume = []
for day in order_books[0].transaction_volumes_history[1:]:
sim_volume.append(sum(day))
stylized_facts[0].append(autocorrelation_returns(sim_returns, 25))
stylized_facts[1].append(kurtosis(sim_returns))
stylized_facts[2].append(autocorrelation_abs_returns(sim_returns, 25))
stylized_facts[3].append(hurst(spy, lag1=2 , lag2=20))
stylized_facts[4].append(correlation_volume_volatility(sim_volume, sim_returns, window=10))
# create next generation individual
next_gen_individual = Individual(parameters, [], np.inf)
# add average stylized facts to individual
for s_fact in stylized_facts:
next_gen_individual.stylized_facts.append(mean(s_fact))
# add average fitness to individual
next_gen_individual.cost = cost_function(stylized_facts_spy, next_gen_individual.stylized_facts)
# set any non_volume simulation cost to infinity
if np.isnan(next_gen_individual.cost):
next_gen_individual.cost = np.inf
# insert into next generation population, lowest score to the left
bisect.insort_left(simulated_population, next_gen_individual)
average_population_fitness = average_fitness(simulated_population)
return simulated_population, average_population_fitness
In [15]:
# next_population = []
# for idx, individual in tqdm(enumerate(population)):
# parameters = individual.parameters
# stylized_facts = [[],[],[],[],[]]
# # name the parameters
# share_chartists= parameters[0]
# share_mean_reversion = parameters[1]
# order_expiration_time = parameters[2]
# agent_order_price_variability = parameters[3]
# agent_order_variability = parameters[4]
# agent_ma_short = parameters[5]
# agent_ma_long = parameters[6]
# agents_hold_thresholds = parameters[7]
# agent_volume_risk_aversion = parameters[8]
# agent_propensity_to_switch = parameters[9]
# profit_announcement_working_days = parameters[10]
# price_to_earnings_base = parameters[11]
# price_to_earnings_heterogeneity = parameters[12]
# price_to_earnings_gap = parameters[13]
# longMA_heterogeneity = parameters[14]
# shortMA_heterogeneity = parameters[15]
# shortMA_memory_divider = parameters[16]
# PE_low_low = price_to_earnings_base
# PE_low_high = int(price_to_earnings_heterogeneity*price_to_earnings_base)
# PE_high_low = PE_low_high + price_to_earnings_gap
# PE_high_high = int(price_to_earnings_heterogeneity*PE_high_low)
# # simulate the model
# for seed in range(NRUNS):
# agents, firms, stocks, order_books = baselinemodel.stockMarketSimulation(seed=seed,
# simulation_time=SIMTIME,
# init_backward_simulated_time=int(agent_ma_long*longMA_heterogeneity),
# number_of_agents=500,
# share_chartists=share_chartists,
# share_mean_reversion=share_mean_reversion,
# amount_of_firms=1,
# initial_total_money=(initial_total_money,int(initial_total_money*1.1)),
# initial_profit=(init_profit, init_profit),
# discount_rate=init_discount_rate,
# init_price_to_earnings_window=((PE_low_low,
# PE_low_high),
# (PE_high_low,
# PE_high_high)),
# order_expiration_time=order_expiration_time,
# agent_order_price_variability=(agent_order_price_variability,agent_order_price_variability),
# agent_order_variability=agent_order_variability,
# agent_ma_short=(agent_ma_short, int(agent_ma_short*shortMA_heterogeneity)),
# agent_ma_long=(agent_ma_long, int(agent_ma_long*longMA_heterogeneity)),
# agents_hold_thresholds=(1-agents_hold_thresholds, 1+agents_hold_thresholds),
# agent_volume_risk_aversion=agent_volume_risk_aversion,
# agent_propensity_to_switch=agent_propensity_to_switch,
# firm_profit_mu=0.058,
# firm_profit_delta=0.00396825396,
# firm_profit_sigma=0.125,
# profit_announcement_working_days=profit_announcement_working_days,
# mean_reversion_memory_divider=4,
# printProgress=False,
# )
# # store simulated stylized facts
# sim_returns = calculate_returns(order_books[0].transaction_prices_history)
# sim_volume = []
# for day in order_books[0].transaction_volumes_history[1:]:
# sim_volume.append(sum(day))
# stylized_facts[0].append(autocorrelation_returns(sim_returns, 25))
# stylized_facts[1].append(kurtosis(sim_returns))
# stylized_facts[2].append(autocorrelation_abs_returns(sim_returns, 25))
# stylized_facts[3].append(hurst(spy, lag1=2 , lag2=20))
# stylized_facts[4].append(correlation_volume_volatility(sim_volume, sim_returns, window=10))
# # create next generation individual
# next_gen_individual = Individual(parameters, [], np.inf)
# # add average stylized facts to individual
# for s_fact in stylized_facts:
# next_gen_individual.stylized_facts.append(mean(s_fact))
# # add average fitness to individual
# next_gen_individual.cost = cost_function(stylized_facts_spy, next_gen_individual.stylized_facts)
# # set any non_volume simulation cost to infinity
# if np.isnan(next_gen_individual.cost):
# next_gen_individual.cost = np.inf
# # insert into next generation population, lowest score to the left
# bisect.insort_left(next_population, next_gen_individual)
# # add this generation to the overview of generations and its fitness to the fitness over time tracker
# populations_over_time.append(next_population)
# av_pop_fitness.append(average_fitness(next_population))
In [28]:
def evolve_population(population, fittest_to_retain, random_to_retain, parents_to_mutate, parameters_to_mutate):
"""
Evolves a population. First, the fittest members of the population plus some random individuals become parents.
Then, some random mutations take place in the parents. Finally, the parents breed to create children.
:param population: population individuals sorted by cost (cheapest left) which contain parameter values
:param fittest_to_retain: percentage of fittest individuals which should be maintained as parents
:param random_to_retain: percentage of other random individuals which should be maintained as parents
:param individuals_to_mutate: percentage of parents in which mutations will take place
:param parameters_to_mutate: percentage of parameters in chosen individuals which will mutate
:return:
"""
# 1 retain parents
retain_lenght = int(len(population) * fittest_to_retain)
parents = population[:retain_lenght]
# 2 retain random individuals
amount_random_indiv = int(len(population) * random_to_retain)
parents.extend(random.sample(population[retain_lenght:], amount_random_indiv))
# 3 mutate random parameters of random individuals
amount_of_individuals_to_mutate = int(parents_to_mutate * len(parents))
amount_of_params_to_mutate = int(parameters_to_mutate * len(parents[0].parameters))
for parent in random.sample(parents, amount_of_params_to_mutate):
indexes_of_mutable_params = random.sample(range(len(parent.parameters)), amount_of_params_to_mutate)
for idx in indexes_of_mutable_params:
min_value, max_value = problem['bounds'][idx][0], problem['bounds'][idx][1]
if type(min_value) == float:
parent.parameters[idx] = random.uniform(min_value, max_value)
else:
parent.parameters[idx] = random.randint(min_value, max_value)
# 4 parents breed to create a new population
parents_lenght = len(parents)
desired_lenght = len(population) - parents_lenght
children = []
while len(children) < desired_lenght:
male = random.randint(0, parents_lenght - 1)
female = random.randint(0, parents_lenght - 1)
if male != female:
male = parents[male]
female = parents[female]
half = int(len(male.parameters) / 2)
child_parameters = male.parameters[:half] + female.parameters[half:]
child = Individual(child_parameters, [], np.inf)
children.append(child)
parents.extend(children)
# the parents list now contains a full new population with the parents and their offspring
return parents
for i in range(iterations): pop = evolve(pop, target) fitness_history.append(average_fitness(pop, target))
In [26]:
iterations = 2
av_pop_fitness = []
all_populations = [population]
In [29]:
for i in range(iterations):
simulated_population, fitness = simulate_population(all_populations[i], number_of_runs=3, simulation_time=10, number_of_agents=200)
av_pop_fitness.append(fitness)
all_populations.append(evolve_population(simulated_population, fittest_to_retain=0.2, random_to_retain=0.1,
parents_to_mutate=0.5, parameters_to_mutate=0.1))
In [ ]:
In [90]:
percentage_parameters_to_mutate = 0.1
percentage_individuals_to_mutate = 0.5
retain=0.3
random_select=0.1
In [17]:
retain_lenght = int(len(next_population) * retain)
parents = next_population[:retain_lenght]
In [26]:
# randomly add other individuals to promote genetic diversity
for individual in next_population[retain_lenght:]:
if random_select > random.random():
parents.append(individual)
In [101]:
amount_of_individuals_to_mutate = int(percentage_individuals_to_mutate * len(parents))
# determine if a mutation will take place in this parent
for parent in random.sample(parents, amount_of_params_to_mutate):
# determine how many parameters should be mutated
amount_of_params_to_mutate = int(percentage_parameter_to_mutate * len(parents[0].parameters))
# sample the indexes of the parameters to mutate
print('I will mutate ', amount_of_params_to_mutate, ' parameters from', parent)
indexes_of_mutable_params = random.sample(range(len(parents[0].parameters)), amount_of_params_to_mutate)
for idx in indexes_of_mutable_params:
# identify the range for this parameter to mutate
min_value, max_value = problem['bounds'][idx][0], problem['bounds'][idx][1]
print('I mutate ', problem['names'][idx], ' which has min, max ', min_value, max_value, 'and current val= ', parent.parameters[idx])
if type(min_value) == float:
parent.parameters[idx] = random.uniform(min_value, max_value)
else:
parent.parameters[idx] = random.randint(min_value, max_value)
print('new variable value is ', parent.parameters[idx])
In [124]:
# keep in mind if it is a float or an integer
# let parents breed to create children
parents_lenght = len(parents)
desired_lenght = len(next_population) - parents_lenght
children = []
while len(children) < desired_lenght:
male = random.randint(0, parents_lenght - 1)
female = random.randint(0, parents_lenght - 1)
print('parents are ', male, female)
if male != female:
male = parents[male]
female = parents[female]
half = int(len(male.parameters) / 2)
# here I should create a new child
child_parameters = male.parameters[:half] + female.parameters[half:]
print('male params are ', male.parameters)
print('female params are ', female.parameters)
print('child params are', child_parameters)
child = Individual(child_parameters, [], np.inf)
children.append(child)
parents.extend(children)
In [ ]:
In [87]:
mean([1,2,3])
Out[87]: