In [2]:
#import libraries
from __future__ import division
import pandas as pd
import os
import sqlite3
import math
import seaborn as sns
import matplotlib.pyplot as plt
import random
In [3]:
#read data into the dataframe
%matplotlib inline
random.seed(666)
data_path = os.path.join('data', 'square.db')
con = sqlite3.connect(data_path)
df = pd.read_sql("SELECT * FROM DQMC", con)
con.close()
In [6]:
#Now I need to filter the data to remove nonphysical quantities
#Structure factors are always positive.
df = df[df["xx_af_structure_factor_value"] >= 0]
df = df[df["zz_af_structure_factor_value"] >= 0]
#I will consider the data with sign < 0.1 as not reliable
df = df[df["avg_sign_value"] >= 0.1]
#Shape of the lattice is related to the measurables because of the finite size effects,
#so I will deal only with shapes: 4x4, 6x6, 8x8, 10x10, 12x12, 16x16, 20x20
df = df[(df["number_of_sites"] == 16) |
(df["number_of_sites"] == 36) |
(df["number_of_sites"] == 64) |
(df["number_of_sites"] == 100)|
(df["number_of_sites"] == 144)|
(df["number_of_sites"] == 256)|
(df["number_of_sites"] == 400)]
#u > 0 and u < 0 show different phsyics, so for now I will keep only u >= 0 cases (repulsive)
df = df[df["u"] >=0]
#Later on I will make plots with respect to the electron density, so I will sort dataframe with respect to the
#electron density. It will make easier later
df = df.sort("density_value")
In [7]:
grid = sns.FacetGrid(df[((df["beta"]== 8) & (df["number_of_sites"] == 100))], col = "u", hue="u", col_wrap=3)
grid.map(plt.plot, "density_value", "xx_af_structure_factor_value", marker="o")
grid.fig.tight_layout(w_pad=1)
As we can see on the plot above, At small u plots are smooth, at large $u$ and small electron density there are missing values due to the sign problem. At $u = 5$ we have values only for the half filled case $(\rho = 1)$ in which sign is protected by the symmetry of the lattice.
In [8]:
grid = sns.FacetGrid(df[((df["u"]== 4) & (df["number_of_sites"] == 100))], col = "beta", hue="beta", col_wrap=5)
grid.map(plt.plot, "density_value", "xx_af_structure_factor_value", marker="o")
grid.fig.tight_layout(w_pad=1)
As we can see from the plot above at small values of $\beta$ we get nice smooth plots, but when we increase $\beta$ (lower temperature) sign problem will appear, so at $\beta = 7$ we will have missing values close to the half-filled case $(\rho = 1)$. And when $\beta$ increases, only values at half filling will be presented.
In [119]:
grid = sns.FacetGrid(df[((df["u"]== 6) & (df["beta"] == 4))], col = "number_of_sites", hue="number_of_sites", col_wrap=3)
grid.map(plt.plot, "density_value", "xx_af_structure_factor_value", marker="o")
grid.fig.tight_layout(w_pad=1)
As we increase lattice size sign problem comes back into play and we will have missing values, except for the cases where average sign is protected by the particle-hole symmetry.
Sign problem arises when $u, \beta, N$ increases, and on the plots above it was clear in what regiemes it becomes important. From physical point of view we are interested in the ground state properties $(\beta \to \infty)$ and thermodynamic limit $(N \to \infty)$, thus we need try to predict, how xx antiferromagnetic structure factor behaves there.
Data that we are using for this project was obtained from the DQMC simulations. Every simulation has number of input parameters, but important are:
Output parameters - number of them, but here I will keep only xx antyferromagnetic structure factor
In [11]:
df_filtered = df[["mu_up", "number_of_sites", "u", "beta", "xx_af_structure_factor_value"]]
df_filtered = df_filtered.rename(columns={'mu_up': 'mu',
'number_of_sites': 'N',
'xx_af_structure_factor_value': 'xx_af_structure_factor'})
In [12]:
print df_filtered.info()
We have 4 predictors and 11927 values for each.
In [13]:
#Import machine learning libraries
from sklearn import cross_validation
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
In [14]:
target = df_filtered['xx_af_structure_factor'].values
train = df_filtered.drop('xx_af_structure_factor', axis=1).values
#Split out training set into training and validation set
x_train, x_test, y_train, y_test = cross_validation.train_test_split(train, target, test_size=0.4, random_state=42)
In [15]:
#Now we estimate how well does our prediction algorithm works on a validation set
forest = RandomForestRegressor(n_jobs=3, n_estimators=1000)
fit = forest.fit(x_train, y_train)
print fit.score(x_test, y_test)
Random Forest regressor gives 38% accuracy.
In [139]:
ada = AdaBoostRegressor()
fit = ada.fit(x_train, y_train)
print fit.score(x_test, y_test)
Ada boost regressor gives 29% accuracy.
In [144]:
extra_tree = ExtraTreesRegressor(n_jobs=3, n_estimators=1000)
fit = extra_tree.fit(x_train, y_train)
print fit.score(x_test, y_test)
Extra trees regressor gives 38% accuracy.
In [146]:
gb = GradientBoostingRegressor(n_estimators=1000)
fit = gb.fit(x_train, y_train)
print fit.score(x_test, y_test)
Gradient Boosting regressor gives 41% accuracy.
In [148]:
clf = svm.SVC()
fit = gb.fit(x_train, y_train)
print fit.score(x_test, y_test)
Support vector machines gives you 41% accuracy.
41% is not ok. I need either to clean and normalize this data, or obtain more data... Goal is 80%+ accuracy.
In [ ]: