by: Bryan Dannowitz
Our proton beam shoots at our target table. The table shifts back and forth, allowing us to hit one of seven targets. For a certain range of data, we're not 100% sure what our target position was. We want to be certain before we analyze it.
We have many (90+) readouts of esoteric measures like trigger rates and radiation levels. Here, I aggregate these readouts, exclude non-useful features, and train a Random Forest Classifier to predict our target position.
In [16]:
import os # Check if files exist
import sys # Import my own modules
In [17]:
import MySQLdb as mdb # Raw data source is MySQL
import pandas as pd # Workhorse data management tool
import numpy as np # For matrices, arrays, matrix math, and nan's
from math import floor
In [18]:
%matplotlib inline
pd.set_option("max_rows", 10)
np.set_printoptions(precision=3)
In [19]:
import matplotlib.pyplot as plt # For plotting some distributions
import seaborn as sns # For easy, pretty plotting
sns.set_style("darkgrid")
sns.set_context("talk", font_scale=1.4)
In [23]:
sys.path.append('./modules')
from spill import get_bad_spills
In [24]:
server = 'e906-db3.fnal.gov' # Source MySQL server
schema = 'merged_roadset62_R004_V005' # Source MySQL schema
analysis_schema = 'user_dannowitz_test1' # A schema name for temporary storage
analysis_table = 'target_analysis' # A table name for that schema
In [25]:
# Aggregate data into our analysis schema and table.
# Table defined here:
create_query = """
CREATE TABLE IF NOT EXISTS %s.%s
(
spillID MEDIUMINT NOT NULL,
name VARCHAR(64),
value DOUBLE NOT NULL,
targetPos INT NOT NULL
)"""
In [26]:
# Here is one MySQL query that will grab all the data we want
# Most of this requires a bit of domain expertise to understand
# as it's specific to our experiment's data structure
scaler_query = """
INSERT INTO %s.%s
### Get data from our Scaler table, along with the target position.
### This contains features from our triggering systems (data taking rates)
SELECT s.spillID, scalerName AS `name`, value, targetPos
FROM Scaler
INNER JOIN Spill s # Source of targetPos
USING(spillID)
WHERE scalerName IS NOT NULL AND
s.spillID NOT BETWEEN 409000 AND 430000 AND
s.spillID NOT BETWEEN 416709 AND 424180 AND
s.spillID NOT BETWEEN 482574 AND 484924 AND
spillType='EOS'
"""
In [27]:
beam_query = """
INSERT INTO %s.%s
### Get data from our Beam table, along with the target position
### This contains features from our proton beam and radiation monitors
SELECT s.spillID, name, value, targetPos
FROM Beam
INNER JOIN Spill s # Source of targetPos
USING(spillID)
WHERE name IS NOT NULL AND
LEFT(name,3)!='F:M' AND # Exclude features that are always NULL
name!='F:NM2SEM' AND #
name!='U:TODB25' AND #
name!='S:KTEVTC' AND #
s.spillID NOT BETWEEN 409000 AND 430000 AND
s.spillID NOT BETWEEN 416709 AND 423255 AND
s.spillID NOT BETWEEN 423921 AND 424180 AND
s.spillID NOT BETWEEN 482574 AND 484924
"""
In [28]:
# The query for reading out the aggregated information
fetch_query = """SELECT * FROM %s.%s"""
In [22]:
# Run the query and read the resultset into a DataFrame
try:
db = mdb.connect(read_default_file='./.my.cnf', # Keep my login credentials secure
read_default_group='guest', # Read-only access to important data
host=server,
db=schema,
port=server_dict[server]['port'])
cur = db.cursor()
cur.execute("SHOW DATABASES LIKE '%s'" % analysis_schema) # See if schema exists
if cur.rowcount != 0:
cur.execute("DROP DATABASE %s" % analysis_schema) # Drop if it does
cur.execute("CREATE DATABASE %s" % analysis_schema) # Create analysis schema
cur.execute(create_query % (analysis_schema, analysis_table)) # Create analysis table
cur.execute(scaler_query % (analysis_schema, analysis_table)) # Fill table with scaler data
cur.execute(beam_query % (analysis_schema, analysis_table)) # Fill table with beam data
data_df = pd.read_sql(fetch_query % # Read data into DataFrame
(analysis_schema, analysis_table), db)
if db:
db.close()
except mdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
In [28]:
# Write to file, and you can read it back instead of querying again
data_df.to_csv('insight_demo_roadset62_long.csv')
In [12]:
# Write to file, and you can read it back instead of querying again
data_df = pd.read_csv('insight_demo_roadset62.csv', index_col='Unnamed: 0')
In [13]:
data_df.head() # Peek at the data
Out[13]:
In [14]:
data_df.info() # ...and investigate data types.
In [15]:
# Cast as float
data_df[['value']] = data_df[['value']].astype(float); data_df.info()
In [16]:
try:
# See if this has already been populated
bad_spill_set
except:
# Get the set of bad (non-sense) spills for this data
bad_spill_set = get_bad_spills(server, schema)
# Get rid of entries that correspond to bad spills
data_df = data_df.query('spillID not in bad_spill_set')
In [17]:
# How many features are we working with here?
value_names = data_df.name.unique()
print len(value_names), "unique features"
In [23]:
# Take a look at the first eight
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(12,20))
index = 0
for value_name in value_names[:8]:
subset_df = data_df[(data_df.name == value_name)]
axis = axes[floor(index/2),index%2]
sns.violinplot(subset_df.value, # We want to inspect the feature
subset_df.targetPos, # the distributions, and how they
color="Paired", # differ for each target position.
bw=0.7, # The side-by-side, normalized nature
ax=axis) # of violin plots are ideal for this.
axis.set_title(value_name)
axis.set_xlabel('Target Position', fontsize=20)
axis.set_ylabel('Value', fontsize=20)
index = index + 1
fig.subplots_adjust(hspace=0.4, wspace=0.4)
plt.show()
In [24]:
# We want to see our scalerNames as column indexes
pivoted_df = data_df.pivot('spillID', 'name', 'value'); pivoted_df.head()
Out[24]:
In [26]:
# Replace sentinel values with NaN's and then drop those rows
pivoted_df = pivoted_df.replace(-9999,np.nan).dropna(axis=0,how='any')
In [27]:
# We take a peek to see what the values in each look like overall
pivoted_df.describe()
Out[27]:
In [28]:
# It's sufficient to say that if the standard deviation is 0, then it's certainly not useful
zero_std_series = (pivoted_df.describe().ix['std'] == 0)
# Get an array of all the features with zero standard deviations
zero_std_features = zero_std_series[zero_std_series == True].index.values; zero_std_features
Out[28]:
In [29]:
# Get rid of these features
_ = pivoted_df.drop(zero_std_features, axis=1, inplace=True)
In [30]:
# Let's prepare the lables, or, our target positions
targpos_df = data_df[['spillID','targetPos']].drop_duplicates().sort('spillID')
targpos_df.head()
Out[30]:
In [31]:
full_df = pd.merge(pivoted_df, targpos_df, how='left', right_on='spillID', left_index=True)
In [32]:
full_df = full_df.set_index('spillID')
full_df.head()
Out[32]:
In [33]:
# Write it to file for use in the next part.
full_df.to_csv('insight_demo_roadset62.csv')
In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import cross_validation # We'll want to cross-validate our RF
sns.set_context("poster")
In [6]:
# If the data is already written, and we're re-visiting,
# just load the prepared data here
full_df = pd.read_csv('insight_demo_roadset62.csv', index_col='spillID')
In [7]:
# Split the DataFrame up into 'data' and 'labels'
labels = full_df.values[:,-1]
In [8]:
# Rescale the training data to beam intensity
# Beam intensity is a big number (O(10^12)), so multiply by a big constant
# to bring it back up to normal feature ranges
engineered_df = pd.DataFrame( (full_df.drop('targetPos', axis=1).values / full_df[['S:G2SEM']].values) * 5000000000000.0,
columns=full_df.columns[:-1] )
_ = engineered_df.drop('S:G2SEM', axis=1, inplace=True)
In [9]:
data = engineered_df.values
In [10]:
scale = StandardScaler().fit(data)
In [11]:
data_scaled = scale.transform(data)
In [12]:
d_train, d_test, l_train, l_test \
= cross_validation.train_test_split(data_scaled, labels, test_size=0.33, random_state=2)
In [13]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=None, max_features='sqrt',
min_samples_split=1, random_state=2)
In [14]:
rfc.fit(d_train, l_train)
Out[14]:
In [15]:
result = rfc.predict(d_test)
print("RF prediction accuracy = {0:5.1f}%".format(100.0 * rfc.score(d_test, l_test)))
In [79]:
def confusion(labels, results, names):
plt.figure(figsize=(10, 10))
# Make a 2D histogram from the test and result arrays
pts, xe, ye = np.histogram2d(labels.astype(int), results.astype(int), bins=len(names))
# For simplicity we create a new DataFrame
pd_pts = pd.DataFrame(np.flipud(pts.astype(int)), index=np.flipud(names), columns=names )
# Display heatmap and add decorations
hm = sns.heatmap(pd_pts, annot=True, fmt="d", cbar=False)
_, ylabels = plt.xticks()
_, xlabels = plt.yticks()
plt.setp(xlabels, rotation=45)
plt.setp(ylabels, rotation=45)
plt.xlabel("Actual", size=22)
plt.ylabel("Prediction", size=22)
return pts
def per_target_accuracy(hist2d_pts, names):
for i in range(len(names)):
rowsum = np.sum(hist2d_pts.T[i])
if rowsum>0:
print names[i] + ": \t" + str(round((hist2d_pts[i][i] / np.sum(hist2d_pts.T[i]))*100,2)) + "%"
else:
print names[i] + ": \tN/A"
In [80]:
# Define names for the target positions
names = ['Hydrogen','Empty','Deuterium','None','Carbon','Iron','Tungsten']
pts = confusion(l_test, result, names)
per_target_accuracy(pts, names)
In [81]:
def relabel(label_array):
# Collapse target position 4 and 2 both into category 2
# Then shift over the rest
label_array_revised = label_array.copy()
label_array_revised[label_array_revised == 4] = 2
label_array_revised[label_array_revised == 5] = 4
label_array_revised[label_array_revised == 6] = 5
label_array_revised[label_array_revised == 7] = 6
return label_array_revised
In [82]:
# Call the new re-labelling function
# and modify the names array
labels_revised = relabel(labels)
names = ['Hydrogen','Empty/None','Deuterium','Carbon','Iron','Tungsten']
In [83]:
d_train, d_test, l_train, l_test \
= cross_validation.train_test_split(data_scaled, labels_revised, test_size=0.33, random_state=5)
rfc = RandomForestClassifier(n_estimators=100, max_depth=None, max_features='sqrt',
min_samples_split=1, random_state=2).fit(d_train, l_train)
result = rfc.predict(d_test)
print("RF prediction accuracy = {0:5.1f}%\n".format(100.0 * rfc.score(d_test, l_test)))
In [84]:
pts = confusion(l_test, result, names) # Show confusion matrix
per_target_accuracy(pts, names) # Print per-target accuracy
In [87]:
features = full_df.drop(['S:G2SEM','targetPos'], axis=1).columns.values
importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
useful_feature_list = []
for f in range(20):
print("%d. Feature '%s' (%f)" % (f + 1, features[indices[f]], importances[indices[f]]))
useful_feature_list.append(features[indices[f]])
In [89]:
reduced_df = pd.DataFrame( (full_df[useful_feature_list].values / full_df[['S:G2SEM']].values) * 5000000000000.0,
columns=useful_feature_list )
data_reduced = reduced_df.values
scale_reduced = StandardScaler().fit(data_reduced)
data_reduced_scaled = scale_reduced.transform(data_reduced)
In [90]:
# We've reduced the number of features and normalized them all to beam intensity
# Let's see how that's affected our predictor
d_train, d_test, l_train, l_test \
= cross_validation.train_test_split(data_reduced_scaled, labels_revised, test_size=0.33, random_state=6)
rfc = RandomForestClassifier(n_estimators=100, max_depth=None, max_features='sqrt',
min_samples_split=1, random_state=2).fit(d_train, l_train)
result = rfc.predict(d_test)
print("RF prediction accuracy = {0:5.1f}%".format(100.0 * rfc_reduced.score(d_test, l_test)))
pts = confusion(l_test, result, names)
per_target_accuracy(pts, names)
In [91]:
kf_total = cross_validation.StratifiedKFold(labels_revised, n_folds=10, shuffle=True, random_state=4)
rfc_scores = cross_validation.cross_val_score(rfc, data_reduced_scaled, labels_revised, cv=kf_total)
In [92]:
print "RFC Scores: ", rfc_scores
print "Accuracy: %0.2f (+/- %0.2f)" % (rfc_scores.mean(), rfc_scores.std() * 2)
In [ ]:
from sklearn import neighbors
from sklearn import svm
from sklearn import preprocessing
In [ ]:
C = 1.0 # SVM regularization parameter
k_neighbors = 15
svc = svm.SVC(kernel='linear', C=C).fit(d_train, l_train)
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(d_train, l_train)
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(d_train, l_train)
lin_svc = svm.LinearSVC(C=C).fit(d_train, l_train)
knclf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights).fit(d_train, l_train)
rfc = RandomForestClassifier(n_estimators=100, max_depth=None, max_features='sqrt',
min_samples_split=1, random_state=2).fit(d_train, l_train)
In [ ]:
svc_result = svc.predict(d_test)
rbf_svc_result = rbf_svc.predict(d_test)
poly_svc_result = poly_svc.predict(d_test)
lin_svc_result = lin_svc.predict(d_test)
knclf_result = knclf.predict(d_test)
rfc_result = rfc.predict(d_test)
In [ ]:
print("SVC prediction accuracy = {0:5.1f}%".format(100.0 * svc.score(d_test, l_test)))
print("RBF SVC prediction accuracy = {0:5.1f}%".format(100.0 * rbf_svc.score(d_test, l_test)))
print("Polynomial SVC prediction accuracy = {0:5.1f}%".format(100.0 * poly_svc.score(d_test, l_test)))
print("Linear SVC prediction accuracy = {0:5.1f}%".format(100.0 * lin_svc.score(d_test, l_test)))
print("K-Neighbor prediction accuracy = {0:5.1f}%".format(100.0 * knclf.score(d_test, l_test)))
print("RFC prediction accuracy = {0:5.1f}%".format(100.0 * rfc.score(d_test, l_test)))
In [337]:
from sklearn.externals import joblib
In [31]:
# Train using our full data set
rfc_final = RandomForestClassifier(n_estimators=100, max_depth=None, max_features=None,
min_samples_split=1, random_state=2).fit(data_reduced_scaled, labels_revised)
In [39]:
data_reduced[:5]
Out[39]:
In [13]:
test_df = pd.read_csv('testrun.csv', index_col='spillID')
In [16]:
test_labels = test_df.values[:,-1]
test_labels = relabel(test_labels)
Out[16]:
In [30]:
test_data = test_df[useful_feature_list]
In [32]:
result = rfc_final.predict(test_data)
print("RF prediction accuracy = {0:5.1f}%".format(100.0 * rfc_final.score(test_data, test_labels)))
pts = confusion(test_labels, result, names)
per_target_accuracy(pts, names)
In [35]:
from sklearn.metrics import confusion_matrix
In [38]:
print test_labels
print result
In [36]:
confusion_matrix(test_labels, result)
Out[36]:
In [356]:
# Write our pickled classifier to file`
rfc_pickle_name = "models/target_rfc_roadset62.pkl"
if os.path.exists(rfc_pickle_name):
os.remove(rfc_pickle_name)
joblib.dump(rfc_final, rfc_pickle_name)
Out[356]:
In [344]:
data_reduced.shape
reduced_df.columns
Out[344]:
In [345]:
useful_feature_list
Out[345]:
In [ ]: