In [1]:
from __future__ import division
from IPython.display import display
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import random, sys, os, re
In [2]:
id_list = []
with open('../submissions/Submission_Format.csv', 'r') as f:
lines = f.read().splitlines()
for line in lines:
ID,prob = line.split(',')
if ID == '': continue
id_list.append(ID)
In [3]:
def get_filepaths(directory):
"""
This function will generate the file names in a directory
tree by walking the tree either top-down or bottom-up. For each
directory in the tree rooted at directory top (including top itself),
it yields a 3-tuple list (dirpath, dirnames, filenames).
"""
import os
file_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for root, directories, files in os.walk(directory):
for filename in files:
# Join the two strings in order to form the full filepath.
filepath = os.path.join(root, filename)
file_paths.append(filepath) # Add it to the list.
return file_paths
In [4]:
file_list = get_filepaths('../submissions')
file_list
Out[4]:
In [5]:
# why do it more than once? For some reason it doesn't work if only run once. Who knows?
# ======================================================================================
for i in range(3):
for file_name in file_list:
if 'Format' in file_name: file_list.remove(file_name)
if 'Ensemble' in file_name: file_list.remove(file_name)
if 'ensemble' in file_name: file_list.remove(file_name)
file_list.sort(key=lambda x: x[26:32])
from copy import copy
file_list_all = copy(file_list)
file_list
Out[5]:
In [6]:
from collections import defaultdict
aggregates = defaultdict(list)
averages = defaultdict(list)
# 1. collect the probabilities for each ID from all the submission files
# ======================================================================
for file_name in file_list:
with open(file_name, 'r') as f:
lines = f.read().splitlines()
for line in lines:
ID,prob = line.split(',')
if ID == '': continue
aggregates[ID].append(prob)
# 2. find the average of all the probabilities for each ID
# ========================================================
averages.update((ID, np.mean(map(float, probs))) for ID, probs in aggregates.items())
aggregates['1'],averages['1']
Out[6]:
In [7]:
len(aggregates),len(averages)
Out[7]:
In [8]:
# f = open("../submissions/submission_EnsembleOfAveragesALL.csv", "w")
# f.write(",Made Donation in March 2007\n")
# for ID in id_list:
# f.write("{},{}\n".format(ID, averages[ID]))
# f.close()
In [9]:
file_list
Out[9]:
In [11]:
# why do it more than once? For some reason it doesn't work if only run once. Who knows?
# ======================================================================================
for _ in range(2):
for _ in range(4):
for file_name in file_list:
if 'Format' in file_name: file_list.remove(file_name)
if 'Ensemble' in file_name: file_list.remove(file_name)
# scores of 0.4... or 0.3... are good
# files with SEED... are good-scoring models that were re-run with different random seeds
if ('bagged_nolearn' not in file_name):
file_list.remove(file_name)
file_list
Out[11]:
In [12]:
from collections import defaultdict
aggregates = defaultdict(list)
averages = defaultdict(list)
# 1. collect the probabilities for each ID from all the submission files
# ======================================================================
for file_name in file_list:
with open(file_name, 'r') as f:
lines = f.read().splitlines()
for line in lines:
ID,prob = line.split(',')
if ID == '': continue
aggregates[ID].append(prob)
# 2. find the average of all the probabilities for each ID
# ========================================================
averages.update((ID, np.mean(map(float, probs))) for ID, probs in aggregates.items())
aggregates['1'],averages['1']
Out[12]:
In [13]:
len(aggregates),len(averages)
Out[13]:
In [14]:
f = open("../submissions/submission_EnsembleOfAveragesBEST_SEED.csv", "w")
f.write(",Made Donation in March 2007\n")
for ID in id_list:
f.write("{},{}\n".format(ID, averages[ID]))
f.close()
In [15]:
from os.path import split
corr_table = pd.read_csv(file_list_all[0],names=['id',split(file_list_all[0])[1][11:-4]],header=0,index_col=0)
corr_table.head()
Out[15]:
In [16]:
for file_path in file_list_all[1:]:
temp = pd.read_csv(file_path,names=['id',split(file_path)[1][11:-4]],header=0,index_col=0)
corr_table[temp.columns[0]] = temp[[temp.columns[0]]]
corr_table.head()
Out[16]:
In [17]:
import seaborn as sns
# Compute the correlation matrix
corr_matrix = corr_table.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=.9,
square=True, xticklabels=4, yticklabels=3,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
plt.show()
In [18]:
corr_threshold = 0.20
indices = np.where(corr_matrix < corr_threshold)
indices = [(corr_matrix.index[x], corr_matrix.columns[y], corr_matrix.ix[x,y]) for x, y in zip(*indices)
if x != y and x < y]
from operator import itemgetter
indices.sort(key=itemgetter(2))
len(indices),indices
Out[18]:
In [19]:
least_corr = set(set(['../submissions/submission_'+a+'.csv' for a,b,c in indices]).\
union(set(['../submissions/submission_'+b+'.csv' for a,b,c in indices])))
len(least_corr), least_corr
Out[19]:
In [20]:
from collections import defaultdict
aggregates = defaultdict(list)
averages = defaultdict(list)
# 1. collect the probabilities for each ID from all the submission files
# ======================================================================
for file_name in least_corr:
with open(file_name, 'r') as f:
lines = f.read().splitlines()
for line in lines:
ID,prob = line.split(',')
if ID == '': continue
aggregates[ID].append(prob)
# 2. find the average of all the probabilities for each ID
# ========================================================
averages.update((ID, np.mean(map(float, probs))) for ID, probs in aggregates.items())
aggregates['1'],averages['1']
Out[20]:
In [21]:
# f = open("../submissions/submission_EnsembleOfAveragesLeastCorr.csv", "w")
# f.write(",Made Donation in March 2007\n")
# for ID in id_list:
# f.write("{},{}\n".format(ID, averages[ID]))
# f.close()
In [ ]:
In [ ]:
In [ ]:
In [ ]: