In [ ]:
#disable autosave functionality;
#This script is taxing enough, and autosaving tends to push it over the edge
# plus, autosaving seems to zero out the file before restoring it from the backup
# this means an autosave causing a crash will actually delete the file rather than saving it!!!
%autosave 0
In [ ]:
#basic imports
%pylab inline
import time
import pickle
import logging as L
L.basicConfig(level=L.ERROR) # INFO)
import time
import numpy
#import #pylab
import matplotlib.pyplot as plt
import scipy.stats
import os
import pylab
import sklearn
import scipy
import sklearn.linear_model
import re
pylab.rcParams['figure.figsize'] = 10,10 #change the default image size for this session
pylab.ion()
In [ ]:
import mne
from mne.connectivity import spectral_connectivity, seed_target_indices
from mne.viz import plot_topo_tfr
In [ ]:
#custom imports
%cd ../scripts
# brian's prototype routines
from protoMEEGutils import *
import protoSpectralWinFFTMapper as specfft
In [ ]:
# OUTLINE, 19th Nov 2014
#
# script for initial "signs of life" analysis of single MEG
#
# load in a meg file in EEGlab format
# load in the word properties
# choose a "languagey" channel (left-temporal, clean, with expected ERF patterns)
# plot some ERFs (e.g. all nouns vs all preps) as sanity check
#### SUBROUTINES ####
# plot the time-scale for ERFs and other epoch figures
def commonPlotProps():
#zeroSample = (abs(epochStart)/float(epochLength)*epochNumTimepoints)
#pylab.plot((0,epochNumTimepoints),(0,0),'k--')
#pylab.ylim((-2.5e-13,2.5e-13)) #((-5e-14,5e-14)) # better way just to get the ones it choose itself?
#pylab.plot((zeroSample,zeroSample),(0,0.01),'k--')
pylab.xticks(numpy.linspace(0,epochNumTimepoints,7),epochStart+(numpy.linspace(0,epochNumTimepoints,7)/samplingRate))
pylab.xlabel('time (s) relative to auditory onset') #+refEvent)
pylab.xlim((62,313))
pylab.show()
pylab.axhline(0, color='k', linestyle='--')
pylab.axvline(125, color='k', linestyle='--')
# adjust R2 down for the artificial inflation you get by increasing the number of explanatory features
def adjustR2(R2, numFeatures, numSamples):
#1/0
#return R2
return R2-(1-R2)*(float(numFeatures)/(numSamples-numFeatures-1))
# normalise (z-scale) the scale of variables (for the explanatory ones, so the magnitude of beta values are comparably interpretable)
def mynormalise(A):
A = scipy.stats.zscore(A)
A[numpy.isnan(A)] = 0
return A
In [ ]:
#change to the data directory to load in the data
%cd ../MEG_data
# choose a file - I found participant V to be pretty good, and 0.01 to 50Hz filter is pretty conservative #*#
(megFileTag1, megFile1) = ('V_TSSS_0.01-50Hz_@125', 'v_hod_allRuns_tsss_audiobookPrepro_stPad1_lp50_resamp125_frac10ICAed.set')#_hp0.010000.set')
(megFileTag2, megFile2) = ('A_TSSS_0.01-50Hz_@125', 'aud_hofd_a_allRuns_tsss_audiobookPrepro_stPad1_lp50_resamp125_frac10ICAed_hp0.010000.set')
(megFileTag3, megFile3) = ('C_TSSS_0.01-50Hz_@125', 'aud_hofd_c_allRuns_tsss_audiobookPrepro_stPad1_lp50_resamp125_frac10ICAed_hp0.010000.set')
## put your on properties in here, as a .tab file of similar format (tab delimited, and field names in a first comment line - should be easy to do in excel...)
#to get the V5.tab:
# python ../scripts/buildtab.py hod_JoeTimes_LoadsaFeaturesV3.tab hod.wsj02to21-comparativized-gcg15-1671-4sm.fullberk.parsed.gcgbadwords > hod_JoeTimes_LoadsaFeaturesV4.tab
# python ../scripts/addsentid.py hod_JoeTimes_LoadsaFeaturesV4.tab > hod_JoeTimes_LoadsaFeaturesV5.tab
tokenPropsFile = 'hod_JoeTimes_LoadsaFeaturesV5.tab' # best yet! have automatic Stanford tagging, several word length and freq measures, and also the 2 and 3 token back-grams
# WHICH CHANNELS TO LOOK AT AS ERFS
# decide which channels to use - channels of interest are the first few you can look at in an ERF, and then from them you can choose one at a time with "channelToAnalyse" for the actual regression analysis #*#
channelLabels = ['MEG1533', 'MEG0323', 'MEG0343']
#channelLabels = ['MEG0111', 'MEG0121', 'MEG0131', 'MEG0211', 'MEG0212', 'MEG0213', 'MEG0341']
#?# this way of doing things was a slightly clumsy work-around, cos I didn't have enough memory to epoch all 306 channels at one time
# LOAD WORD PROPS
# change dtype to suit the files in your .tab file #*#
tokenProps = scipy.genfromtxt(tokenPropsFile,
delimiter='\t',names=True,
dtype="i4,f4,f4,S50,S50,i2,i2,i2,S10,f4,f4,f4,f4,f4,f4,f4,f4,f4,f4,f4,i1,>i4")
# ... and temporarily save as cpickle archive to satisfy the way I programmed the convenience function loadBookMEGWithAudio (it expects to find the same info in a C-pickle file, and so doesn't need to know about number and type of fields)
tokenPropsPickle = tokenPropsFile+'.cpk'
cPickle.dump(tokenProps, open(tokenPropsPickle, 'wb'))
In [ ]:
triggersOfInterest=['s%d' % i for i in range(1,10)]
refEvent = 'onTime' #,'offTime']
# guess an epoch of -0.5 to +1s should be enough #*#
epochStart = -1; # stimulus ref event
epochEnd = +2; #
epochLength = epochEnd-epochStart;
baseline = False #[-1,0]
In [ ]:
# Get the goods on subject 1
(contSignalData1, metaData1, trackTrials, tokenProps, audioSignal, samplingRate, numChannels) = loadBookMEGWithAudio(megFile1, tokenPropsPickle, triggersOfInterest, epochEnd, epochStart, icaComps=False)
channelsOfInterest = [i for i in range(len(metaData1.chanlocs)) if metaData1.chanlocs[i].labels in channelLabels]
severalMagChannels1 = contSignalData1[channelsOfInterest,:]
(wordTrials1, epochedSignalData1, epochSliceTimepoints, wordTimesAbsolute, numTrials, epochNumTimepoints) = wordTrialEpochify(severalMagChannels1, samplingRate, tokenProps, trackTrials, refEvent, epochEnd, epochStart)
In [ ]:
del contSignalData1
del severalMagChannels1
In [ ]:
# Get the goods on subject 2
(contSignalData2, metaData2, trackTrials, tokenProps, audioSignal, samplingRate, numChannels) = loadBookMEGWithAudio(megFile2, tokenPropsPickle, triggersOfInterest, epochEnd, epochStart, icaComps=False)
severalMagChannels2 = contSignalData2[channelsOfInterest,:]
(wordTrials2, epochedSignalData2, epochSliceTimepoints, wordTimesAbsolute, numTrials, epochNumTimepoints) = wordTrialEpochify(severalMagChannels2, samplingRate, tokenProps, trackTrials, refEvent, epochEnd, epochStart)
In [ ]:
del contSignalData2
del severalMagChannels2
In [ ]:
# Get the goods on subject 3
(contSignalData3, metaData3, trackTrials, tokenProps, audioSignal, samplingRate, numChannels) = loadBookMEGWithAudio(megFile3, tokenPropsPickle, triggersOfInterest, epochEnd, epochStart, icaComps=False)
severalMagChannels3 = contSignalData3[channelsOfInterest,:]
(wordTrials3, epochedSignalData3, epochSliceTimepoints, wordTimesAbsolute, numTrials, epochNumTimepoints) = wordTrialEpochify(severalMagChannels3, samplingRate, tokenProps, trackTrials, refEvent, epochEnd, epochStart)
In [ ]:
del contSignalData3
del severalMagChannels3
In [ ]:
epochedSignalData = numpy.concatenate((epochedSignalData1,epochedSignalData2,epochedSignalData3), axis=0)
print(epochedSignalData.shape)
tokenProps = numpy.concatenate((tokenProps,tokenProps,tokenProps),axis=0)
print(tokenProps.shape)
In [ ]:
#pylab.figure()
#pylab.subplot(2,1,1)
#pylab.plot(audioSignal)
#pylab.title('audio signal')
#pylab.subplot(2,1,2)
#pylab.plot(contSignalData[0,:])
#pylab.title('first MEG signal')
#pylab.figure()
#pylab.title('ERF over all tokens, selected channels')
#pylab.plot( numpy.mean(epochedSignalData,axis=0).T)
#pylab.legend(channelLabels, loc=4)
#commonPlotProps()
In [ ]:
# REDUCE TRIALS TO JUST THOSE THAT CONTAIN A REAL WORD (NOT PUNCTUATION, SPACES, ...)
wordTrialsBool = numpy.array([p != '' for p in tokenProps['stanfPOS']])
print(wordTrialsBool[:10])
# REDUCE TRIALS TO JUST THOSE THAT HAVE A DECENT DEPTH ESTIMATE
parsedTrialsBool = numpy.array([d != -1 for d in tokenProps['syndepth']])
print(parsedTrialsBool[:10])
d1TrialsBool = numpy.array([d == 1 for d in tokenProps['syndepth']])
d2TrialsBool = numpy.array([d == 2 for d in tokenProps['syndepth']])
d3TrialsBool = numpy.array([d == 3 for d in tokenProps['syndepth']])
In [ ]:
# Set up the dev and test sets
devsizerecip = 3 # the reciprocal of the dev size, so devsizerecip = 3 means the dev set is 1/3 and the test set is 2/3
devitems = numpy.arange(1,max(tokenProps['sentid']),devsizerecip)
devTrialsBool = numpy.array([s in devitems for s in tokenProps['sentid']])
testTrialsBool = numpy.array([s not in devitems for s in tokenProps['sentid']])
In [ ]:
inDataset = devTrialsBool
In [ ]:
print tokenProps.shape,epochedSignalData.shape
wordEpochs = epochedSignalData[wordTrialsBool & parsedTrialsBool & inDataset]
wordFeatures = tokenProps[wordTrialsBool & parsedTrialsBool & inDataset]
print wordFeatures.shape, wordEpochs.shape
In [ ]:
print tokenProps.shape,epochedSignalData.shape
d1wordEpochs = epochedSignalData[wordTrialsBool & d1TrialsBool & inDataset]
d1wordFeatures = tokenProps[wordTrialsBool & d1TrialsBool & inDataset]
print d1wordFeatures.shape, d1wordEpochs.shape
d2wordEpochs = epochedSignalData[wordTrialsBool & d2TrialsBool & inDataset]
d2wordFeatures = tokenProps[wordTrialsBool & d2TrialsBool & inDataset]
print d2wordFeatures.shape, d2wordEpochs.shape
d3wordEpochs = epochedSignalData[wordTrialsBool & d3TrialsBool & inDataset]
d3wordFeatures = tokenProps[wordTrialsBool & d3TrialsBool & inDataset]
print d3wordFeatures.shape, d3wordEpochs.shape
In [ ]:
freq_decomp = 'wavelet'
NJOBS = 20 #dignam has 24 processor
fmin = 7 #minimum frequency of interest (wavelet); 7
fmax = 38 #maximum frequency of interest (wavelet); 30
In [ ]:
names = channelLabels
seed = list(numpy.arange(len(names)))*len(names)
targets = numpy.array(seed)
seed = numpy.sort(seed)
print seed
print targets
#use indices = seed_target_indices for full cross connectivity (not useful for coherence)
#indices = seed_target_indices(seed, targets)
#use indices = None for lower triangular connectivity
indices = None
In [ ]:
#if freq_decomp == 'fourier':
# con, freqs, times, _, _ = spectral_connectivity(wordEpochs, indices=indices,
# method='coh', mode='fourier', sfreq=samplingRate, n_jobs=NJOBS, verbose='WARNING')
#elif freq_decomp == 'wavelet':
# #use morlet wavelet decomposition
cwt_frequencies = numpy.arange(fmin, fmax, 2)
cwt_n_cycles = cwt_frequencies / 7.
d1con, freqs, times, _, _ = spectral_connectivity(d1wordEpochs, indices=indices,
method='coh', mode='cwt_morlet', sfreq=samplingRate,
cwt_frequencies=cwt_frequencies, cwt_n_cycles=cwt_n_cycles, n_jobs=NJOBS, verbose='WARNING')
In [ ]:
#if freq_decomp == 'fourier':
# con, freqs, times, _, _ = spectral_connectivity(wordEpochs, indices=indices,
# method='coh', mode='fourier', sfreq=samplingRate, n_jobs=NJOBS, verbose='WARNING')
#elif freq_decomp == 'wavelet':
# #use morlet wavelet decomposition
cwt_frequencies = numpy.arange(fmin, fmax, 2)
cwt_n_cycles = cwt_frequencies / 7.
d2con, freqs, times, _, _ = spectral_connectivity(d2wordEpochs, indices=indices,
method='coh', mode='cwt_morlet', sfreq=samplingRate,
cwt_frequencies=cwt_frequencies, cwt_n_cycles=cwt_n_cycles, n_jobs=NJOBS, verbose='WARNING')
In [ ]:
#if freq_decomp == 'fourier':
# con, freqs, times, _, _ = spectral_connectivity(wordEpochs, indices=indices,
# method='coh', mode='fourier', sfreq=samplingRate, n_jobs=NJOBS, verbose='WARNING')
#elif freq_decomp == 'wavelet':
# #use morlet wavelet decomposition
cwt_frequencies = numpy.arange(fmin, fmax, 2)
cwt_n_cycles = cwt_frequencies / 7.
d3con, freqs, times, _, _ = spectral_connectivity(d3wordEpochs, indices=indices,
method='coh', mode='cwt_morlet', sfreq=samplingRate,
cwt_frequencies=cwt_frequencies, cwt_n_cycles=cwt_n_cycles, n_jobs=NJOBS, verbose='WARNING')
In [ ]:
print freqs.shape
print d1con.shape
In [ ]:
n_rows, n_cols = d1con.shape[:2]
vmin = min(min(d1con.ravel()),min(d2con.ravel()),min(d3con.ravel()))
vmax = max(max(d1con.ravel()),max(d2con.ravel()),max(d3con.ravel()))
con = d1con
fig, axes = plt.subplots(n_rows, n_cols, sharex=True, sharey=True)
plt.suptitle('Between sensor connectivity (coherence)')
for i in range(n_rows):
for j in range(n_cols):
if i == j:
axes[i, j].set_axis_off()
continue
cax = axes[i, j].imshow(con[i, j, :], vmin=vmin, vmax = vmax, aspect = 'auto')
if epochStart < 0:
axes[i, j].axvline(x=epochStart *-1 * samplingRate, c='black', lw=1)
#else: onset isn't on graph
if j == 0:
axes[i, j].set_ylabel(names[i])
axes[i, j].set_yticks(numpy.arange(len(cwt_frequencies)))
axes[i, j].set_yticklabels(cwt_frequencies)
axes[0, i].set_title(names[i])
if i == (n_rows - 1):
axes[i, j].set_xlabel(names[j])
axes[i, j].set_xticks(numpy.arange(0.0,con.shape[3],50))
axes[i, j].set_xticklabels(numpy.arange(epochStart,epochEnd,(epochEnd-epochStart)/(con.shape[3]/50.)))
axes[i, j].set_ylim([0.0, con.shape[2]-1])
fig.colorbar(cax)
plt.savefig('small_d1coh.png')
plt.show()
In [ ]:
con = d2con
fig, axes = plt.subplots(n_rows, n_cols, sharex=True, sharey=True)
plt.suptitle('Between sensor connectivity (coherence)')
for i in range(n_rows):
for j in range(n_cols):
if i == j:
axes[i, j].set_axis_off()
continue
cax = axes[i, j].imshow(con[i, j, :], vmin=vmin, vmax = vmax, aspect = 'auto')
if epochStart < 0:
axes[i, j].axvline(x=epochStart *-1 * samplingRate, c='black', lw=1)
#else: onset isn't on graph
if j == 0:
axes[i, j].set_ylabel(names[i])
axes[i, j].set_yticks(numpy.arange(len(cwt_frequencies)))
axes[i, j].set_yticklabels(cwt_frequencies)
axes[0, i].set_title(names[i])
if i == (n_rows - 1):
axes[i, j].set_xlabel(names[j])
axes[i, j].set_xticks(numpy.arange(0.0,con.shape[3],50))
axes[i, j].set_xticklabels(numpy.arange(epochStart,epochEnd,(epochEnd-epochStart)/(con.shape[3]/50.)))
axes[i, j].set_ylim([0.0, con.shape[2]-1])
fig.colorbar(cax)
plt.savefig('small_d2coh.png')
plt.show()
In [ ]:
con = d3con
fig, axes = plt.subplots(n_rows, n_cols, sharex=True, sharey=True)
plt.suptitle('Between sensor connectivity (coherence)')
for i in range(n_rows):
for j in range(n_cols):
if i == j:
axes[i, j].set_axis_off()
continue
cax = axes[i, j].imshow(con[i, j, :], vmin=vmin, vmax = vmax, aspect = 'auto')
if epochStart < 0:
axes[i, j].axvline(x=epochStart *-1 * samplingRate, c='black', lw=1)
#else: onset isn't on graph
if j == 0:
axes[i, j].set_ylabel(names[i])
axes[i, j].set_yticks(numpy.arange(len(cwt_frequencies)))
axes[i, j].set_yticklabels(cwt_frequencies)
axes[0, i].set_title(names[i])
if i == (n_rows - 1):
axes[i, j].set_xlabel(names[j])
axes[i, j].set_xticks(numpy.arange(0.0,con.shape[3],50))
axes[i, j].set_xticklabels(numpy.arange(epochStart,epochEnd,(epochEnd-epochStart)/(con.shape[3]/50.)))
axes[i, j].set_ylim([0.0, con.shape[2]-1])
fig.colorbar(cax)
plt.savefig('small_d3coh.png')
plt.show()
In [ ]:
#test reshape outcomes
a = np.arange(18).reshape((3,2,3))
print(a) #target: 3 epochs, 2 channels, 3 frequency_features
b = np.arange(18).reshape((3,6))
print(b) #fft output: 3 epochs, 2 channels x 3 frequency_features
c = b.reshape((3,2,3))
print(c) #reshaped output: 3 epochs, 2 channels, 3 frequency_features
In [ ]:
# The FFT script collapses across channels
# index0: epoch
# index1: channels x fft_feature_types x frequencies
# solution: reshape the output as (epochs,channels,-1)
print 'wordEpochs: ',wordEpochs.shape
# Spectrally decompose the epochs
(mappedTrialFeatures, spectralFrequencies) = specfft.mapFeatures(wordEpochs,samplingRate,windowShape='hann',featureType='amp')
# Reshape output to get epochs x channels x frequency
mappedTrialFeatures = mappedTrialFeatures.reshape((wordEpochs.shape[0],wordEpochs.shape[1],-1))
print 'FFT output: ', mappedTrialFeatures.shape, spectralFrequencies.shape
In [ ]:
# The FFT script collapses across channels
# index0: epoch
# index1: channels x fft_feature_types x frequencies
# solution: reshape the output as epochs,channels,-1
#print 'wordEpochs: ',wordEpochs.shape
#create a dummy array to permit looping
#mappedTrialFeatures = numpy.zeros((wordEpochs.shape[0],1,wordEpochs.shape[2]))
#FIRST = True
#for i in range(wordEpochs.shape[1]):
#for each channel, get a spectral decomposition of it
#if FIRST:
# print 'dummy: ', mappedTrialFeatures.shape, '\n'
#else:
# print 'real: ', mappedTrialFeatures.shape, '\n'
#singleChannelEpochs = wordEpochs[:,i,:].reshape(wordEpochs.shape[0],1,-1)
##(mappedTrialFeaturesRun, spectralFrequencies) = specfft.mapFeatures(singleChannelEpochs,samplingRate,windowShape='hann',featureType='amp')
#mappedTrialFeaturesRun = singleChannelEpochs[:,:,:-1]
#print 'run: ',mappedTrialFeaturesRun.shape
#mappedTrialFeaturesRun = mappedTrialFeaturesRun.reshape(mappedTrialFeaturesRun.shape[0],1,-1)
#if FIRST:
# mappedTrialFeatures = numpy.copy(mappedTrialFeaturesRun)
#else:
# mappedTrialFeatures = numpy.concatenate((mappedTrialFeatures,mappedTrialFeaturesRun),axis=1)
#FIRST = False
#print 'final: ',mappedTrialFeatures.shape
In [ ]:
print(spectralFrequencies)
freqsource = None #Can be 'weiss', 'wiki', or an interpolation of the two
if freqsource == 'weiss':
#Weiss et al. 05
theta = numpy.nonzero( (spectralFrequencies >= 4) & (spectralFrequencies <= 7) )
beta1 = numpy.nonzero( (spectralFrequencies >= 13) & (spectralFrequencies <= 18) )
beta2 = numpy.nonzero( (spectralFrequencies >= 20) & (spectralFrequencies <= 28) )
gamma = numpy.nonzero( (spectralFrequencies >= 30) & (spectralFrequencies <= 34) )
elif freqsource == 'wiki':
# end of http://en.wikipedia.org/wiki/Theta_rhythm
delta = numpy.nonzero( (spectralFrequencies >= 0.1) & (spectralFrequencies <= 3) )
theta = numpy.nonzero( (spectralFrequencies >= 4) & (spectralFrequencies <= 7) )
alpha = numpy.nonzero( (spectralFrequencies >= 8) & (spectralFrequencies <= 15) )
beta = numpy.nonzero( (spectralFrequencies >= 16) & (spectralFrequencies <= 31) )
gamma = numpy.nonzero( (spectralFrequencies >= 32) & (spectralFrequencies <= 100) )
else:
#Interpolate between weiss and wiki
#print(numpy.nonzero((spectralFrequencies >= 4) & (spectralFrequencies <= 7)))
theta = numpy.nonzero( (spectralFrequencies >= 4) & (spectralFrequencies <= 7) )
alpha = numpy.nonzero( (spectralFrequencies >= 8) & (spectralFrequencies < 13) )
beta1 = numpy.nonzero( (spectralFrequencies >= 13) & (spectralFrequencies <= 18) )
beta2 = numpy.nonzero( (spectralFrequencies >= 20) & (spectralFrequencies <= 28) )
gamma = numpy.nonzero( (spectralFrequencies >= 30) & (spectralFrequencies <= 34) )
print(theta)
print(theta[0])
In [ ]:
print(channelLabels)
channelToAnalyse = 3 # index of the channels above to actually run regression analysis on
print 'Analyzing:', channelLabels[channelToAnalyse]
In [ ]:
# REGULARISATION VALUES TO TRY (e.g. in Ridge GCV)
regParam = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 1e+2, 2e+2, 5e+2, 1e+3, 2e+3, 5e+3]
# SELECT AND DESCRIBE THE REGRESSORS WE'RE CHOOSING TO USE
# this strings should match the names of the fields in tokenProps
# here you should list the features you're choosing from your .tab file (just one?) #*#
features = [
#'logFreq_ANC',
#'surprisal2back_COCA',
#'bigramEntropy_COCA_here',
'syndepth'
]
# ... this has shorthand versions of the variable names, for display, and also has to include the "position" one that this version of the script inserts by default #*#
labelMap = {
#'logFreq_ANC': 'freq',
#'surprisal2back_COCA': 'surprisal',
#'bigramEntropy_COCA_here': 'entropy',
#'sentenceSerial': 'position',
'syndepth': 'depth'
}
legendLabels = features
In [ ]:
# SLOT REGRESSORS IN ONE BY ONE
explanatoryFeatures = numpy.zeros((wordFeatures.shape)) # dummy
#explanatoryFeatures = numpy.array([])
for feature in features:
print feature
explanatoryFeatures = numpy.vstack((explanatoryFeatures, wordFeatures[feature]))
explanatoryFeatures = explanatoryFeatures[1:].T # strip zeros out again
# PLOT EFFECTS X EPOCHS BACK
# I guess you don't want to do the history thing (though is good initially for sanity check), so can leave this at 0 #*#
epochHistory = 0
In [ ]:
modelTrainingFit = []
modelTestCorrelation = []
modelParameters = []
legendLabels = features
#tmpFeatures = explanatoryFeatures.copy()
#tmpLegend = legendLabels[:]
#for epochsBack in range(1,epochHistory+1):
# epochFeatures = numpy.zeros(tmpFeatures.shape)
# epochFeatures[epochsBack:,:] = tmpFeatures[:-epochsBack,:]
# explanatoryFeatures = numpy.hstack((explanatoryFeatures,epochFeatures))
# legendLabels = legendLabels + [l+'-'+str(epochsBack) for l in tmpLegend]
## put in sentence serial - can't leave in history, cos is too highly correlated across history...
#explanatoryFeatures = numpy.vstack((explanatoryFeatures.T, wordFeatures['sentenceSerial'])).T
#features.append('sentenceSerial')
#legendLabels.append('sentenceSerial')
In [ ]:
# STEP THROUGH EACH TIME POINT IN THE EPOCH, RUNNING REGRESSION FOR EACH ONE
for t in theta[0]:#range(1):
#print 'fitting at timepoint',t
# NOTES # tried a load of different versions, and straight linear regression does as well as any of them, measured in terms of R^2
# WHICH VARIETY OF REGRESSION TO USE?
# I get pretty similar results with all three of those below. The most generic (ie fewest extra assumptions) is normal LinearRegression. I guess RidgeCV should do best in terms of R^2, but has discontinuities in betas, as different regularisation parameters are optimal at each time step. LassoLars is something of a compromise. #*#
#lm = sklearn.linear_model.LinearRegression(fit_intercept=True, normalize=True)
#lm = sklearn.linear_model.RidgeCV(fit_intercept=True, normalize=True, alphas=regParam) #, 10000, 100000])
lm = sklearn.linear_model.LassoLars(alpha=0.0001) #(alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, copy_X=True)
# NORMALISE THE EXPLANATORY VARIABLES? (for comparable beta magnitude interpretation)
# choose whether to scale inputs #*#
trainX = mynormalise(explanatoryFeatures)
trainY = mynormalise(mappedTrialFeatures[:,channelToAnalyse,t])
#trainX = mynormalise(explanatoryFeatures)
#trainY = mynormalise(wordEpochs[:,channelToAnalyse,t])
#trainX = explanatoryFeatures
#trainY = wordEpochs[:,channelToAnalyse,t]
trainedLM = lm.fit(trainX,trainY)
modelParameters.append(lm)
#print(lm.score(trainX,trainY),trainX.shape[1], trainX.shape[0])
#modelTrainingFit.append(adjustR2(lm.score(trainX,trainY), trainX.shape[1], trainX.shape[0]))
modelTrainingFit.append(lm.score(trainX,trainY)) #for a single feature, no punishment is necessary
In [ ]:
print(modelTrainingFit)
print(numpy.sort(modelTrainingFit)[::-1])
print 'ave fit: ', numpy.mean(modelTrainingFit)
print 'max fit: ', numpy.max(modelTrainingFit)
In [ ]:
# DETERMINE IF THERE IS CORRELATION BETWEEN THE EXPLANATORY VARIABLES
betaMatrix = numpy.array([p.coef_ for p in modelParameters])
print(betaMatrix.shape)
neatLabels = [l.replace(re.match(r'[^-]+',l).group(0), labelMap[re.match(r'[^-]+',l).group(0)]) for l in legendLabels if re.match(r'[^-]+',l).group(0) in labelMap]
legendLabels = numpy.array(legendLabels)
#numFeaturesDisplay = len(legendLabels)
neatLabels = numpy.array(neatLabels)
In [ ]:
# DO BIG SUMMARY PLOT OF FEATURE CORRELATIONS, R^2 OVER TIMECOURSE, BETAS OVER TIME COURSE, AND ERF/ERP
f = pylab.figure(figsize=(10,10))
s = pylab.subplot(2,2,1)
pylab.title('R-squared '+str(trainedLM))
pylab.plot(modelTrainingFit, linewidth=2)
commonPlotProps()
s = pylab.subplot(2,2,2)
if betaMatrix.shape[1] > 7:
pylab.plot(betaMatrix[:,:7], '-', linewidth=2)
pylab.plot(betaMatrix[:,7:], '--', linewidth=2)
else:
pylab.plot(betaMatrix, '-', linewidth=2)
pylab.legend(neatLabels)
#pylab.legend(legendLabels)
pylab.title('betas for all (normed) variables')
commonPlotProps()
s = pylab.subplot(3,3,2)
pylab.title('correlations between explanatory variables')
pylab.imshow(numpy.abs(numpy.corrcoef(explanatoryFeatures.T)),interpolation='nearest', origin='upper') # leave out the dummy one
pylab.clim(0,1)
pylab.yticks(range(len(neatLabels)),neatLabels)
pylab.ylim((-0.5,len(neatLabels)-0.5))
pylab.xticks(range(len(neatLabels)),neatLabels, rotation=90)
pylab.xlim((-0.5,len(neatLabels)-0.5))
pylab.colorbar()
#fontP = FontProperties()
#fontP.set_size('small')
#legend([s], "title", prop = fontP)
#s = pylab.subplot(2,2,4)
#pylab.plot(numpy.mean(epochedSignalData[wordTrialsBool,channelToAnalyse],axis=0).T, linewidth=2)
#pylab.title('ERF')
#commonPlotProps()
#print 'history %d, mean model fit over -0.5s to +1.0s: %.5f, max is %.5f' % (epochHistory, numpy.mean(modelTrainingFit[62:250]), numpy.max(modelTrainingFit[62:250]))
#pylab.savefig('meg_testfig_%s.png' % (channelLabels[channelToAnalyse]))