In [221]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as LR
%matplotlib inline
In [222]:
with h5py.File('data.h5') as f:
mnist = f['mnist'].value
natural_images = f['natural_images'].value
# mnist.shape - (n_images, x_px, y_px) - (60000, 18, 18)
# natural_images.shape - (n_images, x_px, y_px) - (292864, 16, 16)
In [223]:
mnist_idx = 0
natural_images_idx = 0
f, (ax1, ax2) = plt.subplots(1, 2)
ax1.imshow(mnist[mnist_idx], cmap='gray', interpolation='nearest')
ax1.set_title('mnist digit {}'.format(mnist_idx))
ax2.imshow(natural_images[natural_images_idx], cmap='gray', interpolation='nearest')
ax2.set_title('natural image {}'.format(natural_images_idx))
for ax in [ax1, ax2]:
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
f.tight_layout()
f, (ax1, ax2) = plt.subplots(1, 2)
ax1.imshow(mnist.std(axis=0), cmap='gray', interpolation='nearest')
ax1.set_title('mnist pixel std')
ax2.imshow(natural_images.std(axis=0), cmap='gray', interpolation='nearest')
ax2.set_title('natural image pixel std')
for ax in [ax1, ax2]:
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
f.tight_layout()
In [224]:
from sklearn.decomposition import PCA
In [236]:
data_type = 'natural images'
if data_type == 'natural images':
data = natural_images
else:
data = mnist
data = data.reshape(-1, data.shape[-1]**2)
data -= data.mean(axis=0)
data /= data.std(axis=0)
dim = data.shape[-1]
In [237]:
p = PCA()
data_pcs = p.fit_transform(data)
variance = p.explained_variance_
data_random = data.copy()
for ii in range(dim):
order = np.random.permutation(data.shape[0])
data_random[:, ii] = data_random[:, ii][order]
p2 = PCA()
p2.fit(data_random)
variance2 = p2.explained_variance_
data_pcs = p.transform(data)
data_random_pcs = p2.transform(data_random)
In [238]:
pcts = np.around(np.logspace(-2, 0, 15) * dim) / dim
num_keep = (pcts * dim).astype(int)
fourth = np.full((2, pcts.size, dim), np.nan)
for ii, keep in enumerate(num_keep):
dpi = data_pcs[:, :keep]
drpi = data_random_pcs[:, :keep]
fourth[0, ii, :keep] = (dpi**2**2).mean(axis=0)/(dpi**2).mean(axis=0)**2
fourth[1, ii, :keep] = (drpi**2**2).mean(axis=0)/(drpi**2).mean(axis=0)**2
In [239]:
f, (ax1, ax2) = plt.subplots(2, figsize=(10, 10))
ax1.loglog(variance, 'b', label=data_type)
ax1.loglog(variance, 'b.')
ax1.loglog(variance2, 'g', label='{} shuffled'.format(data_type))
ax1.loglog(variance2, 'g.',)
ax1.set_ylabel('eigenvalue')
ax1.set_xlabel('fractional rank')
model = LR()
model.fit(np.atleast_2d(np.log(np.arange(dim)+1)).T, np.atleast_2d(np.log(variance)).T)
x = np.array([1., dim])
y = np.squeeze(np.exp(model.coef_ * np.log(x) + model.intercept_))
ax1.plot(x, y, 'k', label='1/f fit')
meds = np.zeros(pcts.size)
lows = np.zeros(pcts.size)
highs = np.zeros(pcts.size)
for ii, pct in enumerate(pcts):
y = fourth[0, ii]
y = y[np.logical_not(np.isnan(y))]
ax2.loglog(np.tile(pct, y.size), y, 'b.', alpha=1.)
ax2.axhline(3, drawstyle='--', color='green')
meds[ii] = np.median(y)
t5s5 = np.percentile(y, [25, 75], interpolation='nearest')
lows[ii] = t5s5[0]
highs[ii] = t5s5[1]
ax2.errorbar(pcts*1.1, meds, yerr=[meds-lows, highs-meds], fmt='b.')
ax2.set_xlim(1e-2/1.5, 1.5)
ax2.set_ylabel('normalized fourth moment')
ax2.set_xlabel('fraction of modes remaining')
ax1.legend()
f.tight_layout()
plt.savefig('{}.pdf'.format(data_type))
In [ ]: