In [333]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import LinearSVR, LinearSVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
NOBS = 100
In [334]:
# generate fake hubble diagram data
dist = np.random.beta(1, 5, NOBS) * 10
velocity = dist * 3 - 0.1*dist**2 + np.random.randn(NOBS) * 0.1
dist = dist[:,np.newaxis]
In [335]:
# visualize data
plt.scatter(dist, velocity)
plt.xlabel("distance (some large unit)")
plt.ylabel("velocity (much fast)");
In [346]:
# do the machine learnings
model = GaussianProcessRegressor()
model.fit(dist, velocity)
Out[346]:
In [347]:
# plot the machine learnings
xTest = np.linspace(dist.min(), dist.max())[:,np.newaxis]
plt.plot(xTest, model.predict(xTest))
plt.scatter(dist, velocity)
plt.xlabel("distance (some large unit)")
plt.ylabel("velocity (much fast)");
In [341]:
print("rmse:", np.sqrt(np.mean(np.square(velocity - model.predict(dist)))))
In [342]:
# but actually we should use training and testing data
xTrain, xTest, yTrain, yTest = train_test_split(dist, velocity)
In [343]:
# do some more machine learnings
model.fit(xTrain, yTrain)
Out[343]:
In [344]:
# show predictions on test data
plt.scatter(xTest, model.predict(xTest), color="r")
plt.scatter(xTrain, yTrain)
plt.xlabel("Distance")
plt.ylabel("Velocity")
plt.xlabel("distance (some large unit)")
plt.ylabel("velocity (much fast)");
In [345]:
print("rmse:", np.sqrt(np.mean(np.square(yTest - model.predict(xTest)))))
In [352]:
# generate fake datas
NLCS = 100
NOBS = 200
t = np.random.rand(NLCS, NOBS) * np.pi * 10
t = np.sort(t, axis=1)
freq1a = np.abs(np.random.randn(NLCS//2)*0.3 + 1)[:,np.newaxis]
freq1b = np.abs(np.random.randn(NLCS//2)*0.3 + 3)[:,np.newaxis]
freq2a = np.abs(np.random.randn(NLCS//2)*0.4 + 2)[:,np.newaxis]
freq2b = np.abs(np.random.randn(NLCS//2)*0.4 + 4)[:,np.newaxis]
lcs = np.zeros(t.shape)
lcs[:NLCS//2,:] = np.sin((t[:NLCS//2,:] * freq1a)) + np.sin((t[:NLCS//2,:] * freq1b))
lcs[NLCS//2:,:] = np.sin((t[NLCS//2:,:] * freq2a)) + np.sin((t[NLCS//2:,:] * freq2b))
X = np.vstack([np.hstack([freq1a, freq1b]), np.hstack([freq2a, freq2b])])
y = np.arange(2).repeat(NLCS//2)
In [353]:
# visualize datas
plt.plot(t[0],lcs[0])
plt.plot(t[1],lcs[1])
plt.title("Class 1")
plt.xlabel("Time")
plt.ylabel("Something like flux");
plt.figure()
plt.title("Class 2")
plt.plot(t[50], lcs[50])
plt.plot(t[51], lcs[51])
plt.xlabel("Time")
plt.ylabel("Something like flux");
In [354]:
# histogram of one of the frequency peaks
plt.hist(freq1a, bins=np.linspace(0,6,50));
plt.hist(freq2a, bins=np.linspace(0,6,50));
plt.xlabel("One Frequency")
plt.ylabel("Count");
In [355]:
# histogram of both frequency peaks
plt.scatter(freq1a, freq1b, color="r")
plt.scatter(freq2a, freq2b, color="b")
plt.xlabel("One frequency")
plt.ylabel("Another frequency");
In [356]:
# always split into train/test!
XTrain, XTest, yTrain, yTest = train_test_split(X, y)
# this machine is learning so much today
model = RandomForestClassifier()
model.fit(XTrain, yTrain)
# visualize test performance
yPred = model.predict(XTest)
plt.scatter(XTrain[:,0][yTrain==0], XTrain[:,1][yTrain==0], color="r", alpha=0.5, label="Train")
plt.scatter(XTrain[:,0][yTrain==1], XTrain[:,1][yTrain==1], color="b", alpha=0.5, label="Train")
plt.scatter(XTest[:,0][yTest==0], XTest[:,1][yTest==0], color="r", s=100, label="Test")
plt.scatter(XTest[:,0][yTest==1], XTest[:,1][yTest==1], color="b", s=100, label="Test")
plt.legend()
plt.xlabel("One frequency")
plt.ylabel("Another frequency");
In [357]:
# use the same data but no class labels any more
plt.scatter(X[:,0], X[:,1], color="r");
plt.xlabel("One frequency")
plt.ylabel("Another frequency");
In [358]:
# ready, set, machine learning!
model = GaussianMixture(n_components=2)
model.fit(X)
# who knew that plotting was 4x harder than training a gaussian mixture model!
plt.scatter(X[:,0], X[:,1], color="r")
plt.scatter(model.means_[:,0], model.means_[:,1], marker="+", s=200)
samples1 = np.random.multivariate_normal(model.means_[0], model.covariances_[0], 10000)
samples2 = np.random.multivariate_normal(model.means_[1], model.covariances_[1], 10000)
samples = np.vstack([samples1, samples2])
plt.hist2d(samples[:,0], samples[:,1], bins=50)
plt.xlim(X[:,0].min(), X[:,0].max())
plt.ylim(X[:,1].min(), X[:,1].max())
plt.xlabel("One frequency")
plt.ylabel("Another frequency");
In [359]:
# generate fake spectra
NLINES = 10
WAVELEN_MIN = 400
WAVELEN_MAX = 700
NSPECS = 1000
wavelen = np.arange(WAVELEN_MIN,WAVELEN_MAX,1)
specs = np.random.randn(NSPECS, wavelen.size)
# use same line locations for each spectrum
lineWavelens = np.random.rand(NLINES) * (WAVELEN_MAX - WAVELEN_MIN) + WAVELEN_MIN
lineWeights = (np.arange(NLINES) + 1)
np.random.shuffle(lineWeights)
for specId in range(NSPECS):
for i in range(NLINES):
lineHeights = np.random.randn(NLINES) * lineWeights
specs[specId,:] += lineHeights[i]*np.exp(-(wavelen - lineWavelens[i])**2/10)
In [360]:
# visualize fake spectra
plt.plot(wavelen, specs[0])
plt.plot(wavelen, specs[1])
plt.plot(wavelen, specs[2])
plt.xlabel("wavelength")
plt.ylabel("something like flux");
In [362]:
# you guessed it -- machine learning!
model = PCA(n_components = NLINES)
model.fit(specs)
# show the components for some spectrum
transformed = model.transform(specs)
plt.scatter(np.arange(transformed.shape[1]), transformed[1])
plt.xlabel("Component ID")
plt.ylabel("Component Weight");
In [363]:
# check how good the reconstruction is
plt.plot(specs[0], label="original")
plt.plot(model.inverse_transform(transformed[0]), label="reconstructed")
plt.legend()
plt.xlabel("wavelength")
plt.ylabel("something like flux");
In [365]:
# look at the individual components
plt.plot(model.inverse_transform([10,0,0,0,0,0,0,0,0,0]), label="component 0")
plt.plot(model.inverse_transform([0,0,0,0,0,0,10,0,0,0]), label="component 1")
plt.plot(model.inverse_transform([0,0,10,0,0,0,0,0,0,0]), label="component 2")
plt.legend()
plt.xlabel("wavelength")
plt.ylabel("something like flux");
nah just kidding