In [99]:
def read_hist_file(filename):
import numpy as np
data = []
for line in open(filename):
line_arr = line.strip().split()
haplotypes = map(float, line_arr)
data.append(haplotypes)
data = np.asarray(data)
print data.shape
p_fail = data[:,3] / np.sum(data, axis=1)
x, y, x1,y1 = [],[],[],[]
for i,f in enumerate(p_fail):
if f == np.nan:
continue
else:
x.append(i)
y.append(f)
print p_fail
return x, y
In [109]:
x_x, y_x = read_hist_file("min_4_hist.txt")
x_x19, y_x19 = read_hist_file("hist_04_x19.txt")
In [50]:
from pylab import *
%matplotlib inline
In [108]:
fig = figure(figsize=(19,10))
bar(x_x, y_x, label="X chromosome only", color='blue')
bar(x_x19, y_x19, label='X chromosome and chr19', color='green')
xlabel("Number of observations at site i", fontsize=24)
ylabel("Probability of 4 gametes", fontsize=24)
xlim(min(x), max(x))
savefig("fg_proba.pdf")
In [ ]:
print len(data)
In [ ]:
import random
def build_random():
a = ["0", "1"]
c1, c2 = random.choice(a), random.choice(a)
return c1+c2
In [ ]:
filename = 'min_20_ijiuj.hist.txt'
data = []
for line in open(filename):
line_arr = line.strip().split()
data.append([map(int, inner.split(";")[:-1]) for inner in line_arr])
print len(data), len(data[0]), len(data[0][0]), len(data[-1]), len(data[0][-1])
In [ ]:
fig = figure()
ax = fig.add_subplot(111, projection='3D')
ax.plot_wireframe(zip(*data))
In [ ]:
from mpl_toolkits.mplot3d import axes3d, Axes3D #<-- Note the capitalization!
data = np.asarray(data)
print data.shape
for i, d in enumerate(data):
if len(data[i]) != 103:
print "i", i, len(data[i])
for j, e in enumerate(d):
if len(data[i][j]) != 103:
print "j", i, j,len(data[i][j])
a = zip(*data)
print len(a), len(b), len(c)
In [ ]:
data = np.asarray(data)
X_train = []
y_train = []
for x_i, x in enumerate(data):
for y_i, y in enumerate(x):
for z_i, z in enumerate(y):
for _ in range(z):
X_train.append([x_i,y_i])
y_train.append(z_i)
print len(X_train)
In [1]:
from lr import build_test_train
In [2]:
X_train, y_train = build_test_train()
In [5]:
import numpy as np
In [6]:
print np.mean(y_train)
In [81]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept=False, n_jobs=-1)
lr.fit(X_train, y_train)
Out[81]:
In [82]:
lr.predict([50,50])
Out[82]:
In [83]:
lr.score([[20,20]], [100000000000.])
Out[83]:
In [84]:
res0= lr.predict([(1,i) for i in range(103)])
res1 = lr.predict([(i,1) for i in range(103)])
resb = lr.predict([(i,i) for i in range(103)])
In [36]:
from pylab import *
%matplotlib inline
In [66]:
res0= lr.predict([(1,i) for i in range(103)])
res1 = lr.predict([(i,1) for i in range(103)])
resb = lr.predict([(i,i) for i in range(103)])
fig = figure()
plot(range(103), res0, label="n_obs_i = 1, n_obs_j = x")
plot(range(103), res1, label="n_obs_i = x, n_obs_j = 1")
plot(range(103), resb, label="n_obs_i = x, n_obs_j = x")
legend()
xlim(0,103)
ylim(lr.intercept_,103)
xlabel("W_x0 = " + str(lr.coef_[0]))
ylabel("W_x1 = " + str(lr.coef_[1]))
print lr.coef_
In [69]:
print lr.intercept_
eq = .24 * x + .24 * y - 5
#fits the estimation that about half of the total sites from i j will be covered
#can we adjust it for adjacency?
In [67]:
from random import sample
X_r = []
y_r = []
n_tries = 20
for i in range(103):
for j in range(103):
for k in range(n_tries):
X_r.append([i,j])
intersect = set(sample(range(103), i)) & set(sample(range(103), j))
y_r.append(len(intersect))
In [79]:
lr_r = LinearRegression(fit_intercept=False, normalize=True)
lr_r.fit(X_r, y_r)
print lr_r.coef_, lr_r.intercept_
In [107]:
res0_r = lr_r.predict([(1,i) for i in range(103)])
res1_r = lr_r.predict([(i,1) for i in range(103)])
resb_r = lr_r.predict([(i,i) for i in range(103)])
fig = figure(figsize=(19,10))
plot(range(103), res0_r, c='blue',label="n_obs_i = 1, n_obs_j = x, random")
plot(range(103), res1_r+1, c='red',label="n_obs_i = x, n_obs_j = 1, random")
plot(range(103), resb_r+1, c='green', label="n_obs_i = x, n_obs_j = x, random")
plot(range(103), res0+2, c='cyan', label="n_obs_i = 1, and n_obs_j = x")
plot(range(103), res1+3, c='magenta', label="n_obs_i = x, n_obs_j = 1")
plot(range(103), resb+2, c='orange',label="n_obs_i = x, n_obs_j = x")
legend()
xlim(0,103)
ylim(lr.intercept_,103)
xlabel("W_xr0 = " + str(lr_r.coef_[0]) + " W_x0 = " + str(lr.coef_[0]), fontsize=24)
ylabel("W_xr1 = " + str(lr_r.coef_[1])+ " W_x1 = " + str(lr.coef_[1]), fontsize=24)
savefig("our_data_v_random_lr.pdf")
In [ ]: