In [ ]:
# read the data
agg_alleles_datapath = '../data/genotypes_person.xlsx'
result_filepath = '../results/sec_allels_per_gender'
import pandas as pd
from pprint import pprint
xls = pd.ExcelFile(agg_alleles_datapath)
df = xls.parse(xls.sheet_names[0])
polys = list(df.columns)[2:]
print polys
pairs_of_allels = [['A','G'],['C','G'],['G','A'],['C','G'],['T','C']]
print pairs_of_allels
In [ ]:
# do the same procedure for each polymorphism
# but first group data per gender
df_grouped = df.groupby(['GENDER'])[polys].sum()
import numpy as np
# gather information about a specific polymorphism for all populations
alleles_agg_data = {}
for i in range(len(pairs_of_allels)):
alleles = pairs_of_allels[i]
poly = polys[i]
data = np.empty((2,2))
for j in range(len(alleles)):
data[j][0] = df_grouped[poly].loc['A'].count(alleles[j])
data[j][1] = df_grouped[poly].loc['F'].count(alleles[j])
data = data.astype(int)
alleles_agg_data[poly] = {}
alleles_agg_data[poly]['data'] = data
alleles_agg_data[poly]['labels'] = alleles
In [ ]:
import math
import scipy.stats as stats
def compute_odds_ratio_CI_95(data,odds_ratio):
val_in_list = list(data.flatten())
val_in_list = map(lambda x: 1/float(x),val_in_list)
sum_of_vals = sum(val_in_list)
error = 1.96 * math.sqrt(sum_of_vals)
ln_or = math.log(odds_ratio)
uci = math.exp(ln_or + error)
lci = math.exp(ln_or - error)
return lci, uci
def compute_odds_ratio(data):
if data[0][1] == 0 or data[1][0] == 0:
return 0
else:
return float(data[0][0] * data[1][1]) / (data[0][1] * data[1][0])
def mean_confidence_interval(data, confidence=0.95):
a = 1.0*np.array(data)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
return m, m-h, m+h
In [ ]:
import xlwt
def get_stats_from_matrix(data):
oddsratio, pval_f = stats.fisher_exact(data)
# print oddsratio, pval_f
# print compute_odds_ratio(obs)
lci, uci = compute_odds_ratio_CI_95(data,oddsratio)
if pval_f < 0.0001:
new_p_str = '< 0.0001'
else:
new_p_str = round(pval_f,4)
return "{},{},{}-{}".format(new_p_str,round(oddsratio,4),round(lci,4), round(uci,4))
# for each polymorphism just do the same distr. comparisons in between populations
def compare_between_populations_for_a_polymorphism(data,labels,poly,book):
ws = book.add_sheet(str(poly).split("/")[-1])
fout = open(result_filepath+'_'+str(poly).split("/")[-1]+'.txt','w')
header_str = 'Group_1,Group_2,p_value_fischer,odds_ratio,Confidence_Interval_95'
fout.write('Group_1,Group_2,p_value_fischer,odds_ratio,Confidence_Interval_95%\n')
for j in range(len(header_str.split(','))):
ws.write(0,j,header_str.split(',')[j])
i = 1
try:
matrix = data
except:
print "Polymorphism: {} does not contain comparison tuple {}".format(poly,comp_tuple)
stats = get_stats_from_matrix(data)
result_str = 'Males'+','+'Females'+','+stats
fout.write(result_str+ "\n")
print result_str
for j in range(len(result_str.split(','))):
ws.write(i,j,result_str.split(',')[j])
i += 2
ws.write(i,1,'M')
ws.write(i,2,'F')
i+=1
for z in range(2):
ws.write(i+z,0,labels[z])
for z in range(data.shape[0]):
for k in range(data.shape[1]):
ws.write(i+z,1+k,data[z][k])
fout.close()
return book
book = xlwt.Workbook()
for poly,v in alleles_agg_data.items():
book = compare_between_populations_for_a_polymorphism(v['data'],v['labels'],poly,book)
book.save(result_filepath+'.xls')
In [ ]: