In [ ]:
# read the data
agg_alleles_datapath = '../data/aggregated_alleles_new.xlsx'
result_filepath = '../results/aggregated_allels_comparisons'
import pandas as pd
from pprint import pprint
xls = pd.ExcelFile(agg_alleles_datapath)
df = xls.parse(xls.sheet_names[0])
valid_sheets = xls.sheet_names[:-1]
print valid_sheets
In [ ]:
# gather information about a specific polymorphism for all populations
alleles_agg_data = {}
for poly in valid_sheets:
df = xls.parse(poly)
# find indexes for alleles for all populations => pls allign all sheets!!
alleles_indexes = [6,7]
alleles_labels = map(lambda x: df['GENOTYPES'].loc[x],alleles_indexes)
# print "Labels of alleles " + str(alleles_labels)
# print "Indexes in sheet "+str(alleles_indexes)
alleles_agg_data[poly] = {}
alleles_agg_data[poly]['data'] = df.loc[alleles_indexes]
alleles_agg_data[poly]['labels'] = alleles_labels
print alleles_agg_data
In [ ]:
import math
import scipy.stats as stats
def compute_odds_ratio_CI_95(data,odds_ratio):
val_in_list = list(data.flatten())
val_in_list = map(lambda x: 1/float(x),val_in_list)
sum_of_vals = sum(val_in_list)
error = 1.96 * math.sqrt(sum_of_vals)
ln_or = math.log(odds_ratio)
uci = math.exp(ln_or + error)
lci = math.exp(ln_or - error)
return lci, uci
def compute_odds_ratio(data):
if data[0][1] == 0 or data[1][0] == 0:
return 0
else:
return float(data[0][0] * data[1][1]) / (data[0][1] * data[1][0])
def mean_confidence_interval(data, confidence=0.95):
a = 1.0*np.array(data)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
return m, m-h, m+h
# from scipy.stats import chisquare
# obs = np.array([[44, 468], [23, 477]])
# obs = np.array([[180, 90], [60, 170]])
# obs = np.array([[668, 338], [1585, 679]]).T
# # obs = np.array([[1089, 3919], [679, 1585]])
# # obs = np.array([[977, 29], [2069, 483]])
# # obs = np.array([[4214, 794], [2069, 483]])
# # chisq, pval_chi = chisquare(obs,ddof=0)
# # print chisq, pval_chi
# # chisq, pval_chi= sum(chisq) , sum(pval_chi)
# # print chisq, pval_chi
# oddsratio, pval_f = stats.fisher_exact(obs)
# # print oddsratio, pval_f
# # print compute_odds_ratio(obs)
# lci, uci = compute_odds_ratio_CI_95(obs,oddsratio)
# print "{},{},{}-{}".format(pval_f,oddsratio,lci, uci)
In [ ]:
import xlwt
def get_stats_from_matrix(data):
oddsratio, pval_f = stats.fisher_exact(data)
# print oddsratio, pval_f
# print compute_odds_ratio(obs)
lci, uci = compute_odds_ratio_CI_95(data,oddsratio)
if pval_f < 0.0001:
new_p_str = '< 0.0001'
else:
new_p_str = round(pval_f,4)
return "{},{},{}-{}".format(new_p_str,round(oddsratio,4),round(lci,4), round(uci,4))
# for each polymorphism just do the same distr. comparisons in between populations
def compare_between_populations_for_a_polymorphism(data,labels,poly,book):
# All desired comparisons for producing stats
comparison_list = [['GREEKS','AFRICANS'],
['SEC','AMERICANS'],
['SEC','EAST ASIANS'],
['SEC','SOUTH ASIANS'],
['SEC','EUROPEANS'],
['SEC','GROUP1=CEU+FIN+GBR'],
['GROUP1=CEU+FIN+GBR','GROUP2=IBS+TSI+SEC'],
['SEC','GROUP3=IBS+TSI'],
['SEC','ALL']]
ws = book.add_sheet(str(poly))
fout = open(result_filepath+'_'+str(poly)+'.txt','w')
header_str = 'Group_1,Group_2,p_value_fischer,odds_ratio,Confidence_Interval_95'
fout.write('Group_1,Group_2,p_value_fischer,odds_ratio,Confidence_Interval_95%\n')
for j in range(len(header_str.split(','))):
ws.write(0,j,header_str.split(',')[j])
i = 1
for comp_tuple in comparison_list:
try:
# print data
matrix = data[comp_tuple].as_matrix()
except:
print "Polymorphism: {} does not contain comparison tuple {}".format(poly,comp_tuple)
stats = get_stats_from_matrix(matrix)
result_str = str(comp_tuple[0])+','+comp_tuple[1]+','+stats
fout.write(result_str+ "\n")
print result_str
print matrix
for j in range(len(result_str.split(','))):
ws.write(i,j,result_str.split(',')[j])
i += 1
fout.close()
return book
book = xlwt.Workbook()
for poly,v in alleles_agg_data.items():
book = compare_between_populations_for_a_polymorphism(v['data'],v['labels'],poly,book)
book.save(result_filepath+'.xls')
In [ ]:
import numpy as np
obs = np.array([[1930, 405], [112, 23]])
print stats.fisher_exact(obs)
In [ ]: