In [1]:
import mysql.connector
La idea de este taller es manipular archivos (leerlos, parsearlos y escribirlos) y hacer lo mismo con bases de datos estructuradas.
In [2]:
import pandas as pd
df= pd.read_csv('C:/Users/Alex/Documents/eafit/semestres/X semestre/programacion/taller2.tsv', sep = '\t')
df[:1]
Out[2]:
Qué Entidades (tablas) puede definir?
Cree la base de datos (copie el código SQL que se usó)
In [5]:
CREATE TABLE enfermedad
(
id_enfermedad int AUTO_INCREMENT PRIMARY KEY,
nombre varchar(255)
);
create table plataforma
(
id_plataforma int AUTO_INCREMENT primary key,
nombre varchar(255)
);
CREATE TABLE loci
(
id_loci int AUTO_INCREMENT PRIMARY KEY,
id_plataforma int,
foreign key (id_plataforma) references plataforma(id_plataforma),
region varchar(255),
chrom varchar(255),
pos varchar(255),
genes_reportados varchar(255),
gen_mapped varchar(255),
gen_upstream varchar(255),
gen_downstream varchar(255),
SNP_GENE_IDS varchar(255),
UPSTREAM_GENE_DISTANCE varchar(255),
DOWNSTREAM_GENE_DISTANCE varchar(255),
STRONGEST_SP_RISK varchar(255),
SNPS varchar(255),
MERGED varchar(255),
SNP_ID_CURRENT varchar(255),
CONTEXTO varchar(500),
risk_allele varchar(255)
#PVAl int,
#PVALUE_MLOG int,
#PVALUE_txt varchar(255),
#BETA int,
#novCI varchar(255)
);
CREATE TABLE enfermedad_loci
(
id_loci int ,
id_enfermedad int,
PRIMARY KEY (id_loci, id_enfermedad),
foreign key (id_loci) references loci(id_loci),
foreign key (id_enfermedad) references enfermedad(id_enfermedad)
);
CREATE TABLE journal
(
id_journal int AUTO_INCREMENT PRIMARY KEY,
nombre varchar(255)
);
CREATE TABLE estudio
(
id_estudio int AUTO_INCREMENT PRIMARY KEY,
id_journal int,
id_enfermedad int,
foreign key (id_journal) references journal(id_journal),
foreign key (id_enfermedad) references enfermedad(id_enfermedad),
nombre varchar(255),
tamano_muestra varchar(500),
replicas varchar(500)
);
create table publicacion
(
id_publicacion int AUTO_INCREMENT PRIMARY KEY,
id_journal int,
id_estudio int,
foreign key (id_journal) references journal(id_journal),
foreign key (id_estudio) references estudio(id_estudio),
id_pubmed int,
autor varchar (255),
fecha_pub varchar (20),
link varchar (500)
);
In [6]:
df.head(1)
Out[6]:
In [7]:
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'
def doQuery( conn ) :
cur = conn.cursor()
cur.execute( "select * from enfermedad_loci" )
for id_nombre, nombre_enf in cur.fetchall() :
print (id_nombre, nombre_enf)
myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
doQuery( myConnection )
myConnection.close()
In [27]:
def get_diseaseId(disease_name):
cur = myConnection.cursor()
cur.execute( """select * from enfermedad where nombre = "%s" """ % (disease_name) )
id_enf = None
for id_, dissease in cur.fetchall() :
id_enf = id_
if not id_enf:
#print("""insert into enfermedad values (NULL, "%s" )""" % (disease_name))
cur.execute("""insert into enfermedad values (NULL, "%s" )""" % (disease_name))
cur.execute("SELECT LAST_INSERT_ID()")
id_enf = cur.fetchall()[0][0]
myConnection.commit()
return id_enf
In [28]:
def get_platId(plat_name):
cur = myConnection.cursor()
cur.execute( """select * from plataforma where nombre = "%s" """ % (plat_name) )
id_plat = None
for id_, nombre_plat in cur.fetchall() :
id_plat = id_
if not id_plat:
#print("""insert into plataforma values (NULL, "%s" )""" % (plat_name))
cur.execute("""insert into plataforma values (NULL, "%s" )""" % (plat_name))
cur.execute("SELECT LAST_INSERT_ID()")
id_plat = cur.fetchall()[0][0]
myConnection.commit()
return id_plat
In [29]:
def get_jourId(jour_name):
cur = myConnection.cursor()
cur.execute( """select * from journal where nombre = "%s" """ % (jour_name) )
id_jour = None
for id_, nombre_jour in cur.fetchall() :
id_jour = id_
if not id_jour:
#print("""insert into journal values (NULL, "%s" )""" % (jour_name))
cur.execute("""insert into journal values (NULL, "%s" )""" % (jour_name))
cur.execute("SELECT LAST_INSERT_ID()")
id_jour = cur.fetchall()[0][0]
myConnection.commit()
return id_jour
In [30]:
def get_estId(id_journal, id_enfermedad, est_name, tam_mues, replica):
cur = myConnection.cursor()
cur.execute( """select * from estudio where nombre = "%s" """ % (est_name))
id_est = None
for id_, id_journal, id_enfermedad, nombre_est, tam_mues, replica in cur.fetchall() :
id_est = id_
if not id_est:
#print("""insert into estudio values (NULL, "%s", "%s", "%s", "%s", "%s")""" % (id_journal, id_enfermedad, est_name, tam_mues, replica))
cur.execute("""insert into estudio values (NULL, "%s", "%s", "%s", "%s", "%s")""" % ( id_journal,id_enfermedad,est_name, tam_mues, replica))
cur.execute("SELECT LAST_INSERT_ID()")
id_est = cur.fetchall()[0][0]
myConnection.commit()
return id_est
In [31]:
def get_locId(id_plataforma, region, chrom, pos, genes_reportados, gen_mapped, gen_upstream,gen_downstream, SNP_GENE_ID, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT, CONTEXTO, risk_allele):
cur = myConnection.cursor()
cur.execute( """select * from loci where chrom = "%s" and pos="%s" """ % (chrom,pos))
id_loc = None
for id_,id_plataforma, region, chrom, pos, genes_reportados, gen_mapped, gen_upstream, gen_downstream, SNP_GENE_ID, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT, CONTEXTO, risk_allele in cur.fetchall() :
id_est = id_
if not id_loc:
#print("""insert into loci values (NULL, "%s","%s", "%s", "%s", "%s",
#"%s","%s", "%s", "%s", "%s",
#"%s","%s", "%s", "%s", "%s",
#"%s","%s")""" % (id_plataforma, region,chrom,pos,genes_reportados,
# gen_mapped, gen_upstream,gen_downstream, SNP_GENE_ID,
# UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE,
# STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT,
# CONTEXTO, risk_allele))
cur.execute("""insert into loci values (NULL, "%s","%s", "%s", "%s", "%s",
"%s","%s", "%s", "%s", "%s",
"%s","%s", "%s", "%s", "%s",
"%s","%s")""" % (id_plataforma, region,chrom,pos,genes_reportados,
gen_mapped, gen_upstream,gen_downstream, SNP_GENE_ID,
UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE,
STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT,
CONTEXTO, risk_allele))
cur.execute("SELECT LAST_INSERT_ID()")
id_loc = cur.fetchall()[0][0]
myConnection.commit()
return id_loc
In [32]:
def get_pubId(id_journal, id_estudio, id_pubmed, autor, fecha_pub, link):
cur = myConnection.cursor()
cur.execute( """select * from publicacion where id_pubmed = "%s" """ % (id_pubmed))
id_pub = None
for id_, id_journal, id_estudio, idpubmed, autor, fecha_pub, link in cur.fetchall() :
id_pub = id_
if not id_pub:
#print("""insert into publicacion values (NULL, "%s", "%s", "%s", "%s", "%s", "%s")""" % (id_journal, id_estudio, id_pubmed, autor, fecha_pub, link))
cur.execute("""insert into publicacion values (NULL, "%s", "%s", "%s", "%s", "%s", "%s")""" % (id_journal, id_estudio, id_pubmed, autor, fecha_pub, link))
cur.execute("SELECT LAST_INSERT_ID()")
id_pub = cur.fetchall()[0][0]
myConnection.commit()
return id_pub
In [33]:
def get_enfloId(loci_id, id_enfermedad,):
cur=myConnection.cursor()
cur.execute( """select * from enfermedad_loci where id_loci = "%s" and id_enfermedad = "%s" """ % (id_loci, id_enfermedad))
existe = None
for id_, loci_id, id_enfermedad in cur.fetchall() :
existe = True
if not existe:
#print("""insert into enfermedad_loci values ("%s","%s")""" % (loci_id, id_enfermedad))
cur.execute("""insert into enfermedad_loci values ("%s", "%s")""" % (loci_id, id_enfermedad))
myConnection.commit()
In [35]:
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'
myConnection = mysql.connector.connect(host=hostname, user=username, passwd=password, db=database )
cur=myConnection.cursor()
for index, row in df.iterrows():
#identidad enfermedad
dissease_name = row['DISEASE/TRAIT']
dissease_id = get_diseaseId(dissease_name)
#identidad plataforma
plat_name = row['PLATFORM [SNPS PASSING QC]']
plat_id = get_platId(plat_name)
#identidad loci
id_plataforma = plat_id
region=row['REGION']
chrom=row['CHR_ID']
pos=row['CHR_POS']
genes_reportados=row['REPORTED GENE(S)']
gen_mapped=row['MAPPED_GENE']
gen_upstream=row['UPSTREAM_GENE_ID']
gen_downstream=row['DOWNSTREAM_GENE_ID']
SNP_GENE_ID=row['SNP_GENE_IDS']
UPSTREAM_GENE_DISTANCE=row['UPSTREAM_GENE_DISTANCE']
DOWNSTREAM_GENE_DISTANCE=row['DOWNSTREAM_GENE_DISTANCE']
STRONGEST_SP_RISK=row['STRONGEST SNP-RISK ALLELE']
SNPS=row['SNPS']
MERGED=row['MERGED']
SNP_ID_CURRENT=row['SNP_ID_CURRENT']
CONTEXTO=row['CONTEXT']
risk_allele=row['RISK ALLELE FREQUENCY']
#PVAL=row['P-VALUE']
#Pvalue_MLOG=row['PVALUE_MLOG']
#PVALUE_txt=row['P-VALUE (TEXT)']
#BETA=row['OR or BETA']
#novCI= row['95% CI (TEXT)']
id_loci= get_locId(id_plataforma, region,
chrom,pos,genes_reportados,
gen_mapped, gen_upstream,gen_downstream,
SNP_GENE_ID, UPSTREAM_GENE_DISTANCE,
DOWNSTREAM_GENE_DISTANCE,
STRONGEST_SP_RISK, SNPS, MERGED,
SNP_ID_CURRENT, CONTEXTO, risk_allele)
#identidad loci enfermedad
id_enfermedad=dissease_id
loci_id=id_loci
get_enfloId(loci_id, id_enfermedad)
#identidad journal
jour_name = row['JOURNAL']
jour_id = get_jourId(jour_name)
#identidad estudio
id_enfermedad=dissease_id
id_journal=jour_id
est_name = row['STUDY']
tam_mues = row['INITIAL SAMPLE SIZE']
replica = row['REPLICATION SAMPLE SIZE']
est_id = get_estId(id_journal, id_enfermedad, est_name, tam_mues, replica)
#identidad publicacion
id_journal=jour_id
id_estudio=est_id
idpubmed= row['PUBMEDID']
autor= row['FIRST AUTHOR']
fecha_pub= row['DATE ADDED TO CATALOG']
link= row['LINK']
id_publicacion = get_pubId(id_journal, id_estudio, idpubmed, autor, fecha_pub, link)
myConnection.close()
In [70]:
#¿Cual es el numero de locus afectados para n enfermedades? con ejemplo para 4 enfermedades
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'
myConnection = mysql.connector.connect(host=hostname, user=username, passwd=password, db=database )
def enfermedades_info_1(nombres_enfermedades):
cur=myConnection.cursor()
cromosoma=set()
regionafec= set()
posi=set()
N=len(nombres_enfermedades)
lol = []
num=0
count=0
for i in range(len(nombres_enfermedades)):
cur.execute("""select * FROM enfermedad where nombre= "%s" """ % nombres_enfermedades[i])
for id_, nombre_enf in cur.fetchall() :
print ("***Enfermedad escogida: " + nombre_enf)
print ("- Codigo enfermedad: " )
print (id_)
num=0
cur.execute("""select * FROM enfermedad_loci where id_enfermedad= "%s" """ % id_)
for id_2, cod_enf in cur.fetchall() :
num=num + 1
print("- numero de locus afectados:")
print (num)
print()
print()
enfermedades=("Atopic dermatitis",
"Allergic rhinitis in asthma",
"Asthma", "Allergic rhinitis")
enfermedades_info_1(enfermedades)
myConnection.close()
In [74]:
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'
myConnection = mysql.connector.connect(host=hostname, user=username, passwd=password, db=database )
def enfermedades_info_2(nombres_enfermedades):
cur=myConnection.cursor()
cromosoma=set()
N= len(nombres_enfermedades)+1
lol = [[] for _ in range(N)]
lol[0].append("enfermedades")
lol[0].append("Codigo enfermedad")
lol[0].append("numero de locus afectados")
for i in range(len(nombres_enfermedades)):
cur.execute("""select * FROM enfermedad where nombre= "%s" """ % nombres_enfermedades[i])
for id_, nombre_enf in cur.fetchall() :
lol[i+1].append(nombre_enf)
lol[i+1].append(id_)
num=0
cur.execute("""select * FROM enfermedad_loci where id_enfermedad= "%s" """ % id_)
for id_2, cod_enf in cur.fetchall() :
num=num + 1
lol[i+1].append(num)
import csv
with open('enfermedades_c.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(lol)
return lol
enfermedades=("Asthma", "Allergic rhinitis",
"Allergic rhinitis in asthma",
"Atopic dermatitis")
enfermedades_info_2(enfermedades)
Out[74]: