Archivos y Bases de datos



In [1]:

    
import mysql.connector

La idea de este taller es manipular archivos (leerlos, parsearlos y escribirlos) y hacer lo mismo con bases de datos estructuradas.

Ejercicio 1

Baje el archivo de "All associations with added ontology annotations" del GWAS Catalog.

https://www.ebi.ac.uk/gwas/docs/file-downloads

Describa las columnas del archivo (que información estamos mirando? Para qué sirve? Por qué la hicieron?)



In [2]:

    
import pandas as pd
df= pd.read_csv('C:/Users/Alex/Documents/eafit/semestres/X semestre/programacion/taller2.tsv', sep = '\t')
df[:1]









    



C:\Users\Alex\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (12,23,27) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    Out[2]:






  
    
      
      DATE ADDED TO CATALOG
      PUBMEDID
      FIRST AUTHOR
      DATE
      JOURNAL
      LINK
      STUDY
      DISEASE/TRAIT
      INITIAL SAMPLE SIZE
      REPLICATION SAMPLE SIZE
      ...
      CONTEXT
      INTERGENIC
      RISK ALLELE FREQUENCY
      P-VALUE
      PVALUE_MLOG
      P-VALUE (TEXT)
      OR or BETA
      95% CI (TEXT)
      PLATFORM [SNPS PASSING QC]
      CNV
    
  
  
    
      0
      2009-09-28
      18403759
      Ober C
      2008-04-09
      N Engl J Med
      www.ncbi.nlm.nih.gov/pubmed/18403759
      Effect of variation in CHI3L1 on serum YKL-40 ...
      YKL-40 levels
      632 Hutterite individuals
      443 European ancestry cases, 491 European ance...
      ...
      upstream_gene_variant
      0.0
      0.29
      1e-13
      13.0
      NaN
      0.3
      [NR] ng/ml decrease
      Affymetrix [290325]
      N
    
  

1 rows × 34 columns

Qué Entidades (tablas) puede definir?

Enfermedad
Plataforma (tecnologia de secuenciacion)
Loci
Enfermedad-loci
Journal
Estudio
Publicacion

Cree la base de datos (copie el código SQL que se usó)



In [5]:

    
CREATE TABLE enfermedad
(
id_enfermedad int AUTO_INCREMENT PRIMARY KEY,
nombre varchar(255)
);

create table plataforma
(
id_plataforma int AUTO_INCREMENT primary key, 
nombre varchar(255)
);

CREATE TABLE loci
(
id_loci int AUTO_INCREMENT PRIMARY KEY,
id_plataforma int,
foreign key (id_plataforma) references plataforma(id_plataforma),
region varchar(255),
chrom varchar(255), 
pos varchar(255),
genes_reportados varchar(255),
gen_mapped varchar(255),
gen_upstream varchar(255),
gen_downstream varchar(255),
SNP_GENE_IDS varchar(255),
UPSTREAM_GENE_DISTANCE varchar(255),
DOWNSTREAM_GENE_DISTANCE varchar(255),
STRONGEST_SP_RISK varchar(255),
SNPS varchar(255),
MERGED varchar(255),
SNP_ID_CURRENT varchar(255),
CONTEXTO varchar(500),
risk_allele varchar(255)
#PVAl int,
#PVALUE_MLOG int,
#PVALUE_txt varchar(255),
#BETA int,  
#novCI varchar(255)
);

CREATE TABLE enfermedad_loci
(
id_loci int ,
id_enfermedad int,
PRIMARY KEY (id_loci, id_enfermedad),
foreign key (id_loci) references loci(id_loci),
foreign key (id_enfermedad) references enfermedad(id_enfermedad)
);

CREATE TABLE journal
(
id_journal int AUTO_INCREMENT PRIMARY KEY,
nombre varchar(255)
);

CREATE TABLE estudio
(
id_estudio int AUTO_INCREMENT PRIMARY KEY,
id_journal int,
id_enfermedad int,
foreign key (id_journal) references journal(id_journal),
foreign key (id_enfermedad) references enfermedad(id_enfermedad),
nombre varchar(255),
tamano_muestra varchar(500),
replicas varchar(500)
);

create table publicacion
(
id_publicacion int AUTO_INCREMENT PRIMARY KEY,
id_journal int,
id_estudio int,
foreign key (id_journal) references journal(id_journal),
foreign key (id_estudio) references estudio(id_estudio),
id_pubmed int,
autor varchar (255),
fecha_pub varchar (20),
link varchar (500)
);









    



  File "<ipython-input-5-3975823bd552>", line 1
    CREATE TABLE enfermedad
               ^
SyntaxError: invalid syntax

Ejercicio 2

Lea el archivo y guarde la infomación en la base de datos en las tablas que se definidieron en el Ejercicio 1.



In [6]:

    
df.head(1)









    Out[6]:






  
    
      
      DATE ADDED TO CATALOG
      PUBMEDID
      FIRST AUTHOR
      DATE
      JOURNAL
      LINK
      STUDY
      DISEASE/TRAIT
      INITIAL SAMPLE SIZE
      REPLICATION SAMPLE SIZE
      ...
      CONTEXT
      INTERGENIC
      RISK ALLELE FREQUENCY
      P-VALUE
      PVALUE_MLOG
      P-VALUE (TEXT)
      OR or BETA
      95% CI (TEXT)
      PLATFORM [SNPS PASSING QC]
      CNV
    
  
  
    
      0
      2009-09-28
      18403759
      Ober C
      2008-04-09
      N Engl J Med
      www.ncbi.nlm.nih.gov/pubmed/18403759
      Effect of variation in CHI3L1 on serum YKL-40 ...
      YKL-40 levels
      632 Hutterite individuals
      443 European ancestry cases, 491 European ance...
      ...
      upstream_gene_variant
      0.0
      0.29
      1e-13
      13.0
      NaN
      0.3
      [NR] ng/ml decrease
      Affymetrix [290325]
      N
    
  

1 rows × 34 columns



In [7]:

    
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'

def doQuery( conn ) :
    cur = conn.cursor()

    cur.execute( "select * from enfermedad_loci" )

    for id_nombre, nombre_enf in cur.fetchall() :
        print (id_nombre, nombre_enf)

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
doQuery( myConnection )
myConnection.close()



In [27]:

    
def get_diseaseId(disease_name):
    cur = myConnection.cursor()

    cur.execute( """select * from enfermedad where nombre = "%s" """ % (disease_name) )
    
    id_enf = None
    for id_, dissease in cur.fetchall() :
        id_enf = id_
        
    if not id_enf:
        #print("""insert into enfermedad values (NULL, "%s" )""" % (disease_name))
        cur.execute("""insert into enfermedad values (NULL, "%s" )""" % (disease_name))
        cur.execute("SELECT LAST_INSERT_ID()")
        
        id_enf = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_enf



In [28]:

    
def get_platId(plat_name):
    cur = myConnection.cursor()

    cur.execute( """select * from plataforma where nombre = "%s" """ % (plat_name) )
    
    id_plat = None
    for id_, nombre_plat in cur.fetchall() :
        id_plat = id_
        
    if not id_plat:
        #print("""insert into plataforma values (NULL, "%s" )""" % (plat_name))
        cur.execute("""insert into plataforma values (NULL, "%s" )""" % (plat_name))
        cur.execute("SELECT LAST_INSERT_ID()")
        
                
        id_plat = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_plat



In [29]:

    
def get_jourId(jour_name):
    cur = myConnection.cursor()

    cur.execute( """select * from journal where nombre = "%s" """ % (jour_name) )
    
    id_jour = None
    for id_, nombre_jour in cur.fetchall() :
        id_jour = id_
        
    if not id_jour:
        #print("""insert into journal values (NULL, "%s" )""" % (jour_name))
        cur.execute("""insert into journal values (NULL, "%s" )""" % (jour_name))
        cur.execute("SELECT LAST_INSERT_ID()")
        
                
        id_jour = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_jour



In [30]:

    
def get_estId(id_journal, id_enfermedad, est_name, tam_mues, replica):
    cur = myConnection.cursor()

    cur.execute( """select * from estudio where nombre = "%s" """ % (est_name))
    
    id_est = None
    for id_, id_journal, id_enfermedad, nombre_est, tam_mues, replica in cur.fetchall() :
        id_est = id_
        
    if not id_est:
        #print("""insert into estudio values (NULL, "%s", "%s", "%s", "%s", "%s")""" % (id_journal, id_enfermedad,  est_name, tam_mues, replica))
        cur.execute("""insert into estudio values (NULL, "%s", "%s", "%s", "%s", "%s")""" % ( id_journal,id_enfermedad,est_name, tam_mues, replica))
        cur.execute("SELECT LAST_INSERT_ID()")
    
               
        id_est = cur.fetchall()[0][0]
    
        myConnection.commit()
        
    return id_est



In [31]:

    
def get_locId(id_plataforma, region, chrom, pos, genes_reportados, gen_mapped, gen_upstream,gen_downstream, SNP_GENE_ID, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT, CONTEXTO, risk_allele):
    cur = myConnection.cursor()

    cur.execute( """select * from loci where chrom = "%s" and pos="%s" """ % (chrom,pos))
    
    id_loc = None
    for id_,id_plataforma, region, chrom, pos, genes_reportados, gen_mapped, gen_upstream, gen_downstream, SNP_GENE_ID, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT, CONTEXTO, risk_allele in cur.fetchall() :
        id_est = id_
        
    if not id_loc:
        #print("""insert into loci values (NULL, "%s","%s", "%s", "%s", "%s",
        #"%s","%s", "%s", "%s", "%s",
        #"%s","%s", "%s", "%s", "%s",
        #"%s","%s")""" % (id_plataforma, region,chrom,pos,genes_reportados,
        #                       gen_mapped, gen_upstream,gen_downstream, SNP_GENE_ID, 
        #                        UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, 
        #                        STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT, 
        #                        CONTEXTO, risk_allele))
        
        cur.execute("""insert into loci values (NULL, "%s","%s", "%s", "%s", "%s",
        "%s","%s", "%s", "%s", "%s",
        "%s","%s", "%s", "%s", "%s",
        "%s","%s")""" % (id_plataforma, region,chrom,pos,genes_reportados,
                                gen_mapped, gen_upstream,gen_downstream, SNP_GENE_ID, 
                                UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, 
                                STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT, 
                                CONTEXTO, risk_allele))
        cur.execute("SELECT LAST_INSERT_ID()")
    
               
        id_loc = cur.fetchall()[0][0]
    
        myConnection.commit()
        
    return id_loc



In [32]:

    
def get_pubId(id_journal, id_estudio, id_pubmed, autor, fecha_pub, link):
    cur = myConnection.cursor()

    cur.execute( """select * from publicacion where id_pubmed = "%s" """ % (id_pubmed))
    
    id_pub = None
    for id_, id_journal, id_estudio, idpubmed, autor, fecha_pub, link in cur.fetchall() :
        id_pub = id_
        
    if not id_pub:
        #print("""insert into publicacion values (NULL, "%s", "%s", "%s", "%s", "%s", "%s")""" % (id_journal, id_estudio, id_pubmed, autor, fecha_pub, link))
        cur.execute("""insert into publicacion values (NULL, "%s", "%s", "%s", "%s", "%s", "%s")""" % (id_journal, id_estudio, id_pubmed, autor, fecha_pub, link))
        cur.execute("SELECT LAST_INSERT_ID()")
    
               
        id_pub = cur.fetchall()[0][0]
    
        myConnection.commit()
        
    return id_pub



In [33]:

    
def get_enfloId(loci_id, id_enfermedad,):
    cur=myConnection.cursor()
    
    cur.execute( """select * from enfermedad_loci where id_loci = "%s" and id_enfermedad = "%s" """ % (id_loci, id_enfermedad))
    
    existe = None
    for id_, loci_id, id_enfermedad in cur.fetchall() :
        existe = True
        
    if not existe:
        #print("""insert into enfermedad_loci values ("%s","%s")""" % (loci_id, id_enfermedad))
        cur.execute("""insert into enfermedad_loci values ("%s", "%s")""" % (loci_id, id_enfermedad))
        myConnection.commit()



In [35]:

    
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'

myConnection = mysql.connector.connect(host=hostname, user=username, passwd=password, db=database )
cur=myConnection.cursor()

for index, row in df.iterrows():
    #identidad enfermedad
    dissease_name = row['DISEASE/TRAIT']
    dissease_id = get_diseaseId(dissease_name)
        
    #identidad plataforma
    plat_name = row['PLATFORM [SNPS PASSING QC]']
    plat_id = get_platId(plat_name)
    
    #identidad loci
    id_plataforma = plat_id
    region=row['REGION']
    chrom=row['CHR_ID']
    pos=row['CHR_POS']
    genes_reportados=row['REPORTED GENE(S)']
    gen_mapped=row['MAPPED_GENE']
    gen_upstream=row['UPSTREAM_GENE_ID']
    gen_downstream=row['DOWNSTREAM_GENE_ID']
    SNP_GENE_ID=row['SNP_GENE_IDS']
    UPSTREAM_GENE_DISTANCE=row['UPSTREAM_GENE_DISTANCE']
    DOWNSTREAM_GENE_DISTANCE=row['DOWNSTREAM_GENE_DISTANCE']
    STRONGEST_SP_RISK=row['STRONGEST SNP-RISK ALLELE']
    SNPS=row['SNPS']
    MERGED=row['MERGED']
    SNP_ID_CURRENT=row['SNP_ID_CURRENT']
    CONTEXTO=row['CONTEXT']
    risk_allele=row['RISK ALLELE FREQUENCY']
    #PVAL=row['P-VALUE']
    #Pvalue_MLOG=row['PVALUE_MLOG']
    #PVALUE_txt=row['P-VALUE (TEXT)']
    #BETA=row['OR or BETA']
    #novCI= row['95% CI (TEXT)']
    id_loci= get_locId(id_plataforma, region,
                       chrom,pos,genes_reportados, 
                       gen_mapped, gen_upstream,gen_downstream,
                       SNP_GENE_ID, UPSTREAM_GENE_DISTANCE,
                       DOWNSTREAM_GENE_DISTANCE,
                       STRONGEST_SP_RISK, SNPS, MERGED, 
                        SNP_ID_CURRENT, CONTEXTO, risk_allele)
   
    #identidad loci enfermedad
    id_enfermedad=dissease_id 
    loci_id=id_loci 
    get_enfloId(loci_id, id_enfermedad)
    
    #identidad journal
    jour_name = row['JOURNAL']
    jour_id = get_jourId(jour_name)
    
    #identidad estudio
    id_enfermedad=dissease_id
    id_journal=jour_id
    est_name =  row['STUDY']
    tam_mues = row['INITIAL SAMPLE SIZE']
    replica = row['REPLICATION SAMPLE SIZE']
    est_id = get_estId(id_journal, id_enfermedad, est_name, tam_mues, replica)
    
    #identidad publicacion
    id_journal=jour_id
    id_estudio=est_id
    idpubmed= row['PUBMEDID']
    autor= row['FIRST AUTHOR']
    fecha_pub= row['DATE ADDED TO CATALOG']
    link= row['LINK']
    id_publicacion = get_pubId(id_journal, id_estudio, idpubmed, autor, fecha_pub, link)
    
                  
myConnection.close()









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-35-bf2a2d008b78> in <module>()
     50     id_enfermedad=dissease_id
     51     loci_id=id_loci
---> 52     get_enfloId(loci_id, id_enfermedad)
     53 
     54     #identidad journal

<ipython-input-33-ec50b5916f8e> in get_enfloId(loci_id, id_enfermedad)
     11         #print("""insert into enfermedad_loci values ("%s","%s")""" % (loci_id, id_enfermedad))
     12         cur.execute("""insert into enfermedad_loci values ("%s", "%s")""" % (loci_id, id_enfermedad))
---> 13         myConnection.commit()

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\connection.py in commit(self)
   1476     def commit(self):
   1477         """Commit current transaction"""
-> 1478         self._execute_query("COMMIT")
   1479 
   1480     def rollback(self):

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\connection.py in _execute_query(self, query)
   1497             raise errors.InternalError("Unread result found.")
   1498 
-> 1499         self.cmd_query(query)
   1500 
   1501     def _info_query(self, query):

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\connection.py in cmd_query(self, query)
    720         if not isinstance(query, bytes):
    721             query = query.encode('utf-8')
--> 722         result = self._handle_result(self._send_cmd(ServerCmd.QUERY, query))
    723 
    724         if self._have_next_result:

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\connection.py in _send_cmd(self, command, argument, packet_number, packet, expect_response)
    508         if not expect_response:
    509             return None
--> 510         return self._socket.recv()
    511 
    512     def _send_data(self, data_file, send_empty_packet=False):

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\network.py in recv_plain(self)
    224             packet_len = 0
    225             while packet_len < 4:
--> 226                 chunk = self.sock.recv(4 - packet_len)
    227                 if not chunk:
    228                     raise errors.InterfaceError(errno=2013)

KeyboardInterrupt:

Ejercicio 3

Realize de la base de datos una consulta que le responda una pregunta biológica (e.g. qué genes estan relacionados con cuales enfermedades)



In [70]:

    
#¿Cual es el numero de locus afectados para n enfermedades? con ejemplo para 4 enfermedades

hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'

myConnection = mysql.connector.connect(host=hostname, user=username, passwd=password, db=database )


def enfermedades_info_1(nombres_enfermedades):
    cur=myConnection.cursor()
    cromosoma=set()
    regionafec= set()
    posi=set()
    N=len(nombres_enfermedades)
    lol = []
    num=0
    count=0
        
    for i in range(len(nombres_enfermedades)):
        cur.execute("""select * FROM enfermedad where nombre= "%s" """ % nombres_enfermedades[i])
        for id_, nombre_enf in cur.fetchall() :
            print ("***Enfermedad escogida: " + nombre_enf)
            print ("- Codigo enfermedad: " )
            print (id_)
    
        num=0
        cur.execute("""select * FROM enfermedad_loci where id_enfermedad= "%s" """  % id_)
        for id_2, cod_enf in cur.fetchall() :
            num=num + 1
            
        
                   
                    
                                
          
        print("- numero de locus afectados:")
        print (num)
        print()
        print()
   
    
enfermedades=("Atopic dermatitis",
              "Allergic rhinitis in asthma",
              "Asthma", "Allergic rhinitis")
enfermedades_info_1(enfermedades)
myConnection.close()









    



***Enfermedad escogida: Atopic dermatitis
- Codigo enfermedad: 
173
- numero de locus afectados:
4


***Enfermedad escogida: Allergic rhinitis in asthma
- Codigo enfermedad: 
719
- numero de locus afectados:
1


***Enfermedad escogida: Asthma
- Codigo enfermedad: 
73
- numero de locus afectados:
82


***Enfermedad escogida: Allergic rhinitis
- Codigo enfermedad: 
290
- numero de locus afectados:
19

Ejercicio 4

Guarde el resultado de la consulta anterior en un archivo csv



In [74]:

    
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'

myConnection = mysql.connector.connect(host=hostname, user=username, passwd=password, db=database )

def enfermedades_info_2(nombres_enfermedades):
    cur=myConnection.cursor()
    cromosoma=set()
    N= len(nombres_enfermedades)+1
    lol = [[] for _ in range(N)]
    lol[0].append("enfermedades")
    lol[0].append("Codigo enfermedad") 
    lol[0].append("numero de locus afectados") 
    for i in range(len(nombres_enfermedades)):
        cur.execute("""select * FROM enfermedad where nombre= "%s" """ % nombres_enfermedades[i])
        for id_, nombre_enf in cur.fetchall() :
                lol[i+1].append(nombre_enf)
                lol[i+1].append(id_)
        
        num=0
        cur.execute("""select * FROM enfermedad_loci where id_enfermedad= "%s" """  % id_)
        for id_2, cod_enf in cur.fetchall() :
            num=num + 1
                
              
        lol[i+1].append(num)
        
       
       
        import csv
        with open('enfermedades_c.csv', 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(lol)
                  
       
    return lol

enfermedades=("Asthma", "Allergic rhinitis", 
              "Allergic rhinitis in asthma",
             "Atopic dermatitis")
enfermedades_info_2(enfermedades)









    Out[74]:





[['enfermedades', 'Codigo enfermedad', 'numero de locus afectados'],
 ['Asthma', 73, 82],
 ['Allergic rhinitis', 290, 19],
 ['Allergic rhinitis in asthma', 719, 1],
 ['Atopic dermatitis', 173, 4]]