Archivos y Bases de datos



In [1]:

    
import mysql.connector

La idea de este taller es manipular archivos (leerlos, parsearlos y escribirlos) y hacer lo mismo con bases de datos estructuradas.

Ejercicio 1

Baje el archivo de "All associations with added ontology annotations" del GWAS Catalog.

https://www.ebi.ac.uk/gwas/docs/file-downloads

Describa las columnas del archivo (que información estamos mirando? Para qué sirve? Por qué la hicieron?)



In [2]:

    
import pandas as pd
df= pd.read_csv('C:/Users/Alex/Documents/eafit/semestres/X semestre/programacion/taller2.tsv', sep = '\t')
df[:1]









    



C:\Users\Alex\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (12,23,27) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    Out[2]:






  
    
      
      DATE ADDED TO CATALOG
      PUBMEDID
      FIRST AUTHOR
      DATE
      JOURNAL
      LINK
      STUDY
      DISEASE/TRAIT
      INITIAL SAMPLE SIZE
      REPLICATION SAMPLE SIZE
      ...
      CONTEXT
      INTERGENIC
      RISK ALLELE FREQUENCY
      P-VALUE
      PVALUE_MLOG
      P-VALUE (TEXT)
      OR or BETA
      95% CI (TEXT)
      PLATFORM [SNPS PASSING QC]
      CNV
    
  
  
    
      0
      2009-09-28
      18403759
      Ober C
      2008-04-09
      N Engl J Med
      www.ncbi.nlm.nih.gov/pubmed/18403759
      Effect of variation in CHI3L1 on serum YKL-40 ...
      YKL-40 levels
      632 Hutterite individuals
      443 European ancestry cases, 491 European ance...
      ...
      upstream_gene_variant
      0.0
      0.29
      1e-13
      13.0
      NaN
      0.3
      [NR] ng/ml decrease
      Affymetrix [290325]
      N
    
  

1 rows × 34 columns

Qué Entidades (tablas) puede definir?

Enfermedad
Plataforma (tecnologia de secuenciacion)
Loci
Enfermedad-loci
Journal
Estudio
Publicacion

Cree la base de datos (copie el código SQL que se usó)



In [5]:

    
CREATE TABLE enfermedad
(
id_enfermedad int PRIMARY KEY,
nombre varchar(255)
);

create table plataforma
(
id_plataforma int primary key, 
nombre varchar(255)
);

CREATE TABLE loci
(
id_loci int NOT NULL PRIMARY KEY,
region varchar(255),
chrom varchar(255), 
pos int,
genes_reportados int,
gen_mapped varchar(255),
gen_upstream int,
gen_downstream int,
SNP_GENE_IDS int,
UPSTREAM_GENE_DISTANCE int,
DOWNSTREAM_GENE_DISTANCE int,
STRONGEST_SP_RISK varchar(255),
SNPS varchar(255),
MERGED int,
SNP_ID_CURRENT varchar(255),
CONTEXTO varchar(255),
risk_allele varchar(255),
PVAl int,
PVALUE_MLOG int,
PVALUE_txt varchar(255),
BETA int,  
novCI varchar(255),
id_plataforma int,
foreign key (id_plataforma) references plataforma(id_plataforma)
);

CREATE TABLE enfermedad_loci
(
id_enfermedad int,
id_loci int,
PRIMARY KEY (id_enfermedad, id_loci),
foreign key (id_enfermedad) references enfermedad(id_enfermedad),
foreign key (id_loci) references loci(id_loci)
);

CREATE TABLE journal
(
id_journal int primary key,
nombre varchar(255)
);

create table publicacion
(
id_publicacion int,
id_pubmed int,
autor varchar (255),
fecha_pub varchar (20),
link varchar (255),
id_journal int,
id_estudio int,
foreign key (id_journal) references journal(id_journal),
foreign key (id_estudio) references estudio(id_estudio)
);

CREATE TABLE estudio
(
nombre varchar(255),
id_estudio int primary key,
id_enfermedad int,
id_publicacion int,
foreign key (id_publicacion) references publicacion(id_publicacion),
foreign key (id_enfermedad) references enfermedad(id_enfermedad),
tamano_muestra int,
replicas int
);









    



  File "<ipython-input-5-d1b77f55e9ba>", line 1
    CREATE TABLE enfermedad
               ^
SyntaxError: invalid syntax

Ejercicio 2

Lea el archivo y guarde la infomación en la base de datos en las tablas que se definidieron en el Ejercicio 1.



In [6]:

    
df.head(1)









    Out[6]:






  
    
      
      DATE ADDED TO CATALOG
      PUBMEDID
      FIRST AUTHOR
      DATE
      JOURNAL
      LINK
      STUDY
      DISEASE/TRAIT
      INITIAL SAMPLE SIZE
      REPLICATION SAMPLE SIZE
      ...
      CONTEXT
      INTERGENIC
      RISK ALLELE FREQUENCY
      P-VALUE
      PVALUE_MLOG
      P-VALUE (TEXT)
      OR or BETA
      95% CI (TEXT)
      PLATFORM [SNPS PASSING QC]
      CNV
    
  
  
    
      0
      2009-09-28
      18403759
      Ober C
      2008-04-09
      N Engl J Med
      www.ncbi.nlm.nih.gov/pubmed/18403759
      Effect of variation in CHI3L1 on serum YKL-40 ...
      YKL-40 levels
      632 Hutterite individuals
      443 European ancestry cases, 491 European ance...
      ...
      upstream_gene_variant
      0.0
      0.29
      1e-13
      13.0
      NaN
      0.3
      [NR] ng/ml decrease
      Affymetrix [290325]
      N
    
  

1 rows × 34 columns



In [3]:

    
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'

def doQuery( conn ) :
    cur = conn.cursor()

    cur.execute( "select * from enfermedad" )

    for id_nombre, nombre_enf in cur.fetchall() :
        print (id_nombre, nombre_enf)

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
doQuery( myConnection )
myConnection.close()









    



1 YKL-40 levels
2 Psoriasis
3 Lung cancer



In [7]:

    
def get_diseaseId(disease_name):
    cur = myConnection.cursor()

    cur.execute( """select * from enfermedad where nombre = "%s" """ % (disease_name) )
    
    id_enf = None
    for id_, nombre_enf in cur.fetchall() :
        id_enf = id_
        
    if not id_enf:
        cur.execute("""insert into enfermedad values (NULL, "%s" )""" % (disease_name))
        cur.execute("SELECT LAST_INSERT_ID()")
        
        
        
        id_enf = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_enf



In [47]:

    
def get_platId(plat_name):
    cur = myConnection.cursor()

    cur.execute( """select * from plataforma where nombre = "%s" """ % (plat_name) )
    
    id_plat = None
    for id_, nombre_plat in cur.fetchall() :
        id_plat = id_
        
    if not id_plat:
        print("""insert into plataforma values (NULL, "%s" )""" % (plat_name))
        cur.execute("""insert into plataforma values (NULL, "%s" )""" % (plat_name))
        cur.execute("SELECT LAST_INSERT_ID()")
        
                
        id_plat = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_plat

for index, row in df.iterrows():
    plat_name = row['PLATFORM [SNPS PASSING QC]']
    plat_id = get_platId(plat_name)









    



---------------------------------------------------------------------------
OperationalError                          Traceback (most recent call last)
<ipython-input-47-7fa23b42eb96> in <module>()
     22 for index, row in df.iterrows():
     23     plat_name = row['PLATFORM [SNPS PASSING QC]']
---> 24     plat_id = get_platId(plat_name)

<ipython-input-47-7fa23b42eb96> in get_platId(plat_name)
      1 def get_platId(plat_name):
----> 2     cur = myConnection.cursor()
      3 
      4     cur.execute( """select * from plataforma where nombre = "%s" """ % (plat_name) )
      5 

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\connection.py in cursor(self, buffered, raw, prepared, cursor_class, dictionary, named_tuple)
   1381             raise errors.InternalError("Unread result found.")
   1382         if not self.is_connected():
-> 1383             raise errors.OperationalError("MySQL Connection not available.")
   1384         if cursor_class is not None:
   1385             if not issubclass(cursor_class, CursorBase):

OperationalError: MySQL Connection not available.



In [8]:

    
def get_lociId(loci_name):
    cur = myConnection.cursor()

    cur.execute( """select * from loci where nombre = "%s" """ % (disease_name) )
    
    id_loci = None
    for id_, nombre_enf in cur.fetchall() :
        id_loci = id_
        
    if not id_loci:
        print("""insert into enfermedad values (NULL, "%s", )""" % (disease_name))
        cur.execute("""insert into enfermedad values (NULL, "%s" )""" % (disease_name))
        cur.execute("SELECT LAST_INSERT_ID()")
        
        
        
        id_enf = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_enf



In [9]:

    
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )

for index, row in df.iterrows():
    dis_name = row['DISEASE/TRAIT']
    dissease_id = get_diseaseId(dis_name)


print()

myConnection.close()

Ejercicio 3

Realize de la base de datos una consulta que le responda una pregunta biológica (e.g. qué genes estan relacionados con cuales enfermedades)



In [ ]:

    
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html

Ejercicio 4

Guarde el resultado de la consulta anterior en un archivo csv



In [ ]: