Esta hoja muestra cómo acceder a bases de datos SQL y también a conectar la salida con Jupyter. Las partes en SQL también se pueden realizar directamente en MySQL ejecutando el programa mysql
del contenedor.
Igual que en la práctica de introducción, los contenedores se pueden lanzar automáticamente usando docker-compose
desde el directorio bdge/sql
una vez bajado el repositorio Git de las prácticas:
$ git clone https://github.com/dsevilla/bdge.git
$ cd bdge/sql
$ docker-compose up
Dentro del Notebook, la base de datos está disponible en el host con nombre mysql
.
Instalación de los paquetes Python necesarios:
In [ ]:
!pip install pymysql ipython-sql
In [ ]:
%load_ext sql
#%config SqlMagic.feedback = False # Evitar que muestre el número de filas
In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')
In [ ]:
%%sql
mysql+pymysql://root:root@mysql/?charset=utf8mb4&local_infile=1
1
: AcceptedByOriginator2
: UpMod3
: DownMod4
: Offensive5
: Favorite - if VoteTypeId = 5 UserId will be populated6
: Close7
: Reopen8
: BountyStart9
: BountyClose10
: Deletion11
: Undeletion12
: Spam13
: InformModeratorDescargar todos los CSVs:
In [ ]:
import os
import os.path as path
from urllib.request import urlretrieve
def download_csv_upper_dir(baseurl, filename):
file = path.abspath(path.join(os.getcwd(),os.pardir,filename))
if not os.path.isfile(file):
urlretrieve(baseurl + '/' + filename, file)
baseurl = 'http://neuromancer.inf.um.es:8080/es.stackoverflow/'
download_csv_upper_dir(baseurl, 'Posts.csv')
download_csv_upper_dir(baseurl, 'Users.csv')
download_csv_upper_dir(baseurl, 'Tags.csv')
download_csv_upper_dir(baseurl, 'Comments.csv')
download_csv_upper_dir(baseurl, 'Votes.csv')
In [ ]:
%%sql
DROP SCHEMA IF EXISTS stackoverflow;
CREATE SCHEMA stackoverflow CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
In [ ]:
%%sql
USE stackoverflow;
Se tiene que habilitar esto para que se permita importar CSVs.
In [ ]:
%%sql
SET GLOBAL local_infile = true;
In [ ]:
%%sql
DROP TABLE IF EXISTS Posts;
CREATE TABLE Posts (
Id INT,
AcceptedAnswerId INT NULL DEFAULT NULL,
AnswerCount INT DEFAULT 0,
Body TEXT,
ClosedDate DATETIME(6) NULL DEFAULT NULL,
CommentCount INT DEFAULT 0,
CommunityOwnedDate DATETIME(6) NULL DEFAULT NULL,
CreationDate DATETIME(6) NULL DEFAULT NULL,
FavoriteCount INT DEFAULT 0,
LastActivityDate DATETIME(6) NULL DEFAULT NULL,
LastEditDate DATETIME(6) NULL DEFAULT NULL,
LastEditorDisplayName TEXT,
LastEditorUserId INT NULL DEFAULT NULL,
OwnerDisplayName TEXT,
OwnerUserId INT NULL DEFAULT NULL,
ParentId INT NULL DEFAULT NULL,
PostTypeId INT, -- 1 = Question, 2 = Answer
Score INT DEFAULT 0,
Tags TEXT,
Title TEXT,
ViewCount INT DEFAULT 0,
PRIMARY KEY(Id)
)
CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
In [ ]:
%%sql
LOAD DATA LOCAL INFILE "../Posts.csv" INTO TABLE Posts
CHARACTER SET utf8mb4
COLUMNS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
ESCAPED BY '"'
LINES TERMINATED BY '\r\n'
IGNORE 1 LINES
(Id,
@AcceptedAnswerId,
@AnswerCount,
Body,
@ClosedDate,
@CommentCount,
@CommunityOwnedDate,
CreationDate,
@FavoriteCount,
@LastActivityDate,
@LastEditDate,
LastEditorDisplayName,
@LastEditorUserId,
OwnerDisplayName,
@OwnerUserId,
@ParentId,
PostTypeId,
Score,
Tags,
Title,
@ViewCount)
SET ParentId = nullif (@ParentId, ''),
ClosedDate = nullif(@ClosedDate, ''),
LastEditorUserId = nullif(@OLastEditorUserId, ''),
LastActivityDate = nullif(@LastActivityDate, ''),
LastEditDate = nullif(@LastEditDate, ''),
AcceptedAnswerId = nullif (@AcceptedAnswerId, ''),
OwnerUserId = nullif(@OwnerUserId, ''),
LastEditorUserId = nullif(@LastEditorUserId, ''),
CommunityOwnedDate = nullif(@CommunityOwnedDate, ''),
FavoriteCount = if(@FavoriteCount = '',0,@FavoriteCount),
CommentCount = if(@CommentCount = '',0,@CommentCount),
ViewCount = if(@ViewCount = '',0,@ViewCount),
AnswerCount = if(@AnswerCount = '',0,@AnswerCount)
;
In [ ]:
%%sql
select count(*) from Posts;
In [ ]:
%%sql
select Id,Title,CreationDate from Posts LIMIT 2;
In [ ]:
%%sql
DROP TABLE IF EXISTS Users;
CREATE TABLE Users (
Id INT,
AboutMe TEXT,
AccountId INT,
Age INT NULL DEFAULT NULL,
CreationDate DATETIME(6) NULL DEFAULT NULL,
DisplayName TEXT,
DownVotes INT DEFAULT 0,
LastAccessDate DATETIME(6) NULL DEFAULT NULL,
Location TEXT,
ProfileImageUrl TEXT,
Reputation INT DEFAULT 0,
UpVotes INT DEFAULT 0,
Views INT DEFAULT 0,
WebsiteUrl TEXT,
PRIMARY KEY(Id)
)
CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
In [ ]:
%%sql
LOAD DATA LOCAL INFILE "../Users.csv" INTO TABLE Users
CHARACTER SET utf8mb4
COLUMNS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
ESCAPED BY '"'
LINES TERMINATED BY '\r\n'
IGNORE 1 LINES
(Id,AboutMe,@AccountId,@Age,@CreationDate,DisplayName,DownVotes,LastAccessDate,Location,ProfileImageUrl,
Reputation,UpVotes,Views,WebsiteUrl)
SET LastAccessDate = nullif(@LastAccessDate,''),
Age = nullif(@Age, ''),
CreationDate = nullif(@CreationDate,''),
AccountId = nullif(@AccountId, '')
;
In [ ]:
%%sql
select count(*) from Users;
In [ ]:
%%sql
DROP TABLE IF EXISTS Tags;
CREATE TABLE Tags (
Id INT,
Count INT DEFAULT 0,
ExcerptPostId INT NULL DEFAULT NULL,
TagName TEXT,
WikiPostId INT NULL DEFAULT NULL,
PRIMARY KEY(Id)
)
CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
In [ ]:
%%sql
LOAD DATA LOCAL INFILE "../Tags.csv" INTO TABLE Tags
CHARACTER SET utf8mb4
COLUMNS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
ESCAPED BY '"'
LINES TERMINATED BY '\r\n'
IGNORE 1 LINES
(Id,Count,@ExcerptPostId,TagName,@WikiPostId)
SET WikiPostId = nullif(@WikiPostId, ''),
ExcerptPostId = nullif(@ExcerptPostId, '')
;
In [ ]:
%%sql
DROP TABLE IF EXISTS Comments;
CREATE TABLE Comments (
Id INT,
CreationDate DATETIME(6) NULL DEFAULT NULL,
PostId INT NULL DEFAULT NULL,
Score INT DEFAULT 0,
Text TEXT,
UserDisplayName TEXT,
UserId INT NULL DEFAULT NULL,
PRIMARY KEY(Id)
)
CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
In [ ]:
%%sql
LOAD DATA LOCAL INFILE "../Comments.csv" INTO TABLE Comments
CHARACTER SET utf8mb4
COLUMNS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
ESCAPED BY '"'
LINES TERMINATED BY '\r\n'
IGNORE 1 LINES
(Id,@CreationDate,@PostId,Score,Text,@UserDisplayName,@UserId)
SET UserId = nullif(@UserId, ''),
PostId = nullif(@PostId, ''),
CreationDate = nullif(@CreationDate,''),
UserDisplayName = nullif(@UserDisplayName,'')
;
In [ ]:
%%sql
SELECT Count(*) FROM Comments;
In [ ]:
%%sql
DROP TABLE IF EXISTS Votes;
CREATE TABLE Votes (
Id INT,
BountyAmount INT DEFAULT 0,
CreationDate DATETIME(6) NULL DEFAULT NULL,
PostId INT NULL DEFAULT NULL,
UserId INT NULL DEFAULT NULL,
VoteTypeId INT,
PRIMARY KEY(Id)
)
CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
In [ ]:
%%sql
LOAD DATA LOCAL INFILE "../Votes.csv" INTO TABLE Votes
CHARACTER SET utf8mb4
COLUMNS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
ESCAPED BY '"'
LINES TERMINATED BY '\r\n'
IGNORE 1 LINES
(Id,@BountyAmount,@CreationDate,@PostId,@UserId,VoteTypeId)
SET UserId = nullif(@UserId, ''),
PostId = nullif(@PostId, ''),
BountyAmount = if(@BountyAmount = '',0,@BountyAmount),
CreationDate = nullif(@CreationDate, '')
;
In [ ]:
%%sql
ALTER TABLE Posts ADD FOREIGN KEY (ParentId) REFERENCES Posts(Id);
ALTER TABLE Posts ADD FOREIGN KEY (OwnerUserId) REFERENCES Users(Id);
ALTER TABLE Posts ADD FOREIGN KEY (LastEditorUserId) REFERENCES Users(Id);
ALTER TABLE Posts ADD FOREIGN KEY (AcceptedAnswerId) REFERENCES Posts(Id);
In [ ]:
%%sql
ALTER TABLE Tags ADD FOREIGN KEY (WikiPostId) REFERENCES Posts(Id);
ALTER TABLE Tags ADD FOREIGN KEY (ExcerptPostId) REFERENCES Posts(Id);
In [ ]:
%%sql
ALTER TABLE Comments ADD FOREIGN KEY (PostId) REFERENCES Posts(Id);
ALTER TABLE Comments ADD FOREIGN KEY (UserId) REFERENCES Users(Id);
In [ ]:
%%sql
ALTER TABLE Votes ADD FOREIGN KEY (PostId) REFERENCES Posts(Id);
ALTER TABLE Votes ADD FOREIGN KEY (UserId) REFERENCES Users(Id);
In [ ]:
%%sql
EXPLAIN
SELECT Y.PostId,Y.Present
FROM (SELECT v.PostId AS PostId, COALESCE(p.Id,CONCAT('No: ', v.PostId)) AS Present
FROM Votes v LEFT JOIN Posts p ON v.PostId = p.Id) AS Y
WHERE Y.Present LIKE 'No%';
In [ ]:
%%sql
EXPLAIN SELECT PostId from Votes WHERE PostId NOT IN (select Id from Posts);
In [ ]:
%%sql
select * from Votes LIMIT 20;
In [ ]:
%%sql
SELECT Y.Id, Y.PostId, Y.Present
FROM (SELECT v.PostId AS PostId, v.Id AS Id, p.Id AS Pid, COALESCE(p.Id, CONCAT('No: ', v.PostId)) AS Present
FROM Votes v LEFT JOIN Posts p ON v.PostId = p.Id) AS Y
WHERE Y.Pid IS NULL
LIMIT 1000
In [ ]:
%%sql
-- DELETE FROM Votes WHERE ...;
In [ ]:
%%sql
-- Y ahora sí
ALTER TABLE Votes ADD FOREIGN KEY (PostId) REFERENCES Posts(Id);
ALTER TABLE Votes ADD FOREIGN KEY (UserId) REFERENCES Users(Id);
In [ ]:
%sql use stackoverflow
In [ ]:
%%sql
SHOW TABLES;
In [ ]:
%%sql
DESCRIBE Posts;
In [ ]:
top_tags = %sql SELECT Id, TagName, Count FROM Tags ORDER BY Count DESC LIMIT 40;
¡¡Los resultados de %sql
se pueden convertir a un DataFrame
!!
In [ ]:
top_tags_df = top_tags.DataFrame()
In [ ]:
# invert_y_axis() hace que el más usado aparezca primero. Por defecto es al revés.
top_tags_df.plot(kind='barh',x='TagName', y='Count', figsize=(14,14*2/3)).invert_yaxis()
In [ ]:
top_tags
In [ ]:
%%sql
select Id,TagName,Count from Tags WHERE Count > 5 ORDER BY Count ASC LIMIT 40;
Voy a hacer unas consultas para comparar la eficiencia con HBase. Calcularé el tamaño medio del texto de los comentarios de un post en particular (he seleccionado el 7251, que es el que más tiene comentarios, 32). Hago el cálculo en local porque aunque existe la función AVG
de SQL, es posible que la función que tuviéramos que calcular no la tuviera la base de datos, con lo que tenemos que obtener todos los datos y calcularla en local. Eso también nos dará una idea de la eficiencia de recuperación de la base de datos.
In [ ]:
%%sql
SELECT p.Id, MAX(p.CommentCount) AS c FROM Posts p GROUP BY p.Id ORDER BY c DESC LIMIT 1;
In [ ]:
%sql SELECT AVG(CHAR_LENGTH(Text)) from Comments WHERE PostId = 7251;
In [ ]:
from functools import reduce
def doit():
q = %sql select Text from Comments WHERE PostId = 7251;
(s,n) = reduce(lambda res, e: (res[0]+len(e[0]), res[1]+1), q, (0,0))
return (s/n)
%timeit doit()
In [ ]:
%%sql
-- Preguntas con más respuestas (20 primeras)
In [ ]:
%%sql
select Title from Posts where Id = 5;
In [ ]:
# Calcular la suma de posts cada Tag de manera eficiente
import re
# Obtener los datos iniciales de los Tags
results = %sql SELECT Id, Tags FROM Posts where Tags IS NOT NULL;
tagcount = {}
for result in results:
# Inserta las tags en la tabla Tag
tags = re.findall('<(.*?)>', result[1])
for tag in tags:
tagcount[tag] = tagcount.get(tag,0) + 1;
# Comprobar que son iguales las cuentas
for k in tagcount:
res = %sql select TagName,SUM(Count) from Tags WHERE TagName = :k GROUP BY TagName;
if tagcount[k] != res[0][1]:
print("Tag %s NO coincide (%d)!!" % (k, res[0][1]))
In [ ]:
tagcount
In [ ]:
df = pd.DataFrame({'count' : pd.Series(list(tagcount.values()),
index=list(tagcount.keys()))})
df
In [ ]:
sort_df = df.sort_values(by='count',ascending=False)
In [ ]:
sort_df
In [ ]:
sort_df[:100].plot(kind='bar',figsize=(20,20*2/3))
In [ ]:
sort_df[-100:].plot(kind='bar',figsize=(20,20*2/3))