In [1]:
# 유저 정보와 주소 정보를 한꺼번에 가져오기
# ( 단, 현재 유저 정보는 customer table에, 주소 정보는 address table 에 저장되어 있는 상태다 )
In [2]:
import pandas as pd
import MySQLdb
db = MySQLdb.connect(
"db.fastcamp.us", # DATABASE_HOST
"root", # DATABASE_USERNAME
"dkstncks", # DATABASE_PASSWORD
"sakila", # DATABASE_NAME
charset='utf8',
)
cursor = db.cursor()
In [3]:
# 일단 그냥 한번 불러와보자. 어떤 일이 생기는지 살펴볼 필요가 있다.
SQL_QUERY = """
SELECT COUNT(*) FROM customer;
"""
pd.read_sql(SQL_QUERY, db)
Out[3]:
In [4]:
SQL_QUERY = """
SELECT COUNT(*) FROM address;
"""
pd.read_sql(SQL_QUERY, db)
Out[4]:
In [5]:
SQL_QUERY = """
SELECT COUNT(*) FROM customer, address;
"""
pd.read_sql(SQL_QUERY, db)
Out[5]:
In [6]:
# 599 * 603 => 361197
In [7]:
SQL_QUERY = """
SELECT * FROM customer, address;
"""
df = pd.read_sql(SQL_QUERY, db)
df.head(5)
Out[7]:
In [8]:
# 제대로 된 정보를 얻으려면 이렇게 하셔야 합니다.
SQL_QUERY = """
SELECT
customer.*, address.address
FROM customer, address
WHERE
customer.address_id = address.address_id
;
"""
pd.read_sql(SQL_QUERY, db).head()
Out[8]:
In [9]:
# 근데 이걸 효과적으로 하는 방법은 SQL JOIN을 사용하는 것이다.
SQL_QUERY = """
SELECT customer.*, address
FROM customer
JOIN address ON customer.address_id = address.address_id
;
"""
pd.read_sql(SQL_QUERY, db).head()
Out[9]:
In [10]:
# 시간을 한번 재보자.
import time
# SELECT WHERE
start_time = time.clock()
SQL_QUERY = """
SELECT
customer.*, address.address
FROM customer, address
WHERE
customer.address_id = address.address_id
;
"""
pd.read_sql(SQL_QUERY, db)
print("SELECT WHERE: {time}s".format(time=time.clock() - start_time))
# SELECT JOIN
start_time = time.clock()
SQL_QUERY = """
SELECT customer.*, address
FROM customer
JOIN address ON customer.address_id = address.address_id
;
"""
pd.read_sql(SQL_QUERY, db)
print("SELECT JOIN : {time}s".format(time=time.clock() - start_time))
In [11]:
# Pandas JOIN
customer_df = pd.read_sql("SELECT * FROM customer;", db)
customer_df.head()
Out[11]:
In [12]:
address_df = pd.read_sql("SELECT * FROM address;", db)
address_df.head()
Out[12]:
In [13]:
df = pd.merge(customer_df, address_df, on="address_id")
In [14]:
df.head()
Out[14]: