In [1]:
# GROUP BY example
# 월별 렌탈 횟수 지표 뽑아내기

In [2]:
import numpy as np
import pandas as pd

import MySQLdb


db = MySQLdb.connect(
    "db.fastcamp.us",  # DATABASE_HOST
    "root",  # DATABASE_USERNAME
    "dkstncks",  # DATABASE_PASSWORD
    "sakila",  # DATABASE_NAME
    charset='utf8',
)

cursor = db.cursor()

In [3]:
SQL_QUERY = """
    SELECT
        LEFT(rental.rental_date, 7) "Monthly",
        COUNT(rental.rental_id) "Total Rental Count"
    FROM 
        rental
    GROUP BY
        1
    ORDER BY
        2 DESC
"""

pd.read_sql(SQL_QUERY, db)


Out[3]:
Monthly Total Rental Count
0 2005-07 6709
1 2005-08 5686
2 2005-06 2311
3 2005-05 1156
4 2006-02 182

In [4]:
SQL_QUERY = """
    SELECT * FROM rental;
"""

rental_df = pd.read_sql(SQL_QUERY, db)

In [5]:
rental_df.head()


Out[5]:
rental_id rental_date inventory_id customer_id return_date staff_id last_update
0 1 2005-05-24 22:53:30 367 130 2005-05-26 22:04:30 1 2006-02-15 21:30:53
1 2 2005-05-24 22:54:33 1525 459 2005-05-28 19:40:33 1 2006-02-15 21:30:53
2 3 2005-05-24 23:03:39 1711 408 2005-06-01 22:12:39 1 2006-02-15 21:30:53
3 4 2005-05-24 23:04:41 2452 333 2005-06-03 01:43:41 2 2006-02-15 21:30:53
4 5 2005-05-24 23:05:21 2079 222 2005-06-02 04:33:21 1 2006-02-15 21:30:53

In [6]:
# 일단은 "Month" 라는 새로운 Column에 데이터를 넣는다.

rental_df["Month"] = rental_df["rental_date"].apply(lambda x: str(x)[:7])

In [7]:
rental_df.head()


Out[7]:
rental_id rental_date inventory_id customer_id return_date staff_id last_update Month
0 1 2005-05-24 22:53:30 367 130 2005-05-26 22:04:30 1 2006-02-15 21:30:53 2005-05
1 2 2005-05-24 22:54:33 1525 459 2005-05-28 19:40:33 1 2006-02-15 21:30:53 2005-05
2 3 2005-05-24 23:03:39 1711 408 2005-06-01 22:12:39 1 2006-02-15 21:30:53 2005-05
3 4 2005-05-24 23:04:41 2452 333 2005-06-03 01:43:41 2 2006-02-15 21:30:53 2005-05
4 5 2005-05-24 23:05:21 2079 222 2005-06-02 04:33:21 1 2006-02-15 21:30:53 2005-05

In [8]:
rental_df.groupby("Month").size().sort_values(ascending=False)


Out[8]:
Month
2005-07    6709
2005-08    5686
2005-06    2311
2005-05    1156
2006-02     182
dtype: int64