notebook.community

Edit and run



In [1]:

    
# GROUP BY example
# 월별 렌탈 횟수 지표 뽑아내기



In [2]:

    
import numpy as np
import pandas as pd

import MySQLdb


db = MySQLdb.connect(
    "db.fastcamp.us",  # DATABASE_HOST
    "root",  # DATABASE_USERNAME
    "dkstncks",  # DATABASE_PASSWORD
    "sakila",  # DATABASE_NAME
    charset='utf8',
)

cursor = db.cursor()



In [3]:

    
SQL_QUERY = """
    SELECT
        LEFT(rental.rental_date, 7) "Monthly",
        COUNT(rental.rental_id) "Total Rental Count"
    FROM 
        rental
    GROUP BY
        1
    ORDER BY
        2 DESC
"""

pd.read_sql(SQL_QUERY, db)









    Out[3]:






  
    
      
      Monthly
      Total Rental Count
    
  
  
    
      0
      2005-07
      6709
    
    
      1
      2005-08
      5686
    
    
      2
      2005-06
      2311
    
    
      3
      2005-05
      1156
    
    
      4
      2006-02
      182



In [4]:

    
SQL_QUERY = """
    SELECT * FROM rental;
"""

rental_df = pd.read_sql(SQL_QUERY, db)



In [5]:

    
rental_df.head()









    Out[5]:






  
    
      
      rental_id
      rental_date
      inventory_id
      customer_id
      return_date
      staff_id
      last_update
    
  
  
    
      0
      1
      2005-05-24 22:53:30
      367
      130
      2005-05-26 22:04:30
      1
      2006-02-15 21:30:53
    
    
      1
      2
      2005-05-24 22:54:33
      1525
      459
      2005-05-28 19:40:33
      1
      2006-02-15 21:30:53
    
    
      2
      3
      2005-05-24 23:03:39
      1711
      408
      2005-06-01 22:12:39
      1
      2006-02-15 21:30:53
    
    
      3
      4
      2005-05-24 23:04:41
      2452
      333
      2005-06-03 01:43:41
      2
      2006-02-15 21:30:53
    
    
      4
      5
      2005-05-24 23:05:21
      2079
      222
      2005-06-02 04:33:21
      1
      2006-02-15 21:30:53



In [6]:

    
# 일단은 "Month" 라는 새로운 Column에 데이터를 넣는다.

rental_df["Month"] = rental_df["rental_date"].apply(lambda x: str(x)[:7])



In [7]:

    
rental_df.head()









    Out[7]:






  
    
      
      rental_id
      rental_date
      inventory_id
      customer_id
      return_date
      staff_id
      last_update
      Month
    
  
  
    
      0
      1
      2005-05-24 22:53:30
      367
      130
      2005-05-26 22:04:30
      1
      2006-02-15 21:30:53
      2005-05
    
    
      1
      2
      2005-05-24 22:54:33
      1525
      459
      2005-05-28 19:40:33
      1
      2006-02-15 21:30:53
      2005-05
    
    
      2
      3
      2005-05-24 23:03:39
      1711
      408
      2005-06-01 22:12:39
      1
      2006-02-15 21:30:53
      2005-05
    
    
      3
      4
      2005-05-24 23:04:41
      2452
      333
      2005-06-03 01:43:41
      2
      2006-02-15 21:30:53
      2005-05
    
    
      4
      5
      2005-05-24 23:05:21
      2079
      222
      2005-06-02 04:33:21
      1
      2006-02-15 21:30:53
      2005-05



In [8]:

    
rental_df.groupby("Month").size().sort_values(ascending=False)









    Out[8]:





Month
2005-07    6709
2005-08    5686
2005-06    2311
2005-05    1156
2006-02     182
dtype: int64

	rental_id	rental_date	inventory_id	customer_id	return_date	staff_id	last_update
0	1	2005-05-24 22:53:30	367	130	2005-05-26 22:04:30	1	2006-02-15 21:30:53
1	2	2005-05-24 22:54:33	1525	459	2005-05-28 19:40:33	1	2006-02-15 21:30:53
2	3	2005-05-24 23:03:39	1711	408	2005-06-01 22:12:39	1	2006-02-15 21:30:53
3	4	2005-05-24 23:04:41	2452	333	2005-06-03 01:43:41	2	2006-02-15 21:30:53
4	5	2005-05-24 23:05:21	2079	222	2005-06-02 04:33:21	1	2006-02-15 21:30:53