4T_Pandas로 배우는 SQL 시작하기 (4) - HAVING, SUB QUERY

SQL => 연산의 결과로 나온 데이터를 다시 Filtering ( HAVING )

SUB QUERY + TEMPORARY TABLE ( 임시 테이블 )

실습)

  • "5월 달에" / "지금까지" 렌탈 횟수가 30회 이상인 유저
  • 유저이름과 유저 이메일 => 마케팅
  • customer, rental
  • Pandas

In [1]:
import pymysql
import curl

In [2]:
db = pymysql.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "sakila",
    charset = "utf8",
)
customer_df = pd.read_sql("SELECT * FROM customer;", db)
rental_df = pd.read_sql("SELECT * FROM rental;", db)

In [3]:
df = rental_df.merge(customer_df, on="customer_id")

In [8]:
df.head(1)


Out[8]:
rental_id rental_date inventory_id customer_id return_date staff_id last_update_x store_id first_name last_name email address_id active create_date last_update_y
0 1 2005-05-24 22:53:30 367 130 2005-05-26 22:04:30 1 2006-02-15 21:30:53 1 CHARLOTTE HUNTER CHARLOTTE.HUNTER@sakilacustomer.org 134 1 2006-02-14 22:04:36 2006-02-15 04:57:20

In [4]:
rental_per_customer_groups = df.groupby("customer_id")

In [5]:
rental_per_customer_groups.get_group(1)


Out[5]:
rental_id rental_date inventory_id customer_id return_date staff_id last_update_x store_id first_name last_name email address_id active create_date last_update_y
1988 76 2005-05-25 11:30:37 3021 1 2005-06-03 12:00:37 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1989 573 2005-05-28 10:35:23 4020 1 2005-06-03 06:32:23 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1990 1185 2005-06-15 00:54:12 2785 1 2005-06-23 02:42:12 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1991 1422 2005-06-15 18:02:53 1021 1 2005-06-19 15:54:53 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1992 1476 2005-06-15 21:08:46 1407 1 2005-06-25 02:26:46 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1993 1725 2005-06-16 15:18:57 726 1 2005-06-17 21:05:57 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1994 2308 2005-06-18 08:41:48 197 1 2005-06-22 03:36:48 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1995 2363 2005-06-18 13:33:59 3497 1 2005-06-19 17:40:59 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1996 3284 2005-06-21 06:24:45 4566 1 2005-06-28 03:28:45 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1997 4526 2005-07-08 03:17:05 1443 1 2005-07-14 01:19:05 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1998 4611 2005-07-08 07:33:56 3486 1 2005-07-12 13:25:56 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
1999 5244 2005-07-09 13:24:07 3726 1 2005-07-14 14:01:07 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2000 5326 2005-07-09 16:38:01 797 1 2005-07-13 18:02:01 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2001 6163 2005-07-11 10:13:46 1330 1 2005-07-19 13:15:46 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2002 7273 2005-07-27 11:31:22 2465 1 2005-07-31 06:50:22 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2003 7841 2005-07-28 09:04:45 1092 1 2005-07-30 12:37:45 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2004 8033 2005-07-28 16:18:23 4268 1 2005-07-30 17:56:23 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2005 8074 2005-07-28 17:33:39 1558 1 2005-07-29 20:17:39 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2006 8116 2005-07-28 19:20:07 4497 1 2005-07-29 22:54:07 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2007 8326 2005-07-29 03:58:49 108 1 2005-08-01 05:16:49 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2008 9571 2005-07-31 02:42:18 2219 1 2005-08-02 23:26:18 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2009 10437 2005-08-01 08:51:04 14 1 2005-08-10 12:12:04 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2010 11299 2005-08-02 15:36:52 3232 1 2005-08-10 16:40:52 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2011 11367 2005-08-02 18:01:38 1440 1 2005-08-04 13:19:38 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2012 11824 2005-08-17 12:37:54 2639 1 2005-08-19 10:11:54 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2013 12250 2005-08-18 03:57:29 921 1 2005-08-22 23:05:29 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2014 13068 2005-08-19 09:55:16 3019 1 2005-08-20 14:44:16 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2015 13176 2005-08-19 13:56:54 2269 1 2005-08-23 08:50:54 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2016 14762 2005-08-21 23:33:57 4249 1 2005-08-23 01:30:57 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2017 14825 2005-08-22 01:27:57 1449 1 2005-08-27 07:01:57 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2018 15298 2005-08-22 19:41:37 1446 1 2005-08-28 22:49:37 1 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20
2019 15315 2005-08-22 20:03:46 312 1 2005-08-30 01:51:46 2 2006-02-15 21:30:53 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20

In [10]:
rental_per_customer_groups.agg({"customer_id": np.size})["customer_id"]


Out[10]:
customer_id
1      32
2      27
3      26
4      22
5      38
6      28
7      33
8      24
9      23
10     25
11     24
12     28
13     27
14     28
15     32
16     28
17     21
18     22
19     24
20     30
21     35
22     22
23     30
24     25
25     29
26     34
27     31
28     32
29     36
30     34
       ..
570    26
571    24
572    25
573    29
574    28
575    29
576    34
577    27
578    22
579    27
580    27
581    27
582    25
583    23
584    30
585    24
586    19
587    26
588    29
589    28
590    25
591    27
592    29
593    26
594    27
595    30
596    28
597    25
598    22
599    19
Name: customer_id, dtype: int64

In [6]:
# rental_per_customer_groups.agg({"customer_id": np.size})["customer_id"] > 30
is_many_rentals_user = rental_per_customer_groups.size() > 30
is_many_rentals_user


Out[6]:
customer_id
1       True
2      False
3      False
4      False
5       True
6      False
7       True
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15      True
16     False
17     False
18     False
19     False
20     False
21      True
22     False
23     False
24     False
25     False
26      True
27      True
28      True
29      True
30      True
       ...  
570    False
571    False
572    False
573    False
574    False
575    False
576     True
577    False
578    False
579    False
580    False
581    False
582    False
583    False
584    False
585    False
586    False
587    False
588    False
589    False
590    False
591    False
592    False
593    False
594    False
595    False
596    False
597    False
598    False
599    False
dtype: bool

In [13]:
#여기서는 어떤 문제로 되지 않는다. 다음에 알려주겠다.

In [14]:
# SQL로 하겠다
# 1. Sub Query - Query 안에 Query 가 들어있다.

In [17]:
SQL_QUERY = """
    SELECT
        r.customer_id,
        COUNT(*) rentals
    FROM rental r
        JOIN customer c
        ON r.customer_id = c.customer_id
    GROUP BY r.customer_id
    WHERE rentals > 30
"""
pd.read_sql(SQL_QUERY, db)

#순서가 FROM -> group by -> where(=>rental이 없다.) -> select에서 카운트가 마지막이라서


---------------------------------------------------------------------------
ProgrammingError                          Traceback (most recent call last)
C:\Anaconda3\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
   1563             else:
-> 1564                 cur.execute(*args)
   1565             return cur

C:\Anaconda3\lib\site-packages\pymysql\cursors.py in execute(self, query, args)
    165 
--> 166         result = self._query(query)
    167         self._executed = query

C:\Anaconda3\lib\site-packages\pymysql\cursors.py in _query(self, q)
    321         self._last_executed = q
--> 322         conn.query(q)
    323         self._do_get_result()

C:\Anaconda3\lib\site-packages\pymysql\connections.py in query(self, sql, unbuffered)
    836         self._execute_command(COMMAND.COM_QUERY, sql)
--> 837         self._affected_rows = self._read_query_result(unbuffered=unbuffered)
    838         return self._affected_rows

C:\Anaconda3\lib\site-packages\pymysql\connections.py in _read_query_result(self, unbuffered)
   1020             result = MySQLResult(self)
-> 1021             result.read()
   1022         self._result = result

C:\Anaconda3\lib\site-packages\pymysql\connections.py in read(self)
   1303         try:
-> 1304             first_packet = self.connection._read_packet()
   1305 

C:\Anaconda3\lib\site-packages\pymysql\connections.py in _read_packet(self, packet_type)
    982         packet = packet_type(buff, self.encoding)
--> 983         packet.check_error()
    984         return packet

C:\Anaconda3\lib\site-packages\pymysql\connections.py in check_error(self)
    394             if DEBUG: print("errno =", errno)
--> 395             err.raise_mysql_exception(self._data)
    396 

C:\Anaconda3\lib\site-packages\pymysql\err.py in raise_mysql_exception(data)
    101     errorclass = error_map.get(errno, InternalError)
--> 102     raise errorclass(errno, errval)

ProgrammingError: (1064, "42000You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'WHERE rentals > 30' at line 8")

During handling of the above exception, another exception occurred:

DatabaseError                             Traceback (most recent call last)
<ipython-input-17-430e60ea3f7a> in <module>()
      9     WHERE rentals > 30
     10 """
---> 11 pd.read_sql(SQL_QUERY, db)

C:\Anaconda3\lib\site-packages\pandas\io\sql.py in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize)
    497             sql, index_col=index_col, params=params,
    498             coerce_float=coerce_float, parse_dates=parse_dates,
--> 499             chunksize=chunksize)
    500 
    501     try:

C:\Anaconda3\lib\site-packages\pandas\io\sql.py in read_query(self, sql, index_col, coerce_float, params, parse_dates, chunksize)
   1597 
   1598         args = _convert_params(sql, params)
-> 1599         cursor = self.execute(*args)
   1600         columns = [col_desc[0] for col_desc in cursor.description]
   1601 

C:\Anaconda3\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
   1574             ex = DatabaseError(
   1575                 "Execution failed on sql '%s': %s" % (args[0], exc))
-> 1576             raise_with_traceback(ex)
   1577 
   1578     @staticmethod

C:\Anaconda3\lib\site-packages\pandas\compat\__init__.py in raise_with_traceback(exc, traceback)
    331         if traceback == Ellipsis:
    332             _, _, traceback = sys.exc_info()
--> 333         raise exc.with_traceback(traceback)
    334 else:
    335     # this version of raise is a syntax error in Python 3

C:\Anaconda3\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
   1562                 cur.execute(*args, **kwargs)
   1563             else:
-> 1564                 cur.execute(*args)
   1565             return cur
   1566         except Exception as exc:

C:\Anaconda3\lib\site-packages\pymysql\cursors.py in execute(self, query, args)
    164         query = self.mogrify(query, args)
    165 
--> 166         result = self._query(query)
    167         self._executed = query
    168         return result

C:\Anaconda3\lib\site-packages\pymysql\cursors.py in _query(self, q)
    320         conn = self._get_db()
    321         self._last_executed = q
--> 322         conn.query(q)
    323         self._do_get_result()
    324         return self.rowcount

C:\Anaconda3\lib\site-packages\pymysql\connections.py in query(self, sql, unbuffered)
    835                 sql = sql.encode(self.encoding, 'surrogateescape')
    836         self._execute_command(COMMAND.COM_QUERY, sql)
--> 837         self._affected_rows = self._read_query_result(unbuffered=unbuffered)
    838         return self._affected_rows
    839 

C:\Anaconda3\lib\site-packages\pymysql\connections.py in _read_query_result(self, unbuffered)
   1019         else:
   1020             result = MySQLResult(self)
-> 1021             result.read()
   1022         self._result = result
   1023         if result.server_status is not None:

C:\Anaconda3\lib\site-packages\pymysql\connections.py in read(self)
   1302     def read(self):
   1303         try:
-> 1304             first_packet = self.connection._read_packet()
   1305 
   1306             if first_packet.is_ok_packet():

C:\Anaconda3\lib\site-packages\pymysql\connections.py in _read_packet(self, packet_type)
    981 
    982         packet = packet_type(buff, self.encoding)
--> 983         packet.check_error()
    984         return packet
    985 

C:\Anaconda3\lib\site-packages\pymysql\connections.py in check_error(self)
    393             errno = self.read_uint16()
    394             if DEBUG: print("errno =", errno)
--> 395             err.raise_mysql_exception(self._data)
    396 
    397     def dump(self):

C:\Anaconda3\lib\site-packages\pymysql\err.py in raise_mysql_exception(data)
    100     errval = data[4:].decode('utf-8', 'replace')
    101     errorclass = error_map.get(errno, InternalError)
--> 102     raise errorclass(errno, errval)

DatabaseError: Execution failed on sql '
    SELECT
        r.customer_id,
        COUNT(*) rentals
    FROM rental r
        JOIN customer c
        ON r.customer_id = c.customer_id
    GROUP BY r.customer_id
    WHERE rentals > 30
': (1064, "42000You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'WHERE rentals > 30' at line 8")

In [18]:
# 1. Sub Query = Query 안에 Query가 들어있다.

In [23]:
SQL_QUERY = """
    SELECT rentals_per_customer.customer_id "Customer ID", rentals_per_customer.rentals
    FROM (
        SELECT
            r.customer_id,
            COUNT(*) rentals
        FROM rental r
            JOIN customer c
            ON r.customer_id = c.customer_id
        GROUP BY r.customer_id
    ) AS rentals_per_customer
    WHERE rentals > 30
    ;
"""

pd.read_sql(SQL_QUERY, db)


Out[23]:
Customer ID rentals
0 1 32
1 5 38
2 7 33
3 15 32
4 21 35
5 26 34
6 27 31
7 28 32
8 29 36
9 30 34
10 35 32
11 38 34
12 46 34
13 50 35
14 51 33
15 64 33
16 66 34
17 75 41
18 78 31
19 84 33
20 86 33
21 89 32
22 91 35
23 102 33
24 103 31
25 114 33
26 119 34
27 120 32
28 122 32
29 125 32
... ... ...
104 416 31
105 438 33
106 439 36
107 442 32
108 446 31
109 451 33
110 452 32
111 454 33
112 459 38
113 462 33
114 468 39
115 469 40
116 470 32
117 473 34
118 479 31
119 494 31
120 502 34
121 503 32
122 506 35
123 513 31
124 520 32
125 522 33
126 526 45
127 532 32
128 533 33
129 535 32
130 550 32
131 566 34
132 569 32
133 576 34

134 rows × 2 columns


In [25]:
# sub query스럽지 않아서 나눠 쓰면 보기가 좋다.

In [28]:
RENTALS_PER_CUSTOMER_SQL_QUERY = """
    SELECT
        r.customer_id,
        COUNT(*) rentals
    FROM rental r
        JOIN customer c
        ON r.customer_id = c.customer_id
    GROUP BY r.customer_id
    ;
"""

SQL_QUERY = """
    SELECT *
    FROM (
        {rentals_per_customer_sql_query}
    ) AS rentals_per_customer
    WHERE rentals > 30
    ;
""".format(
    rentals_per_customer_sql_query=RENTALS_PER_CUSTOMER_SQL_QUERY.replace(";", "")
)

pd.read_sql(SQL_QUERY, db)


Out[28]:
customer_id rentals
0 1 32
1 5 38
2 7 33
3 15 32
4 21 35
5 26 34
6 27 31
7 28 32
8 29 36
9 30 34
10 35 32
11 38 34
12 46 34
13 50 35
14 51 33
15 64 33
16 66 34
17 75 41
18 78 31
19 84 33
20 86 33
21 89 32
22 91 35
23 102 33
24 103 31
25 114 33
26 119 34
27 120 32
28 122 32
29 125 32
... ... ...
104 416 31
105 438 33
106 439 36
107 442 32
108 446 31
109 451 33
110 452 32
111 454 33
112 459 38
113 462 33
114 468 39
115 469 40
116 470 32
117 473 34
118 479 31
119 494 31
120 502 34
121 503 32
122 506 35
123 513 31
124 520 32
125 522 33
126 526 45
127 532 32
128 533 33
129 535 32
130 550 32
131 566 34
132 569 32
133 576 34

134 rows × 2 columns


In [29]:
print(SQL_QUERY)


    SELECT *
    FROM (
        
    SELECT
        r.customer_id,
        COUNT(*) rentals
    FROM rental r
        JOIN customer c
        ON r.customer_id = c.customer_id
    GROUP BY r.customer_id
    

    ) AS rentals_per_customer
    WHERE rentals > 30
    ;


In [32]:
# 30번 이상인 애들의 => 이름, 이메일 
RESULT_SQL_QUERY = """
    SELECT customer.last_name, customer.first_name, customer.email
    FROM ({SQL_QUERY}) many_rental_user 
        JOIN customer ON many_rental_user.customer_id = customer.customer_id
    ;
""".format(
    SQL_QUERY=SQL_QUERY.replace(";", "")
)

pd.read_sql(RESULT_SQL_QUERY, db)


Out[32]:
last_name first_name email
0 SMITH MARY MARY.SMITH@sakilacustomer.org
1 BROWN ELIZABETH ELIZABETH.BROWN@sakilacustomer.org
2 MILLER MARIA MARIA.MILLER@sakilacustomer.org
3 HARRIS HELEN HELEN.HARRIS@sakilacustomer.org
4 CLARK MICHELLE MICHELLE.CLARK@sakilacustomer.org
5 HALL JESSICA JESSICA.HALL@sakilacustomer.org
6 ALLEN SHIRLEY SHIRLEY.ALLEN@sakilacustomer.org
7 YOUNG CYNTHIA CYNTHIA.YOUNG@sakilacustomer.org
8 HERNANDEZ ANGELA ANGELA.HERNANDEZ@sakilacustomer.org
9 KING MELISSA MELISSA.KING@sakilacustomer.org
10 GREEN VIRGINIA VIRGINIA.GREEN@sakilacustomer.org
11 GONZALEZ MARTHA MARTHA.GONZALEZ@sakilacustomer.org
12 CAMPBELL CATHERINE CATHERINE.CAMPBELL@sakilacustomer.org
13 COLLINS DIANE DIANE.COLLINS@sakilacustomer.org
14 STEWART ALICE ALICE.STEWART@sakilacustomer.org
15 COX JUDITH JUDITH.COX@sakilacustomer.org
16 WARD JANICE JANICE.WARD@sakilacustomer.org
17 SANDERS TAMMY TAMMY.SANDERS@sakilacustomer.org
18 WOOD LORI LORI.WOOD@sakilacustomer.org
19 PERRY SARA SARA.PERRY@sakilacustomer.org
20 LONG JACQUELINE JACQUELINE.LONG@sakilacustomer.org
21 FLORES JULIA JULIA.FLORES@sakilacustomer.org
22 BUTLER LOIS LOIS.BUTLER@sakilacustomer.org
23 FORD CRYSTAL CRYSTAL.FORD@sakilacustomer.org
24 HAMILTON GLADYS GLADYS.HAMILTON@sakilacustomer.org
25 ELLIS GRACE GRACE.ELLIS@sakilacustomer.org
26 MARSHALL SHERRY SHERRY.MARSHALL@sakilacustomer.org
27 ORTIZ SYLVIA SYLVIA.ORTIZ@sakilacustomer.org
28 MURRAY THELMA THELMA.MURRAY@sakilacustomer.org
29 WEBB ETHEL ETHEL.WEBB@sakilacustomer.org
... ... ... ...
104 PINSON JEFFERY JEFFERY.PINSON@sakilacustomer.org
105 LOVELACE BARRY BARRY.LOVELACE@sakilacustomer.org
106 FENNELL ALEXANDER ALEXANDER.FENNELL@sakilacustomer.org
107 BUSTAMANTE LEROY LEROY.BUSTAMANTE@sakilacustomer.org
108 CULP THEODORE THEODORE.CULP@sakilacustomer.org
109 REA JIM JIM.REA@sakilacustomer.org
110 MILNER TOM TOM.MILNER@sakilacustomer.org
111 GRESHAM ALEX ALEX.GRESHAM@sakilacustomer.org
112 COLLAZO TOMMY TOMMY.COLLAZO@sakilacustomer.org
113 SHERROD WARREN WARREN.SHERROD@sakilacustomer.org
114 CARY TIM TIM.CARY@sakilacustomer.org
115 BULL WESLEY WESLEY.BULL@sakilacustomer.org
116 ALLARD GORDON GORDON.ALLARD@sakilacustomer.org
117 OLIVARES JORGE JORGE.OLIVARES@sakilacustomer.org
118 HITE ZACHARY ZACHARY.HITE@sakilacustomer.org
119 CHOATE RAMON RAMON.CHOATE@sakilacustomer.org
120 CORNWELL BRETT BRETT.CORNWELL@sakilacustomer.org
121 BARCLAY ANGEL ANGEL.BARCLAY@sakilacustomer.org
122 SEWARD LESLIE LESLIE.SEWARD@sakilacustomer.org
123 TUBBS DUANE DUANE.TUBBS@sakilacustomer.org
124 WESTMORELAND MITCHELL MITCHELL.WESTMORELAND@sakilacustomer.org
125 HAVENS ARNOLD ARNOLD.HAVENS@sakilacustomer.org
126 SEAL KARL KARL.SEAL@sakilacustomer.org
127 RENNER NEIL NEIL.RENNER@sakilacustomer.org
128 MILAM JESSIE JESSIE.MILAM@sakilacustomer.org
129 ELROD JAVIER JAVIER.ELROD@sakilacustomer.org
130 BROWNLEE GUY GUY.BROWNLEE@sakilacustomer.org
131 MENA CASEY CASEY.MENA@sakilacustomer.org
132 GARDINER DAVE DAVE.GARDINER@sakilacustomer.org
133 MCCARTER MORRIS MORRIS.MCCARTER@sakilacustomer.org

134 rows × 3 columns


In [35]:
# Temporary Table ( 임시 테이블 )

SQL_QUERY = """
    DROP TEMPORARY TABLE IF EXISTS rentals_per_customer;
    CREATE TEMPORARY TABLE rentals_per_customer
    
    SELECT
        r.customer_id,
        COUNT(*) rentals
    FROM rental r
        JOIN customer c
        ON r.customer_id = c.customer_id
    GROUP BY r.customer_id
    ;
"""
# pd.read_sql() => 이걸로 실행시키면 오류가 난다. 그래서 cursor로 실행
cursor = db.cursor()
cursor.execute(SQL_QUERY)


Out[35]:
0

In [39]:
SQL_QUERY = """
    SELECT rpc.customer_id, rpc.rentals
    FROM rentals_per_customer rpc
    WHERE rentals > 30
    ;
"""

pd.read_sql(SQL_QUERY, db)


Out[39]:
customer_id rentals

HAVING


In [41]:
SQL_QUERY = """
    SELECT r.customer_id, COUNT(*) rentals
    FROM rental r
        JOIN customer c
        ON r.customer_id = c.customer_id
    GROUP BY r.customer_id
    # WHERE rentals > 30  #연산에 대한 결과로 Filtering을 할 수 없다.
    # 연산에 대한 결과로 Filtering을 할 수 있는 기능
    HAVING rentals > 30
"""

pd.read_sql(SQL_QUERY, db)


Out[41]:
customer_id rentals
0 1 32
1 5 38
2 7 33
3 15 32
4 21 35
5 26 34
6 27 31
7 28 32
8 29 36
9 30 34
10 35 32
11 38 34
12 46 34
13 50 35
14 51 33
15 64 33
16 66 34
17 75 41
18 78 31
19 84 33
20 86 33
21 89 32
22 91 35
23 102 33
24 103 31
25 114 33
26 119 34
27 120 32
28 122 32
29 125 32
... ... ...
104 416 31
105 438 33
106 439 36
107 442 32
108 446 31
109 451 33
110 452 32
111 454 33
112 459 38
113 462 33
114 468 39
115 469 40
116 470 32
117 473 34
118 479 31
119 494 31
120 502 34
121 503 32
122 506 35
123 513 31
124 520 32
125 522 33
126 526 45
127 532 32
128 533 33
129 535 32
130 550 32
131 566 34
132 569 32
133 576 34

134 rows × 2 columns

실습)

  • 제일 많이 매출을 발생시킨 "영화 제목"
  • payment, film, inventory, rental
  • 영화별 ( GROUP BY )
  • 매출 ( SUM )을 뽑아서
  • 정렬하자 ( ORDER BY )
  • 데이터가 어디에 들어있는가?

In [2]:
db = pymysql.connect(
    "db.fastcamp.us",
    "root",
    "dkstncks",
    "sakila",
    charset="utf8"
)
film_df = pd.read_sql("SELECT * FROM film;", db)
rental_df = pd.read_sql("SELECT * FROM rental;", db)
payment_df = pd.read_sql("SELECT * FROM payment;", db)
inventory_df = pd.read_sql("SELECT * FROM inventory;", db)

film_df => film_id, title inventory_df => inventory_id, film_id rental_df => rental_id, inventory_id payment_df => rental_id, amount


In [23]:
SQL_QUERY = """
    SELECT f.film_id, f.title, SUM(p.amount) "revenue"
    FROM 
        film f,
        rental r,
        payment p,
        inventory i
    WHERE
        f.film_id = i.film_id
        AND i.inventory_id = r.inventory_id
        AND r.rental_id = p.rental_id
    GROUP BY f.film_id
    ORDER BY revenue DESC
    ;
"""
pd.read_sql(SQL_QUERY, db)


Out[23]:
film_id title revenue
0 879 TELEGRAPH VOYAGE 231.73
1 973 WIFE TURN 223.69
2 1000 ZORRO ARK 214.69
3 369 GOODFELLAS SALUTE 209.69
4 764 SATURDAY LAMBS 204.72
5 893 TITANS JERK 201.71
6 897 TORQUE BOUND 198.72
7 403 HARRY IDAHO 195.70
8 460 INNOCENT USUAL 191.74
9 444 HUSTLER PARTY 190.78
10 670 PELICAN COMFORTS 188.74
11 127 CAT CONEHEADS 181.70
12 284 ENEMY ODDS 180.71
13 103 BUCKET BROTHERHOOD 180.66
14 715 RANGE MOONWALKER 179.73
15 563 MASSACRE USUAL 179.70
16 941 VIDEOTAPE ARSENIC 178.71
17 239 DOGMA FAMILY 178.70
18 31 APACHE DIVINE 178.69
19 938 VELVET TERMINATOR 177.74
20 244 DORADO NOTTING 176.73
21 327 FOOL MOCKINGBIRD 175.77
22 979 WITCHES PANIC 173.70
23 159 CLOSER BANG 172.72
24 767 SCALAWAG DUCK 172.68
25 745 ROSES TREASURE 171.72
26 865 SUNRISE LEAGUE 170.76
27 791 SHOW LORD 170.75
28 624 NIGHTMARE CHILL 169.75
29 771 SCORPION APOLLO 168.77
... ... ... ...
928 262 DUMBO LUST 13.94
929 965 WATERSHIP FRONTIER 13.94
930 516 LEGEND JEDI 13.91
931 630 NOTTING SPEAKEASY 13.90
932 423 HOLLYWOOD ANONYMOUS 13.87
933 910 TREATMENT JEKYLL 12.93
934 751 RUNAWAY TENENBAUMS 12.92
935 822 SOUP WISDOM 12.92
936 866 SUNSET RACER 12.92
937 940 VICTORY ACADEMY 12.91
938 899 TOWERS HURRICANE 12.89
939 566 MAUDE MOD 11.92
940 355 GHOSTBUSTERS ELF 11.91
941 594 MONTEZUMA COMMAND 11.91
942 477 JAWBREAKER BROOKLYN 11.90
943 157 CLOCKWORK PARADISE 11.90
944 178 CONNECTION MICROCOSMOS 10.93
945 52 BALLROOM MOCKINGBIRD 10.92
946 168 COMANCHEROS ENEMY 9.94
947 523 LIGHTS DEER 9.92
948 401 HAROLD FRENCH 9.92
949 839 STALLION SUNDANCE 8.93
950 475 JAPANESE RUN 7.94
951 196 CRUELTY UNFORGIVEN 7.93
952 718 REBEL AIRPORT 7.92
953 261 DUFFEL APOCALYPSE 6.93
954 996 YOUNG LANGUAGE 6.93
955 335 FREEDOM CLEOPATRA 5.95
956 635 OKLAHOMA JUMANJI 5.94
957 885 TEXAS WATCH 5.94

958 rows × 3 columns


In [24]:
SQL_QUERY = """
    SELECT f.film_id, f.title, SUM(p.amount) "revenue"
    FROM payment p
        JOIN rental r ON p.rental_id = r.rental_id
        JOIN inventory i ON i.inventory_id = r.inventory_id
        JOIN film f ON f.film_id = i.film_id
    GROUP BY f.film_id
    ORDER BY revenue DESC
    ;
"""

pd.read_sql(SQL_QUERY, db)


Out[24]:
film_id title revenue
0 879 TELEGRAPH VOYAGE 231.73
1 973 WIFE TURN 223.69
2 1000 ZORRO ARK 214.69
3 369 GOODFELLAS SALUTE 209.69
4 764 SATURDAY LAMBS 204.72
5 893 TITANS JERK 201.71
6 897 TORQUE BOUND 198.72
7 403 HARRY IDAHO 195.70
8 460 INNOCENT USUAL 191.74
9 444 HUSTLER PARTY 190.78
10 670 PELICAN COMFORTS 188.74
11 127 CAT CONEHEADS 181.70
12 284 ENEMY ODDS 180.71
13 103 BUCKET BROTHERHOOD 180.66
14 715 RANGE MOONWALKER 179.73
15 563 MASSACRE USUAL 179.70
16 941 VIDEOTAPE ARSENIC 178.71
17 239 DOGMA FAMILY 178.70
18 31 APACHE DIVINE 178.69
19 938 VELVET TERMINATOR 177.74
20 244 DORADO NOTTING 176.73
21 327 FOOL MOCKINGBIRD 175.77
22 979 WITCHES PANIC 173.70
23 159 CLOSER BANG 172.72
24 767 SCALAWAG DUCK 172.68
25 745 ROSES TREASURE 171.72
26 865 SUNRISE LEAGUE 170.76
27 791 SHOW LORD 170.75
28 624 NIGHTMARE CHILL 169.75
29 771 SCORPION APOLLO 168.77
... ... ... ...
928 965 WATERSHIP FRONTIER 13.94
929 262 DUMBO LUST 13.94
930 516 LEGEND JEDI 13.91
931 630 NOTTING SPEAKEASY 13.90
932 423 HOLLYWOOD ANONYMOUS 13.87
933 910 TREATMENT JEKYLL 12.93
934 866 SUNSET RACER 12.92
935 751 RUNAWAY TENENBAUMS 12.92
936 822 SOUP WISDOM 12.92
937 940 VICTORY ACADEMY 12.91
938 899 TOWERS HURRICANE 12.89
939 566 MAUDE MOD 11.92
940 355 GHOSTBUSTERS ELF 11.91
941 594 MONTEZUMA COMMAND 11.91
942 477 JAWBREAKER BROOKLYN 11.90
943 157 CLOCKWORK PARADISE 11.90
944 178 CONNECTION MICROCOSMOS 10.93
945 52 BALLROOM MOCKINGBIRD 10.92
946 168 COMANCHEROS ENEMY 9.94
947 523 LIGHTS DEER 9.92
948 401 HAROLD FRENCH 9.92
949 839 STALLION SUNDANCE 8.93
950 475 JAPANESE RUN 7.94
951 196 CRUELTY UNFORGIVEN 7.93
952 718 REBEL AIRPORT 7.92
953 996 YOUNG LANGUAGE 6.93
954 261 DUFFEL APOCALYPSE 6.93
955 335 FREEDOM CLEOPATRA 5.95
956 635 OKLAHOMA JUMANJI 5.94
957 885 TEXAS WATCH 5.94

958 rows × 3 columns

실습 추가)

  • 결제 누적액이 많은 유저 상위 10명 ( customer, payment )

In [3]:
customer_df = pd.read_sql("SELECT * FROM customer;", db)

In [4]:
customer_df.head(1)


Out[4]:
customer_id store_id first_name last_name email address_id active create_date last_update
0 1 1 MARY SMITH MARY.SMITH@sakilacustomer.org 5 1 2006-02-14 22:04:36 2006-02-15 04:57:20

In [5]:
payment_df.head(1)


Out[5]:
payment_id customer_id staff_id rental_id amount payment_date last_update
0 1 1 1 76.0 2.99 2005-05-25 11:30:37 2006-02-15 22:12:30

In [3]:
SQL_QUERY = """
    SELECT c.first_name, c.last_name, SUM(p.amount) "revenue"
    FROM customer c
        JOIN payment p
        ON c.customer_id = p.customer_id
    GROUP BY c.customer_id
    ORDER BY revenue DESC
    ;
"""
pd.read_sql(SQL_QUERY, db)


Out[3]:
first_name last_name revenue
0 KARL SEAL 221.55
1 ELEANOR HUNT 216.54
2 CLARA SHAW 195.58
3 MARION SNYDER 194.61
4 RHONDA KENNEDY 194.61
5 TOMMY COLLAZO 186.62
6 WESLEY BULL 177.60
7 TIM CARY 175.61
8 MARCIA DEAN 175.58
9 ANA BRADLEY 174.66
10 JUNE CARROLL 173.63
11 LENA JENSEN 170.67
12 DIANE COLLINS 169.65
13 ARNOLD HAVENS 167.67
14 CURTIS IRBY 167.62
15 MIKE WAY 166.65
16 DAISY BATES 162.62
17 TONYA CHAPMAN 161.68
18 LOUIS LEONE 161.65
19 GORDON ALLARD 160.68
20 BRITTANY RILEY 159.72
21 GUY BROWNLEE 159.68
22 WARREN SHERROD 159.67
23 MARGIE WADE 159.64
24 MAE FLETCHER 158.69
25 STEVE MACKENZIE 158.66
26 HARRY ARCE 157.65
27 PRISCILLA LOWE 157.65
28 ARTHUR SIMPKINS 155.68
29 MICHELLE CLARK 155.65
... ... ... ...
569 JO FOWLER 73.80
570 JUAN FRALEY 73.77
571 FELIX GAFFNEY 73.76
572 JONATHAN SCARBOROUGH 72.82
573 TERRY GRISSOM 72.80
574 EUGENE CULPEPPER 71.81
575 EDITH MCDONALD 71.81
576 ANTHONY SCHWAB 71.80
577 LAUREN HUDSON 71.80
578 SAMANTHA DUNCAN 71.77
579 IRMA PEARSON 70.82
580 WAYNE TRUONG 70.81
581 FLOYD GANDY 69.83
582 PENNY NEAL 68.82
583 JILL HAWKINS 68.79
584 VERA MCCOY 67.82
585 JOANN GARDNER 66.84
586 LLOYD DOWD 66.81
587 ALBERTO HENNING 66.79
588 LESTER KRAUS 65.84
589 KIRK STCLAIR 64.81
590 MATTIE HOFFMAN 64.78
591 ANITA MORALES 62.85
592 TIFFANY JORDAN 59.86
593 KATHERINE RIVERA 58.86
594 ANNIE RUSSELL 58.82
595 JOHNNY TURPIN 57.81
596 BRIAN WYMAN 52.88
597 LEONA OBRIEN 50.86
598 CAROLINE BOWMAN 50.85

599 rows × 3 columns

  • 영화를 흥행시킨 ( 매출이 많이 발생한 ) 배우 상위 10명
  • rental, payment ...

actor_df => actor_id, first_name, last_name film_actor_df => actor_id, film_id inventory_df => inventory_id, film_id rental_df => rental_id, inventory_id payment_df => rental_id, amount


In [12]:
SQL_QUERY = """
    SELECT a.first_name, a.last_name, SUM(p.amount) "revenue"
    FROM
        actor a,
        film_actor fa,
        inventory i,
        rental r,
        payment p
    WHERE
        a.actor_id = fa.actor_id
        AND fa.film_id = i.film_id
        AND i.inventory_id = r.inventory_id
        AND r.rental_id = p.rental_id
    GROUP BY a.actor_id
    ORDER BY revenue DESC
    ;
"""
pd.read_sql(SQL_QUERY, db)


Out[12]:
first_name last_name revenue
0 GINA DEGENERES 3442.49
1 MATTHEW CARREY 2742.19
2 MARY KEITEL 2689.25
3 SCARLETT DAMON 2655.28
4 WALTER TORN 2620.62
5 ANGELA WITHERSPOON 2614.46
6 CHRISTIAN AKROYD 2611.49
7 HENRY BERRY 2602.88
8 WOODY HOFFMAN 2546.40
9 CAMERON ZELLWEGER 2529.41
10 SANDRA KILMER 2485.96
11 BEN WILLIS 2459.49
12 SEAN GUINESS 2424.00
13 MICHAEL BOLGER 2401.48
14 JAYNE NOLTE 2400.86
15 KEVIN GARLAND 2393.36
16 WHOOPI HURT 2366.72
17 ANGELA HUDSON 2361.26
18 HELEN VOIGHT 2358.45
19 KIRSTEN PALTROW 2344.49
20 JULIA MCQUEEN 2330.51
21 ED MANSFIELD 2323.21
22 RENEE TRACY 2321.56
23 UMA WOOD 2314.64
24 REESE WEST 2299.47
25 JADA RYDER 2294.43
26 REESE KILMER 2273.52
27 RIP CRAWFORD 2271.88
28 WARREN NOLTE 2271.37
29 EWAN GOODING 2264.31
... ... ... ...
170 CAMERON STREEP 1421.83
171 CHRIS DEPP 1420.17
172 RITA REYNOLDS 1420.06
173 DAN TORN 1419.95
174 MINNIE KILMER 1417.09
175 JIM MOSTEL 1406.17
176 BETTE NICHOLSON 1402.21
177 MENA HOPPER 1396.44
178 AUDREY BAILEY 1390.68
179 JOE SWANK 1389.21
180 DAN STREEP 1380.12
181 CHRISTOPHER BERRY 1379.30
182 CUBA BIRCH 1370.38
183 CATE HARRIS 1360.37
184 SUSAN DAVIS 1354.79
185 CARY MCCONAUGHEY 1330.57
186 CAMERON WRAY 1327.97
187 ED CHASE 1322.89
188 JEFF SILVERSTONE 1278.74
189 KENNETH PESCI 1258.08
190 RUSSELL CLOSE 1251.05
191 PENELOPE GUINESS 1230.94
192 JULIA FAWCETT 1189.42
193 JULIA ZELLWEGER 1064.79
194 JENNIFER DAVIS 1052.27
195 SANDRA PECK 1040.12
196 JUDY DEAN 976.47
197 ADAM GRANT 974.19
198 SISSY SOBIESKI 902.65
199 EMILY DEE 883.85

200 rows × 3 columns