In [2]:
# Python and SQL and Pandas
## PostgreSQL with psycopg2
import psycopg2

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [28]:
# Connect to a database
conn = psycopg2.connect("dbname=testdb user=postgres")

# Open a cursor to perform db ops
cur = conn.cursor()

# Execute comand to create a new table
cur.execute("DROP TABLE IF EXISTS test;")
cur.execute("CREATE TABLE test (id serial PRIMARY KEY, num integer, data varchar);")

# Pass data to fill a query placeholders and let Psycopg perform
# the correct conversion (no more SQL injections!)
cur.execute("INSERT INTO test (num, data) VALUES (%s, %s)", (100, "'abc'def"))
cur.execute("INSERT INTO test (num, data) VALUES (%s, %s)", (200, "'abc'xyz"))

# Query the database and obtain data as Python objects
cur.execute("SELECT * FROM test;")
result = cur.fetchone()
print(result)
result = cur.fetchmany()
print(result)
result = cur.fetchall()
print(result)
            
# Make the changes to the database persistent / use rollback() for undo
conn.commit()

# Close communication with the database
cur.close()
conn.close()


(1, 100, "'abc'def")
[(2, 200, "'abc'xyz")]
[]

The problem with the query parameters

The SQL representation of many data types is often different from their Python string representation. The typical example is with single quotes in strings: in SQL single quotes are used as string literal delimiters, so the ones appearing inside the string itself must be escaped, whereas in Python single quotes can be left unescaped if the string is delimited by double quotes.

Because of the difference, sometime subtle, between the data types representations, a naïve approach to query strings composition, such as using Python strings concatenation, is a recipe for terrible problems:

SQL = "INSERT INTO authors (name) VALUES ('%s');" # NEVER DO THIS data = ("O'Reilly", ) cur.execute(SQL % data) # THIS WILL FAIL MISERABLY ProgrammingError: syntax error at or near "Reilly" LINE 1: INSERT INTO authors (name) VALUES ('O'Reilly') ^

If the variables containing the data to send to the database come from an untrusted source (such as a form published on a web site) an attacker could easily craft a malformed string, either gaining access to unauthorized data or performing destructive operations on the database. This form of attack is called SQL injection and is known to be one of the most widespread forms of attack to database servers. Before continuing, please print this page as a memo and hang it onto your desk.

Psycopg can automatically convert Python objects to and from SQL literals: using this feature your code will be more robust and reliable.

Never, never, NEVER use Python string concatenation (+) or string parameters interpolation (%) to pass variables to a SQL query string. Not even at gunpoint.

  • The correct way to pass variables in a SQL command is using the second argument of the execute() method:

SQL = "INSERT INTO authors (name) VALUES (%s);" # Note: no quotes data = ("O'Reilly", ) cur.execute(SQL, data) # Note: no % operator


In [29]:
# More on CSV files
dframe = pd.read_csv('../../data/data_rep.csv')
dframe


Out[29]:
Unnamed: 0 Monday Tuesday Wednesday Thursday Friday Saturday Sunday Server Host Runner Busser Bar-back Bar Pref. Emp. Pos. Pref. Days
0 Ryo 0 1 1 1 1 1 0 0 0 0 0 0 4 1 6 4
1 Joji 1 1 1 1 1 1 1 0 0 0 0 0 3 0 6 3
2 Erinn 1 1 1 1 1 1 1 0 1 0 0 2 0 0 5 2
3 Masa 1 1 1 1 1 1 0 2 5 1 0 0 1 1 2 5
4 Mari 1 1 1 1 1 1 1 3 2 1 0 0 0 0 1 4
5 Dan 1 1 0 1 0 1 1 3 0 0 0 0 0 1 1 2
6 Jay 1 1 1 1 1 0 0 2 0 0 0 0 0 0 1 2
7 Machiko 0 1 0 0 1 0 0 2 0 1 0 0 0 1 1 2
8 Masami 1 1 1 1 1 1 1 3 0 0 0 0 0 1 1 3
9 Nana 1 1 1 1 1 1 1 3 0 1 0 0 0 0 1 3
10 Shiho 1 1 1 1 1 1 1 4 0 4 4 0 0 0 4 4
11 Saya 1 1 0 1 1 1 1 0 0 2 0 0 0 1 3 2
12 Stephanie 0 0 0 0 1 1 0 0 0 2 0 0 0 1 3 2
13 Tamiko 1 1 0 0 0 0 1 2 0 2 0 0 0 0 1 2
14 Tomoyo 1 1 1 1 1 1 1 0 0 0 3 0 0 0 3 3
15 Naoko 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3

In [30]:
file = '../../data/data_rep.csv'
dframe = pd.read_csv(file, header=None)
dframe


Out[30]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
0 NaN Monday Tuesday Wednesday Thursday Friday Saturday Sunday Server Host Runner Busser Bar-back Bar Pref. Emp. Pos. Pref. Days
1 Ryo 0 1 1 1 1 1 0 0 0 0 0 0 4 1 6 4
2 Joji 1 1 1 1 1 1 1 0 0 0 0 0 3 0 6 3
3 Erinn 1 1 1 1 1 1 1 0 1 0 0 2 0 0 5 2
4 Masa 1 1 1 1 1 1 0 2 5 1 0 0 1 1 2 5
5 Mari 1 1 1 1 1 1 1 3 2 1 0 0 0 0 1 4
6 Dan 1 1 0 1 0 1 1 3 0 0 0 0 0 1 1 2
7 Jay 1 1 1 1 1 0 0 2 0 0 0 0 0 0 1 2
8 Machiko 0 1 0 0 1 0 0 2 0 1 0 0 0 1 1 2
9 Masami 1 1 1 1 1 1 1 3 0 0 0 0 0 1 1 3
10 Nana 1 1 1 1 1 1 1 3 0 1 0 0 0 0 1 3
11 Shiho 1 1 1 1 1 1 1 4 0 4 4 0 0 0 4 4
12 Saya 1 1 0 1 1 1 1 0 0 2 0 0 0 1 3 2
13 Stephanie 0 0 0 0 1 1 0 0 0 2 0 0 0 1 3 2
14 Tamiko 1 1 0 0 0 0 1 2 0 2 0 0 0 0 1 2
15 Tomoyo 1 1 1 1 1 1 1 0 0 0 3 0 0 0 3 3
16 Naoko 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3

In [31]:
dframe = pd.read_csv(file, sep=',')
dframe


Out[31]:
Unnamed: 0 Monday Tuesday Wednesday Thursday Friday Saturday Sunday Server Host Runner Busser Bar-back Bar Pref. Emp. Pos. Pref. Days
0 Ryo 0 1 1 1 1 1 0 0 0 0 0 0 4 1 6 4
1 Joji 1 1 1 1 1 1 1 0 0 0 0 0 3 0 6 3
2 Erinn 1 1 1 1 1 1 1 0 1 0 0 2 0 0 5 2
3 Masa 1 1 1 1 1 1 0 2 5 1 0 0 1 1 2 5
4 Mari 1 1 1 1 1 1 1 3 2 1 0 0 0 0 1 4
5 Dan 1 1 0 1 0 1 1 3 0 0 0 0 0 1 1 2
6 Jay 1 1 1 1 1 0 0 2 0 0 0 0 0 0 1 2
7 Machiko 0 1 0 0 1 0 0 2 0 1 0 0 0 1 1 2
8 Masami 1 1 1 1 1 1 1 3 0 0 0 0 0 1 1 3
9 Nana 1 1 1 1 1 1 1 3 0 1 0 0 0 0 1 3
10 Shiho 1 1 1 1 1 1 1 4 0 4 4 0 0 0 4 4
11 Saya 1 1 0 1 1 1 1 0 0 2 0 0 0 1 3 2
12 Stephanie 0 0 0 0 1 1 0 0 0 2 0 0 0 1 3 2
13 Tamiko 1 1 0 0 0 0 1 2 0 2 0 0 0 0 1 2
14 Tomoyo 1 1 1 1 1 1 1 0 0 0 3 0 0 0 3 3
15 Naoko 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3

In [32]:
pd.read_csv(file, nrows=2)


Out[32]:
Unnamed: 0 Monday Tuesday Wednesday Thursday Friday Saturday Sunday Server Host Runner Busser Bar-back Bar Pref. Emp. Pos. Pref. Days
0 Ryo 0 1 1 1 1 1 0 0 0 0 0 0 4 1 6 4
1 Joji 1 1 1 1 1 1 1 0 0 0 0 0 3 0 6 3

In [33]:
# Can export to file by dframe.to_csv('name.csv')
import sys
dframe.to_csv(sys.stdout, sep='_')


_Unnamed: 0_Monday_Tuesday_Wednesday_Thursday_Friday_Saturday_Sunday_Server_Host_Runner_Busser_Bar-back_Bar_Pref. Emp._Pos. Pref._Days
0_Ryo_0_1_1_1_1_1_0_0_0_0_0_0_4_1_6_4
1_Joji_1_1_1_1_1_1_1_0_0_0_0_0_3_0_6_3
2_Erinn_1_1_1_1_1_1_1_0_1_0_0_2_0_0_5_2
3_Masa_1_1_1_1_1_1_0_2_5_1_0_0_1_1_2_5
4_Mari_1_1_1_1_1_1_1_3_2_1_0_0_0_0_1_4
5_Dan_1_1_0_1_0_1_1_3_0_0_0_0_0_1_1_2
6_Jay_1_1_1_1_1_0_0_2_0_0_0_0_0_0_1_2
7_Machiko_0_1_0_0_1_0_0_2_0_1_0_0_0_1_1_2
8_Masami_1_1_1_1_1_1_1_3_0_0_0_0_0_1_1_3
9_Nana_1_1_1_1_1_1_1_3_0_1_0_0_0_0_1_3
10_Shiho_1_1_1_1_1_1_1_4_0_4_4_0_0_0_4_4
11_Saya_1_1_0_1_1_1_1_0_0_2_0_0_0_1_3_2
12_Stephanie_0_0_0_0_1_1_0_0_0_2_0_0_0_1_3_2
13_Tamiko_1_1_0_0_0_0_1_2_0_2_0_0_0_0_1_2
14_Tomoyo_1_1_1_1_1_1_1_0_0_0_3_0_0_0_3_3
15_Naoko_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_3

In [3]:
# HTML and XML
#from bs4 import BeautifulSoup
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
dframe_list = pd.io.html.read_html(url)
dframe = dframe_list[0]
dframe


Out[3]:
Bank Name City ST CERT Acquiring Institution Closing Date Updated Date Loss Share Type Agreement Terminated Termination Date
0 Hometown National Bank Longview WA 35156 Twin City Bank October 2, 2015 October 15, 2015 NaN NaN NaN
1 The Bank of Georgia Peachtree City GA 35259 Fidelity Bank October 2, 2015 October 15, 2015 NaN NaN NaN
2 Premier Bank Denver CO 34112 United Fidelity Bank, fsb July 10, 2015 December 17, 2015 none NaN NaN
3 Edgebrook Bank Chicago IL 57772 Republic Bank of Chicago May 8, 2015 July 23, 2015 none NaN NaN
4 Doral BankEn Espanol San Juan PR 32102 Banco Popular de Puerto Rico February 27, 2015 May 13, 2015 none NaN NaN
5 Capitol City Bank & Trust Company Atlanta GA 33938 First-Citizens Bank & Trust Company February 13, 2015 April 21, 2015 none NaN NaN
6 Highland Community Bank Chicago IL 20290 United Fidelity Bank, fsb January 23, 2015 April 21, 2015 none NaN NaN
7 First National Bank of Crestview Crestview FL 17557 First NBC Bank January 16, 2015 April 21, 2015 none NaN NaN
8 Northern Star Bank Mankato MN 34983 BankVista December 19, 2014 March 26, 2015 none NaN NaN
9 Frontier Bank, FSB D/B/A El Paseo Bank Palm Desert CA 34738 Bank of Southern California, N.A. November 7, 2014 March 26, 2015 none NaN NaN
10 The National Republic Bank of Chicago Chicago IL 916 State Bank of Texas October 24, 2014 March 26, 2015 none NaN NaN
11 NBRS Financial Rising Sun MD 4862 Howard Bank October 17, 2014 March 26, 2015 none NaN NaN
12 GreenChoice Bank, fsb Chicago IL 28462 Providence Bank, LLC July 25, 2014 July 28, 2015 none NaN NaN
13 Eastside Commercial Bank Conyers GA 58125 Community & Southern Bank July 18, 2014 July 28, 2015 none NaN NaN
14 The Freedom State Bank Freedom OK 12483 Alva State Bank & Trust Company June 27, 2014 July 18, 2014 none NaN NaN
15 Valley Bank Fort Lauderdale FL 21793 Landmark Bank, National Association June 20, 2014 June 29, 2015 none NaN NaN
16 Valley Bank Moline IL 10450 Great Southern Bank June 20, 2014 June 26, 2015 none NaN NaN
17 Slavie Federal Savings Bank Bel Air MD 32368 Bay Bank, FSB May 30, 2014 June 15, 2015 none NaN NaN
18 Columbia Savings Bank Cincinnati OH 32284 United Fidelity Bank, fsb May 23, 2014 May 28, 2015 none NaN NaN
19 AztecAmerica Bank En Espanol Berwyn IL 57866 Republic Bank of Chicago May 16, 2014 July 18, 2014 none NaN NaN
20 Allendale County Bank Fairfax SC 15062 Palmetto State Bank April 25, 2014 July 18, 2014 none NaN NaN
21 Vantage Point Bank Horsham PA 58531 First Choice Bank February 28, 2014 March 3, 2015 none NaN NaN
22 Millennium Bank, National Association Sterling VA 35096 WashingtonFirst Bank February 28, 2014 March 03, 2015 none NaN NaN
23 Syringa Bank Boise ID 34296 Sunwest Bank January 31, 2014 February 19, 2015 none NaN NaN
24 The Bank of Union El Reno OK 17967 BancFirst January 24, 2014 February 19, 2015 none NaN NaN
25 DuPage National Bank West Chicago IL 5732 Republic Bank of Chicago January 17, 2014 February 19, 2015 none NaN NaN
26 Texas Community Bank, National Association The Woodlands TX 57431 Spirit of Texas Bank, SSB December 13, 2013 December 29, 2014 none NaN NaN
27 Bank of Jackson County Graceville FL 14794 First Federal Bank of Florida October 30, 2013 November 05, 2014 none NaN NaN
28 First National Bank also operating as The Nat... Edinburg TX 14318 PlainsCapital Bank September 13, 2013 May 27, 2015 SFR/NSF NaN NaN
29 The Community's Bank Bridgeport CT 57041 No Acquirer September 13, 2013 December 7, 2015 none NaN NaN
... ... ... ... ... ... ... ... ... ... ...
512 ANB Financial, NA Bentonville AR 33901 Pulaski Bank and Trust Company May 9, 2008 August 28, 2012 none NaN NaN
513 Hume Bank Hume MO 1971 Security Bank March 7, 2008 August 28, 2012 none NaN NaN
514 Douglass National Bank Kansas City MO 24660 Liberty Bank and Trust Company January 25, 2008 October 26, 2012 none NaN NaN
515 Miami Valley Bank Lakeview OH 16848 The Citizens Banking Company October 4, 2007 August 28, 2012 none NaN NaN
516 NetBank Alpharetta GA 32575 ING DIRECT September 28, 2007 August 28, 2012 none NaN NaN
517 Metropolitan Savings Bank Pittsburgh PA 35353 Allegheny Valley Bank of Pittsburgh February 2, 2007 October 27, 2010 none NaN NaN
518 Bank of Ephraim Ephraim UT 1249 Far West Bank June 25, 2004 April 9, 2008 none NaN NaN
519 Reliance Bank White Plains NY 26778 Union State Bank March 19, 2004 April 9, 2008 none NaN NaN
520 Guaranty National Bank of Tallahassee Tallahassee FL 26838 Hancock Bank of Florida March 12, 2004 June 5, 2012 none NaN NaN
521 Dollar Savings Bank Newark NJ 31330 No Acquirer February 14, 2004 April 9, 2008 none NaN NaN
522 Pulaski Savings Bank Philadelphia PA 27203 Earthstar Bank November 14, 2003 July 22, 2005 none NaN NaN
523 First National Bank of Blanchardville Blanchardville WI 11639 The Park Bank May 9, 2003 June 5, 2012 none NaN NaN
524 Southern Pacific Bank Torrance CA 27094 Beal Bank February 7, 2003 October 20, 2008 none NaN NaN
525 Farmers Bank of Cheneyville Cheneyville LA 16445 Sabine State Bank & Trust December 17, 2002 October 20, 2004 none NaN NaN
526 Bank of Alamo Alamo TN 9961 No Acquirer November 8, 2002 March 18, 2005 none NaN NaN
527 AmTrade International BankEn Espanol Atlanta GA 33784 No Acquirer September 30, 2002 September 11, 2006 none NaN NaN
528 Universal Federal Savings Bank Chicago IL 29355 Chicago Community Bank June 27, 2002 April 9, 2008 none NaN NaN
529 Connecticut Bank of Commerce Stamford CT 19183 Hudson United Bank June 26, 2002 February 14, 2012 none NaN NaN
530 New Century Bank Shelby Township MI 34979 No Acquirer March 28, 2002 March 18, 2005 none NaN NaN
531 Net 1st National Bank Boca Raton FL 26652 Bank Leumi USA March 1, 2002 April 9, 2008 none NaN NaN
532 NextBank, NA Phoenix AZ 22314 No Acquirer February 7, 2002 February 5, 2015 none NaN NaN
533 Oakwood Deposit Bank Co. Oakwood OH 8966 The State Bank & Trust Company February 1, 2002 October 25, 2012 none NaN NaN
534 Bank of Sierra Blanca Sierra Blanca TX 22002 The Security State Bank of Pecos January 18, 2002 November 6, 2003 none NaN NaN
535 Hamilton Bank, NAEn Espanol Miami FL 24382 Israel Discount Bank of New York January 11, 2002 September 21, 2015 none NaN NaN
536 Sinclair National Bank Gravette AR 34248 Delta Trust & Bank September 7, 2001 February 10, 2004 none NaN NaN
537 Superior Bank, FSB Hinsdale IL 32646 Superior Federal, FSB July 27, 2001 August 19, 2014 none NaN NaN
538 Malta National Bank Malta OH 6629 North Valley Bank May 3, 2001 November 18, 2002 none NaN NaN
539 First Alliance Bank & Trust Co. Manchester NH 34264 Southern New Hampshire Bank & Trust February 2, 2001 February 18, 2003 none NaN NaN
540 National State Bank of Metropolis Metropolis IL 3815 Banterra Bank of Marion December 14, 2000 March 17, 2005 none NaN NaN
541 Bank of Honolulu Honolulu HI 21029 Bank of the Orient October 13, 2000 March 17, 2005 none NaN NaN

542 rows × 10 columns


In [41]:
# Merging Frames: like equijoins in SQL
dframe1 = DataFrame({'key':['x','z','y','z','x','x'], 'dataset_1':np.arange(6)})
dframe2 = DataFrame({'key':['q','y','z'], 'dataset_2':[1,2,3]})
pd.merge(dframe1,dframe2)


Out[41]:
dataset_1 key dataset_2
0 1 z 3
1 3 z 3
2 2 y 2

In [42]:
pd.merge(dframe1, dframe2, on='key') # Chooses what frame to merge on


Out[42]:
dataset_1 key dataset_2
0 1 z 3
1 3 z 3
2 2 y 2

In [43]:
pd.merge(dframe1,dframe2,on='key',how='left')


Out[43]:
dataset_1 key dataset_2
0 0 x NaN
1 1 z 3
2 2 y 2
3 3 z 3
4 4 x NaN
5 5 x NaN

In [44]:
pd.merge(dframe1,dframe2,how='right') # Can also apply a full outer join by 'outer' keyword


Out[44]:
dataset_1 key dataset_2
0 1 z 3
1 3 z 3
2 2 y 2
3 NaN q 1

In [ ]: