In [2]:
# Python and SQL and Pandas
## PostgreSQL with psycopg2
import psycopg2
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
In [28]:
# Connect to a database
conn = psycopg2.connect("dbname=testdb user=postgres")
# Open a cursor to perform db ops
cur = conn.cursor()
# Execute comand to create a new table
cur.execute("DROP TABLE IF EXISTS test;")
cur.execute("CREATE TABLE test (id serial PRIMARY KEY, num integer, data varchar);")
# Pass data to fill a query placeholders and let Psycopg perform
# the correct conversion (no more SQL injections!)
cur.execute("INSERT INTO test (num, data) VALUES (%s, %s)", (100, "'abc'def"))
cur.execute("INSERT INTO test (num, data) VALUES (%s, %s)", (200, "'abc'xyz"))
# Query the database and obtain data as Python objects
cur.execute("SELECT * FROM test;")
result = cur.fetchone()
print(result)
result = cur.fetchmany()
print(result)
result = cur.fetchall()
print(result)
# Make the changes to the database persistent / use rollback() for undo
conn.commit()
# Close communication with the database
cur.close()
conn.close()
The SQL representation of many data types is often different from their Python string representation. The typical example is with single quotes in strings: in SQL single quotes are used as string literal delimiters, so the ones appearing inside the string itself must be escaped, whereas in Python single quotes can be left unescaped if the string is delimited by double quotes.
Because of the difference, sometime subtle, between the data types representations, a naïve approach to query strings composition, such as using Python strings concatenation, is a recipe for terrible problems:
SQL = "INSERT INTO authors (name) VALUES ('%s');" # NEVER DO THIS data = ("O'Reilly", ) cur.execute(SQL % data) # THIS WILL FAIL MISERABLY ProgrammingError: syntax error at or near "Reilly" LINE 1: INSERT INTO authors (name) VALUES ('O'Reilly') ^
If the variables containing the data to send to the database come from an untrusted source (such as a form published on a web site) an attacker could easily craft a malformed string, either gaining access to unauthorized data or performing destructive operations on the database. This form of attack is called SQL injection and is known to be one of the most widespread forms of attack to database servers. Before continuing, please print this page as a memo and hang it onto your desk.
Psycopg can automatically convert Python objects to and from SQL literals: using this feature your code will be more robust and reliable.
SQL = "INSERT INTO authors (name) VALUES (%s);" # Note: no quotes data = ("O'Reilly", ) cur.execute(SQL, data) # Note: no % operator
In [29]:
# More on CSV files
dframe = pd.read_csv('../../data/data_rep.csv')
dframe
Out[29]:
In [30]:
file = '../../data/data_rep.csv'
dframe = pd.read_csv(file, header=None)
dframe
Out[30]:
In [31]:
dframe = pd.read_csv(file, sep=',')
dframe
Out[31]:
In [32]:
pd.read_csv(file, nrows=2)
Out[32]:
In [33]:
# Can export to file by dframe.to_csv('name.csv')
import sys
dframe.to_csv(sys.stdout, sep='_')
In [3]:
# HTML and XML
#from bs4 import BeautifulSoup
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
dframe_list = pd.io.html.read_html(url)
dframe = dframe_list[0]
dframe
Out[3]:
In [41]:
# Merging Frames: like equijoins in SQL
dframe1 = DataFrame({'key':['x','z','y','z','x','x'], 'dataset_1':np.arange(6)})
dframe2 = DataFrame({'key':['q','y','z'], 'dataset_2':[1,2,3]})
pd.merge(dframe1,dframe2)
Out[41]:
In [42]:
pd.merge(dframe1, dframe2, on='key') # Chooses what frame to merge on
Out[42]:
In [43]:
pd.merge(dframe1,dframe2,on='key',how='left')
Out[43]:
In [44]:
pd.merge(dframe1,dframe2,how='right') # Can also apply a full outer join by 'outer' keyword
Out[44]:
In [ ]: