SQL

Accessing data stored in databases is a routine exercise. I demonstrate a few helpful methods in the Jupyter Notebook.


In [1]:
%load_ext sql_magic



In [2]:
import sqlalchemy
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
sqlite_engine = create_engine('sqlite://')

In [4]:
%config SQL.conn_name = "sqlite_engine"

In [5]:
%config SQL


SQL options
---------
SQL.conn_name=<Unicode>
    Current: 'sqlite_engine'
    Object name for accessing computing resource environment
SQL.notify_result=<Bool>
    Current: True
    Notify query result to stdout
SQL.output_result=<Bool>
    Current: True
    Output query result to stdout

In [6]:
%config SQL.output_result = False
CREATE TABLE presidents (first_name, last_name, year_of_birth);
INSERT INTO presidents VALUES ('George', 'Washington', 1732);
INSERT INTO presidents VALUES ('John', 'Adams', 1735);
INSERT INTO presidents VALUES ('Thomas', 'Jefferson', 1743);
INSERT INTO presidents VALUES ('James', 'Madison', 1751);
INSERT INTO presidents VALUES ('James', 'Monroe', 1758);
INSERT INTO presidents VALUES ('Zachary', 'Taylor', 1784);
INSERT INTO presidents VALUES ('Abraham', 'Lincoln', 1809);
INSERT INTO presidents VALUES ('Theodore', 'Roosevelt', 1858);
INSERT INTO presidents VALUES ('Richard', 'Nixon', 1913);
INSERT INTO presidents VALUES ('Barack', 'Obama', 1961);

In [7]:
%%read_sql temp
CREATE TABLE presidents (first_name, last_name, year_of_birth);
INSERT INTO presidents VALUES ('George', 'Washington', 1732);
INSERT INTO presidents VALUES ('John', 'Adams', 1735);
INSERT INTO presidents VALUES ('Thomas', 'Jefferson', 1743);
INSERT INTO presidents VALUES ('James', 'Madison', 1751);
INSERT INTO presidents VALUES ('James', 'Monroe', 1758);
INSERT INTO presidents VALUES ('Zachary', 'Taylor', 1784);
INSERT INTO presidents VALUES ('Abraham', 'Lincoln', 1809);
INSERT INTO presidents VALUES ('Theodore', 'Roosevelt', 1858);
INSERT INTO presidents VALUES ('Richard', 'Nixon', 1913);
INSERT INTO presidents VALUES ('Barack', 'Obama', 1961);


Query started at 04:40:23 PM PST; Query executed in 0.00 m

In [8]:
%%read_sql df
SELECT * FROM presidents


Query started at 04:40:37 PM PST; Query executed in 0.00 m

In [9]:
df


Out[9]:
first_name last_name year_of_birth
0 George Washington 1732
1 John Adams 1735
2 Thomas Jefferson 1743
3 James Madison 1751
4 James Monroe 1758
5 Zachary Taylor 1784
6 Abraham Lincoln 1809
7 Theodore Roosevelt 1858
8 Richard Nixon 1913
9 Barack Obama 1961

Inline magic


In [10]:
later_presidents = %read_sql SELECT * FROM presidents WHERE year_of_birth > 1825
later_presidents


Query started at 04:40:48 PM PST; Query executed in 0.00 m
Out[10]:
first_name last_name year_of_birth
0 Theodore Roosevelt 1858
1 Richard Nixon 1913
2 Barack Obama 1961

In [11]:
%%read_sql later_presidents
SELECT * FROM presidents WHERE year_of_birth > 1825


Query started at 04:41:13 PM PST; Query executed in 0.00 m

Through pandas directly


In [12]:
birthyear = 1800

In [13]:
%%read_sql df1
SELECT first_name,
       last_name,
       year_of_birth
FROM presidents
WHERE year_of_birth > {birthyear}


Query started at 04:41:22 PM PST; Query executed in 0.00 m

In [14]:
df1


Out[14]:
first_name last_name year_of_birth
0 Abraham Lincoln 1809
1 Theodore Roosevelt 1858
2 Richard Nixon 1913
3 Barack Obama 1961

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [15]:
con = sqlite3.connect("presidents.sqlite")
# cur = con.cursor()

new_dataframe = pd.read_sql(f"""SELECT first_name, last_name, year_of_birth
                               FROM presidents
                               WHERE year_of_birth > {birthyear}
                               """, 
                            con=con)

con.close()

In [16]:
new_dataframe


Out[16]:
first_name last_name year_of_birth
0 Theodore Roosevelt 1858
1 Richard Nixon 1913
2 Barack Obama 1961

In [17]:
df.head()


Out[17]:
first_name last_name year_of_birth
0 George Washington 1732
1 John Adams 1735
2 Thomas Jefferson 1743
3 James Madison 1751
4 James Monroe 1758

In [18]:
coal = pd.read_csv("../data/coal_prod_cleaned.csv")
coal.head()


Out[18]:
MSHA_ID Average_Employees Company_Type Labor_Hours Mine_Basin Mine_County Mine_Name Mine_State Mine_Status Mine_Type Operating_Company Operating_Company_Address Operation_Type Production_short_tons Union_Code Year
0 103295 18.0 Independent Producer Operator 39175.0 Appalachia Southern Bibb Seymour Mine Alabama Active Surface Hope Coal Company Inc P.O. Box 249, Maylene, AL 35114 Mine only 105082.0 NaN 2008
1 103117 19.0 Operating Subsidiary 29926.0 Appalachia Southern Cullman Mine #2, #3, #4 Alabama Active, men working, not producing Surface Twin Pines Coal Company Inc 1874 County Road 15, Bremen, AL 35033 Mine only 10419.0 NaN 2008
2 103361 20.0 Operating Subsidiary 42542.0 Appalachia Southern Cullman Cold Springs West Mine Alabama Active Surface Twin Pines Coal Company 74 Industrial Parkway, Jasper, AL 35502 Mine only 143208.0 NaN 2008
3 100759 395.0 Operating Subsidiary 890710.0 Appalachia Southern Fayette North River # 1 Underground Mi Alabama Active Underground Chevron Mining Inc 3114 County Road 63 S, Berry, AL 35546 Mine and Preparation Plant 2923261.0 United Mine Workers of America 2008
4 103246 22.0 Independent Producer Operator 55403.0 Appalachia Southern Franklin Bear Creek Alabama Active Surface Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Mine only 183137.0 NaN 2008

In [19]:
new_dataframe.to_sql('presidents', con=sqlite_engine, if_exists='append', index=False)

In [20]:
coal.to_sql('coal', con=sqlite_engine, if_exists='append', index=False)

In [ ]:


In [ ]:


In [ ]:
%%read_sql example
SELECT * FROM coal

In [ ]:
example.head()

In [ ]:
example.columns