SQL

Accessing data stored in databases is a routine exercise. I demonstrate a few helpful methods in the Jupyter Notebook.



In [1]:

    
%load_ext sql_magic



In [2]:

    
import sqlalchemy
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
sqlite_engine = create_engine('sqlite://')



In [4]:

    
%config SQL.conn_name = "sqlite_engine"



In [5]:

    
%config SQL









    



SQL options
---------
SQL.conn_name=<Unicode>
    Current: 'sqlite_engine'
    Object name for accessing computing resource environment
SQL.notify_result=<Bool>
    Current: True
    Notify query result to stdout
SQL.output_result=<Bool>
    Current: True
    Output query result to stdout



In [6]:

    
%config SQL.output_result = False

CREATE TABLE presidents (first_name, last_name, year_of_birth);
INSERT INTO presidents VALUES ('George', 'Washington', 1732);
INSERT INTO presidents VALUES ('John', 'Adams', 1735);
INSERT INTO presidents VALUES ('Thomas', 'Jefferson', 1743);
INSERT INTO presidents VALUES ('James', 'Madison', 1751);
INSERT INTO presidents VALUES ('James', 'Monroe', 1758);
INSERT INTO presidents VALUES ('Zachary', 'Taylor', 1784);
INSERT INTO presidents VALUES ('Abraham', 'Lincoln', 1809);
INSERT INTO presidents VALUES ('Theodore', 'Roosevelt', 1858);
INSERT INTO presidents VALUES ('Richard', 'Nixon', 1913);
INSERT INTO presidents VALUES ('Barack', 'Obama', 1961);



In [7]:

    
%%read_sql temp
CREATE TABLE presidents (first_name, last_name, year_of_birth);
INSERT INTO presidents VALUES ('George', 'Washington', 1732);
INSERT INTO presidents VALUES ('John', 'Adams', 1735);
INSERT INTO presidents VALUES ('Thomas', 'Jefferson', 1743);
INSERT INTO presidents VALUES ('James', 'Madison', 1751);
INSERT INTO presidents VALUES ('James', 'Monroe', 1758);
INSERT INTO presidents VALUES ('Zachary', 'Taylor', 1784);
INSERT INTO presidents VALUES ('Abraham', 'Lincoln', 1809);
INSERT INTO presidents VALUES ('Theodore', 'Roosevelt', 1858);
INSERT INTO presidents VALUES ('Richard', 'Nixon', 1913);
INSERT INTO presidents VALUES ('Barack', 'Obama', 1961);









    



Query started at 04:40:23 PM PST; Query executed in 0.00 m



In [8]:

    
%%read_sql df
SELECT * FROM presidents









    



Query started at 04:40:37 PM PST; Query executed in 0.00 m



In [9]:

    
df









    Out[9]:







  
    
      
      first_name
      last_name
      year_of_birth
    
  
  
    
      0
      George
      Washington
      1732
    
    
      1
      John
      Adams
      1735
    
    
      2
      Thomas
      Jefferson
      1743
    
    
      3
      James
      Madison
      1751
    
    
      4
      James
      Monroe
      1758
    
    
      5
      Zachary
      Taylor
      1784
    
    
      6
      Abraham
      Lincoln
      1809
    
    
      7
      Theodore
      Roosevelt
      1858
    
    
      8
      Richard
      Nixon
      1913
    
    
      9
      Barack
      Obama
      1961

Inline magic



In [10]:

    
later_presidents = %read_sql SELECT * FROM presidents WHERE year_of_birth > 1825
later_presidents









    



Query started at 04:40:48 PM PST; Query executed in 0.00 m





    Out[10]:







  
    
      
      first_name
      last_name
      year_of_birth
    
  
  
    
      0
      Theodore
      Roosevelt
      1858
    
    
      1
      Richard
      Nixon
      1913
    
    
      2
      Barack
      Obama
      1961



In [11]:

    
%%read_sql later_presidents
SELECT * FROM presidents WHERE year_of_birth > 1825









    



Query started at 04:41:13 PM PST; Query executed in 0.00 m

Through pandas directly



In [12]:

    
birthyear = 1800



In [13]:

    
%%read_sql df1
SELECT first_name,
       last_name,
       year_of_birth
FROM presidents
WHERE year_of_birth > {birthyear}









    



Query started at 04:41:22 PM PST; Query executed in 0.00 m



In [14]:

    
df1









    Out[14]:







  
    
      
      first_name
      last_name
      year_of_birth
    
  
  
    
      0
      Abraham
      Lincoln
      1809
    
    
      1
      Theodore
      Roosevelt
      1858
    
    
      2
      Richard
      Nixon
      1913
    
    
      3
      Barack
      Obama
      1961



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [15]:

    
con = sqlite3.connect("presidents.sqlite")
# cur = con.cursor()

new_dataframe = pd.read_sql(f"""SELECT first_name, last_name, year_of_birth
                               FROM presidents
                               WHERE year_of_birth > {birthyear}
                               """, 
                            con=con)

con.close()



In [16]:

    
new_dataframe









    Out[16]:







  
    
      
      first_name
      last_name
      year_of_birth
    
  
  
    
      0
      Theodore
      Roosevelt
      1858
    
    
      1
      Richard
      Nixon
      1913
    
    
      2
      Barack
      Obama
      1961



In [17]:

    
df.head()









    Out[17]:







  
    
      
      first_name
      last_name
      year_of_birth
    
  
  
    
      0
      George
      Washington
      1732
    
    
      1
      John
      Adams
      1735
    
    
      2
      Thomas
      Jefferson
      1743
    
    
      3
      James
      Madison
      1751
    
    
      4
      James
      Monroe
      1758



In [18]:

    
coal = pd.read_csv("../data/coal_prod_cleaned.csv")
coal.head()









    Out[18]:







  
    
      
      MSHA_ID
      Average_Employees
      Company_Type
      Labor_Hours
      Mine_Basin
      Mine_County
      Mine_Name
      Mine_State
      Mine_Status
      Mine_Type
      Operating_Company
      Operating_Company_Address
      Operation_Type
      Production_short_tons
      Union_Code
      Year
    
  
  
    
      0
      103295
      18.0
      Independent Producer Operator
      39175.0
      Appalachia Southern
      Bibb
      Seymour Mine
      Alabama
      Active
      Surface
      Hope Coal Company Inc
      P.O. Box 249, Maylene, AL 35114
      Mine only
      105082.0
      NaN
      2008
    
    
      1
      103117
      19.0
      Operating Subsidiary
      29926.0
      Appalachia Southern
      Cullman
      Mine #2, #3, #4
      Alabama
      Active, men working, not producing
      Surface
      Twin Pines Coal Company Inc
      1874 County Road 15, Bremen, AL 35033
      Mine only
      10419.0
      NaN
      2008
    
    
      2
      103361
      20.0
      Operating Subsidiary
      42542.0
      Appalachia Southern
      Cullman
      Cold Springs West Mine
      Alabama
      Active
      Surface
      Twin Pines Coal Company
      74 Industrial Parkway, Jasper, AL 35502
      Mine only
      143208.0
      NaN
      2008
    
    
      3
      100759
      395.0
      Operating Subsidiary
      890710.0
      Appalachia Southern
      Fayette
      North River # 1 Underground Mi
      Alabama
      Active
      Underground
      Chevron Mining Inc
      3114 County Road 63 S, Berry, AL 35546
      Mine and Preparation Plant
      2923261.0
      United Mine Workers of America
      2008
    
    
      4
      103246
      22.0
      Independent Producer Operator
      55403.0
      Appalachia Southern
      Franklin
      Bear Creek
      Alabama
      Active
      Surface
      Birmingham Coal & Coke Co., In
      912 Edenton Street, Birmingham, AL 35242
      Mine only
      183137.0
      NaN
      2008



In [19]:

    
new_dataframe.to_sql('presidents', con=sqlite_engine, if_exists='append', index=False)



In [20]:

    
coal.to_sql('coal', con=sqlite_engine, if_exists='append', index=False)



In [ ]:



In [ ]:



In [ ]:

    
%%read_sql example
SELECT * FROM coal



In [ ]:

    
example.head()



In [ ]:

    
example.columns

	first_name	last_name	year_of_birth
0	George	Washington	1732
1	John	Adams	1735
2	Thomas	Jefferson	1743
3	James	Madison	1751
4	James	Monroe	1758
5	Zachary	Taylor	1784
6	Abraham	Lincoln	1809
7	Theodore	Roosevelt	1858
8	Richard	Nixon	1913
9	Barack	Obama	1961

	MSHA_ID	Average_Employees	Company_Type	Labor_Hours	Mine_Basin	Mine_County	Mine_Name	Mine_State	Mine_Status	Mine_Type	Operating_Company	Operating_Company_Address	Operation_Type	Production_short_tons	Union_Code	Year
0	103295	18.0	Independent Producer Operator	39175.0	Appalachia Southern	Bibb	Seymour Mine	Alabama	Active	Surface	Hope Coal Company Inc	P.O. Box 249, Maylene, AL 35114	Mine only	105082.0	NaN	2008
1	103117	19.0	Operating Subsidiary	29926.0	Appalachia Southern	Cullman	Mine #2, #3, #4	Alabama	Active, men working, not producing	Surface	Twin Pines Coal Company Inc	1874 County Road 15, Bremen, AL 35033	Mine only	10419.0	NaN	2008
2	103361	20.0	Operating Subsidiary	42542.0	Appalachia Southern	Cullman	Cold Springs West Mine	Alabama	Active	Surface	Twin Pines Coal Company	74 Industrial Parkway, Jasper, AL 35502	Mine only	143208.0	NaN	2008
3	100759	395.0	Operating Subsidiary	890710.0	Appalachia Southern	Fayette	North River # 1 Underground Mi	Alabama	Active	Underground	Chevron Mining Inc	3114 County Road 63 S, Berry, AL 35546	Mine and Preparation Plant	2923261.0	United Mine Workers of America	2008
4	103246	22.0	Independent Producer Operator	55403.0	Appalachia Southern	Franklin	Bear Creek	Alabama	Active	Surface	Birmingham Coal & Coke Co., In	912 Edenton Street, Birmingham, AL 35242	Mine only	183137.0	NaN	2008