In [1]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
In [2]:
#In Python: Define a database name (we're using a dataset on births, so I call it
# birth_db), and your username for your computer (CHANGE IT BELOW).
dbname = 'actions'
username = 'rangel'
In [3]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print (engine.url)
In [4]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
create_database(engine.url)
print(database_exists(engine.url))
In [5]:
import numpy as np
# read a database from CSV and load it into a pandas dataframe
df = pd.DataFrame.from_csv('actions.csv',index_col=0)
df.userid = df.userid.astype(int)
#df.user = df.user.where(df.user > 0, np.nan)
# Add some null values
#df.iloc['user']
In [6]:
## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
df.to_sql('actions', engine, if_exists='replace')
#As a workaround at this moment (for pandas versions 0.14.1 and lower), you can manually convert the nan values to None with:
#df2 = df.astype(object).where(pd.notnull(df), None)
# First row (no NaNs)
#df1 = df[:1]
#and then write the dataframe to sql.
#This however converts all columns to object dtype.
#Because of this, you have to create the database table based
#on the original dataframe. Eg if your first row does not contain NaNs:
#df1.user = df1.user.astype(int)
#df1.to_sql('random_timestamps', engine, if_exists='replace')
#df2[1:].to_sql('random_timestamps', engine, if_exists='append')
In [7]:
# Insert null values:
#insert into bb values(null, null);
In [8]:
## Now try the same queries, but in python!
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username)
# query:
sql_query = """
SELECT * FROM \"actions\" ;
"""
query = pd.read_sql_query(sql_query,con)
query.head(10)
Out[8]:
In [ ]: