Convert a csv database into posgres

This follows the notebook: ../general-docs/python_sql_dev_setups.ipynb


In [1]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

In [2]:
#In Python: Define a database name (we're using a dataset on births, so I call it 
# birth_db), and your username for your computer (CHANGE IT BELOW). 
dbname = 'actions'
username = 'rangel'

In [3]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print (engine.url)


postgres://rangel@localhost/actions

In [4]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))


True

In [5]:
import numpy as np
# read a database from CSV and load it into a pandas dataframe
df = pd.DataFrame.from_csv('actions.csv',index_col=0)

df.userid = df.userid.astype(int)
#df.user = df.user.where(df.user > 0, np.nan)
# Add some null values
#df.iloc['user']

In [6]:
## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
df.to_sql('actions', engine, if_exists='replace')

#As a workaround at this moment (for pandas versions 0.14.1 and lower), you can manually convert the nan values to None with:
#df2 = df.astype(object).where(pd.notnull(df), None)
# First row (no NaNs)
#df1 = df[:1]

#and then write the dataframe to sql. 
#This however converts all columns to object dtype. 
#Because of this, you have to create the database table based 
#on the original dataframe. Eg if your first row does not contain NaNs:
#df1.user = df1.user.astype(int)

#df1.to_sql('random_timestamps', engine, if_exists='replace')
#df2[1:].to_sql('random_timestamps', engine, if_exists='append')

In [7]:
# Insert null values:
#insert into bb values(null, null);

In [8]:
## Now try the same queries, but in python!

# connect:
con = None
con = psycopg2.connect(database = dbname, user = username)

# query:
sql_query = """
SELECT * FROM \"actions\" ;
"""
query = pd.read_sql_query(sql_query,con)

query.head(10)


Out[8]:
index timestamp userid action
0 0 1.513757e+09 6 send
1 1 1.514016e+09 6 reject
2 2 1.513238e+09 1 send
3 3 1.513325e+09 1 accept
4 4 1.513498e+09 2 send
5 5 1.513584e+09 2 accept
6 6 1.514102e+09 4 send
7 7 1.514362e+09 4 accept
8 8 1.512115e+09 4 send
9 9 1.512202e+09 4 reject

In [ ]: