As a data analyst or scientist, you probably have one or more datasets kept in different folders on your laptop or PC. To better organize the datasets, you can write them to one SQLite database file and manage them from there.
With all your datasets in a single database file, you can:
How to do that? Follow the codes in this notebook.
In [ ]:
# Create a SQLite database
# log in to your terminal, change to the folder where you want to keep the database file.
# type the following code and change xxxx to a name you want to give the database. Mine is called datarepo:
"""
sqlite3 xxxx.db
.databases
"""
# press control D to exit the database.
For convenience, put all the datasets you want to write to SQLite file into a designated directory and define the path to the directory in the code.
The following formats are currently supported:
Before you write csv/txt files, please confirm the delimiter type (comma or tab) in your file and change sep value (see line 73) in the code if it is different from default comma delimiter provided.
In [ ]:
# Import relevant modules
import os
import sys
from sqlalchemy import create_engine
import pandas as pd
import datetime as dt
# Connect to the database
conn = create_engine('sqlite:///datarepo.db')
# Define full path to the directory containing all the datasets to be written
datapath = '/Users/xxxx/xxxx/dataset'
# a list to hold information
datfile = []
file_in_database = []
new_datfile = []
# store the existing txt and csv datasets in a list datfile
if os.path.isdir(datapath):
for i in os.listdir(datapath):
ipath = os.path.join(datapath, i)
if os.path.isfile(ipath):
if os.path.splitext(ipath)[1] == '.txt' or os.path.splitext(
ipath)[1] == '.csv' or os.path.splitext(
ipath)[1] == '.xlsx' or os.path.splitext(
ipath)[1] == '.xls':
datfile.append(ipath)
# read the existing files in the database and store the information in a list file_in_database
sql = "select tbl_name from sqlite_master where type='table'"
res = pd.read_sql(sql, conn)
for i in [j for k in res.values.tolist() for j in k]:
file_in_database.append(os.path.join(datapath, i))
# get the list of new datasets that have not been written to the database
for file in datfile:
if file not in file_in_database:
new_datfile.append(file)
# check whether list of new datasets is empty
if len(new_datfile) == 0:
print(
"The datasets in the source folder have all already been written to the database."
)
sys.exit
# otherwise, write the new datasets to the database
else:
for file in new_datfile:
#write excel dataset
if file.split(".")[-1] == 'xlsx' or file.split(".")[-1] == 'xls':
tab_name = file.split("/")[
-1] # strip the path and get the file name only
start = dt.datetime.now()
df = pd.read_excel(file)
df.to_sql(tab_name, conn, if_exists='append')
print('{} seconds: writing of {} to the database is completed.'.
format((dt.datetime.now() - start).seconds, tab_name))
#write csv or txt dataset
if file.split(".")[-1] == 'csv' or file.split(".")[-1] == 'txt':
tab_name = file.split("/")[
-1] # strip the path and get the file name only
start = dt.datetime.now()
chunksize = 1000 # you can increase the chunksize
counter = 0
index_start = 1
for df in pd.read_table(
# confirm the delimiter type (comma or tab) in your csv/txt file and change sep value accordingly
file, chunksize=chunksize, iterator=True, sep=",",
encoding='utf-8'):
df.index += index_start
counter += 1
print('{} seconds: completed {} rows'.format((
dt.datetime.now() - start).seconds, counter * chunksize))
df.to_sql(tab_name, conn, if_exists='append')
index_start = df.index[-1] + 1
In [ ]:
from sqlalchemy import create_engine
import pandas as pd
# Connect to the database
conn = create_engine('sqlite:///datarepo.db')
# Get the list of current datasets in the database
def list_dataset():
try:
sql = "select tbl_name from sqlite_master where type='table'"
return pd.read_sql(sql, conn)
except Exception as err:
print(err)
#list_dataset()
In [ ]:
from sqlalchemy import create_engine
import pandas as pd
# Connect to the database
conn = create_engine('sqlite:///datarepo.db')
# View a dataset columns or fields
def view_dataset_cols(table_name):
try:
h = "'" + table_name + "'"
sql = (
"select sql from sqlite_master where type ='table' and tbl_name = %s"
% h)
x = pd.read_sql(sql, conn)
j = str(x.values.tolist())
k = j[:len(j) - 6]
return k.split("\\n\\t")[1:]
except Exception as err:
print(err)
#view_dataset_cols('winequality-red.csv')
In [ ]:
from sqlalchemy import create_engine
import pandas as pd
# Connect to the database
conn = create_engine('sqlite:///datarepo.db')
# Load a dataset into a dataframe
def load_dataset(table_name):
try:
tabname = "'" + table_name + "'"
sql = " SELECT * FROM %s" % tabname
return pd.read_sql(sql, conn)
except Exception as err:
print(err)
# You can amend the SQL code to select only the needed fields from a dataset.
#j = load_dataset('housing.csv')
#j.head()
In [ ]:
from sqlalchemy import create_engine
import pandas as pd
# Connect to the database
conn = create_engine('sqlite:///datarepo.db')
# Drop a dataset
def drop_dataset(table_name):
try:
tabname = "'" + table_name + "'"
sql = "DROP TABLE %s" % tabname
print('The dataset: ' + tabname + ' has been dropped from the database.')
pd.read_sql(sql, conn)
except Exception as err:
pass
#drop_dataset('test.csv')