Organize and manage datasets

As a data analyst or scientist, you probably have one or more datasets kept in different folders on your laptop or PC. To better organize the datasets, you can write them to one SQLite database file and manage them from there.

With all your datasets in a single database file, you can:

add up more dataset to the database at any time.
list and view all the datasets in the database at any time.
view the fields or columns of each dataset or table.
load any dataset directly from the database.
drop any dataset or table as you like.

How to do that? Follow the codes in this notebook.

Create a SQLite database file



In [ ]:

    
# Create a SQLite database
    # log in to your terminal, change to the folder where you want to keep the database file.
    # type the following code and change xxxx to a name you want to give the database. Mine is called datarepo:
"""
sqlite3 xxxx.db 

.databases

"""
# press control D to exit the database.

Write the datasets into the SQLite file

For convenience, put all the datasets you want to write to SQLite file into a designated directory and define the path to the directory in the code.
The following formats are currently supported:

csv/txt
xls/xlsx

Before you write csv/txt files, please confirm the delimiter type (comma or tab) in your file and change sep value (see line 73) in the code if it is different from default comma delimiter provided.



In [ ]:

    
# Import relevant modules
import os
import sys
from sqlalchemy import create_engine
import pandas as pd
import datetime as dt

# Connect to the database
conn = create_engine('sqlite:///datarepo.db')

# Define full path to the directory containing all the datasets to be written 
datapath = '/Users/xxxx/xxxx/dataset'

# a list to hold information
datfile = []
file_in_database = []
new_datfile = []

# store the existing txt and csv datasets in a list datfile
if os.path.isdir(datapath):
    for i in os.listdir(datapath):
        ipath = os.path.join(datapath, i)
        if os.path.isfile(ipath):
            if os.path.splitext(ipath)[1] == '.txt' or os.path.splitext(
                    ipath)[1] == '.csv' or os.path.splitext(
                        ipath)[1] == '.xlsx' or os.path.splitext(
                            ipath)[1] == '.xls':
                datfile.append(ipath)

# read the existing files in the database and store the information in a list file_in_database                
sql = "select tbl_name from sqlite_master where type='table'"
res = pd.read_sql(sql, conn)
for i in [j for k in res.values.tolist() for j in k]:
    file_in_database.append(os.path.join(datapath, i))

# get the list of new datasets that have not been written to the database
for file in datfile:
    if file not in file_in_database:
        new_datfile.append(file)

# check whether list of new datasets is empty         
if len(new_datfile) == 0:
    print(
        "The datasets in the source folder have all already been written to the database."
    )
    sys.exit

# otherwise, write the new datasets to the database
else:
    for file in new_datfile:

        #write excel dataset
        if file.split(".")[-1] == 'xlsx' or file.split(".")[-1] == 'xls':
            tab_name = file.split("/")[
                -1]  # strip the path and get the file name only
            start = dt.datetime.now()
            df = pd.read_excel(file)
            df.to_sql(tab_name, conn, if_exists='append')
            print('{} seconds: writing of {} to the database is completed.'.
                  format((dt.datetime.now() - start).seconds, tab_name))

        #write csv or txt dataset
        if file.split(".")[-1] == 'csv' or file.split(".")[-1] == 'txt':
            tab_name = file.split("/")[
                -1]  # strip the path and get the file name only
            start = dt.datetime.now()
            chunksize = 1000  # you can increase the chunksize
            counter = 0
            index_start = 1

            for df in pd.read_table(
                # confirm the delimiter type (comma or tab) in your csv/txt file and change sep value accordingly 
                    file, chunksize=chunksize, iterator=True, sep=",",
                    encoding='utf-8'):
                df.index += index_start
                counter += 1

                print('{} seconds: completed {} rows'.format((
                    dt.datetime.now() - start).seconds, counter * chunksize))

                df.to_sql(tab_name, conn, if_exists='append')
                index_start = df.index[-1] + 1

List and view all the datasets in the database file



In [ ]:

    
from sqlalchemy import create_engine
import pandas as pd

# Connect to the database
conn = create_engine('sqlite:///datarepo.db')


# Get the list of current datasets in the database
def list_dataset():
    try:
        sql = "select tbl_name from sqlite_master where type='table'"
        return pd.read_sql(sql, conn)

    except Exception as err:
        print(err)


#list_dataset()

View a dataset fields or columns



In [ ]:

    
from sqlalchemy import create_engine
import pandas as pd

# Connect to the database
conn = create_engine('sqlite:///datarepo.db')


# View a dataset columns or fields
def view_dataset_cols(table_name):
    try:
        h = "'" + table_name + "'"
        sql = (
            "select sql from sqlite_master where type ='table' and tbl_name = %s"
            % h)
        x = pd.read_sql(sql, conn)
        j = str(x.values.tolist())
        k = j[:len(j) - 6]
        return k.split("\\n\\t")[1:]

    except Exception as err:
        print(err)


#view_dataset_cols('winequality-red.csv')

Load a dataset into a dataframe directly from the database file



In [ ]:

    
from sqlalchemy import create_engine
import pandas as pd

# Connect to the database
conn = create_engine('sqlite:///datarepo.db')


# Load a dataset into a dataframe
def load_dataset(table_name):
    try:
        tabname = "'" + table_name + "'"
        sql = " SELECT * FROM %s" % tabname
        return pd.read_sql(sql, conn)

    except Exception as err:
        print(err)

# You can amend the SQL code to select only the needed fields from a dataset. 
        
#j = load_dataset('housing.csv')
#j.head()

Drop any dataset from the database file



In [ ]:

    
from sqlalchemy import create_engine
import pandas as pd

# Connect to the database
conn = create_engine('sqlite:///datarepo.db')


# Drop a dataset
def drop_dataset(table_name):
    try:
        tabname = "'" + table_name + "'"
        sql = "DROP TABLE %s" % tabname
        print('The dataset: ' + tabname + ' has been dropped from the database.')
        pd.read_sql(sql, conn)

    except Exception as err:
        pass


#drop_dataset('test.csv')