AWS DynamoDB Import Census Data

This is a companion notebook for the new Data Science Solutions book. The code is explained in the book.

IMPORTANT. The code will only run if you have a local DynamoDB server installed and running on localhost port 8000.


In [22]:
# data analysis and wrangling
import pandas as pd
import boto3
import numpy as np

already_imported = True

resource = boto3.resource(
    'dynamodb', endpoint_url='http://localhost:8000')

client = boto3.client(
    'dynamodb', endpoint_url='http://localhost:8000')

census_table = resource.Table('Census')
census_test_table = resource.Table('CensusTest')

column_names = [
    'age', 'workclass', 'fnlwgt', 
    'education', 'education-num', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 
    'capital-gain', 'capital-loss', 'hours-per-week', 
    'native-country', 'salary']

train_df = pd.read_csv(
    'data/aws/census/adult.data', 
    header=None, names=column_names, 
    sep=', ', engine='python')

test_df = pd.read_csv(
    'data/aws/census/adult.test', 
    header=None, names=column_names, 
    sep=', ', engine='python', skiprows=1)

train_df.shape, test_df.shape


Out[22]:
((32561, 15), (16281, 15))

In [23]:
train_df.head()


Out[23]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country salary
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K

In [24]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
salary            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB

In [25]:
item_data_size = train_df[:1].squeeze().nbytes
attribute_names_size = sum([len(i) for i in column_names])
item_size = item_data_size + attribute_names_size

print(round(item_size/1024,2), "KB")


0.25 KB

In [26]:
train_df['uid'] = train_df.index
test_df['uid'] = test_df.index

train_df.head()


Out[26]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country salary uid
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K 0
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K 1
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K 2
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K 3
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K 4

In [27]:
if already_imported:
    print("Already imported data")
else:
    for index, row in train_df.iterrows():
        sample = row.squeeze().to_dict()

        for key, value in sample.items():
            if type(value) is np.int64:
                sample[key] = int(sample[key])

        census_table.put_item(Item=sample)
        print('Put {}'.format(index))


Already imported data

In [28]:
if already_imported:
    print("Already imported data")
else:
    for index, row in test_df.iterrows():
        sample = row.squeeze().to_dict()

        for key, value in sample.items():
            if type(value) is np.int64:
                sample[key] = int(sample[key])

        census_test_table.put_item(Item=sample)
        print('Put {}'.format(index))


Already imported data

In [29]:
table_description = client.describe_table(TableName='Census')
print(table_description)
table_description = client.describe_table(TableName='CensusTest')
print(table_description)


{'Table': {'AttributeDefinitions': [{'AttributeName': 'uid', 'AttributeType': 'N'}], 'TableName': 'Census', 'KeySchema': [{'AttributeName': 'uid', 'KeyType': 'HASH'}], 'TableStatus': 'ACTIVE', 'CreationDateTime': datetime.datetime(2017, 2, 6, 14, 48, 9, 91000, tzinfo=tzlocal()), 'ProvisionedThroughput': {'LastIncreaseDateTime': datetime.datetime(2017, 2, 6, 16, 1, 28, 553000, tzinfo=tzlocal()), 'LastDecreaseDateTime': datetime.datetime(2017, 2, 6, 17, 6, 17, 966000, tzinfo=tzlocal()), 'NumberOfDecreasesToday': 3, 'ReadCapacityUnits': 3, 'WriteCapacityUnits': 3}, 'TableSizeBytes': 0, 'ItemCount': 0, 'TableArn': 'arn:aws:dynamodb:eu-west-1:107349881872:table/Census'}, 'ResponseMetadata': {'RequestId': 'KC445SRH35CTED4S9J54KBALRBVV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'KC445SRH35CTED4S9J54KBALRBVV4KQNSO5AEMVJF66Q9ASUAAJG', 'x-amz-crc32': '2918135267', 'content-type': 'application/x-amz-json-1.0', 'content-length': '492', 'date': 'Mon, 06 Feb 2017 12:03:23 GMT'}, 'RetryAttempts': 0}}
{'Table': {'AttributeDefinitions': [{'AttributeName': 'uid', 'AttributeType': 'N'}], 'TableName': 'CensusTest', 'KeySchema': [{'AttributeName': 'uid', 'KeyType': 'HASH'}], 'TableStatus': 'ACTIVE', 'CreationDateTime': datetime.datetime(2017, 2, 6, 14, 48, 42, 128000, tzinfo=tzlocal()), 'ProvisionedThroughput': {'LastIncreaseDateTime': datetime.datetime(2017, 2, 6, 17, 1, 7, 773000, tzinfo=tzlocal()), 'LastDecreaseDateTime': datetime.datetime(2017, 2, 6, 17, 31, 24, 470000, tzinfo=tzlocal()), 'NumberOfDecreasesToday': 2, 'ReadCapacityUnits': 3, 'WriteCapacityUnits': 3}, 'TableSizeBytes': 0, 'ItemCount': 0, 'TableArn': 'arn:aws:dynamodb:eu-west-1:107349881872:table/CensusTest'}, 'ResponseMetadata': {'RequestId': 'QHDO2H7LMUICMK80GRT63HAIN7VV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'QHDO2H7LMUICMK80GRT63HAIN7VV4KQNSO5AEMVJF66Q9ASUAAJG', 'x-amz-crc32': '2178789439', 'content-type': 'application/x-amz-json-1.0', 'content-length': '499', 'date': 'Mon, 06 Feb 2017 12:03:23 GMT'}, 'RetryAttempts': 0}}