AWS DynamoDB Import Census Data

This is a companion notebook for the new Data Science Solutions book. The code is explained in the book.

IMPORTANT. The code will only run if you have a local DynamoDB server installed and running on localhost port 8000.



In [22]:

    
# data analysis and wrangling
import pandas as pd
import boto3
import numpy as np

already_imported = True

resource = boto3.resource(
    'dynamodb', endpoint_url='http://localhost:8000')

client = boto3.client(
    'dynamodb', endpoint_url='http://localhost:8000')

census_table = resource.Table('Census')
census_test_table = resource.Table('CensusTest')

column_names = [
    'age', 'workclass', 'fnlwgt', 
    'education', 'education-num', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 
    'capital-gain', 'capital-loss', 'hours-per-week', 
    'native-country', 'salary']

train_df = pd.read_csv(
    'data/aws/census/adult.data', 
    header=None, names=column_names, 
    sep=', ', engine='python')

test_df = pd.read_csv(
    'data/aws/census/adult.test', 
    header=None, names=column_names, 
    sep=', ', engine='python', skiprows=1)

train_df.shape, test_df.shape









    Out[22]:





((32561, 15), (16281, 15))



In [23]:

    
train_df.head()









    Out[23]:






  
    
      
      age
      workclass
      fnlwgt
      education
      education-num
      marital-status
      occupation
      relationship
      race
      sex
      capital-gain
      capital-loss
      hours-per-week
      native-country
      salary
    
  
  
    
      0
      39
      State-gov
      77516
      Bachelors
      13
      Never-married
      Adm-clerical
      Not-in-family
      White
      Male
      2174
      0
      40
      United-States
      <=50K
    
    
      1
      50
      Self-emp-not-inc
      83311
      Bachelors
      13
      Married-civ-spouse
      Exec-managerial
      Husband
      White
      Male
      0
      0
      13
      United-States
      <=50K
    
    
      2
      38
      Private
      215646
      HS-grad
      9
      Divorced
      Handlers-cleaners
      Not-in-family
      White
      Male
      0
      0
      40
      United-States
      <=50K
    
    
      3
      53
      Private
      234721
      11th
      7
      Married-civ-spouse
      Handlers-cleaners
      Husband
      Black
      Male
      0
      0
      40
      United-States
      <=50K
    
    
      4
      28
      Private
      338409
      Bachelors
      13
      Married-civ-spouse
      Prof-specialty
      Wife
      Black
      Female
      0
      0
      40
      Cuba
      <=50K



In [24]:

    
train_df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
salary            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB



In [25]:

    
item_data_size = train_df[:1].squeeze().nbytes
attribute_names_size = sum([len(i) for i in column_names])
item_size = item_data_size + attribute_names_size

print(round(item_size/1024,2), "KB")



In [26]:

    
train_df['uid'] = train_df.index
test_df['uid'] = test_df.index

train_df.head()









    Out[26]:






  
    
      
      age
      workclass
      fnlwgt
      education
      education-num
      marital-status
      occupation
      relationship
      race
      sex
      capital-gain
      capital-loss
      hours-per-week
      native-country
      salary
      uid
    
  
  
    
      0
      39
      State-gov
      77516
      Bachelors
      13
      Never-married
      Adm-clerical
      Not-in-family
      White
      Male
      2174
      0
      40
      United-States
      <=50K
      0
    
    
      1
      50
      Self-emp-not-inc
      83311
      Bachelors
      13
      Married-civ-spouse
      Exec-managerial
      Husband
      White
      Male
      0
      0
      13
      United-States
      <=50K
      1
    
    
      2
      38
      Private
      215646
      HS-grad
      9
      Divorced
      Handlers-cleaners
      Not-in-family
      White
      Male
      0
      0
      40
      United-States
      <=50K
      2
    
    
      3
      53
      Private
      234721
      11th
      7
      Married-civ-spouse
      Handlers-cleaners
      Husband
      Black
      Male
      0
      0
      40
      United-States
      <=50K
      3
    
    
      4
      28
      Private
      338409
      Bachelors
      13
      Married-civ-spouse
      Prof-specialty
      Wife
      Black
      Female
      0
      0
      40
      Cuba
      <=50K
      4



In [27]:

    
if already_imported:
    print("Already imported data")
else:
    for index, row in train_df.iterrows():
        sample = row.squeeze().to_dict()

        for key, value in sample.items():
            if type(value) is np.int64:
                sample[key] = int(sample[key])

        census_table.put_item(Item=sample)
        print('Put {}'.format(index))









    



Already imported data



In [28]:

    
if already_imported:
    print("Already imported data")
else:
    for index, row in test_df.iterrows():
        sample = row.squeeze().to_dict()

        for key, value in sample.items():
            if type(value) is np.int64:
                sample[key] = int(sample[key])

        census_test_table.put_item(Item=sample)
        print('Put {}'.format(index))









    



Already imported data



In [29]:

    
table_description = client.describe_table(TableName='Census')
print(table_description)
table_description = client.describe_table(TableName='CensusTest')
print(table_description)









    



{'Table': {'AttributeDefinitions': [{'AttributeName': 'uid', 'AttributeType': 'N'}], 'TableName': 'Census', 'KeySchema': [{'AttributeName': 'uid', 'KeyType': 'HASH'}], 'TableStatus': 'ACTIVE', 'CreationDateTime': datetime.datetime(2017, 2, 6, 14, 48, 9, 91000, tzinfo=tzlocal()), 'ProvisionedThroughput': {'LastIncreaseDateTime': datetime.datetime(2017, 2, 6, 16, 1, 28, 553000, tzinfo=tzlocal()), 'LastDecreaseDateTime': datetime.datetime(2017, 2, 6, 17, 6, 17, 966000, tzinfo=tzlocal()), 'NumberOfDecreasesToday': 3, 'ReadCapacityUnits': 3, 'WriteCapacityUnits': 3}, 'TableSizeBytes': 0, 'ItemCount': 0, 'TableArn': 'arn:aws:dynamodb:eu-west-1:107349881872:table/Census'}, 'ResponseMetadata': {'RequestId': 'KC445SRH35CTED4S9J54KBALRBVV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'KC445SRH35CTED4S9J54KBALRBVV4KQNSO5AEMVJF66Q9ASUAAJG', 'x-amz-crc32': '2918135267', 'content-type': 'application/x-amz-json-1.0', 'content-length': '492', 'date': 'Mon, 06 Feb 2017 12:03:23 GMT'}, 'RetryAttempts': 0}}
{'Table': {'AttributeDefinitions': [{'AttributeName': 'uid', 'AttributeType': 'N'}], 'TableName': 'CensusTest', 'KeySchema': [{'AttributeName': 'uid', 'KeyType': 'HASH'}], 'TableStatus': 'ACTIVE', 'CreationDateTime': datetime.datetime(2017, 2, 6, 14, 48, 42, 128000, tzinfo=tzlocal()), 'ProvisionedThroughput': {'LastIncreaseDateTime': datetime.datetime(2017, 2, 6, 17, 1, 7, 773000, tzinfo=tzlocal()), 'LastDecreaseDateTime': datetime.datetime(2017, 2, 6, 17, 31, 24, 470000, tzinfo=tzlocal()), 'NumberOfDecreasesToday': 2, 'ReadCapacityUnits': 3, 'WriteCapacityUnits': 3}, 'TableSizeBytes': 0, 'ItemCount': 0, 'TableArn': 'arn:aws:dynamodb:eu-west-1:107349881872:table/CensusTest'}, 'ResponseMetadata': {'RequestId': 'QHDO2H7LMUICMK80GRT63HAIN7VV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'QHDO2H7LMUICMK80GRT63HAIN7VV4KQNSO5AEMVJF66Q9ASUAAJG', 'x-amz-crc32': '2178789439', 'content-type': 'application/x-amz-json-1.0', 'content-length': '499', 'date': 'Mon, 06 Feb 2017 12:03:23 GMT'}, 'RetryAttempts': 0}}

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country	salary
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K