In [ ]:
# NOTE: Update the config file first
!cp config/bdkd_datastore.conf /root/.bdkd_datastore.conf

In [ ]:
import os
import csv
import time
import wikipedia
import yaml

import datastorewrapper

In [ ]:
cities_file = 'data/country-capitals.csv'
cities_folder = 'data/cities'
results_file = 'data/results_word_count.csv'

datastore_conf = yaml.load(open('/root/.bdkd_datastore.conf').read())
access_key = datastore_conf.get('hosts',{}).get('s3-sydney',{}).get('access_key')
secret_key = datastore_conf.get('hosts',{}).get('s3-sydney',{}).get('secret_key')

ds_repo = 'bdkd-sirca-public'
ds_dataset = 'cities'

In [ ]:
# Download cities info
def download_cities(max_cities=3):
    cities_downloaded = []
    if not os.path.exists(cities_folder):
        os.makedirs(cities_folder)
    with open(cities_file, 'rb') as csvfile:
        reader = csv.reader(csvfile)
        for n, row in enumerate(reader):
            if n == 0: #Skip header
                continue 

            if n > max_cities: # Only n cities
                break

            city_name = row[1]
            try:
                wiki_content = wikipedia.page(city_name).content
                file_name = 'city_{0:03}.txt'.format(n)
                with open('{0}/{1}'.format(cities_folder, file_name), 'wb') as fw:
                    fw.write(wiki_content.encode('utf8'))
            except:
                pass
            cities_downloaded.append(city_name)

            time.sleep(1) # Pause between each requtest

    return cities_downloaded

# Check if dataset exits
def dataset_exists(ds_repo, ds_dataset):
    datastore = datastorewrapper.Datastore()
    repos = datastore.list(ds_repo)
    return ds_dataset in repos

# Create dataset
def create_dataset():
    result = True
    if dataset_exists(ds_repo, ds_dataset):
        datastore = datastorewrapper.Datastore()
        result = datastore.delete(ds_repo, ds_dataset)
        
    datastore = datastorewrapper.Datastore()
    result = datastore.create(ds_repo, ds_dataset)
        
    return result

# Add files to dataset
def add_files_to_dataset():
    datastore = datastorewrapper.Datastore()
    files = ['{0}/{1}'.format(cities_folder, i) for i in os.listdir(cities_folder)]
    result = datastore.add_files(ds_repo, ds_dataset, files, overwrite = True)
    return result

# List files in dataset
def list_files():
    datastore = datastorewrapper.Datastore()
    file_list = datastore.get_file_list(ds_repo, ds_dataset)
    for i in file_list:
        print i

# Add results to dataset
def add_results_to_dataset():
    datastore = datastorewrapper.Datastore()
    files = [results_file,]
    result = datastore.add_files(ds_repo, ds_dataset, files)
    return result

In [ ]:
download_cities()

In [ ]:
create_dataset()

In [ ]:
add_files_to_dataset()

In [ ]:
list_files()

In [ ]:
# Process data: run '03_process_data.ipynb'

In [ ]:
add_results_to_dataset()

In [ ]:
list_files()

In [ ]: