In [ ]:
# NOTE: Update the config file first
!cp config/bdkd_datastore.conf /root/.bdkd_datastore.conf
In [ ]:
import os
import csv
import time
import wikipedia
import yaml
import datastorewrapper
In [ ]:
cities_file = 'data/country-capitals.csv'
cities_folder = 'data/cities'
results_file = 'data/results_word_count.csv'
datastore_conf = yaml.load(open('/root/.bdkd_datastore.conf').read())
access_key = datastore_conf.get('hosts',{}).get('s3-sydney',{}).get('access_key')
secret_key = datastore_conf.get('hosts',{}).get('s3-sydney',{}).get('secret_key')
ds_repo = 'bdkd-sirca-public'
ds_dataset = 'cities'
In [ ]:
# Download cities info
def download_cities(max_cities=3):
cities_downloaded = []
if not os.path.exists(cities_folder):
os.makedirs(cities_folder)
with open(cities_file, 'rb') as csvfile:
reader = csv.reader(csvfile)
for n, row in enumerate(reader):
if n == 0: #Skip header
continue
if n > max_cities: # Only n cities
break
city_name = row[1]
try:
wiki_content = wikipedia.page(city_name).content
file_name = 'city_{0:03}.txt'.format(n)
with open('{0}/{1}'.format(cities_folder, file_name), 'wb') as fw:
fw.write(wiki_content.encode('utf8'))
except:
pass
cities_downloaded.append(city_name)
time.sleep(1) # Pause between each requtest
return cities_downloaded
# Check if dataset exits
def dataset_exists(ds_repo, ds_dataset):
datastore = datastorewrapper.Datastore()
repos = datastore.list(ds_repo)
return ds_dataset in repos
# Create dataset
def create_dataset():
result = True
if dataset_exists(ds_repo, ds_dataset):
datastore = datastorewrapper.Datastore()
result = datastore.delete(ds_repo, ds_dataset)
datastore = datastorewrapper.Datastore()
result = datastore.create(ds_repo, ds_dataset)
return result
# Add files to dataset
def add_files_to_dataset():
datastore = datastorewrapper.Datastore()
files = ['{0}/{1}'.format(cities_folder, i) for i in os.listdir(cities_folder)]
result = datastore.add_files(ds_repo, ds_dataset, files, overwrite = True)
return result
# List files in dataset
def list_files():
datastore = datastorewrapper.Datastore()
file_list = datastore.get_file_list(ds_repo, ds_dataset)
for i in file_list:
print i
# Add results to dataset
def add_results_to_dataset():
datastore = datastorewrapper.Datastore()
files = [results_file,]
result = datastore.add_files(ds_repo, ds_dataset, files)
return result
In [ ]:
download_cities()
In [ ]:
create_dataset()
In [ ]:
add_files_to_dataset()
In [ ]:
list_files()
In [ ]:
# Process data: run '03_process_data.ipynb'
In [ ]:
add_results_to_dataset()
In [ ]:
list_files()
In [ ]: