In [ ]:
# Run this cell before the lab !
import numpy as np
import os
import os.path as op
import zipfile
try:
from urllib.request import urlretrieve
except ImportError: # Python 2 compat
from urllib import urlretrieve
# BBC dataset
BBC_DATASET_URL = "http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip"
zip_filename = BBC_DATASET_URL.rsplit('/', 1)[1]
BBC_DATASET_FOLDER = 'bbc'
if not op.exists(zip_filename):
print("Downloading %s to %s..." % (BBC_DATASET_URL, zip_filename))
urlretrieve(BBC_DATASET_URL, zip_filename)
if not op.exists(BBC_DATASET_FOLDER):
with zipfile.ZipFile(zip_filename, 'r') as f:
print("Extracting contents of %s..." % zip_filename)
f.extractall('.')
# Get pretrained Glove Word2Vec
URL_REPRESENTATIONS = "https://github.com/m2dsupsdlclass/lectures-labs/releases/download/0.3/glove100k.100d.zip"
ZIP_REPRESENTATIONS = "glove100k.100d.zip"
FILE_REPRESENTATIONS = "glove100K.100d.txt"
if not op.exists(ZIP_REPRESENTATIONS):
print('Downloading from %s to %s...' % (URL_REPRESENTATIONS, ZIP_REPRESENTATIONS))
urlretrieve(URL_REPRESENTATIONS, './' + ZIP_REPRESENTATIONS)
if not op.exists(FILE_REPRESENTATIONS):
print("extracting %s..." % ZIP_REPRESENTATIONS)
myzip = zipfile.ZipFile(ZIP_REPRESENTATIONS)
myzip.extractall()
# Get the Nietzche dataset
from tensorflow.keras.utils import get_file
URL = "https://s3.amazonaws.com/text-datasets/nietzsche.txt"
corpus_path = get_file('nietzsche.txt', origin=URL)
In [ ]: