In case of reading training data from file, we illustrate the necessary steps as following:
file_path, input_names, value_dim, is_sparse etc as
discussed in https://github.com/Microsoft/CNTK/wiki/CNTKTextFormat-ReaderThe first step is done with MinibatchSource()
In [2]:
import sys, os
import getpass
import numpy as np
mnist_dir = '/home/' + getpass.getuser() + '/repos/cntk/Examples/Image/DataSets/MNIST/'
trn_data_file = mnist_dir + 'Train-28x28_cntk_text.txt'
print (os.path.exists(trn_data_file))
In [3]:
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
In [4]:
input_stream = StreamDef(field='features', shape=784, is_sparse=False)
label_stream = StreamDef(field='labels', shape=10, is_sparse=False)
print (input_stream)
print (label_stream)
In [5]:
streams = StreamDefs(
input = input_stream,
label = label_stream
)
print (streams)
In [6]:
ctf = CTFDeserializer(trn_data_file, streams)
print (ctf)
In [7]:
source = MinibatchSource(ctf)
features_si = source['input']
label_si = source['label']
In [9]:
mb = source.next_minibatch(10)
print (mb[features_si].value.shape)
print (mb[features_si].m_data.data().to_numpy().shape)
print (np.asarray(mb[features_si].m_data).shape)
print (mb[features_si].num_samples)
In [57]:
from cntk.io import StreamConfiguration, text_format_minibatch_source
mb_source = text_format_minibatch_source(trn_data_file, [
StreamConfiguration('features', 784),
StreamConfiguration('labels', 10)
])
features_si = mb_source['features']
labels_si = mb_source['labels']
In [58]:
mb = mb_source.next_minibatch(10)
print (mb[features_si].value.shape)
print (mb[features_si].m_data.data().to_numpy().shape)
print (np.asarray(mb[features_si].m_data).shape)
In [19]:
import sys, os
import getpass
import numpy as np
Teemo_path = '/home/' + getpass.getuser() + '/git_test'
if not os.path.exists(Teemo_path):
Teemo_path = '/home/' + getpass.getuser() + '/huizhu/git_test'
if not os.path.exists(Teemo_path):
exit("Can not find Teemo_path")
sys.path.append(Teemo_path)
from Teemo.examples.mnist import load_data
trn_x, trn_y, valid_x, valid_y = load_data.mnist()
trn_x = np.asarray(trn_x, dtype=np.float32)
trn_y = np.asarray(trn_y, dtype=np.float32)
valid_x = np.asarray(valid_x, dtype=np.float32)
valid_y = np.asarray(valid_y, dtype=np.float32)
print ('load data sucessfully')
In [54]:
def save_txt(file_name, features_ndarray, labels_ndarray):
dir_name = os.path.dirname(file_name)
if not os.path.exists(dir_name):
os.makedirs(dir_name)
if os.path.exists(file_name):
print ("File already exists: {0}".format(file_name))
else:
f = open(file_name, 'w')
for feat, label in zip(features_ndarray, labels_ndarray):
feat_str = ' '.join(feat.astype(str))
label_str = ' '.join(label.astype(str))
f.write('label| {0} features| {1}\n'.format(label_str, feat_str))
In [55]:
file_name = '/home/xtalpi/cntk_data/Train-28x28_cntk_text.txt'
save_txt(file_name, trn_x, trn_y)
In [ ]:
In [ ]: