In case of reading training data from file, we illustrate the necessary steps as following:

  1. collect the info about the file data, including file_path, input_names, value_dim, is_sparse etc as discussed in https://github.com/Microsoft/CNTK/wiki/CNTKTextFormat-Reader
  2. pass the real data (accroding to data info) to network model

The first step is done with MinibatchSource()


In [2]:
import sys, os
import getpass
import numpy as np

mnist_dir = '/home/' + getpass.getuser() + '/repos/cntk/Examples/Image/DataSets/MNIST/'
trn_data_file = mnist_dir + 'Train-28x28_cntk_text.txt'

print (os.path.exists(trn_data_file))


True

type I

(get numpy.ndarray from cntk reader)


In [3]:
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs

In [4]:
input_stream = StreamDef(field='features', shape=784, is_sparse=False)
label_stream = StreamDef(field='labels', shape=10, is_sparse=False)
print (input_stream)
print (label_stream)


{'dim': 784, 'stream_alias': 'features', 'is_sparse': False}
{'dim': 10, 'stream_alias': 'labels', 'is_sparse': False}

In [5]:
streams = StreamDefs(
    input = input_stream,
    label = label_stream
)
print (streams)


{'input': {'dim': 784, 'stream_alias': 'features', 'is_sparse': False}, 'label': {'dim': 10, 'stream_alias': 'labels', 'is_sparse': False}}

In [6]:
ctf = CTFDeserializer(trn_data_file, streams)
print (ctf)


{'input': {'input': {'alias': 'features', 'format': 'dense', 'dim': 784}, 'label': {'alias': 'labels', 'format': 'dense', 'dim': 10}}, 'type': 'CNTKTextFormatDeserializer', 'file': '/home/xtalpi/repos/cntk/Examples/Image/DataSets/MNIST/Train-28x28_cntk_text.txt'}

In [7]:
source = MinibatchSource(ctf)
features_si = source['input']
label_si = source['label']

In [9]:
mb = source.next_minibatch(10)
print (mb[features_si].value.shape)
print (mb[features_si].m_data.data().to_numpy().shape)
print (np.asarray(mb[features_si].m_data).shape)
print (mb[features_si].num_samples)


(10, 1, 784)
(10, 1, 784)
(10, 1, 784)
10

type II


In [57]:
from cntk.io import StreamConfiguration, text_format_minibatch_source
mb_source = text_format_minibatch_source(trn_data_file, [
        StreamConfiguration('features', 784),
        StreamConfiguration('labels', 10)
    ])
features_si = mb_source['features']
labels_si = mb_source['labels']

In [58]:
mb = mb_source.next_minibatch(10)
print (mb[features_si].value.shape)
print (mb[features_si].m_data.data().to_numpy().shape)
print (np.asarray(mb[features_si].m_data).shape)


(10, 1, 784)
(10, 1, 784)
(10, 1, 784)

save numpy.ndarray into file with cntk style


In [19]:
import sys, os
import getpass
import numpy as np
Teemo_path = '/home/' + getpass.getuser() + '/git_test'
if not os.path.exists(Teemo_path):
    Teemo_path = '/home/' + getpass.getuser() + '/huizhu/git_test'
if not os.path.exists(Teemo_path):
    exit("Can not find Teemo_path")
sys.path.append(Teemo_path)
from Teemo.examples.mnist import load_data
trn_x, trn_y, valid_x, valid_y = load_data.mnist()
trn_x = np.asarray(trn_x, dtype=np.float32)
trn_y = np.asarray(trn_y, dtype=np.float32)
valid_x = np.asarray(valid_x, dtype=np.float32)
valid_y = np.asarray(valid_y, dtype=np.float32)
print ('load data sucessfully')


load data sucessfully

In [54]:
def save_txt(file_name, features_ndarray, labels_ndarray):
    dir_name = os.path.dirname(file_name)
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    if os.path.exists(file_name):
        print ("File already exists: {0}".format(file_name))
    else:
        f = open(file_name, 'w')
        for feat, label in zip(features_ndarray, labels_ndarray):
            feat_str = ' '.join(feat.astype(str))
            label_str = ' '.join(label.astype(str))
            f.write('label| {0} features| {1}\n'.format(label_str, feat_str))

In [55]:
file_name = '/home/xtalpi/cntk_data/Train-28x28_cntk_text.txt'

save_txt(file_name, trn_x, trn_y)

In [ ]:


In [ ]: