In [59]:
import numpy as np

In [60]:
input_sentences = []
target_sentences = []
with open("../data/train") as f:
    in_sentence = []
    target_sentence = []
    for line in f:
        if line != "\n":
            in_target = line.split('\t')
            in_sentence.append(in_target[0])
            target_sentence.append(in_target[1].strip())
        else:
            input_sentences.append(in_sentence)
            target_sentences.append(target_sentence)
            in_sentence = []
            target_sentence = []
            
print len(input_sentences)


2394

In [61]:
data = []
for sentence_idx in range(len(input_sentences)):
    sentence = input_sentences[sentence_idx]
    sentence_data = np.zeros((70+3,500*2),dtype=np.float32)
    col_idx = 0
    for word_idx in range(len(sentence)):
        word = sentence[word_idx]
        target_symbol_index = 70 # 0 PASS
        if ("geo-loc" in target_sentences[sentence_idx][word_idx]) is True:
            target_symbol_index = 71
        for char in word.upper(): # upper the 
            char_dec = ord(char)
            row_idx = 68 # represent other unkonw symbols
            if char_dec >= 33 and char_dec <= 96:
                row_idx = char_dec-33
            elif char_dec >= 123 and char_dec <= 126:
                row_idx = char_dec-33-26
            sentence_data[row_idx,col_idx] = 1
            sentence_data[target_symbol_index,col_idx] = 1
            col_idx += 1
        sentence_data[69,col_idx] = 1
        sentence_data[72,col_idx] = 1
        col_idx += 1
    data.append(sentence_data)

In [63]:
t = data[:]

In [66]:
z = np.array(t)

In [70]:
np.shape(z[0:1])


Out[70]:
(1, 73, 1000)

In [71]:
len(z)


Out[71]:
2394

In [73]:
np.shape(z)[0]


Out[73]:
2394

In [74]:
a= [1,2,3,4,5]

In [76]:
a[1:2]


Out[76]:
[2]

In [ ]: