In [59]:
import numpy as np
In [60]:
input_sentences = []
target_sentences = []
with open("../data/train") as f:
in_sentence = []
target_sentence = []
for line in f:
if line != "\n":
in_target = line.split('\t')
in_sentence.append(in_target[0])
target_sentence.append(in_target[1].strip())
else:
input_sentences.append(in_sentence)
target_sentences.append(target_sentence)
in_sentence = []
target_sentence = []
print len(input_sentences)
In [61]:
data = []
for sentence_idx in range(len(input_sentences)):
sentence = input_sentences[sentence_idx]
sentence_data = np.zeros((70+3,500*2),dtype=np.float32)
col_idx = 0
for word_idx in range(len(sentence)):
word = sentence[word_idx]
target_symbol_index = 70 # 0 PASS
if ("geo-loc" in target_sentences[sentence_idx][word_idx]) is True:
target_symbol_index = 71
for char in word.upper(): # upper the
char_dec = ord(char)
row_idx = 68 # represent other unkonw symbols
if char_dec >= 33 and char_dec <= 96:
row_idx = char_dec-33
elif char_dec >= 123 and char_dec <= 126:
row_idx = char_dec-33-26
sentence_data[row_idx,col_idx] = 1
sentence_data[target_symbol_index,col_idx] = 1
col_idx += 1
sentence_data[69,col_idx] = 1
sentence_data[72,col_idx] = 1
col_idx += 1
data.append(sentence_data)
In [63]:
t = data[:]
In [66]:
z = np.array(t)
In [70]:
np.shape(z[0:1])
Out[70]:
In [71]:
len(z)
Out[71]:
In [73]:
np.shape(z)[0]
Out[73]:
In [74]:
a= [1,2,3,4,5]
In [76]:
a[1:2]
Out[76]:
In [ ]: