notebook.community

Edit and run



In [59]:

    
import numpy as np



In [60]:

    
input_sentences = []
target_sentences = []
with open("../data/train") as f:
    in_sentence = []
    target_sentence = []
    for line in f:
        if line != "\n":
            in_target = line.split('\t')
            in_sentence.append(in_target[0])
            target_sentence.append(in_target[1].strip())
        else:
            input_sentences.append(in_sentence)
            target_sentences.append(target_sentence)
            in_sentence = []
            target_sentence = []
            
print len(input_sentences)



In [61]:

    
data = []
for sentence_idx in range(len(input_sentences)):
    sentence = input_sentences[sentence_idx]
    sentence_data = np.zeros((70+3,500*2),dtype=np.float32)
    col_idx = 0
    for word_idx in range(len(sentence)):
        word = sentence[word_idx]
        target_symbol_index = 70 # 0 PASS
        if ("geo-loc" in target_sentences[sentence_idx][word_idx]) is True:
            target_symbol_index = 71
        for char in word.upper(): # upper the 
            char_dec = ord(char)
            row_idx = 68 # represent other unkonw symbols
            if char_dec >= 33 and char_dec <= 96:
                row_idx = char_dec-33
            elif char_dec >= 123 and char_dec <= 126:
                row_idx = char_dec-33-26
            sentence_data[row_idx,col_idx] = 1
            sentence_data[target_symbol_index,col_idx] = 1
            col_idx += 1
        sentence_data[69,col_idx] = 1
        sentence_data[72,col_idx] = 1
        col_idx += 1
    data.append(sentence_data)



In [63]:

    
t = data[:]



In [66]:

    
z = np.array(t)



In [70]:

    
np.shape(z[0:1])









    Out[70]:





(1, 73, 1000)



In [71]:

    
len(z)









    Out[71]:





2394



In [73]:

    
np.shape(z)[0]









    Out[73]:





2394



In [74]:

    
a= [1,2,3,4,5]



In [76]:

    
a[1:2]









    Out[76]:





[2]



In [ ]: