In [12]:
import numpy as np
import pandas as pd
import re
import os

dirname='/Users/mac/Desktop/Kaggle_datasets/Personalized_med/'
filename_1='training_text.txt'
filename_2='training_variants.csv'
filename_3='test_text.txt'
filename_4='test_variants.csv'
filename_5='submissionFile.csv'


df_train = pd.read_csv(os.path.join(dirname+filename_2))
df_test = pd.read_csv(os.path.join(dirname+filename_4))
df_ans = pd.read_csv(os.path.join(dirname+filename_5))
df_train_text = pd.read_csv(os.path.join(dirname+filename_1), sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
df_test_text = pd.read_csv(os.path.join(dirname+filename_3), sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

train_text = open(os.path.join(dirname+filename_1),'r').read()
test_text = open(os.path.join(dirname+filename_3),'r').read()

In [13]:
df_train.head()


Out[13]:
ID Gene Variation Class
0 0 FAM58A Truncating Mutations 1
1 1 CBL W802* 2
2 2 CBL Q249E 2
3 3 CBL N454D 3
4 4 CBL L399V 4

In [14]:
df_test.head()


Out[14]:
ID Gene Variation
0 0 ACSL4 R570S
1 1 NAGLU P521L
2 2 PAH L333F
3 3 ING1 A148D
4 4 TMEM216 G77A

In [15]:
df_ans.head()


Out[15]:
ID class1 class2 class3 class4 class5 class6 class7 class8 class9
0 0 0 0 0 0 0 1 0 0 0
1 1 0 1 0 0 0 0 0 0 0
2 2 0 0 0 0 0 1 0 0 0
3 3 0 0 0 0 0 0 0 1 0
4 4 0 0 0 1 0 0 0 0 0

In [16]:
df_train_text.head()


Out[16]:
ID Text
0 0 Cyclin-dependent kinases (CDKs) regulate a var...
1 1 Abstract Background Non-small cell lung canc...
2 2 Abstract Background Non-small cell lung canc...
3 3 Recent evidence has demonstrated that acquired...
4 4 Oncogenic mutations in the monomeric Casitas B...

In [ ]:
#由於文字太多了最高頻率30000字
import keras
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

df_train_text['Seq'] = df_train_text['ID']

token_train = keras.preprocessing.text.Tokenizer(num_words=10000)
token_train.fit_on_texts(train_text) #用token讀取新文字 

for i in range(len(df_train_text['ID'])):
    df_train_text['Seq'][i] = token_train.texts_to_sequences(df_train_text['Text'][i])
    #df_train_text['Seq'][i] = sequence.pad_sequences(x_train_seq, maxlen=100)
    
    if (i != 0) & (i % 100 == 0):
        print('Finished train texts: ',i)

In [ ]:
x_train =

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [9]:
'''
import keras
from keras.preprocessing import text

for i in range(len(train_text['Text'])):
    train_text['Text'][i] = keras.preprocessing.text.text_to_word_sequence(train_text['Text'][i],
                                                                           filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',
                                                                           lower=True,
                                                                           split=" ")
    if i % 100 == 0: 
        print('Finished text study number: ', str(i))

print('Finished separating train texts.')
'''


Using TensorFlow backend.
//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Finished:  0.0
Finished:  1.0
Finished:  2.0
Finished:  3.0
Finished:  4.0
Finished:  5.0
Finished:  6.0
Finished:  7.0
Finished:  8.0
Finished:  9.0
Finished:  10.0
Finished:  11.0
Finished:  12.0
Finished:  13.0
Finished:  14.0
Finished:  15.0
Finished:  16.0
Finished:  17.0
Finished:  18.0
Finished:  19.0
Finished:  20.0
Finished:  21.0
Finished:  22.0
Finished:  23.0
Finished:  24.0
Finished:  25.0
Finished:  26.0
Finished:  27.0
Finished:  28.0
Finished:  29.0
Finished:  30.0
Finished:  31.0
Finished:  32.0
Finished:  33.0
Finished separating train texts.

In [ ]:


In [ ]:
'''
from keras.preprocessing import text

for i in range(len(test_text['Text'])):
    test_text['Text'][i] = keras.preprocessing.text.text_to_word_sequence(test_text['Text'][i],
                                                                          filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',
                                                                          lower=True,
                                                                          split=" ")
    
    if i % 100 == 0: 
        print('Finished test study number: ', str(i))

print('Finished separating test text.')
'''

In [ ]: