In [12]:
import numpy as np
import pandas as pd
import re
import os
dirname='/Users/mac/Desktop/Kaggle_datasets/Personalized_med/'
filename_1='training_text.txt'
filename_2='training_variants.csv'
filename_3='test_text.txt'
filename_4='test_variants.csv'
filename_5='submissionFile.csv'
df_train = pd.read_csv(os.path.join(dirname+filename_2))
df_test = pd.read_csv(os.path.join(dirname+filename_4))
df_ans = pd.read_csv(os.path.join(dirname+filename_5))
df_train_text = pd.read_csv(os.path.join(dirname+filename_1), sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
df_test_text = pd.read_csv(os.path.join(dirname+filename_3), sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
train_text = open(os.path.join(dirname+filename_1),'r').read()
test_text = open(os.path.join(dirname+filename_3),'r').read()
In [13]:
df_train.head()
Out[13]:
In [14]:
df_test.head()
Out[14]:
In [15]:
df_ans.head()
Out[15]:
In [16]:
df_train_text.head()
Out[16]:
In [ ]:
#由於文字太多了最高頻率30000字
import keras
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
df_train_text['Seq'] = df_train_text['ID']
token_train = keras.preprocessing.text.Tokenizer(num_words=10000)
token_train.fit_on_texts(train_text) #用token讀取新文字
for i in range(len(df_train_text['ID'])):
df_train_text['Seq'][i] = token_train.texts_to_sequences(df_train_text['Text'][i])
#df_train_text['Seq'][i] = sequence.pad_sequences(x_train_seq, maxlen=100)
if (i != 0) & (i % 100 == 0):
print('Finished train texts: ',i)
In [ ]:
x_train =
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [9]:
'''
import keras
from keras.preprocessing import text
for i in range(len(train_text['Text'])):
train_text['Text'][i] = keras.preprocessing.text.text_to_word_sequence(train_text['Text'][i],
filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',
lower=True,
split=" ")
if i % 100 == 0:
print('Finished text study number: ', str(i))
print('Finished separating train texts.')
'''
In [ ]:
In [ ]:
'''
from keras.preprocessing import text
for i in range(len(test_text['Text'])):
test_text['Text'][i] = keras.preprocessing.text.text_to_word_sequence(test_text['Text'][i],
filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',
lower=True,
split=" ")
if i % 100 == 0:
print('Finished test study number: ', str(i))
print('Finished separating test text.')
'''
In [ ]: