In [1]:
import pandas as pd
data_files = ['../data/data_by_ocean/eclipse/raw/0_summary_description.csv']
labels_files = ['../data/data_by_ocean/eclipse/raw/0_bug_id_date_who.csv']
test_data_files = ['../data/data_by_ocean/eclipse/raw/1_summary_description.csv']
test_labels_files = ['../data/data_by_ocean/eclipse/raw/1_bug_id_date_who.csv']
In [8]:
import re
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
In [10]:
data = []
for data_file in data_files:
with open(data_file, 'r', encoding='latin-1') as f:
data.extend([s.strip() for s in f.readlines()])
data = [clean_str(s) for s in data]
test_data = []
for test_data_file in test_data_files:
with open(test_data_file, 'r', encoding='latin-1') as f:
test_data.extend([s.strip() for s in f.readlines()])
test_data = [clean_str(s) for s in test_data]
labels = pd.read_csv(labels_files[0])
labels_files.pop(0)
for labels_file in labels_files:
labels_df = pd.read_csv(labels_file)
labels.append(labels_df)
test_labels = pd.read_csv(test_labels_files[0])
test_labels_files.pop(0)
for test_labels_file in test_labels_files:
test_labels_df = pd.read_csv(test_labels_file)
test_labels.append(test_labels_df)
In [11]:
data[:2]
Out[11]:
In [12]:
document_length = [len(x.split(" ")) for x in data]
In [13]:
document_l = pd.DataFrame(document_length)
In [14]:
document_l.describe()
Out[14]:
In [15]:
document_l.shape
Out[15]:
In [16]:
test_document_l = pd.DataFrame([len(x.split(" ")) for x in test_data])
In [17]:
test_document_l.describe()
Out[17]:
In [1]:
import numpy as np
import pandas as pd
from tensorflow.contrib import learn
In [2]:
data = pd.read_csv("../data/data_by_ocean/eclipse/sort-text-id.csv", encoding='latin-1')
x = data.text
y = data.fixer
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(0.2 * float(len(y)))
x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]
y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]
# 处理training data
# document length取90%的分位数
document_length_df = pd.DataFrame([len(xx.split(" ")) for xx in x_train])
document_length = np.int64(document_length_df.quantile(0.8))
vocabulary_processor = learn.preprocessing.VocabularyProcessor(1)
In [3]:
t_train = vocabulary_processor.fit_transform(y_train)
In [4]:
t_l_train = np.array(list(t_train))
In [5]:
t_l_train.shape
Out[5]:
In [6]:
t_ed=np.expand_dims(t_l_train, -1)
In [7]:
np.squeeze(t_ed).shape
Out[7]:
In [8]:
len(vocabulary_processor.vocabulary_)
Out[8]:
In [9]:
len(set(y_train))
Out[9]:
In [ ]: