In [2]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelBinarizer
from tensorflow.contrib import learn
In [16]:
data_files = ['../data/data_by_ocean/eclipse/raw/0_summary_description.csv',
'../data/data_by_ocean/eclipse/raw/1_summary_description.csv',
'../data/data_by_ocean/eclipse/raw/2_summary_description.csv',
'../data/data_by_ocean/eclipse/raw/3_summary_description.csv',
'../data/data_by_ocean/eclipse/raw/4_summary_description.csv',
'../data/data_by_ocean/eclipse/raw/5_summary_description.csv',
'../data/data_by_ocean/eclipse/raw/6_summary_description.csv',
'../data/data_by_ocean/eclipse/raw/7_summary_description.csv',
'../data/data_by_ocean/eclipse/raw/8_summary_description.csv']
labels_files = ['../data/data_by_ocean/eclipse/raw/0_bug_id_date_who.csv',
'../data/data_by_ocean/eclipse/raw/1_bug_id_date_who.csv',
'../data/data_by_ocean/eclipse/raw/2_bug_id_date_who.csv',
'../data/data_by_ocean/eclipse/raw/3_bug_id_date_who.csv',
'../data/data_by_ocean/eclipse/raw/4_bug_id_date_who.csv',
'../data/data_by_ocean/eclipse/raw/5_bug_id_date_who.csv',
'../data/data_by_ocean/eclipse/raw/6_bug_id_date_who.csv',
'../data/data_by_ocean/eclipse/raw/7_bug_id_date_who.csv',
'../data/data_by_ocean/eclipse/raw/8_bug_id_date_who.csv']
test_data_files = ['../data/data_by_ocean/eclipse/raw/9_summary_description.csv',
'../data/data_by_ocean/eclipse/raw/10_summary_description.csv']
test_labels_files = ['../data/data_by_ocean/eclipse/raw/9_bug_id_date_who.csv',
'../data/data_by_ocean/eclipse/raw/10_bug_id_date_who.csv']
In [3]:
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
In [9]:
data = []
for data_file in data_files:
with open(data_file, 'r', encoding='latin-1') as f:
data.extend([s.strip() for s in f.readlines()])
data = [clean_str(s) for s in data]
print('train data length: %d' % len(data))
In [17]:
del labels
labels_dfs = [pd.read_csv(f) for f in labels_files]
labels = pd.concat(labels_dfs)
print(len(labels))
In [18]:
labels.who
Out[18]:
将sort-text-id排好序的文本改写成编号的形式存储起来。
In [4]:
data = pd.read_csv("../data/data_by_ocean/eclipse/sort-text-id.csv", encoding='latin-1')
In [5]:
x = data.text
y = data.fixer
In [6]:
from tensorflow.contrib import learn
In [7]:
dev_sample_index = -1 * int(0.2 * float(len(y)))
x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]
y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]
In [8]:
document_length_df = pd.DataFrame([len(xx.split(" ")) for xx in x_train])
document_length = np.int64(document_length_df.quantile(0.8))
vocabulary_processor = learn.preprocessing.VocabularyProcessor(document_length)
In [9]:
t_train = vocabulary_processor.fit_transform(x_train)
In [1]:
In [7]:
x_train = np.array(list(vocabulary_processor.fit_transform(x_train)), dtype=np.float32)
x_dev = np.array(list(vocabulary_processor.transform(x_dev)))
In [11]:
label_processor = learn.preprocessing.VocabularyProcessor(1)
In [12]:
y_train = np.array(list(label_processor.fit_transform(y_train)), dtype=np.float32)
y_dev = np.array(list(label_processor.transform(y_dev)))
In [27]:
pd.DataFrame([y_train, y_dev]).to_csv('tmpt.csv')
In [ ]: