In [1]:
import pandas as pd

data_files = ['../data/data_by_ocean/eclipse/raw/0_summary_description.csv']
labels_files = ['../data/data_by_ocean/eclipse/raw/0_bug_id_date_who.csv']
test_data_files = ['../data/data_by_ocean/eclipse/raw/1_summary_description.csv']
test_labels_files = ['../data/data_by_ocean/eclipse/raw/1_bug_id_date_who.csv']

In [8]:
import re
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [10]:
data = []
for data_file in data_files:
    with open(data_file, 'r', encoding='latin-1') as f:
        data.extend([s.strip() for s in f.readlines()])
        data = [clean_str(s) for s in data]
test_data = []
for test_data_file in test_data_files:
    with open(test_data_file, 'r', encoding='latin-1') as f:
        test_data.extend([s.strip() for s in f.readlines()])
        test_data = [clean_str(s) for s in test_data]

labels = pd.read_csv(labels_files[0])
labels_files.pop(0)
for labels_file in labels_files:
    labels_df = pd.read_csv(labels_file)
    labels.append(labels_df)

test_labels = pd.read_csv(test_labels_files[0])
test_labels_files.pop(0)
for test_labels_file in test_labels_files:
    test_labels_df = pd.read_csv(test_labels_file)
    test_labels.append(test_labels_df)

In [11]:
data[:2]


Out[11]:
['dcr javadoc code assist should offer better support for closing tags \\( 1gkq0im \\) reply description dani megert 2001 10 10 23 07 46 edt when you want to write e g code test code you can not simple do the following 1 code assist , select code 2 type test 3 code assist , select code note at this point the content assist is empty does not come up , i have to enter a space first suggestions a \\) could offer code code as template \\( at 1 \\) b \\) go back on the line and search for nearest tag and offer its closing version i e if x is found then show the corresponding closing tag \\( if any \\) notes eg \\( 10 1 2001 12 04 43 am \\) templates support in the java doc partition should offer macros for code code etc reply comment 1 claude knaus 2001 10 11 04 06 30 edt suggestion a \\) has been implemented using templates the selection of the template happens via code assist add comment collapse all comments expand all comments , dcr javadoc code assist should offer better support for closing tags \\( 1gkq0im \\)',
 'test reply description jerome lanneluc 2001 10 11 04 37 42 edt testing testing 1 , 2 , 3 reply comment 1 jerome lanneluc 2001 10 11 04 56 34 edt fixed add comment collapse all comments expand all comments , test']

In [12]:
document_length = [len(x.split(" ")) for x in data]

In [13]:
document_l = pd.DataFrame(document_length)

In [14]:
document_l.describe()


Out[14]:
0
count 21067.000000
mean 394.410357
std 693.669842
min 37.000000
25% 134.000000
50% 221.000000
75% 433.000000
max 49088.000000

In [15]:
document_l.shape


Out[15]:
(21067, 1)

In [16]:
test_document_l = pd.DataFrame([len(x.split(" ")) for x in test_data])

In [17]:
test_document_l.describe()


Out[17]:
0
count 60345.000000
mean 472.331146
std 773.428355
min 37.000000
25% 153.000000
50% 256.000000
75% 503.000000
max 29469.000000

In [1]:
import numpy as np
import pandas as pd
from tensorflow.contrib import learn

In [2]:
data = pd.read_csv("../data/data_by_ocean/eclipse/sort-text-id.csv", encoding='latin-1')
x = data.text
y = data.fixer
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(0.2 * float(len(y)))
x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]
y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]

# 处理training data
# document length取90%的分位数
document_length_df = pd.DataFrame([len(xx.split(" ")) for xx in x_train])
document_length = np.int64(document_length_df.quantile(0.8))
vocabulary_processor = learn.preprocessing.VocabularyProcessor(1)

In [3]:
t_train = vocabulary_processor.fit_transform(y_train)

In [4]:
t_l_train = np.array(list(t_train))

In [5]:
t_l_train.shape


Out[5]:
(159968, 1)

In [6]:
t_ed=np.expand_dims(t_l_train, -1)

In [7]:
np.squeeze(t_ed).shape


Out[7]:
(159968,)

In [8]:
len(vocabulary_processor.vocabulary_)


Out[8]:
2643

In [9]:
len(set(y_train))


Out[9]:
2272

In [ ]: