In [2]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelBinarizer
from tensorflow.contrib import learn

In [16]:
data_files = ['../data/data_by_ocean/eclipse/raw/0_summary_description.csv',
              '../data/data_by_ocean/eclipse/raw/1_summary_description.csv',
              '../data/data_by_ocean/eclipse/raw/2_summary_description.csv',
              '../data/data_by_ocean/eclipse/raw/3_summary_description.csv',
              '../data/data_by_ocean/eclipse/raw/4_summary_description.csv',
              '../data/data_by_ocean/eclipse/raw/5_summary_description.csv',
              '../data/data_by_ocean/eclipse/raw/6_summary_description.csv',
              '../data/data_by_ocean/eclipse/raw/7_summary_description.csv',
              '../data/data_by_ocean/eclipse/raw/8_summary_description.csv']
labels_files = ['../data/data_by_ocean/eclipse/raw/0_bug_id_date_who.csv',
                '../data/data_by_ocean/eclipse/raw/1_bug_id_date_who.csv',
                '../data/data_by_ocean/eclipse/raw/2_bug_id_date_who.csv',
                '../data/data_by_ocean/eclipse/raw/3_bug_id_date_who.csv',
                '../data/data_by_ocean/eclipse/raw/4_bug_id_date_who.csv',
                '../data/data_by_ocean/eclipse/raw/5_bug_id_date_who.csv',
                '../data/data_by_ocean/eclipse/raw/6_bug_id_date_who.csv',
                '../data/data_by_ocean/eclipse/raw/7_bug_id_date_who.csv',
                '../data/data_by_ocean/eclipse/raw/8_bug_id_date_who.csv']
test_data_files = ['../data/data_by_ocean/eclipse/raw/9_summary_description.csv',
                   '../data/data_by_ocean/eclipse/raw/10_summary_description.csv']
test_labels_files = ['../data/data_by_ocean/eclipse/raw/9_bug_id_date_who.csv',
                     '../data/data_by_ocean/eclipse/raw/10_bug_id_date_who.csv']

In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [9]:
data = []
for data_file in data_files:
    with open(data_file, 'r', encoding='latin-1') as f:
        data.extend([s.strip() for s in f.readlines()])
        data = [clean_str(s) for s in data]
print('train data length: %d' % len(data))


train data length: 163611

In [17]:
del labels
labels_dfs = [pd.read_csv(f) for f in labels_files]
labels = pd.concat(labels_dfs)
print(len(labels))


163611

In [18]:
labels.who


Out[18]:
0                     Claude_Knaus@oti.com
1               jerome_lanneluc@fr.ibm.com
2                     Claude_Knaus@oti.com
3                     Claude_Knaus@oti.com
4                        akiezun@gmail.com
5                        akiezun@gmail.com
6                        akiezun@gmail.com
7                        akiezun@gmail.com
8                        akiezun@gmail.com
9                        akiezun@gmail.com
10                       akiezun@gmail.com
11                       akiezun@gmail.com
12                       akiezun@gmail.com
13                    Claude_Knaus@oti.com
14                       akiezun@gmail.com
15                       akiezun@gmail.com
16                       akiezun@gmail.com
17                    Claude_Knaus@oti.com
18                    Claude_Knaus@oti.com
19                       akiezun@gmail.com
20                       akiezun@gmail.com
21               philippe_mulet@fr.ibm.com
22                       akiezun@gmail.com
23                       akiezun@gmail.com
24              kai-uwe_maetzel@ch.ibm.com
25                       akiezun@gmail.com
26                       akiezun@gmail.com
27                    Claude_Knaus@oti.com
28                       akiezun@gmail.com
29                       akiezun@gmail.com
                       ...                
18149            michael.norman@oracle.com
18150                   nathan@eclipse.org
18151                       wlu@us.ibm.com
18152                 mikekucera@gmail.com
18153                 mikekucera@gmail.com
18154                 mikekucera@gmail.com
18155              matt.macivor@oracle.com
18156             markus_keller@ch.ibm.com
18157           steffen.pingel@tasktop.com
18158                  ken.ryall@gmail.com
18159                  cwindatt@ca.ibm.com
18160                  ken.ryall@gmail.com
18161                  emoffatt@ca.ibm.com
18162            Michael_Rennie@ca.ibm.com
18163                  remy.suen@gmail.com
18164            Michael_Rennie@ca.ibm.com
18165                     cgold@us.ibm.com
18166              darin.eclipse@gmail.com
18167                yves.yang@soyatec.com
18168                   aniefer@ca.ibm.com
18169           john.cortell@freescale.com
18170              darius.jockel@itemis.de
18171           cameron.bateman@oracle.com
18172                 d_a_carver@yahoo.com
18173                 mikekucera@gmail.com
18174                 mikekucera@gmail.com
18175           cameron.bateman@oracle.com
18176    raghunathan.srinivasan@oracle.com
18177                     cgold@us.ibm.com
18178                  pascal@rapicorp.com
Name: who, dtype: object

将sort-text-id排好序的文本改写成编号的形式存储起来。


In [4]:
data = pd.read_csv("../data/data_by_ocean/eclipse/sort-text-id.csv", encoding='latin-1')

In [5]:
x = data.text
y = data.fixer

In [6]:
from tensorflow.contrib import learn

In [7]:
dev_sample_index = -1 * int(0.2 * float(len(y)))
x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]
y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]

In [8]:
document_length_df = pd.DataFrame([len(xx.split(" ")) for xx in x_train])
document_length = np.int64(document_length_df.quantile(0.8))
vocabulary_processor = learn.preprocessing.VocabularyProcessor(document_length)

In [9]:
t_train = vocabulary_processor.fit_transform(x_train)

In [1]:



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-60c1fcef09cf> in <module>()
----> 1 np.array(t_train)

NameError: name 'np' is not defined

In [7]:
x_train = np.array(list(vocabulary_processor.fit_transform(x_train)), dtype=np.float32)
x_dev = np.array(list(vocabulary_processor.transform(x_dev)))

In [11]:
label_processor = learn.preprocessing.VocabularyProcessor(1)

In [12]:
y_train = np.array(list(label_processor.fit_transform(y_train)), dtype=np.float32)
y_dev = np.array(list(label_processor.transform(y_dev)))

In [27]:
pd.DataFrame([y_train, y_dev]).to_csv('tmpt.csv')

In [ ]: