notebook.community

Edit and run



In [1]:

    
import pandas as pd

data_files = ['../data/data_by_ocean/eclipse/raw/0_summary_description.csv']
labels_files = ['../data/data_by_ocean/eclipse/raw/0_bug_id_date_who.csv']
test_data_files = ['../data/data_by_ocean/eclipse/raw/1_summary_description.csv']
test_labels_files = ['../data/data_by_ocean/eclipse/raw/1_bug_id_date_who.csv']



In [8]:

    
import re
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()



In [10]:

    
data = []
for data_file in data_files:
    with open(data_file, 'r', encoding='latin-1') as f:
        data.extend([s.strip() for s in f.readlines()])
        data = [clean_str(s) for s in data]
test_data = []
for test_data_file in test_data_files:
    with open(test_data_file, 'r', encoding='latin-1') as f:
        test_data.extend([s.strip() for s in f.readlines()])
        test_data = [clean_str(s) for s in test_data]

labels = pd.read_csv(labels_files[0])
labels_files.pop(0)
for labels_file in labels_files:
    labels_df = pd.read_csv(labels_file)
    labels.append(labels_df)

test_labels = pd.read_csv(test_labels_files[0])
test_labels_files.pop(0)
for test_labels_file in test_labels_files:
    test_labels_df = pd.read_csv(test_labels_file)
    test_labels.append(test_labels_df)



In [11]:

    
data[:2]









    Out[11]:





['dcr javadoc code assist should offer better support for closing tags \\( 1gkq0im \\) reply description dani megert 2001 10 10 23 07 46 edt when you want to write e g code test code you can not simple do the following 1 code assist , select code 2 type test 3 code assist , select code note at this point the content assist is empty does not come up , i have to enter a space first suggestions a \\) could offer code code as template \\( at 1 \\) b \\) go back on the line and search for nearest tag and offer its closing version i e if x is found then show the corresponding closing tag \\( if any \\) notes eg \\( 10 1 2001 12 04 43 am \\) templates support in the java doc partition should offer macros for code code etc reply comment 1 claude knaus 2001 10 11 04 06 30 edt suggestion a \\) has been implemented using templates the selection of the template happens via code assist add comment collapse all comments expand all comments , dcr javadoc code assist should offer better support for closing tags \\( 1gkq0im \\)',
 'test reply description jerome lanneluc 2001 10 11 04 37 42 edt testing testing 1 , 2 , 3 reply comment 1 jerome lanneluc 2001 10 11 04 56 34 edt fixed add comment collapse all comments expand all comments , test']



In [12]:

    
document_length = [len(x.split(" ")) for x in data]



In [13]:

    
document_l = pd.DataFrame(document_length)



In [14]:

    
document_l.describe()









    Out[14]:






  
    
      
      0
    
  
  
    
      count
      21067.000000
    
    
      mean
      394.410357
    
    
      std
      693.669842
    
    
      min
      37.000000
    
    
      25%
      134.000000
    
    
      50%
      221.000000
    
    
      75%
      433.000000
    
    
      max
      49088.000000



In [15]:

    
document_l.shape









    Out[15]:





(21067, 1)



In [16]:

    
test_document_l = pd.DataFrame([len(x.split(" ")) for x in test_data])



In [17]:

    
test_document_l.describe()









    Out[17]:






  
    
      
      0
    
  
  
    
      count
      60345.000000
    
    
      mean
      472.331146
    
    
      std
      773.428355
    
    
      min
      37.000000
    
    
      25%
      153.000000
    
    
      50%
      256.000000
    
    
      75%
      503.000000
    
    
      max
      29469.000000



In [1]:

    
import numpy as np
import pandas as pd
from tensorflow.contrib import learn



In [2]:

    
data = pd.read_csv("../data/data_by_ocean/eclipse/sort-text-id.csv", encoding='latin-1')
x = data.text
y = data.fixer
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(0.2 * float(len(y)))
x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]
y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]

# 处理training data
# document length取90%的分位数
document_length_df = pd.DataFrame([len(xx.split(" ")) for xx in x_train])
document_length = np.int64(document_length_df.quantile(0.8))
vocabulary_processor = learn.preprocessing.VocabularyProcessor(1)



In [3]:

    
t_train = vocabulary_processor.fit_transform(y_train)



In [4]:

    
t_l_train = np.array(list(t_train))



In [5]:

    
t_l_train.shape









    Out[5]:





(159968, 1)



In [6]:

    
t_ed=np.expand_dims(t_l_train, -1)



In [7]:

    
np.squeeze(t_ed).shape









    Out[7]:





(159968,)



In [8]:

    
len(vocabulary_processor.vocabulary_)









    Out[8]:





2643



In [9]:

    
len(set(y_train))









    Out[9]:





2272



In [ ]:

	0
count	21067.000000
mean	394.410357
std	693.669842
min	37.000000
25%	134.000000
50%	221.000000
75%	433.000000
max	49088.000000

	0
count	60345.000000
mean	472.331146
std	773.428355
min	37.000000
25%	153.000000
50%	256.000000
75%	503.000000
max	29469.000000