Example from figure 2 of:

Peters, F., Menzies, T., & Marcus, A. (2013). Better cross company defect prediction. In IEEE International Working Conference on Mining Software Repositories (pp. 409–418). https://doi.org/10.1109/MSR.2013.6624057


In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.ensemble import RandomForestClassifier
import random
RAND_SEED = 2016 
random.seed(RAND_SEED) # change this to see new random data!

In [13]:
train_instances = [(0,6,0), (3,4,1), (4,3,0), (5,7,0), (5,2,0), (8,8,2)]
test_instances =  [(1,1), (2.5,4.5), (5,3)]

In [14]:
train_df= pd.DataFrame(train_instances, columns=['x','y','project'])
test_df = pd.DataFrame(test_instances, columns=['x','y'])

In [15]:
for project in train_df.project.unique():
    subset = train_df[train_df.project == project]
    plt.scatter(subset.x, subset.y, label="train " + str(project))
plt.scatter(test_df.x, test_df.y, label="target")
plt.legend()


Out[15]:
<matplotlib.legend.Legend at 0x10ec09810>

In [16]:
from tl_algs import peters, burak

In [17]:
p = peters.Peters(test_set_X=test_df,
              test_set_domain="doesn'tmatter", 
              train_pool_X=train_df[['x','y']], 
              train_pool_y=pd.Series([False for __ in train_df.iterrows()]),
              train_pool_domain=train_df.project, 
              cluster_factor=9999,
              Base_Classifier=RandomForestClassifier,
              rand_seed=RAND_SEED
             )

In [18]:
p.filter_instances(p.train_pool_X, p.train_pool_y, p.test_set_X)


Out[18]:
(     x    y
 1  3.0  4.0
 2  4.0  3.0, 0    False
 1    False
 dtype: bool)

In [16]:
b = burak.Burak(test_set_X=test_df,
              test_set_domain="doesn'tmatter", 
              train_pool_X=train_df[['x','y']], 
              train_pool_y=pd.Series([False for __ in train_df.iterrows()]),
              train_pool_domain=train_df.project, 
              cluster_factor=9999,
              Base_Classifier=RandomForestClassifier,
              rand_seed=RAND_SEED
             )

In [17]:
b.filter_instances(p.train_pool_X, p.train_pool_y, p.test_set_X, 3)[0]


Out[17]:
x y
1 3.0 4.0
2 4.0 3.0
4 5.0 2.0
0 0.0 6.0
3 5.0 7.0