there are duplicates between train and test set; code below computes those duplicates and stores them in a file


In [1]:
import pandas as pd
import numpy as np
import time

In [ ]:
#duplicates2 = pd.read_csv("duplicates.csv", index_col='Unnamed: 0')
reader_train = pd.read_csv("Train.csv", chunksize=200000)
count = 1
for train in reader_train:
    tStart = time.time()
    if count > 26:
        ordered_train = train.sort(columns='Title')
        reader_test = pd.read_csv("Test.csv", chunksize=200000)
        for test in reader_test:
            ordered_test = test.sort(columns='Title')
            duplicates2 = duplicates2.append(pd.merge(ordered_test[['Id','Title']],ordered_train[['Id','Title','Tags']],on='Title'))
        duplicates2.to_csv("duplicates.csv")
    print("finished chunk {0: d} in {1:.0f}s".format(count, time.time() - tStart))
    count +=1

In [23]:
duplicates2.to_csv("duplicates.csv")

In [3]:
duplicates_sorted = duplicates2.sort(columns='Id_x')

In [4]:
duplicates_sorted.to_csv("duplicates_sorted.csv")

In [ ]:
duplicates = pd.read_csv("duplicates_sorted.csv", index_col='Unnamed: 0')

In [ ]:
duplicates_single = duplicates.drop_duplicates(cols='Id_x')

In [ ]:
duplicates_single.to_csv("duplicates_single.csv")
drop duplicates but retain the last row instead of the first; that should be the one with the most labels....?

In [113]:
duplicates_single_v2 = duplicates.drop_duplicates(cols='Id_x', take_last=True)

In [114]:
duplicates_single_v2.to_csv("duplicates_single_v2.csv")