there are duplicates between train and test set; code below computes those duplicates and stores them in a file
In [1]:
import pandas as pd
import numpy as np
import time
In [ ]:
#duplicates2 = pd.read_csv("duplicates.csv", index_col='Unnamed: 0')
reader_train = pd.read_csv("Train.csv", chunksize=200000)
count = 1
for train in reader_train:
tStart = time.time()
if count > 26:
ordered_train = train.sort(columns='Title')
reader_test = pd.read_csv("Test.csv", chunksize=200000)
for test in reader_test:
ordered_test = test.sort(columns='Title')
duplicates2 = duplicates2.append(pd.merge(ordered_test[['Id','Title']],ordered_train[['Id','Title','Tags']],on='Title'))
duplicates2.to_csv("duplicates.csv")
print("finished chunk {0: d} in {1:.0f}s".format(count, time.time() - tStart))
count +=1
In [23]:
duplicates2.to_csv("duplicates.csv")
In [3]:
duplicates_sorted = duplicates2.sort(columns='Id_x')
In [4]:
duplicates_sorted.to_csv("duplicates_sorted.csv")
In [ ]:
duplicates = pd.read_csv("duplicates_sorted.csv", index_col='Unnamed: 0')
In [ ]:
duplicates_single = duplicates.drop_duplicates(cols='Id_x')
In [ ]:
duplicates_single.to_csv("duplicates_single.csv")
In [113]:
duplicates_single_v2 = duplicates.drop_duplicates(cols='Id_x', take_last=True)
In [114]:
duplicates_single_v2.to_csv("duplicates_single_v2.csv")