In [9]:
import pandas as pd
import numpy as np
import glob
import datetime
import time
In [10]:
allFiles = glob.glob("./csv/*.csv")
jobs = pd.read_csv('./csv/stackoverflow_jobs_macbook_20160912.csv', index_col='jobid',header=0)
dataframes = []
for file in allFiles:
df = pd.read_csv(file, index_col='jobid', header=0)
dataframes.append(df)
merged_jobs = pd.concat(dataframes)
In [11]:
print "Before de-deuplication count is " + str(len(merged_jobs.index))
merged_jobs.drop_duplicates(keep='last', inplace=True)
print "After de-deuplication count is " + str(len(merged_jobs.index))
In [12]:
timestr = time.strftime("%Y%m%d")
out_path = "./csv_out/jobs_merged_" + timestr + "_" + str(len(merged_jobs.index)) + ".csv"
merged_jobs.to_csv(out_path,mode='w')
In [ ]: