In [9]:
    
import pandas as pd
import numpy as np 
import glob
import datetime 
import time
    
In [10]:
    
allFiles = glob.glob("./csv/*.csv")
jobs = pd.read_csv('./csv/stackoverflow_jobs_macbook_20160912.csv', index_col='jobid',header=0)
dataframes = []
for file in allFiles: 
    df = pd.read_csv(file, index_col='jobid', header=0)
    dataframes.append(df)
    
merged_jobs = pd.concat(dataframes)
    
In [11]:
    
print "Before de-deuplication count is " + str(len(merged_jobs.index))
merged_jobs.drop_duplicates(keep='last', inplace=True)
print "After de-deuplication count is " + str(len(merged_jobs.index))
    
    
In [12]:
    
timestr = time.strftime("%Y%m%d")
out_path = "./csv_out/jobs_merged_" + timestr + "_" + str(len(merged_jobs.index))  + ".csv"
merged_jobs.to_csv(out_path,mode='w')
    
In [ ]: