In [7]:
import pandas as pd
# Import Stata files
volume = pd.read_stata('./data/VolumePaperData.dta')
cites = pd.read_stata('./data/AssessingCodedData.dta')
# Drop coder column
volume.drop('volCoder', axis=1, inplace=True)
# Create new dataframe with total no. of replications/extensions for each paper
volume_cites = cites[['volID', 'replication', 'extension']].groupby('volID').sum()
# Add new variable
volume_cites['Both'] = volume_cites['replication'] + volume_cites['extension']
# Merge dataframes together
merged = pd.merge(volume, volume_cites, how='left', left_on='volID', right_index=True)
# Rename columns
merged.columns = ['ID', 'Title', 'Web of Science', 'Top 200', 'Google Scholar', 'Field', 'Replications', 'Extensions', 'Both']
# Fill in missing values
merged[['Replications', 'Extensions', 'Both']] = merged[['Replications', 'Extensions', 'Both']].fillna(0)
# Export to CSV
merged.to_csv('data.csv', index=False)
In [8]:
merged.head()
Out[8]: