In [7]:
import pandas as pd

# Import Stata files
volume = pd.read_stata('./data/VolumePaperData.dta')
cites = pd.read_stata('./data/AssessingCodedData.dta')

# Drop coder column
volume.drop('volCoder', axis=1, inplace=True)

# Create new dataframe with total no. of replications/extensions for each paper
volume_cites = cites[['volID', 'replication', 'extension']].groupby('volID').sum()

# Add new variable
volume_cites['Both'] = volume_cites['replication'] + volume_cites['extension']

# Merge dataframes together
merged = pd.merge(volume, volume_cites, how='left', left_on='volID', right_index=True)

# Rename columns
merged.columns = ['ID', 'Title', 'Web of Science', 'Top 200', 'Google Scholar', 'Field', 'Replications', 'Extensions', 'Both']

# Fill in missing values
merged[['Replications', 'Extensions', 'Both']] = merged[['Replications', 'Extensions', 'Both']].fillna(0)

# Export to CSV
merged.to_csv('data.csv', index=False)

In [8]:
merged.head()


Out[8]:
ID Title Web of Science Top 200 Google Scholar Field Replications Extensions Both
0 2 Learning about a New Technology: Pineapple in ... 176 108.0 1246 Development 1.0 10.0 11.0
1 3 Multiple-Product Firms and Product Switching 111 74.0 764 Labor/IO 1.0 2.0 3.0
2 4 Momma's Got the Pill": How Anthony Comstock an... 23 15.0 98 Labor/IO 2.0 0.0 2.0
3 5 Matching and Sorting in Online Dating 75 23.0 326 Labor/IO 0.0 5.0 5.0
4 6 Entry, Exit, and Investment-Specific Technical... 9 9.0 38 Macro/Int/Trade 0.0 0.0 0.0