In [11]:
output_file = "../data/cleaned_coalpublic2013.csv"
In [12]:
import pandas as pd
import numpy as np
In [13]:
df = pd.read_excel("../data/coalpublic2013.xls", header=2, index_col="MSHA ID")
In [14]:
df['Company Type'].unique()
Out[14]:
In [15]:
# renaming Indepedent and Independent
df['Company Type'].replace(to_replace='Indepedent Producer Operator',
value='Independent Producer Operator', inplace=True)
In [16]:
# changing spaces to underscores
df.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)
In [17]:
# We are removing data for coal mines without any production
df = df[df['Production_(short_tons)'] > 0]
In [18]:
df['log_production'] = np.log(df['Production_(short_tons)'])
In [19]:
df.to_csv(output_file)