Data Cleaning

September 04, 2017 06:27 - E. Asa Bour - bourea@hotmail.com - Cleaned up the data and removed zero production coal mines



In [11]:

    
output_file = "../data/cleaned_coalpublic2013.csv"



In [12]:

    
import pandas as pd
import numpy as np



In [13]:

    
df = pd.read_excel("../data/coalpublic2013.xls", header=2, index_col="MSHA ID")



In [14]:

    
df['Company Type'].unique()









    Out[14]:





array([u'Indepedent Producer Operator', u'Operating Subsidiary',
       u'Contractor'], dtype=object)



In [15]:

    
# renaming Indepedent and Independent
df['Company Type'].replace(to_replace='Indepedent Producer Operator', 
                           value='Independent Producer Operator', inplace=True)



In [16]:

    
# changing spaces to underscores
df.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)



In [17]:

    
# We are removing data for coal mines without any production
df = df[df['Production_(short_tons)'] > 0]



In [18]:

    
df['log_production'] = np.log(df['Production_(short_tons)'])



In [19]:

    
df.to_csv(output_file)