Data Cleaning

by: Bad Wizard

Cleaned up the data -- removed 0 production coal mines.



In [11]:

    
output_file = '../data/cleaned_coalpublic2013.csv'



In [12]:

    
import pandas as pd
import numpy as np



In [13]:

    
df = pd.read_excel('../data/coalpublic2013.xls',header=2,index_col=1)



In [14]:

    
# Mistake renaming Indepedent to Independent
df['Company Type'].unique()









    Out[14]:





array(['Indepedent Producer Operator', 'Operating Subsidiary', 'Contractor'], dtype=object)



In [15]:

    
df['Company Type'].replace(to_replace = 'Indepedent Producer Operator',
                           value = 'Independent Producer Operator',
                           inplace = True)



In [16]:

    
# changing spaces to _
df.rename(columns=lambda x: x.replace(" ","_"),inplace=True)



In [17]:

    
# we are removing data here !
# coal mines without ANY coal production are removed.
df = df[df['Production_(short_tons)']>0]



In [18]:

    
len(df)









    Out[18]:





1061



In [19]:

    
df['log_production'] = np.log(df['Production_(short_tons)'])



In [20]:

    
df.to_csv(output_file)



In [ ]: