Data cleaning

by: Simon



In [1]:

    
output_file = "../data/cleaned_coalpublic2013.xls"



In [2]:

    
import pandas as pd
import numpy as np



In [5]:

    
df = pd.read_excel("../data/coalpublic2013.xls", header=2, index_col='MSHA ID')



In [6]:

    
df['Company Type'].unique()









    Out[6]:





array([u'Indepedent Producer Operator', u'Operating Subsidiary',
       u'Contractor'], dtype=object)



In [7]:

    
df['Company Type'].replace(   to_replace = 'Indepedent Producer Operator',
                              value = 'Independent Producer Operator',
                              inplace=True)



In [8]:

    
df.rename(columns=lambda x: x.replace(" ","_"), inplace=True)



In [9]:

    
df = df[df["Production_(short_tons)"] > 0]



In [10]:

    
len(df)









    Out[10]:





1061



In [11]:

    
df['log_production'] = np.log(df["Production_(short_tons)"])



In [12]:

    
df.to_csv(output_file)