Data cleaning

by: Jonathan

Cleaned up the data -- removed 0 production coal mines.


In [14]:
output_file = "../data/cleaned_coalpublic2013.csv"

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_excel("../data/coalpublic2013.xls", header=2, index_col='MSHA ID')

In [6]:
# Mistake, renaming Indepedent to Independent
df['Company Type'].unique()


Out[6]:
array(['Independent Producer Operator', u'Operating Subsidiary',
       u'Contractor'], dtype=object)

In [7]:
df['Company Type'].replace(to_replace='Indepedent Producer Operator', 
                           value='Independent Producer Operator',
                           inplace=True)

In [8]:
# Changing spaces to _
df.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)

In [10]:
# We are removing data here!
# Coal mines without ANY coal production are removed.
df = df[df['Production_(short_tons)'] > 0]

In [11]:
len(df)


Out[11]:
1061

In [12]:
df['log_production'] = np.log(df['Production_(short_tons)'])

In [15]:
df.to_csv(output_file)

In [ ]: