Data cleaning

by: Simon


In [1]:
output_file = "../data/cleaned_coalpublic2013.xls"

In [2]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_excel("../data/coalpublic2013.xls", header=2, index_col='MSHA ID')

In [6]:
df['Company Type'].unique()


Out[6]:
array([u'Indepedent Producer Operator', u'Operating Subsidiary',
       u'Contractor'], dtype=object)

In [7]:
df['Company Type'].replace(   to_replace = 'Indepedent Producer Operator',
                              value = 'Independent Producer Operator',
                              inplace=True)

In [8]:
df.rename(columns=lambda x: x.replace(" ","_"), inplace=True)

In [9]:
df = df[df["Production_(short_tons)"] > 0]

In [10]:
len(df)


Out[10]:
1061

In [11]:
df['log_production'] = np.log(df["Production_(short_tons)"])

In [12]:
df.to_csv(output_file)