In [11]:
output_file = '../data/cleaned_coalpublic2013.csv'
In [12]:
import pandas as pd
import numpy as np
In [13]:
df = pd.read_excel('../data/coalpublic2013.xls',header=2,index_col=1)
In [14]:
# Mistake renaming Indepedent to Independent
df['Company Type'].unique()
Out[14]:
In [15]:
df['Company Type'].replace(to_replace = 'Indepedent Producer Operator',
value = 'Independent Producer Operator',
inplace = True)
In [16]:
# changing spaces to _
df.rename(columns=lambda x: x.replace(" ","_"),inplace=True)
In [17]:
# we are removing data here !
# coal mines without ANY coal production are removed.
df = df[df['Production_(short_tons)']>0]
In [18]:
len(df)
Out[18]:
In [19]:
df['log_production'] = np.log(df['Production_(short_tons)'])
In [20]:
df.to_csv(output_file)
In [ ]: