In [30]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set();
In [3]:
df = pd.read_excel("../data/coalpublic2013.xls", header=2, index_col='MSHA ID')
In [4]:
df.head()
Out[4]:
In [5]:
df['Company Type'].unique()
Out[5]:
In [7]:
df['Company Type'].replace(to_replace='Indepedent Producer Operator',
value='Independent Producer Operator',
inplace=True)
In [8]:
df['Company Type'].head()
Out[8]:
In [9]:
df.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)
In [11]:
df.head()
Out[11]:
In [12]:
len(df)
Out[12]:
In [14]:
plt.scatter(df.Average_Employees, df.Labor_Hours)
plt.xlabel("Number of Employees")
plt.ylabel("Total Hours Worked")
Out[14]:
In [17]:
sns.regplot(df.Average_Employees, df.Labor_Hours, )
plt.savefig("../figures/2015-11-21" + "-employees_vs_hours.png")
In [19]:
for column in df.columns:
print column
In [20]:
plt.scatter(df.Labor_Hours, df['Production_(short_tons)'])
Out[20]:
In [21]:
df['Production_(short_tons)'].hist()
Out[21]:
In [26]:
# We are removing data here!
df = df[df['Production_(short_tons)'] > 0]
In [27]:
len(df)
Out[27]:
In [31]:
df['log_production'] = np.log(df['Production_(short_tons)'])
In [32]:
df.log_production.hist()
Out[32]:
In [33]:
df.to_csv("../data/cleaned_coalpublic2013.csv")
In [ ]: