In [47]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
In [10]:
df = pd.read_excel("../data/coalpublic2013.xls", header=2, index_col="MSHA ID")
In [11]:
df.head()
Out[11]:
In [12]:
df['Company Type'].unique()
Out[12]:
In [18]:
df['Company Type'].replace(to_replace='Indepedent Producer Operator',
value='Independent Producer Operator', inplace=True)
In [19]:
df.head()
Out[19]:
In [24]:
df.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)
In [25]:
df.head()
Out[25]:
In [26]:
len(df)
Out[26]:
In [29]:
plt.scatter(df.Average_Employees, df.Labor_Hours)
plt.xlabel("Number of Employees")
plt.ylabel("Total Hours Worked")
Out[29]:
In [31]:
sns.regplot(df.Average_Employees, df.Labor_Hours)
plt.savefig("../figures/2017-09-03" + "-employees_vs_hours.png")
In [32]:
for column in df.columns:
print column
In [34]:
plt.scatter(df.Labor_Hours, df['Production_(short_tons)'])
Out[34]:
In [36]:
df['Production_(short_tons)'].hist()
Out[36]:
In [45]:
# We are removing data where Production is <= 0
df = df[df['Production_(short_tons)'] > 0]
In [46]:
df['Production_(short_tons)'].hist()
Out[46]:
In [48]:
df['log_production'] = np.log(df['Production_(short_tons)'])
In [51]:
# log normal distribution
df['log_production'].hist()
Out[51]:
In [55]:
df.to_csv("../data/cleaned_coalpublic2013.csv")
In [ ]: