In [65]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set();
In [16]:
df = pd.read_excel('../data/coalpublic2013.xls',header=2,index_col=1)
df.head()
Out[16]:
In [17]:
df['Company Type'].unique()
Out[17]:
In [18]:
df['Company Type'].replace(to_replace = 'Indepedent Producer Operator',
value = 'Independent Producer Operator',
inplace = True)
In [19]:
df['Company Type'].head()
Out[19]:
In [40]:
df.rename(columns=lambda x: x.replace(" ","_"),inplace=True)
In [41]:
df.columns
Out[41]:
In [39]:
df1 = df.copy()
df1.rename(columns = lambda x: x.replace('Mine','Yours'),inplace=True)
df1.head()
Out[39]:
In [32]:
df.shape
Out[32]:
In [35]:
plt.scatter(df.Average_Employees,df.Labor_Hours)
plt.xlabel('Number of employees')
plt.ylabel('Total Hours Worked')
Out[35]:
In [37]:
sns.regplot(df.Average_Employees,df.Labor_Hours)
plt.savefig("../figures/2016-04-05" + "-employees_vs_hours.png")
In [42]:
for column in df.columns:
print(column)
In [43]:
plt.scatter(df.Labor_Hours, df['Production_(short_tons)'])
Out[43]:
In [44]:
df['Production_(short_tons)'].hist()
Out[44]:
In [45]:
min(df['Production_(short_tons)'])
Out[45]:
In [61]:
df[df['Production_(short_tons)']==0].shape
Out[61]:
In [62]:
df[df['Production_(short_tons)']>0]
Out[62]:
In [63]:
# we are removing data here !
df = df[df['Production_(short_tons)']>0]
In [66]:
df['log_production'] = np.log(df['Production_(short_tons)'])
In [67]:
df.log_production.hist()
Out[67]:
In [68]:
df.to_csv('../data/cleaned_coalpublic2013.csv')
In [ ]: