Q3


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('extract_medium.csv',sep=';')

In [3]:
data.head(1)


Out[3]:
State House_id Weight House_relation Sex Age Race Marriage Education Ancestry Language Employment_status Traveltime Industry Occupation Hours Weeks Salary Income Earnings
0 Arizona 3399818 18 3 2 14 8 5 4 210 0 0 0 0 0 0 0 0 0 0

In [4]:
Genders = ['Male','Female']
Education = ['Not in universe (Under 3 years)','No schooling completed','Nursery school to 4th grade',\
             '5th grade or 6th grade','7th,8th grade','9th grade','10th grade','11th grade','12th grade,no diploma',\
'High school graduate','college,less than 1 year','college 1+ years, no degree','Associate degree','Bachelor,s degree',\
'Master.s degree','Professional degree','Doctorate degree']
MarriageState=['married','Widowed','Divorced','Separated','Never married']

In [77]:
table = pd.pivot_table(data,values='Earnings',index=['Sex', 'Education'],aggfunc=np.mean)
##OR###
temp1 = data.groupby(['Sex', 'Education']).Earnings.mean()

In [78]:
temp1


Out[78]:
Sex  Education
1    0                0.000000
     1             3504.784689
     2              690.859232
     3             5083.671988
     4             4073.961606
     5             9596.498516
     6            11185.848485
     7            11167.638889
     8            19404.356436
     9            24012.275826
     10           28201.210614
     11           28488.347335
     12           35081.001821
     13           53294.264282
     14           70755.173611
     15           94245.204918
     16           61467.676768
2    0                0.000000
     1              947.790323
     2              387.342995
     3             2771.055195
     4             2256.137405
     5             3326.657609
     6             4281.338798
     7             5003.164557
     8             9200.885781
     9            11515.832571
     10           16305.045514
     11           16949.317489
     12           21991.480263
     13           33193.713496
     14           44412.231834
     15           46821.961290
     16           41476.065574
Name: Earnings, dtype: float64

In [79]:
temp1 = temp1.values
test=temp1.reshape(2,17)

In [80]:
test.shape


Out[80]:
(2, 17)

In [82]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.pcolor(test,cmap=plt.cm.Reds,vmin=np.min(test), vmax=np.max(test))
ax.set_yticks([1,2])
ax.set_yticklabels(Genders)
ax.set_xticks(range(1,18))
ax.set_xticklabels(Education)
for tick in ax.get_xticklabels():
    tick.set_rotation(90)
plt.gcf().subplots_adjust(bottom=0.20)
ax.set_title('Heat map of average Earnings Gender Vs Education')
plt.show()
plt.close()


Q4


In [68]:
table = pd.pivot_table(data,values='Earnings',index=['Sex', 'Marriage'],aggfunc=np.mean)
table = table.values
test=table.reshape(2,5)

In [74]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.pcolor(test,cmap=plt.cm.Reds,vmin=np.min(test), vmax=np.max(test))
ax.set_yticks([1,2])
ax.set_yticklabels(Genders)
ax.set_xticks(range(6))
ax.set_xticklabels(MarriageState)
for tick in ax.get_xticklabels():
    tick.set_rotation(45)
plt.gcf().subplots_adjust(bottom=0.20)
ax.set_title('Heat map of average Earnings Gender Vs Mariage')
plt.show()
plt.close()


Q5


In [35]:
table = pd.pivot_table(data,values='Earnings',index=['Sex', 'Marriage','Hours'],aggfunc=np.mean)

In [36]:
type(table)


Out[36]:
pandas.core.series.Series