Q3



In [1]:

    
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt



In [2]:

    
data = pd.read_csv('extract_medium.csv',sep=';')



In [3]:

    
data.head(1)









    Out[3]:






  
    
      
      State
      House_id
      Weight
      House_relation
      Sex
      Age
      Race
      Marriage
      Education
      Ancestry
      Language
      Employment_status
      Traveltime
      Industry
      Occupation
      Hours
      Weeks
      Salary
      Income
      Earnings
    
  
  
    
      0
       Arizona
       3399818
       18
       3
       2
       14
       8
       5
       4
       210
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0



In [4]:

    
Genders = ['Male','Female']
Education = ['Not in universe (Under 3 years)','No schooling completed','Nursery school to 4th grade',\
             '5th grade or 6th grade','7th,8th grade','9th grade','10th grade','11th grade','12th grade,no diploma',\
'High school graduate','college,less than 1 year','college 1+ years, no degree','Associate degree','Bachelor,s degree',\
'Master.s degree','Professional degree','Doctorate degree']
MarriageState=['married','Widowed','Divorced','Separated','Never married']



In [77]:

    
table = pd.pivot_table(data,values='Earnings',index=['Sex', 'Education'],aggfunc=np.mean)
##OR###
temp1 = data.groupby(['Sex', 'Education']).Earnings.mean()



In [78]:

    
temp1









    Out[78]:





Sex  Education
1    0                0.000000
     1             3504.784689
     2              690.859232
     3             5083.671988
     4             4073.961606
     5             9596.498516
     6            11185.848485
     7            11167.638889
     8            19404.356436
     9            24012.275826
     10           28201.210614
     11           28488.347335
     12           35081.001821
     13           53294.264282
     14           70755.173611
     15           94245.204918
     16           61467.676768
2    0                0.000000
     1              947.790323
     2              387.342995
     3             2771.055195
     4             2256.137405
     5             3326.657609
     6             4281.338798
     7             5003.164557
     8             9200.885781
     9            11515.832571
     10           16305.045514
     11           16949.317489
     12           21991.480263
     13           33193.713496
     14           44412.231834
     15           46821.961290
     16           41476.065574
Name: Earnings, dtype: float64



In [79]:

    
temp1 = temp1.values
test=temp1.reshape(2,17)



In [80]:

    
test.shape









    Out[80]:





(2, 17)



In [82]:

    
fig = plt.figure()
ax = fig.add_subplot(111)
ax.pcolor(test,cmap=plt.cm.Reds,vmin=np.min(test), vmax=np.max(test))
ax.set_yticks([1,2])
ax.set_yticklabels(Genders)
ax.set_xticks(range(1,18))
ax.set_xticklabels(Education)
for tick in ax.get_xticklabels():
    tick.set_rotation(90)
plt.gcf().subplots_adjust(bottom=0.20)
ax.set_title('Heat map of average Earnings Gender Vs Education')
plt.show()
plt.close()

Q4



In [68]:

    
table = pd.pivot_table(data,values='Earnings',index=['Sex', 'Marriage'],aggfunc=np.mean)
table = table.values
test=table.reshape(2,5)



In [74]:

    
fig = plt.figure()
ax = fig.add_subplot(111)
ax.pcolor(test,cmap=plt.cm.Reds,vmin=np.min(test), vmax=np.max(test))
ax.set_yticks([1,2])
ax.set_yticklabels(Genders)
ax.set_xticks(range(6))
ax.set_xticklabels(MarriageState)
for tick in ax.get_xticklabels():
    tick.set_rotation(45)
plt.gcf().subplots_adjust(bottom=0.20)
ax.set_title('Heat map of average Earnings Gender Vs Mariage')
plt.show()
plt.close()

Q5



In [35]:

    
table = pd.pivot_table(data,values='Earnings',index=['Sex', 'Marriage','Hours'],aggfunc=np.mean)



In [36]:

    
type(table)









    Out[36]:





pandas.core.series.Series