In [4]:
import pandas as pd
import numpy as np
import os
import sys
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.cluster import KMeans
sns.set(style="darkgrid")
In [30]:
ad_data = pd.read_csv('adult.data.txt', header=None)
ad_data.shape
Out[30]:
In [31]:
ad_data.
Out[31]:
In [52]:
ad_data['annual_revenue_above_50k'] = ad_data[14].str.contains('>')
ad_data.annual_revenue_above_50k
Out[52]:
In [54]:
ad_data['annual_revenue_above_50k'].hist(by=ad_data[9])
plt.show()
In [107]:
groups_gender = ad_data.groupby(by=9)['annual_revenue_above_50k']
female_prior, male_prior = groups_gender.mean()
Out[107]:
In [94]:
#np.random.binomial(1, male_prior, size=300)
In [101]:
#np.random.binomial(1, female_prior, size=100)
In [103]:
ad_data['annual_revenue_above_50k'].hist(by=ad_data[3], figsize=(20,20))
plt.show()
In [108]:
groups_education = ad_data.groupby(by=3)['annual_revenue_above_50k']
results = groups_education.mean()
results
Out[108]:
In [114]:
male_with_education_prior = male_prior * results
female_with_education_prior = female_prior * results
In [122]:
male_with_education_prior.plot.bar()
plt.title('Male prior to earn more than 50k per year')
plt.show()
In [123]:
female_with_education_prior.plot.bar()
plt.title('Female prior to earn more than 50k per year')
plt.show()