In [0]:
import pandas as pd
import numpy as np

In [0]:
# Dataset from here
# https://archive.ics.uci.edu/ml/datasets/Adult

"""
age: continuous.
workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
fnlwgt: continuous.
education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
education-num: continuous.
marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
sex: Female, Male.
capital-gain: continuous.
capital-loss: continuous.
hours-per-week: continuous.
native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
"""

df = pd.read_csv('../data/adult.data', names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', '<=50k'])

In [0]:
splitter = np.random.binomial(1, 0.2, size=len(df))

# Take 10% of women and 80% of men
biased_splitter = (
        (np.random.binomial(1, 0.1, size=len(df)) & (df['sex'] == ' Female')) |
        (np.random.binomial(1, 0.8, size=len(df)) & (df['sex'] == ' Male'))
    )

In [0]:
df_train = df[splitter == False]
df_test = df[splitter == True]
df_b_1 = df[biased_splitter == True]
b_1_splitter = np.random.binomial(1, 0.2, size=len(df_b_1))
df_b_1_train = df_b_1[b_1_splitter == False]
df_b_1_test = df_b_1[b_1_splitter == True]
df_b_2 = df[biased_splitter == False]
b_2_splitter = np.random.binomial(1, 0.2, size=len(df_b_2))
df_b_2_train = df_b_2[b_2_splitter == False]
df_b_2_test = df_b_2[b_2_splitter == True]

In [0]:
print(df_train.shape)
print(df_test.shape)
print(df_b_1.shape)
print(df_b_1_train.shape)
print(df_b_1_test.shape)
print(df_b_2.shape)
print(df_b_2_train.shape)
print(df_b_2_test.shape)

In [0]:
df_train.to_csv('../data/adult.data.train.csv', index=False)
df_test.to_csv('../data/adult.data.test.csv', index=False)
df_b_1_train.to_csv('../data/adult.data.b_1_train.csv', index=False)
df_b_1_test.to_csv('../data/adult.data.b_1_test.csv', index=False)
df_b_2_train.to_csv('../data/adult.data.b_2_train.csv', index=False)
df_b_2_test.to_csv('../data/adult.data.b_2_test.csv', index=False)

In [0]: