In [1]:
import pandas as pd
import numpy as np
pd.set_option('precision', 2)
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_colwidth = 100

In [2]:
# helper function to clean variables
def strip_spaces(x):
    try:
        clean_x = x.strip()
        return clean_x
    except:
        return x

In [3]:
# load data via pandas 
df = pd.read_excel("./tab9-6.xlsx" , header=2)
# clean up column names
df.columns = [col.strip() for col in df.columns]
# keep only the data we need for the visualization
df = df.iloc[6:47,:]

In [4]:
# strip white spaces
df['Highest degree and occupation'] = df['Highest degree and occupation'].apply(lambda x: strip_spaces(x))

# create some lists to make the data preprocessing easier
rows_to_clean = df[df['Highest degree and occupation'] == 'Postsecondary teacher']['Highest degree and occupation'].index.tolist()
science_categories = ['Biological/life scientist', 'Computer and information scientist', 'Mathematical scientist', 'Physical scientist', 'Psychologist', 'Social scientist', 'Engineering occupations']

In [5]:
# rename some of the variables with names that arent visualization friendly 
for row, replacement_val in zip(rows_to_clean, science_categories):
    df.loc[row, 'Highest degree and occupation'] = replacement_val + ' professor'

In [6]:
# clean up dataframe by dropping rows with missing data
df = df.dropna(subset=['Highest degree and occupation'])
df = df.dropna(axis=1,how='all')
df = df.dropna(axis=0,thresh=5)

In [7]:
# breakdown = ['S&E-related occupations', 'Non-S&E occupations']

In [8]:
# create occupation categories from the data
occupations = []
label = 'HEADER'
for row in df['Highest degree and occupation']:
    if row in science_categories:
        label = row
        occupations.append(label)
    else:
        occupations.append(label)
        
df['occupation'] = occupations

In [9]:
# refacoting notes and assumptions
# * = estimate < 500.
# D = suppressed for data confidentiality reasons. 
# S = suppressed for reliability; coefficient of variation exceeds publication standards.

# refactor * to 500
df = df.replace('*', 500)
# refactor supressed data to 100
df = df.replace('D', 100)
# refactor supressed data to np.nan
df = df.replace('S', np.nan)
# fill missing values as 0
df = df.fillna(0)
# add sequential count to keep track of headers
df["sequential_count"] = df.groupby("occupation").cumcount() + 1

In [10]:
# rename columns
df.columns = ['Highest degree and occupation',
              'All',
              'Hispanic or Latino',
              'American Indian or Alaska Native',
              'Asian',
              'Black or African American',
              'Native Hawaiian or Other Pacific Islander',
              'White',
              'More than one race',
              'occupation',
              'sequential_count']

In [11]:
# remove subset header -- it contains aggerated infomation per variable
viz_df = df[df['sequential_count'] != 1].drop(['sequential_count'], axis=1)
# restructure the data into long format for visualization
viz_df = pd.melt(viz_df, id_vars=['Highest degree and occupation', 'occupation'], value_name='total')
# save data to tsv
viz_df.to_csv('./temp/data.tsv', sep='\t', index=False)

In [12]:
# open data data to convert thousands seperator into ints
viz_df = pd.read_csv("./temp/data.tsv",  sep='\t', thousands=",")
# rename columns for visualization
viz_df.columns = ['Occupation', 'Category', 'Race', 'Total']
# remove whitespace from Occupation column
viz_df['Occupation'] = viz_df['Occupation'].apply(lambda x: x.strip())
# rename variables for visualization
viz_df['Occupation'] = viz_df['Occupation'].replace('Psychologist', 'Psychologists')
viz_df['Occupation'] = viz_df['Occupation'].replace('Mathematical scientist', 'Mathematical scientists')
# save data into project folder
viz_df.to_csv("./project/data.tsv", sep='\t', index=False, encoding='utf-8')