In [98]:
import pandas as pd
import numpy as np
import os.path as p

In [99]:
df_title = pd.read_csv("/home/rmyeid/notebooks/compsocial/SPSSI_Survey_1_Complete.csv",  header=0)
#df_title = pd.read_csv("/home/rmyeid/notebooks/compsocial/SPSSI_2nd year project_timepoint 1.csv",  header=0)

In [100]:
df = df_title.drop(0)

In [101]:
print len(df)


357

Data Cleaning

  1. Create a new column, which labels monoculturals, biculturals and multiculturals
  2. Categorize cultural labels into Census groups (quantify demographics)
  3. Determine subgroups of monoculturals and biculturals (based on ingroup prototypicality)
  4. Create a sub-dataset for the SPSSI poster
  5. Check for missing data

In [102]:
df["identity_categorization"] = np.nan
monocultural_index = (np.logical_not(df.IDEN_1A.isnull())) 
df.loc[monocultural_index, "identity_categorization"]="monocultural"
bicultural_index = (np.logical_not(df.IDEN_1A.isnull())) * (np.logical_not(df.IDEN_2A.isnull())) * (df.IDEN_3A.isnull())
df.loc[bicultural_index,"identity_categorization"]="bicultural"
multicultural_index = (np.logical_not(df.IDEN_1A.isnull())) * (np.logical_not(df.IDEN_2A.isnull())) * (np.logical_not(df.IDEN_3A.isnull()))
df.loc[multicultural_index, "identity_categorization"] = "multicultural"
df.identity_categorization.value_counts()


Out[102]:
monocultural     153
bicultural       152
multicultural     35
dtype: int64

Missing Data Cleaning

We need to remove people who did not fill the census and the primary identity data


In [104]:
df[df.identity_categorization.isnull()][["IDEN_1A", "CEN_1_1"]]


Out[104]:
IDEN_1A CEN_1_1
160 NaN 1
184 NaN 1
192 NaN NaN
223 NaN NaN
229 NaN NaN
336 NaN 1
337 NaN 1
338 NaN 1
339 NaN 1
340 NaN 1
341 NaN 1
342 NaN 1
343 NaN 1
346 NaN 1
347 NaN NaN
348 NaN 1
354 NaN 1

In [118]:
df= df.drop(df.index[df.IDEN_1A.isnull() * df.CEN_1_1.isnull()])

Identity Categorization (according to census)


In [138]:
ID_cat_columns =["IDEN_1A", "IDEN_2A", "IDEN_3A", "IDEN_4A", "CEN_7", "GEN_M_1A", 
                 "GEN_M_2A", "GEN_M_3A", "GEN_M_4A"]
identities = df[ID_cat_columns].values.flatten()
normalized_idenitites = [x.lower().replace("and", "").split(",") for x in identities if isinstance(x, str)]
unique_idens = set([iden.strip() for person in normalized_idenitites for iden in person])

In [145]:
idens_df = pd.DataFrame.from_records([[x] for  x in list(unique_idens)])
idens_df.to_csv("identities_map.csv")

Mapping


In [150]:
from StringIO import StringIO  # got moved to io in python3.
import requests

In [153]:
r = requests.get("https://docs.google.com/spreadsheets/d/1vHCDGgb8CjnBb4pA5e-htHOlhdsA55JToZWs3USvEJk/"
                 "export?format=csv&id=1vHCDGgb8CjnBb4pA5e-htHOlhdsA55JToZWs3USvEJk&gid=665505531")
courses_data = r.content
courses_df = pd.read_csv(StringIO(courses_data), index_col=0)
courses_df.head()


Out[153]:
original 2010 Census categorization New Census Categorization
0 irish/italian/ british white white
1 indian(hindu) asian asian
2 mexican hispanic or latino hispanic or latino
3 chinese asian asian
4 eastern european white white

In [180]:
census10_map = {k:v for k,v in courses_df.values[:, [0,1]]}
census12_map = {k:v for k,v in courses_df.values[:, [0,2]]}

In [191]:
census10 = []
census12 = []

for person in df[ID_cat_columns].values:
  normalized = [x.lower().replace("and", "").split(",") if isinstance(x, str) else ["na"] for x in person]
  all_idens = sorted(list(set([x.strip() for col in normalized for x in col])))
  idens10 = list(set([census10_map[x] for x in all_idens]))
  idens12 = list(set([census12_map[x] for x in all_idens]))

  if len(idens10) > 1 and "n/a" in idens10: idens10.remove("n/a")
  if len(idens12) > 1 and "n/a" in idens12: idens12.remove("n/a")

  census10.append(",".join(sorted(list(set(idens10)))))
  census12.append(",".join(sorted(list(set(idens12)))))

In [192]:
df["census10"] = census10
df["census12"] = census12

In [195]:
df[ID_cat_columns + ["census10", "census12"]].head()


Out[195]:
IDEN_1A IDEN_2A IDEN_3A IDEN_4A CEN_7 GEN_M_1A GEN_M_2A GEN_M_3A GEN_M_4A census10 census12
1 white NaN NaN NaN NaN WHITE white white white white white
2 Chinese NaN NaN NaN NaN Chinese Chinese Chinese Chinese asian asian
3 Asian Indian NaN NaN NaN NaN Asian Indian Asian Indian Asian Indian Asian Indian asian asian
4 Indian(Hindu) American NaN NaN NaN Indian Indian(Hindu) Indian(Hindu) Indian(Hindu) asian,white asian,white
5 jamaicam NaN NaN NaN Black n/a n/a n/a n/a black or african american black or african american

In [219]:
df.to_csv("SPSSI_poster_clean.csv")

In [228]:
df2 = df.drop(df.index[df.identity_categorization.isnull()]) #[["IDEN_1A", "IDEN_2A", "IDEN_3A", "IDEN_4A"])

In [ ]:
from collections import defaultdict

Census 2010 stats


In [229]:
identity_counts = defaultdict(lambda: 0)
census10_counts = df2.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)


Out[229]:
[('white', 236),
 ('asian', 135),
 ('hispanic or latino', 60),
 ('black or african american', 32),
 ('american indian or alaskan native', 5),
 ('other', 2),
 ('hipanic/white', 1),
 ('black or african american/asian', 1),
 ('native hawaiian or pacific islander', 1)]

Census 2020 stats


In [230]:
identity_counts = defaultdict(lambda: 0)
census12_counts = df2.census12.value_counts()
for x,c  in zip(census12_counts.index.values, census12_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)


Out[230]:
[('white', 223),
 ('asian', 135),
 ('hispanic or latino', 60),
 ('black or african american', 32),
 ('middle eastern', 28),
 ('american indian or alaskan native', 5),
 ('other', 2),
 ('hipanic/white', 1),
 ('black or african american/asian', 1),
 ('native hawaiian or pacific islander', 1)]

In [237]:
df_mono = df2[df2.identity_categorization=="monocultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_mono.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)


Out[237]:
[('asian', 71),
 ('white', 62),
 ('hispanic or latino', 30),
 ('black or african american', 12),
 ('american indian or alaskan native', 3),
 ('native hawaiian or pacific islander', 1)]

In [238]:
df_bi = df2[df2.identity_categorization=="bicultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_bi.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)


Out[238]:
[('white', 140),
 ('asian', 55),
 ('hispanic or latino', 27),
 ('black or african american', 16),
 ('other', 1),
 ('american indian or alaskan native', 1),
 ('black or african american/asian', 1)]

In [239]:
df_multi = df2[df2.identity_categorization=="multicultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_multi.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)


Out[239]:
[('white', 34),
 ('asian', 9),
 ('black or african american', 4),
 ('hispanic or latino', 3),
 ('hipanic/white', 1),
 ('american indian or alaskan native', 1),
 ('other', 1)]

In [276]:
tmp = df2.loc[df2.identity_categorization=="monocultural", ["IDEN_1A", "GEN_M_1A", "GEN_M_2A", 
                                                      "GEN_M_3A", "GEN_M_4A", "census10"]]

In [274]:
rows = [42,46,75, 99, 100, 102, 116, 137, 152, 156, 167, 168, 170, 193, 200, 203, 263, 299, 331]
[len(x.split(",")) for x in df2.loc[rows, "census10"]]


Out[274]:
[2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, 2, 2, 2, 2]