In [1]:
import pandas as pd
import numpy as np
import os.path as p
from collections import defaultdict

In [2]:
df = pd.read_csv("/home/rmyeid/notebooks/compsocial/SPSSI_2nd year project_timepoint 1.csv",  header=0)

In [3]:
df.head()


Out[3]:
Unnamed: 0 VERSION ID_1 ID_2 ID_3 CEN_0 CEN_1_1 CEN_1_2 CEN_1_3 CEN_1_4 ... MOTH_EDU_TEXT RES NOT_0 NOT_0_TEXT LocationLatitude LocationLongitude LocationAccuracy identity_categorization census10.1 census12.1
0 1 SP Emily Marlow emily.marlow@stonybrook.edu ejmarlow@gmail.com 1 1 NaN NaN NaN ... NaN 2 1 NaN 40.906403 -73.131897 -1 monocultural white white
1 2 SP Michael Chen michael.chen.2@stonybrook.edu mchen1496@yahoo.com 1 1 NaN NaN NaN ... Associate's 2 1 NaN 40.931702 -73.114197 -1 monocultural asian asian
2 3 SP Milvin Shroff milvin.shroff@stonybrook.edu shroffmilvin@gmail.com 1 1 NaN NaN NaN ... NaN 2 1 NaN 40.906403 -73.131897 -1 monocultural asian asian
3 4 SP Aditi Sharma aditi.sharma@stonybrook.edu india2153@gmail.com 1 1 NaN NaN NaN ... NaN 2 1 NaN 40.906403 -73.131897 -1 bicultural asian,white asian,white
4 5 SP jeffery bailey jeffery.bailey@stonybrook.edu jeffery41@gmail.com 1 1 NaN NaN NaN ... n/a 2 1 NaN 40.819504 -73.920898 -1 monocultural black or african american black or african american

5 rows × 271 columns


In [4]:
print len(df)


351

Data Cleaning

  1. Create a new column, which labels monoculturals, biculturals and multiculturals
  2. Categorize cultural labels into Census groups (quantify demographics)
  3. Determine subgroups of monoculturals and biculturals (based on ingroup prototypicality)
  4. Create a sub-dataset for the SPSSI poster
  5. Check for missing data

In [5]:
df[["census10", "census12"]].head()


Out[5]:
census10 census12
0 white white
1 asian asian
2 asian asian
3 asian,white asian,white
4 black or african american black or african american

Census 10 Stats According to Genetic Categorization

Mono


In [6]:
df_mono = df[df.gen_identity_categorization == "monocultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_mono.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)


Out[6]:
[('asian', 121),
 ('white', 120),
 ('hispanic or latino', 31),
 ('black or african american', 14),
 ('american indian or alaskan native', 1)]

Bi


In [7]:
df_bi = df[df.gen_identity_categorization =="bicultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_bi.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)


Out[7]:
[('white', 51),
 ('hispanic or latino', 17),
 ('black or african american', 11),
 ('asian', 10),
 ('american indian or alaskan native', 1),
 ('black or african american/asian', 1),
 ('native hawaiian or pacific islander', 1)]

Multi


In [8]:
df_multi =  df[df.gen_identity_categorization =="multicultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_multi.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)


Out[8]:
[('white', 64),
 ('hispanic or latino', 10),
 ('black or african american', 5),
 ('asian', 5),
 ('other', 2),
 ('american indian or alaskan native', 2),
 ('hipanic/white', 1)]

Census 10 Total Stats


In [9]:
df_all =  df
identity_counts = defaultdict(lambda: 0)
census10_counts = df_all.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)


Out[9]:
[('white', 237),
 ('asian', 137),
 ('hispanic or latino', 58),
 ('black or african american', 31),
 ('n/a', 9),
 ('american indian or alaskan native', 4),
 ('other', 2),
 ('hipanic/white', 1),
 ('black or african american/asian', 1),
 ('native hawaiian or pacific islander', 1)]

Perceived Identification


In [10]:
df["per_identity_categorization"] = np.nan
monocultural_index = (np.logical_not(df.IDEN_1A.isnull())) 
df.loc[monocultural_index, "per_identity_categorization"]="monocultural"
bicultural_index = (np.logical_not(df.IDEN_1A.isnull())) * (np.logical_not(df.IDEN_2A.isnull())) * (df.IDEN_3A.isnull())
df.loc[bicultural_index,"per_identity_categorization"]="bicultural"
multicultural_index = (np.logical_not(df.IDEN_1A.isnull())) * (np.logical_not(df.IDEN_2A.isnull())) * (np.logical_not(df.IDEN_3A.isnull()))
df.loc[multicultural_index, "per_identity_categorization"] = "multicultural"
df.per_identity_categorization.value_counts()


/usr/local/lib/python2.7/dist-packages/pandas/computation/expressions.py:190: UserWarning: evaluating in Python space because the '*' operator is not supported by numexpr for the bool dtype, use '&' instead
  unsupported[op_str]))
Out[10]:
monocultural     152
bicultural       151
multicultural     35
dtype: int64

In [11]:
df.per_identity_categorization.head()


Out[11]:
0    monocultural
1    monocultural
2    monocultural
3      bicultural
4    monocultural
Name: per_identity_categorization, dtype: object

Identity Mismatch Gen & Per


In [12]:
GEN_combined = df.GEN_M_1A + "," + df.GEN_M_2A + "," + df.GEN_M_3A + "," + df.GEN_M_4A
gen_identities = [set([a.strip().lower() for a in x.strip().split(',')]) for x in GEN_combined.fillna("nan")]
gen_num = np.array([len(x) for x in gen_identities])

In [13]:
mul_col_index = df.gen_identity_categorization == "multicultural"
mul_index = np.where(mul_col_index.values)[0]
print len(mul_index)


68

In [14]:
df["num_gen_identity"] = np.nan
df.loc[df.gen_identity_categorization == "monocultural", "num_gen_identity"] = 1
df.loc[df.gen_identity_categorization == "bicultural", "num_gen_identity"] = 2
df.loc[mul_index, "num_gen_identity"] = gen_num[mul_index]
df.num_gen_identity.head(5)


Out[14]:
0     1
1     1
2     1
3     1
4   NaN
Name: num_gen_identity, dtype: float64

In [15]:
PER_combined = df.IDEN_1A + "," + df.IDEN_2A + "," + df.IDEN_3A + "," + df.IDEN_4A
per_identities = [set([a.strip().lower() for a in x.strip().split(',')]) for x in PER_combined.fillna("nan")]
per_num = np.array([len(x) for x in per_identities])

In [16]:
mul_col_index2 = df.per_identity_categorization == "multicultural"
mul_index2 = np.where(mul_col_index2.values)[0]
print len(mul_index2)


35

In [17]:
df["num_per_identity"] = np.nan
df.loc[df.per_identity_categorization == "monocultural", "num_per_identity"] = 1
df.loc[df.per_identity_categorization == "bicultural", "num_per_identity"] = 2
df.loc[mul_index2, "num_per_identity"] = per_num[mul_index2]
df.num_per_identity.head(5)


Out[17]:
0    1
1    1
2    1
3    2
4    1
Name: num_per_identity, dtype: float64

In [18]:
df["identity_mismatch_gen_per"] = abs(df.num_gen_identity - df.num_per_identity)
df.identity_mismatch_gen_per.head()


Out[18]:
0     0
1     0
2     0
3     1
4   NaN
Name: identity_mismatch_gen_per, dtype: float64

In [19]:
df.loc[df.identity_mismatch_gen_per == 0, "dich_identity_mismatch_gen_per"] = 0 
df.loc[df.identity_mismatch_gen_per != 0, "dich_identity_mismatch_gen_per"] = 1

Census 10 Mono/Bi/Multi Count


In [20]:
df.census10.head()


Out[20]:
0                         white
1                         asian
2                         asian
3                   asian,white
4    black or african american 
Name: census10, dtype: object

In [21]:
df["census_10_num"] = [len(set([a.strip().lower() for a in x.strip().split(',')])) for x in df.census10]
df.loc[df.census_10_num == 1, "census_10_identity"] = "monocultural"
df.loc[df.census_10_num == 2, "census_10_identity"] = "bicultural"
df.loc[df.census_10_num >2, "census_10_identity"] = "multicultural"
#cen10_num = np.array([len(x) for x in gen_identities])

Census 20 Mono/Bi/Multi Count


In [22]:
df.rename(columns={"census12": "census20"}, inplace=True)
df["census_20_num"] = [len(set([a.strip().lower() for a in x.strip().split(',')])) for x in df.census20]
df.loc[df.census_20_num == 1, "census_20_identity"] = "monocultural"
df.loc[df.census_20_num == 2, "census_20_identity"] = "bicultural"
df.loc[df.census_20_num >2, "census_20_identity"] = "multicultural"

Identity Mismatch Gen & Census 10


In [23]:
df["identity_mismatch_gen_cen10"] = abs(df.num_gen_identity - df.census_10_num)

In [24]:
df.loc[df.identity_mismatch_gen_cen10 == 0, "dich_identity_mismatch_gen_cen10"] = 0 
df.loc[df.identity_mismatch_gen_cen10 != 0, "dich_identity_mismatch_gen_cen10"] = 1

Identity Mismatch Gen & Census 20


In [25]:
df["identity_mismatch_gen_cen20"] = abs(df.num_gen_identity - df.census_20_num)

In [26]:
df.loc[df.identity_mismatch_gen_cen20 == 0, "dich_identity_mismatch_gen_cen20"] = 0 
df.loc[df.identity_mismatch_gen_cen20 != 0, "dich_identity_mismatch_gen_cen20"] = 1

Identity Mismatch Per & Census 10


In [27]:
df["identity_mismatch_per_cen10"] = abs(df.num_per_identity - df.census_10_num)

In [28]:
df.loc[df.identity_mismatch_per_cen10 == 0, "dich_identity_mismatch_per_cen10"] = 0 
df.loc[df.identity_mismatch_per_cen10 != 0, "dich_identity_mismatch_per_cen10"] = 1

Identity Mismatch Per & Census 20


In [29]:
df["identity_mismatch_per_cen20"] = abs(df.num_per_identity - df.census_20_num)

In [30]:
df.loc[df.identity_mismatch_gen_cen20 == 0, "dich_identity_mismatch_gen_cen20"] = 0 
df.loc[df.identity_mismatch_gen_cen20 != 0, "dich_identity_mismatch_gen_cen20"] = 1

Save Dataframe


In [33]:
!pwd


/data/csc/compsocial/SPSSI

In [32]:
df.to_csv("spssi_full_identity_mismatch.csv")