notebook.community



In [1]:

    
import pandas as pd
import numpy as np
import os.path as p
from collections import defaultdict



In [2]:

    
df = pd.read_csv("/home/rmyeid/notebooks/compsocial/SPSSI_2nd year project_timepoint 1.csv",  header=0)



In [3]:

    
df.head()









    Out[3]:






  
    
      
      Unnamed: 0
      VERSION
      ID_1
      ID_2
      ID_3
      CEN_0
      CEN_1_1
      CEN_1_2
      CEN_1_3
      CEN_1_4
      ...
      MOTH_EDU_TEXT
      RES
      NOT_0
      NOT_0_TEXT
      LocationLatitude
      LocationLongitude
      LocationAccuracy
      identity_categorization
      census10.1
      census12.1
    
  
  
    
      0
       1
       SP
         Emily Marlow
         emily.marlow@stonybrook.edu
           ejmarlow@gmail.com
       1
       1
      NaN
      NaN
      NaN
      ...
               NaN
       2
       1
       NaN
       40.906403
      -73.131897
      -1
       monocultural
                            white
                            white
    
    
      1
       2
       SP
         Michael Chen
       michael.chen.2@stonybrook.edu
          mchen1496@yahoo.com
       1
       1
      NaN
      NaN
      NaN
      ...
       Associate's
       2
       1
       NaN
       40.931702
      -73.114197
      -1
       monocultural
                            asian
                            asian
    
    
      2
       3
       SP
        Milvin Shroff
        milvin.shroff@stonybrook.edu
       shroffmilvin@gmail.com
       1
       1
      NaN
      NaN
      NaN
      ...
               NaN
       2
       1
       NaN
       40.906403
      -73.131897
      -1
       monocultural
                            asian
                            asian
    
    
      3
       4
       SP
         Aditi Sharma
         aditi.sharma@stonybrook.edu
          india2153@gmail.com
       1
       1
      NaN
      NaN
      NaN
      ...
               NaN
       2
       1
       NaN
       40.906403
      -73.131897
      -1
         bicultural
                      asian,white
                      asian,white
    
    
      4
       5
       SP
       jeffery bailey
       jeffery.bailey@stonybrook.edu
          jeffery41@gmail.com
       1
       1
      NaN
      NaN
      NaN
      ...
               n/a
       2
       1
       NaN
       40.819504
      -73.920898
      -1
       monocultural
       black or african american 
       black or african american 
    
  

5 rows × 271 columns



In [4]:

    
print len(df)

Data Cleaning

~~Create a new column, which labels monoculturals, biculturals and multiculturals~~
~~Categorize cultural labels into Census groups (quantify demographics)~~
Determine subgroups of monoculturals and biculturals (based on ingroup prototypicality)
~~Create a sub-dataset for the SPSSI poster~~
~~Check for missing data~~



In [5]:

    
df[["census10", "census12"]].head()









    Out[5]:






  
    
      
      census10
      census12
    
  
  
    
      0
                            white
                            white
    
    
      1
                            asian
                            asian
    
    
      2
                            asian
                            asian
    
    
      3
                      asian,white
                      asian,white
    
    
      4
       black or african american 
       black or african american

Census 10 Stats According to Genetic Categorization

Mono



In [6]:

    
df_mono = df[df.gen_identity_categorization == "monocultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_mono.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)









    Out[6]:





[('asian', 121),
 ('white', 120),
 ('hispanic or latino', 31),
 ('black or african american', 14),
 ('american indian or alaskan native', 1)]

Bi



In [7]:

    
df_bi = df[df.gen_identity_categorization =="bicultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_bi.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)









    Out[7]:





[('white', 51),
 ('hispanic or latino', 17),
 ('black or african american', 11),
 ('asian', 10),
 ('american indian or alaskan native', 1),
 ('black or african american/asian', 1),
 ('native hawaiian or pacific islander', 1)]

Multi



In [8]:

    
df_multi =  df[df.gen_identity_categorization =="multicultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_multi.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)









    Out[8]:





[('white', 64),
 ('hispanic or latino', 10),
 ('black or african american', 5),
 ('asian', 5),
 ('other', 2),
 ('american indian or alaskan native', 2),
 ('hipanic/white', 1)]

Census 10 Total Stats



In [9]:

    
df_all =  df
identity_counts = defaultdict(lambda: 0)
census10_counts = df_all.census10.value_counts()
for x,c  in zip(census10_counts.index.values, census10_counts.values):
  for j in [k.strip() for k in x.split(',')]:
    identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)









    Out[9]:





[('white', 237),
 ('asian', 137),
 ('hispanic or latino', 58),
 ('black or african american', 31),
 ('n/a', 9),
 ('american indian or alaskan native', 4),
 ('other', 2),
 ('hipanic/white', 1),
 ('black or african american/asian', 1),
 ('native hawaiian or pacific islander', 1)]

Perceived Identification



In [10]:

    
df["per_identity_categorization"] = np.nan
monocultural_index = (np.logical_not(df.IDEN_1A.isnull())) 
df.loc[monocultural_index, "per_identity_categorization"]="monocultural"
bicultural_index = (np.logical_not(df.IDEN_1A.isnull())) * (np.logical_not(df.IDEN_2A.isnull())) * (df.IDEN_3A.isnull())
df.loc[bicultural_index,"per_identity_categorization"]="bicultural"
multicultural_index = (np.logical_not(df.IDEN_1A.isnull())) * (np.logical_not(df.IDEN_2A.isnull())) * (np.logical_not(df.IDEN_3A.isnull()))
df.loc[multicultural_index, "per_identity_categorization"] = "multicultural"
df.per_identity_categorization.value_counts()









    



/usr/local/lib/python2.7/dist-packages/pandas/computation/expressions.py:190: UserWarning: evaluating in Python space because the '*' operator is not supported by numexpr for the bool dtype, use '&' instead
  unsupported[op_str]))






    Out[10]:





monocultural     152
bicultural       151
multicultural     35
dtype: int64



In [11]:

    
df.per_identity_categorization.head()









    Out[11]:





0    monocultural
1    monocultural
2    monocultural
3      bicultural
4    monocultural
Name: per_identity_categorization, dtype: object

Identity Mismatch Gen & Per



In [12]:

    
GEN_combined = df.GEN_M_1A + "," + df.GEN_M_2A + "," + df.GEN_M_3A + "," + df.GEN_M_4A
gen_identities = [set([a.strip().lower() for a in x.strip().split(',')]) for x in GEN_combined.fillna("nan")]
gen_num = np.array([len(x) for x in gen_identities])



In [13]:

    
mul_col_index = df.gen_identity_categorization == "multicultural"
mul_index = np.where(mul_col_index.values)[0]
print len(mul_index)



In [14]:

    
df["num_gen_identity"] = np.nan
df.loc[df.gen_identity_categorization == "monocultural", "num_gen_identity"] = 1
df.loc[df.gen_identity_categorization == "bicultural", "num_gen_identity"] = 2
df.loc[mul_index, "num_gen_identity"] = gen_num[mul_index]
df.num_gen_identity.head(5)









    Out[14]:





0     1
1     1
2     1
3     1
4   NaN
Name: num_gen_identity, dtype: float64



In [15]:

    
PER_combined = df.IDEN_1A + "," + df.IDEN_2A + "," + df.IDEN_3A + "," + df.IDEN_4A
per_identities = [set([a.strip().lower() for a in x.strip().split(',')]) for x in PER_combined.fillna("nan")]
per_num = np.array([len(x) for x in per_identities])



In [16]:

    
mul_col_index2 = df.per_identity_categorization == "multicultural"
mul_index2 = np.where(mul_col_index2.values)[0]
print len(mul_index2)



In [17]:

    
df["num_per_identity"] = np.nan
df.loc[df.per_identity_categorization == "monocultural", "num_per_identity"] = 1
df.loc[df.per_identity_categorization == "bicultural", "num_per_identity"] = 2
df.loc[mul_index2, "num_per_identity"] = per_num[mul_index2]
df.num_per_identity.head(5)









    Out[17]:





0    1
1    1
2    1
3    2
4    1
Name: num_per_identity, dtype: float64



In [18]:

    
df["identity_mismatch_gen_per"] = abs(df.num_gen_identity - df.num_per_identity)
df.identity_mismatch_gen_per.head()









    Out[18]:





0     0
1     0
2     0
3     1
4   NaN
Name: identity_mismatch_gen_per, dtype: float64



In [19]:

    
df.loc[df.identity_mismatch_gen_per == 0, "dich_identity_mismatch_gen_per"] = 0 
df.loc[df.identity_mismatch_gen_per != 0, "dich_identity_mismatch_gen_per"] = 1

Census 10 Mono/Bi/Multi Count



In [20]:

    
df.census10.head()









    Out[20]:





0                         white
1                         asian
2                         asian
3                   asian,white
4    black or african american 
Name: census10, dtype: object



In [21]:

    
df["census_10_num"] = [len(set([a.strip().lower() for a in x.strip().split(',')])) for x in df.census10]
df.loc[df.census_10_num == 1, "census_10_identity"] = "monocultural"
df.loc[df.census_10_num == 2, "census_10_identity"] = "bicultural"
df.loc[df.census_10_num >2, "census_10_identity"] = "multicultural"
#cen10_num = np.array([len(x) for x in gen_identities])

Census 20 Mono/Bi/Multi Count



In [22]:

    
df.rename(columns={"census12": "census20"}, inplace=True)
df["census_20_num"] = [len(set([a.strip().lower() for a in x.strip().split(',')])) for x in df.census20]
df.loc[df.census_20_num == 1, "census_20_identity"] = "monocultural"
df.loc[df.census_20_num == 2, "census_20_identity"] = "bicultural"
df.loc[df.census_20_num >2, "census_20_identity"] = "multicultural"

Identity Mismatch Gen & Census 10



In [23]:

    
df["identity_mismatch_gen_cen10"] = abs(df.num_gen_identity - df.census_10_num)



In [24]:

    
df.loc[df.identity_mismatch_gen_cen10 == 0, "dich_identity_mismatch_gen_cen10"] = 0 
df.loc[df.identity_mismatch_gen_cen10 != 0, "dich_identity_mismatch_gen_cen10"] = 1

Identity Mismatch Gen & Census 20



In [25]:

    
df["identity_mismatch_gen_cen20"] = abs(df.num_gen_identity - df.census_20_num)



In [26]:

    
df.loc[df.identity_mismatch_gen_cen20 == 0, "dich_identity_mismatch_gen_cen20"] = 0 
df.loc[df.identity_mismatch_gen_cen20 != 0, "dich_identity_mismatch_gen_cen20"] = 1

Identity Mismatch Per & Census 10



In [27]:

    
df["identity_mismatch_per_cen10"] = abs(df.num_per_identity - df.census_10_num)



In [28]:

    
df.loc[df.identity_mismatch_per_cen10 == 0, "dich_identity_mismatch_per_cen10"] = 0 
df.loc[df.identity_mismatch_per_cen10 != 0, "dich_identity_mismatch_per_cen10"] = 1

Identity Mismatch Per & Census 20



In [29]:

    
df["identity_mismatch_per_cen20"] = abs(df.num_per_identity - df.census_20_num)



In [30]:

    
df.loc[df.identity_mismatch_gen_cen20 == 0, "dich_identity_mismatch_gen_cen20"] = 0 
df.loc[df.identity_mismatch_gen_cen20 != 0, "dich_identity_mismatch_gen_cen20"] = 1

Save Dataframe



In [33]:

    
!pwd









    



/data/csc/compsocial/SPSSI



In [32]:

    
df.to_csv("spssi_full_identity_mismatch.csv")

	Unnamed: 0	VERSION	ID_1	ID_2	ID_3	CEN_0	CEN_1_1	CEN_1_2	CEN_1_3	CEN_1_4	...	MOTH_EDU_TEXT	RES	NOT_0	NOT_0_TEXT	LocationLatitude	LocationLongitude	LocationAccuracy	identity_categorization	census10.1	census12.1
0	1	SP	Emily Marlow	emily.marlow@stonybrook.edu	ejmarlow@gmail.com	1	1	NaN	NaN	NaN	...	NaN	2	1	NaN	40.906403	-73.131897	-1	monocultural	white	white
1	2	SP	Michael Chen	michael.chen.2@stonybrook.edu	mchen1496@yahoo.com	1	1	NaN	NaN	NaN	...	Associate's	2	1	NaN	40.931702	-73.114197	-1	monocultural	asian	asian
2	3	SP	Milvin Shroff	milvin.shroff@stonybrook.edu	shroffmilvin@gmail.com	1	1	NaN	NaN	NaN	...	NaN	2	1	NaN	40.906403	-73.131897	-1	monocultural	asian	asian
3	4	SP	Aditi Sharma	aditi.sharma@stonybrook.edu	india2153@gmail.com	1	1	NaN	NaN	NaN	...	NaN	2	1	NaN	40.906403	-73.131897	-1	bicultural	asian,white	asian,white
4	5	SP	jeffery bailey	jeffery.bailey@stonybrook.edu	jeffery41@gmail.com	1	1	NaN	NaN	NaN	...	n/a	2	1	NaN	40.819504	-73.920898	-1	monocultural	black or african american	black or african american