In [1]:
import pandas as pd
import numpy as np
import os.path as p
from collections import defaultdict
In [2]:
df = pd.read_csv("/home/rmyeid/notebooks/compsocial/SPSSI_2nd year project_timepoint 1.csv", header=0)
In [3]:
df.head()
Out[3]:
In [4]:
print len(df)
In [5]:
df[["census10", "census12"]].head()
Out[5]:
In [6]:
df_mono = df[df.gen_identity_categorization == "monocultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_mono.census10.value_counts()
for x,c in zip(census10_counts.index.values, census10_counts.values):
for j in [k.strip() for k in x.split(',')]:
identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)
Out[6]:
In [7]:
df_bi = df[df.gen_identity_categorization =="bicultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_bi.census10.value_counts()
for x,c in zip(census10_counts.index.values, census10_counts.values):
for j in [k.strip() for k in x.split(',')]:
identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)
Out[7]:
In [8]:
df_multi = df[df.gen_identity_categorization =="multicultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_multi.census10.value_counts()
for x,c in zip(census10_counts.index.values, census10_counts.values):
for j in [k.strip() for k in x.split(',')]:
identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)
Out[8]:
In [9]:
df_all = df
identity_counts = defaultdict(lambda: 0)
census10_counts = df_all.census10.value_counts()
for x,c in zip(census10_counts.index.values, census10_counts.values):
for j in [k.strip() for k in x.split(',')]:
identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)
Out[9]:
In [10]:
df["per_identity_categorization"] = np.nan
monocultural_index = (np.logical_not(df.IDEN_1A.isnull()))
df.loc[monocultural_index, "per_identity_categorization"]="monocultural"
bicultural_index = (np.logical_not(df.IDEN_1A.isnull())) * (np.logical_not(df.IDEN_2A.isnull())) * (df.IDEN_3A.isnull())
df.loc[bicultural_index,"per_identity_categorization"]="bicultural"
multicultural_index = (np.logical_not(df.IDEN_1A.isnull())) * (np.logical_not(df.IDEN_2A.isnull())) * (np.logical_not(df.IDEN_3A.isnull()))
df.loc[multicultural_index, "per_identity_categorization"] = "multicultural"
df.per_identity_categorization.value_counts()
Out[10]:
In [11]:
df.per_identity_categorization.head()
Out[11]:
In [12]:
GEN_combined = df.GEN_M_1A + "," + df.GEN_M_2A + "," + df.GEN_M_3A + "," + df.GEN_M_4A
gen_identities = [set([a.strip().lower() for a in x.strip().split(',')]) for x in GEN_combined.fillna("nan")]
gen_num = np.array([len(x) for x in gen_identities])
In [13]:
mul_col_index = df.gen_identity_categorization == "multicultural"
mul_index = np.where(mul_col_index.values)[0]
print len(mul_index)
In [14]:
df["num_gen_identity"] = np.nan
df.loc[df.gen_identity_categorization == "monocultural", "num_gen_identity"] = 1
df.loc[df.gen_identity_categorization == "bicultural", "num_gen_identity"] = 2
df.loc[mul_index, "num_gen_identity"] = gen_num[mul_index]
df.num_gen_identity.head(5)
Out[14]:
In [15]:
PER_combined = df.IDEN_1A + "," + df.IDEN_2A + "," + df.IDEN_3A + "," + df.IDEN_4A
per_identities = [set([a.strip().lower() for a in x.strip().split(',')]) for x in PER_combined.fillna("nan")]
per_num = np.array([len(x) for x in per_identities])
In [16]:
mul_col_index2 = df.per_identity_categorization == "multicultural"
mul_index2 = np.where(mul_col_index2.values)[0]
print len(mul_index2)
In [17]:
df["num_per_identity"] = np.nan
df.loc[df.per_identity_categorization == "monocultural", "num_per_identity"] = 1
df.loc[df.per_identity_categorization == "bicultural", "num_per_identity"] = 2
df.loc[mul_index2, "num_per_identity"] = per_num[mul_index2]
df.num_per_identity.head(5)
Out[17]:
In [18]:
df["identity_mismatch_gen_per"] = abs(df.num_gen_identity - df.num_per_identity)
df.identity_mismatch_gen_per.head()
Out[18]:
In [19]:
df.loc[df.identity_mismatch_gen_per == 0, "dich_identity_mismatch_gen_per"] = 0
df.loc[df.identity_mismatch_gen_per != 0, "dich_identity_mismatch_gen_per"] = 1
In [20]:
df.census10.head()
Out[20]:
In [21]:
df["census_10_num"] = [len(set([a.strip().lower() for a in x.strip().split(',')])) for x in df.census10]
df.loc[df.census_10_num == 1, "census_10_identity"] = "monocultural"
df.loc[df.census_10_num == 2, "census_10_identity"] = "bicultural"
df.loc[df.census_10_num >2, "census_10_identity"] = "multicultural"
#cen10_num = np.array([len(x) for x in gen_identities])
In [22]:
df.rename(columns={"census12": "census20"}, inplace=True)
df["census_20_num"] = [len(set([a.strip().lower() for a in x.strip().split(',')])) for x in df.census20]
df.loc[df.census_20_num == 1, "census_20_identity"] = "monocultural"
df.loc[df.census_20_num == 2, "census_20_identity"] = "bicultural"
df.loc[df.census_20_num >2, "census_20_identity"] = "multicultural"
In [23]:
df["identity_mismatch_gen_cen10"] = abs(df.num_gen_identity - df.census_10_num)
In [24]:
df.loc[df.identity_mismatch_gen_cen10 == 0, "dich_identity_mismatch_gen_cen10"] = 0
df.loc[df.identity_mismatch_gen_cen10 != 0, "dich_identity_mismatch_gen_cen10"] = 1
In [25]:
df["identity_mismatch_gen_cen20"] = abs(df.num_gen_identity - df.census_20_num)
In [26]:
df.loc[df.identity_mismatch_gen_cen20 == 0, "dich_identity_mismatch_gen_cen20"] = 0
df.loc[df.identity_mismatch_gen_cen20 != 0, "dich_identity_mismatch_gen_cen20"] = 1
In [27]:
df["identity_mismatch_per_cen10"] = abs(df.num_per_identity - df.census_10_num)
In [28]:
df.loc[df.identity_mismatch_per_cen10 == 0, "dich_identity_mismatch_per_cen10"] = 0
df.loc[df.identity_mismatch_per_cen10 != 0, "dich_identity_mismatch_per_cen10"] = 1
In [29]:
df["identity_mismatch_per_cen20"] = abs(df.num_per_identity - df.census_20_num)
In [30]:
df.loc[df.identity_mismatch_gen_cen20 == 0, "dich_identity_mismatch_gen_cen20"] = 0
df.loc[df.identity_mismatch_gen_cen20 != 0, "dich_identity_mismatch_gen_cen20"] = 1
In [33]:
!pwd
In [32]:
df.to_csv("spssi_full_identity_mismatch.csv")