In [98]:
import pandas as pd
import numpy as np
import os.path as p
In [99]:
df_title = pd.read_csv("/home/rmyeid/notebooks/compsocial/SPSSI_Survey_1_Complete.csv", header=0)
#df_title = pd.read_csv("/home/rmyeid/notebooks/compsocial/SPSSI_2nd year project_timepoint 1.csv", header=0)
In [100]:
df = df_title.drop(0)
In [101]:
print len(df)
In [102]:
df["identity_categorization"] = np.nan
monocultural_index = (np.logical_not(df.IDEN_1A.isnull()))
df.loc[monocultural_index, "identity_categorization"]="monocultural"
bicultural_index = (np.logical_not(df.IDEN_1A.isnull())) * (np.logical_not(df.IDEN_2A.isnull())) * (df.IDEN_3A.isnull())
df.loc[bicultural_index,"identity_categorization"]="bicultural"
multicultural_index = (np.logical_not(df.IDEN_1A.isnull())) * (np.logical_not(df.IDEN_2A.isnull())) * (np.logical_not(df.IDEN_3A.isnull()))
df.loc[multicultural_index, "identity_categorization"] = "multicultural"
df.identity_categorization.value_counts()
Out[102]:
In [104]:
df[df.identity_categorization.isnull()][["IDEN_1A", "CEN_1_1"]]
Out[104]:
In [118]:
df= df.drop(df.index[df.IDEN_1A.isnull() * df.CEN_1_1.isnull()])
In [138]:
ID_cat_columns =["IDEN_1A", "IDEN_2A", "IDEN_3A", "IDEN_4A", "CEN_7", "GEN_M_1A",
"GEN_M_2A", "GEN_M_3A", "GEN_M_4A"]
identities = df[ID_cat_columns].values.flatten()
normalized_idenitites = [x.lower().replace("and", "").split(",") for x in identities if isinstance(x, str)]
unique_idens = set([iden.strip() for person in normalized_idenitites for iden in person])
In [145]:
idens_df = pd.DataFrame.from_records([[x] for x in list(unique_idens)])
idens_df.to_csv("identities_map.csv")
In [150]:
from StringIO import StringIO # got moved to io in python3.
import requests
In [153]:
r = requests.get("https://docs.google.com/spreadsheets/d/1vHCDGgb8CjnBb4pA5e-htHOlhdsA55JToZWs3USvEJk/"
"export?format=csv&id=1vHCDGgb8CjnBb4pA5e-htHOlhdsA55JToZWs3USvEJk&gid=665505531")
courses_data = r.content
courses_df = pd.read_csv(StringIO(courses_data), index_col=0)
courses_df.head()
Out[153]:
In [180]:
census10_map = {k:v for k,v in courses_df.values[:, [0,1]]}
census12_map = {k:v for k,v in courses_df.values[:, [0,2]]}
In [191]:
census10 = []
census12 = []
for person in df[ID_cat_columns].values:
normalized = [x.lower().replace("and", "").split(",") if isinstance(x, str) else ["na"] for x in person]
all_idens = sorted(list(set([x.strip() for col in normalized for x in col])))
idens10 = list(set([census10_map[x] for x in all_idens]))
idens12 = list(set([census12_map[x] for x in all_idens]))
if len(idens10) > 1 and "n/a" in idens10: idens10.remove("n/a")
if len(idens12) > 1 and "n/a" in idens12: idens12.remove("n/a")
census10.append(",".join(sorted(list(set(idens10)))))
census12.append(",".join(sorted(list(set(idens12)))))
In [192]:
df["census10"] = census10
df["census12"] = census12
In [195]:
df[ID_cat_columns + ["census10", "census12"]].head()
Out[195]:
In [219]:
df.to_csv("SPSSI_poster_clean.csv")
In [228]:
df2 = df.drop(df.index[df.identity_categorization.isnull()]) #[["IDEN_1A", "IDEN_2A", "IDEN_3A", "IDEN_4A"])
In [ ]:
from collections import defaultdict
In [229]:
identity_counts = defaultdict(lambda: 0)
census10_counts = df2.census10.value_counts()
for x,c in zip(census10_counts.index.values, census10_counts.values):
for j in [k.strip() for k in x.split(',')]:
identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)
Out[229]:
In [230]:
identity_counts = defaultdict(lambda: 0)
census12_counts = df2.census12.value_counts()
for x,c in zip(census12_counts.index.values, census12_counts.values):
for j in [k.strip() for k in x.split(',')]:
identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)
Out[230]:
In [237]:
df_mono = df2[df2.identity_categorization=="monocultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_mono.census10.value_counts()
for x,c in zip(census10_counts.index.values, census10_counts.values):
for j in [k.strip() for k in x.split(',')]:
identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)
Out[237]:
In [238]:
df_bi = df2[df2.identity_categorization=="bicultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_bi.census10.value_counts()
for x,c in zip(census10_counts.index.values, census10_counts.values):
for j in [k.strip() for k in x.split(',')]:
identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)
Out[238]:
In [239]:
df_multi = df2[df2.identity_categorization=="multicultural"]
identity_counts = defaultdict(lambda: 0)
census10_counts = df_multi.census10.value_counts()
for x,c in zip(census10_counts.index.values, census10_counts.values):
for j in [k.strip() for k in x.split(',')]:
identity_counts[j] += c
sorted(identity_counts.items(), key=lambda(x,y):y, reverse=True)
Out[239]:
In [276]:
tmp = df2.loc[df2.identity_categorization=="monocultural", ["IDEN_1A", "GEN_M_1A", "GEN_M_2A",
"GEN_M_3A", "GEN_M_4A", "census10"]]
In [274]:
rows = [42,46,75, 99, 100, 102, 116, 137, 152, 156, 167, 168, 170, 193, 200, 203, 263, 299, 331]
[len(x.split(",")) for x in df2.loc[rows, "census10"]]
Out[274]: