In [1]:
import feather
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from pprint import pprint
import scipy as sp
# For reasons of privacy, the raw data (and the final dataframes) are not added to the github repository
FILE_PATH = '../data_science_case/'
In [2]:
anp_df = feather.read_dataframe(FILE_PATH + 'anp.feather')
face_df = feather.read_dataframe(FILE_PATH + 'face.feather')
image_df = feather.read_dataframe(FILE_PATH + 'image_data.feather')
metrics_df = feather.read_dataframe(FILE_PATH + 'image_metrics.feather')
object_labels_df = feather.read_dataframe(FILE_PATH + 'object_labels.feather')
survey_df = feather.read_dataframe(FILE_PATH + 'survey.feather')
In [3]:
# Merge them based on the image_id so that we have a large data frame containing all the elements
image_anp_frame = pd.merge(image_df, anp_df, how='inner', on='image_id')
im_anp_obj_frame = pd.merge(image_anp_frame, object_labels_df, how='inner', on='image_id')
im_anp_obj_face_frame = pd.merge(im_anp_obj_frame, face_df, how='inner', on='image_id')
In [4]:
image_frame = im_anp_obj_face_frame.drop_duplicates().reset_index()
In [5]:
selfies_columns = ['anp_label',
'data_amz_label',
'image_id',
'face_sunglasses',
'face_beard',
'face_mustache',
'eyeglasses',
'user_followed_by',
'user_follows',
'user_posted_photos',
'user_id',
'face_id',
'emotion_score']
image_frame = image_frame[selfies_columns]
In [6]:
survey_df['P'] = survey_df.apply(lambda row: np.mean([row['P_1'], row['P_2'], row['P_3']]), axis=1)
survey_df['E'] = survey_df.apply(lambda row: np.mean([row['E_1'], row['E_2'], row['E_3']]), axis=1)
survey_df['R'] = survey_df.apply(lambda row: np.mean([row['R_1'], row['R_2'], row['R_3']]), axis=1)
survey_df['M'] = survey_df.apply(lambda row: np.mean([row['M_1'], row['M_2'], row['M_3']]), axis=1)
survey_df['A'] = survey_df.apply(lambda row: np.mean([row['A_1'], row['A_2'], row['A_3']]), axis=1)
survey_df['H'] = survey_df.apply(lambda row: np.mean([row['H_1'], row['H_2'], row['H_3']]), axis=1)
survey_df['PERMA'] = survey_df.apply(lambda row: np.mean([row['P'], row['E'], row['R'], row['M'], row['A']]), axis=1)
In [7]:
image_frame['user_id'] = image_frame['user_id'].astype(int)
both= pd.merge(left=image_frame, right= survey_df, on=None, left_on= "user_id", right_on="insta_user_id")
In [8]:
selfies_columns = ['PERMA',
'anp_label',
'data_amz_label',
'image_id',
'face_sunglasses',
'face_beard',
'face_mustache',
'eyeglasses',
'user_followed_by',
'user_follows',
'user_posted_photos',
'insta_user_id',
'gender',
'born',
'emotion_score',
'education',
'income',
'face_id',
'P', 'E', 'R', 'M', 'A', 'H']
In [9]:
income_dict = {
'Less than $10,000': 5000,
'$10,000 to $19,999': 15000,
'$20,000 to $29,999': 25000,
'$30,000 to $39,999': 35000,
'$40,000 to $49,999': 45000,
'$50,000 to $59,999': 55000,
'$60,000 to $69,999': 65000,
'$70,000 to $79,999': 75000,
'$80,000 to $89,999': 85000,
'$90,000 to $99,999': 95000,
'$100,000 to $149,999': 125000,
'$150,000 or more': 150000
}
In [10]:
amz_labels = ['Afro Hairstyle', 'Blonde', 'Dimples', 'Hat', 'Tattoo', 'Cap', 'Headband']
anp_labels = ['plastic_surgery', 'dark_skin', 'mixed_race']
In [11]:
selfie_ids = both.loc[both['data_amz_label'] == 'Selfie', 'image_id'].unique()
selfies = both[both['image_id'].isin(selfie_ids)][selfies_columns]
selfies = selfies[selfies['income'] != 'I\'d rather not disclose this information']
selfies['female'] = np.where(selfies['gender'] == 'Female', True, False)
selfies['Afro_Hairstyle'] = np.where(selfies['data_amz_label'] == 'Afro Hairstyle', True, False)
selfies['Hat'] = np.where(selfies['data_amz_label'].isin(['Hat', 'Cap', 'Headband']), True, False)
selfies['Tattoo'] = np.where(selfies['data_amz_label'] == 'Tattoo', True, False)
selfies['Blonde'] = np.where(selfies['data_amz_label'] == 'Blonde', True, False)
selfies['Dimples'] = np.where(selfies['data_amz_label'] == 'Dimples', True, False)
selfies['Plastic_surgery'] = np.where(selfies['anp_label'] == 'plastic_surgery', True, False)
selfies['Dark_skin'] = np.where(selfies['anp_label'] == 'dark_skin', True, False)
selfies['mixed_race'] = np.where(selfies['anp_label'] == 'mixed_race', True, False)
selfies['college'] = np.where(selfies['education'] == 'College graduate', True, False)
selfies['high_school'] = np.where(selfies['education'] == 'High school graduate', True, False)
selfies['post_graduate'] = np.where(selfies['education'] == 'Post graduate degree', True, False)
selfies['born'] = selfies.apply(lambda row: 2017 - row['born'], axis=1)
selfies['Facial_Hair'] = selfies.apply(lambda row: True if row['face_beard'] == True else (True if row['face_mustache'] == True else False), axis=1)
selfies['income'] = selfies.apply(lambda row: income_dict[row['income']], axis=1)
del selfies['gender']
del selfies['education']
selfies.drop_duplicates(inplace=True)
In [12]:
selfies = selfies[selfies['insta_user_id'] != 703978203]
selfies = selfies[selfies['insta_user_id'] != 1556973431]
selfies = selfies[selfies['insta_user_id'] != 2248592884]
selfies = selfies.reset_index()
In [13]:
selfies = selfies.groupby('image_id', as_index=False).agg({
'emotion_score': 'mean',
'Afro_Hairstyle': 'max',
'Hat': 'max',
'Tattoo': 'max',
'Blonde': 'max',
'Dimples': 'max',
'Plastic_surgery': 'max',
'Dark_skin': 'max',
'mixed_race': 'max',
'face_sunglasses': 'max',
'face_beard': 'max',
'face_mustache': 'max',
'Facial_Hair': 'max',
'eyeglasses': 'max',
'PERMA': 'max',
'user_followed_by': 'max',
'user_follows': 'max',
'user_posted_photos': 'max',
'insta_user_id': 'max',
'born': 'max',
'income': 'max',
'female': 'max',
'college': 'max',
'high_school': 'max',
'post_graduate': 'max',
'P': 'max',
'E': 'max',
'R': 'max',
'M': 'max',
'A': 'max',
'H': 'max',
})
In [14]:
users = selfies.groupby('insta_user_id', as_index=False).agg({
'face_sunglasses': 'max',
'Facial_Hair': 'max',
'eyeglasses': 'max',
'PERMA': 'max',
'Afro_Hairstyle': 'max',
'Hat': 'max',
'Tattoo': 'max',
'Blonde': 'max',
'Dimples': 'max',
'Plastic_surgery': 'max',
'Dark_skin': 'max',
'mixed_race': 'max',
'user_followed_by': 'max',
'user_follows': 'max',
'user_posted_photos': 'max',
'born': 'max',
'income': 'max',
'female': 'max',
'college': 'max',
'high_school': 'max',
'post_graduate': 'max',
'emotion_score': 'mean',
'P': 'max',
'E': 'max',
'R': 'max',
'M': 'max',
'A': 'max',
'H': 'max'
})
In [15]:
users['user_follows'] = users['user_follows'].astype(int)
users['user_followed_by'] = users['user_followed_by'].astype(int)
users['user_posted_photos'] = users['user_posted_photos'].astype(int)
In [16]:
selfies.to_csv('selfies_frame.csv', sep=';', encoding='utf-8')
In [17]:
users.to_csv('users_frame.csv', sep=';', encoding='utf-8')
In [18]:
survey_df.to_csv('survey_frame.csv', sep=';', encoding='utf-8')