In [1]:
import feather
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from pprint import pprint
import scipy as sp

# For reasons of privacy, the raw data (and the final dataframes) are not added to the github repository
FILE_PATH = '../data_science_case/'

In [2]:
anp_df = feather.read_dataframe(FILE_PATH + 'anp.feather')
face_df = feather.read_dataframe(FILE_PATH + 'face.feather')
image_df = feather.read_dataframe(FILE_PATH + 'image_data.feather')
metrics_df = feather.read_dataframe(FILE_PATH + 'image_metrics.feather')
object_labels_df = feather.read_dataframe(FILE_PATH + 'object_labels.feather')
survey_df = feather.read_dataframe(FILE_PATH + 'survey.feather')

In [3]:
# Merge them based on the image_id so that we have a large data frame containing all the elements

image_anp_frame = pd.merge(image_df, anp_df, how='inner', on='image_id')
im_anp_obj_frame = pd.merge(image_anp_frame, object_labels_df, how='inner', on='image_id')
im_anp_obj_face_frame = pd.merge(im_anp_obj_frame, face_df, how='inner', on='image_id')

In [4]:
image_frame = im_anp_obj_face_frame.drop_duplicates().reset_index()

In [5]:
selfies_columns = ['anp_label',
                   'data_amz_label',
                   'image_id',
                   'face_sunglasses', 
                   'face_beard', 
                   'face_mustache',
                   'eyeglasses', 
                   'user_followed_by', 
                   'user_follows',
                   'user_posted_photos',
                   'user_id',
                   'face_id',
                   'emotion_score']
image_frame = image_frame[selfies_columns]

In [6]:
survey_df['P'] = survey_df.apply(lambda row: np.mean([row['P_1'], row['P_2'], row['P_3']]), axis=1)
survey_df['E'] = survey_df.apply(lambda row: np.mean([row['E_1'], row['E_2'], row['E_3']]), axis=1)
survey_df['R'] = survey_df.apply(lambda row: np.mean([row['R_1'], row['R_2'], row['R_3']]), axis=1)
survey_df['M'] = survey_df.apply(lambda row: np.mean([row['M_1'], row['M_2'], row['M_3']]), axis=1)
survey_df['A'] = survey_df.apply(lambda row: np.mean([row['A_1'], row['A_2'], row['A_3']]), axis=1)
survey_df['H'] = survey_df.apply(lambda row: np.mean([row['H_1'], row['H_2'], row['H_3']]), axis=1)
survey_df['PERMA'] = survey_df.apply(lambda row: np.mean([row['P'], row['E'], row['R'], row['M'], row['A']]), axis=1)

In [7]:
image_frame['user_id'] = image_frame['user_id'].astype(int)
both= pd.merge(left=image_frame, right= survey_df, on=None, left_on= "user_id", right_on="insta_user_id")

In [8]:
selfies_columns = ['PERMA',
                   'anp_label',
                   'data_amz_label',
                   'image_id',
                   'face_sunglasses', 
                   'face_beard', 
                   'face_mustache',
                   'eyeglasses', 
                   'user_followed_by', 
                   'user_follows',
                   'user_posted_photos',
                   'insta_user_id',
                   'gender',
                   'born',
                   'emotion_score',
                   'education',
                   'income',
                   'face_id',
                   'P', 'E', 'R', 'M', 'A', 'H']

In [9]:
income_dict = {
    'Less than $10,000': 5000,
    '$10,000 to $19,999': 15000,
    '$20,000 to $29,999': 25000,
    '$30,000 to $39,999': 35000,
    '$40,000 to $49,999': 45000,
    '$50,000 to $59,999': 55000,
    '$60,000 to $69,999': 65000,
    '$70,000 to $79,999': 75000,
    '$80,000 to $89,999': 85000,
    '$90,000 to $99,999': 95000,
    '$100,000 to $149,999': 125000,
    '$150,000 or more': 150000
}

In [10]:
amz_labels = ['Afro Hairstyle', 'Blonde', 'Dimples', 'Hat', 'Tattoo', 'Cap', 'Headband']
anp_labels = ['plastic_surgery', 'dark_skin', 'mixed_race']

In [11]:
selfie_ids = both.loc[both['data_amz_label'] == 'Selfie', 'image_id'].unique()
selfies = both[both['image_id'].isin(selfie_ids)][selfies_columns]

selfies = selfies[selfies['income'] != 'I\'d rather not disclose this information']

selfies['female'] = np.where(selfies['gender'] == 'Female', True, False)

selfies['Afro_Hairstyle'] = np.where(selfies['data_amz_label'] == 'Afro Hairstyle', True, False)
selfies['Hat'] = np.where(selfies['data_amz_label'].isin(['Hat', 'Cap', 'Headband']), True, False)
selfies['Tattoo'] = np.where(selfies['data_amz_label'] == 'Tattoo', True, False)
selfies['Blonde'] = np.where(selfies['data_amz_label'] == 'Blonde', True, False)
selfies['Dimples'] = np.where(selfies['data_amz_label'] == 'Dimples', True, False)
selfies['Plastic_surgery'] = np.where(selfies['anp_label'] == 'plastic_surgery', True, False)
selfies['Dark_skin'] = np.where(selfies['anp_label'] == 'dark_skin', True, False)
selfies['mixed_race'] = np.where(selfies['anp_label'] == 'mixed_race', True, False)
selfies['college'] = np.where(selfies['education'] == 'College graduate', True, False)
selfies['high_school'] = np.where(selfies['education'] == 'High school graduate', True, False)
selfies['post_graduate'] = np.where(selfies['education'] == 'Post graduate degree', True, False)
selfies['born'] = selfies.apply(lambda row: 2017 - row['born'], axis=1)
selfies['Facial_Hair'] = selfies.apply(lambda row: True if row['face_beard'] == True else (True if row['face_mustache'] == True else False), axis=1)

selfies['income'] = selfies.apply(lambda row: income_dict[row['income']], axis=1)
del selfies['gender']
del selfies['education']
selfies.drop_duplicates(inplace=True)

In [12]:
selfies = selfies[selfies['insta_user_id'] != 703978203]
selfies = selfies[selfies['insta_user_id'] != 1556973431]
selfies = selfies[selfies['insta_user_id'] != 2248592884]
selfies = selfies.reset_index()

In [13]:
selfies = selfies.groupby('image_id', as_index=False).agg({
    'emotion_score': 'mean',
    'Afro_Hairstyle': 'max',
    'Hat': 'max',
    'Tattoo': 'max',
    'Blonde': 'max',
    'Dimples': 'max',
    'Plastic_surgery': 'max',
    'Dark_skin': 'max',
    'mixed_race': 'max',
    'face_sunglasses': 'max',
    'face_beard': 'max',
    'face_mustache': 'max',
    'Facial_Hair': 'max',
    'eyeglasses': 'max',
    'PERMA': 'max',
    'user_followed_by': 'max',
    'user_follows': 'max',
    'user_posted_photos': 'max',
    'insta_user_id': 'max',
    'born': 'max',
    'income': 'max',
    'female': 'max',
    'college': 'max',
    'high_school': 'max',
    'post_graduate': 'max',
    'P': 'max',
    'E': 'max',
    'R': 'max',
    'M': 'max',
    'A': 'max',
    'H': 'max',
})

In [14]:
users = selfies.groupby('insta_user_id', as_index=False).agg({
    'face_sunglasses': 'max',
    'Facial_Hair': 'max',
    'eyeglasses': 'max',
    'PERMA': 'max',
    'Afro_Hairstyle': 'max',
    'Hat': 'max',
    'Tattoo': 'max',
    'Blonde': 'max',
    'Dimples': 'max',
    'Plastic_surgery': 'max',
    'Dark_skin': 'max',
    'mixed_race': 'max',
    'user_followed_by': 'max',
    'user_follows': 'max',
    'user_posted_photos': 'max',
    'born': 'max',
    'income': 'max',
    'female': 'max',
    'college': 'max',
    'high_school': 'max',
    'post_graduate': 'max',
    'emotion_score': 'mean',
    'P': 'max',
    'E': 'max',
    'R': 'max',
    'M': 'max',
    'A': 'max',
    'H': 'max'
})

In [15]:
users['user_follows'] = users['user_follows'].astype(int)
users['user_followed_by'] = users['user_followed_by'].astype(int)
users['user_posted_photos'] = users['user_posted_photos'].astype(int)

In [16]:
selfies.to_csv('selfies_frame.csv', sep=';', encoding='utf-8')

In [17]:
users.to_csv('users_frame.csv', sep=';', encoding='utf-8')

In [18]:
survey_df.to_csv('survey_frame.csv', sep=';', encoding='utf-8')