In [321]:
import pandas as pd
import numpy as np

In [211]:
colnames = ['No.', 'Title', 'Fullname', 'Age', 'Organization', 'Weight', 'Height', 'Heartbeat',
            'BMI', 'Systolic', 'HLA', 'Serum', 'Gluc', 'BUN', 'Creat', 'Uric', 'Chol', 'TG',
            'AST', 'AL', 'ALP', 'HDL', 'LDLD', 'Ca', 'GGT', 'Pb', 'HBsAg', 'HBsAb', 'HBcAb', 'HbA1c',
            'T3','T4', 'TSH', 'PSA', 'AFP', 'CEA', 'Others', 'Blood', 'WBC', 'N %', 'L %', 'M %', 'E %',
            'B %', 'PLT', 'Hb', 'Hct', 'RBC Morphology', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50',
            'Sample', 'pH', 'Sp.gr.', 'Protein', 'Sugar', 'WBC.1', 'RBC', 'Sq.Epi.', 'Tran.Epi.',
            'Bacteria', 'Amorp.', 'Cal.ox.', 'Uric a.', 'Others.1', 'Unnamed: 65', 'Sample.1', 'Parasite&ova',
            'IFOBT', 'Unnamed: 69', 'Chest', 'ผิดปกติ', 'Unnamed: 72']

In [213]:
def find_gender(s):
    title = s.encode('utf-8')
    if title == 'ด.ญ.':
        return 'female'
    elif title == 'น.ส.':
        return 'female'
    elif title == 'นาง':
        return 'female'
    elif title == 'นาย':
        return 'male'
    elif title == 'พระ':
        return 'male'
    elif title == 'ด.ช.':
        return 'male'
    else:
        return None

In [237]:
def find_ms(df):
    df = df[df['BMI']>=25.0]

    df['TG-s'] = df['TG']>=150.0
    df['Gluc-s'] = df['Gluc'] >= 100.0
    df['Systolic-s'] = df['Systolic']>=130.0
    df['Diastolic-s'] = df['Diastolic'] >=85.0

    df['TG-s'] = df['TG-s'].astype(int)
    df['Gluc-s'] = df['Gluc-s'].astype(int)
    df['BP-s'] = df['Systolic-s'] | df['Diastolic-s']
    df['BP-s'] = df['BP-s'].astype(int)
    
    male_df = df[df['Gender'] == 'male']
    female_df = df[df['Gender'] == 'female']
    
    male_df['HDL-s'] = male_df['HDL'] < 40.0
    female_df['HDL-s'] = female_df['HDL'] < 50.0
    female_df['HDL-s'] = female_df['HDL-s'].astype(int)
    male_df['HDL-s'] = male_df['HDL-s'].astype(int)

    male_df['MS'] = male_df[['BP-s', 'Gluc-s', 'TG-s', 'HDL-s']].sum(axis=1)
    female_df['MS'] = female_df[['BP-s', 'Gluc-s', 'TG-s', 'HDL-s']].sum(axis=1)
    return pd.concat([male_df, female_df])

In [317]:
def replace_missing_value(s):
    try:
        value = float(s)
    except:
        return np.nan
    else:
        return float(s)

In [377]:
def run(df, output_file):
    df.columns = colnames
    df['BMI'] = df['BMI'].apply(replace_missing_value)
    df['TG'] = df['TG'].apply(replace_missing_value)
    df['HDL'] = df['HDL'].apply(replace_missing_value)
    df['Gluc'] = df['Gluc'].apply(replace_missing_value)
    df['Age'] = df['Age'].apply(replace_missing_value)

    df['Gender'] = df['Title'].apply(find_gender)
    df_subset_index = df[['BMI', 'Systolic', 'TG', 'HDL', 'Gluc', 'Gender']].dropna().index
    df_subset = df.ix[df_subset_index]
    df_subset[['Systolic', 'Diastolic']] = df_subset['Systolic'].str.split('/').apply(pd.Series)
    df_subset['Systolic'] = df_subset['Systolic'].apply(replace_missing_value)
    df_subset['Diastolic'] = df_subset['Diastolic'].apply(replace_missing_value)
    d = find_ms(df_subset)
    ms_d = d[d['MS']>1]  # individuals who pass the metabolic syndrome criteria
    #ms_d[['BMI', 'Gender', 'Systolic', 'Diastolic', 'Gluc', 'HDL', 'TG', 'MS']].to_csv(output_file)
    #print ms_d[['BMI', 'Gender', 'Systolic', 'Diastolic', 'Gluc', 'HDL', 'TG', 'MS']]
    #print ms_d.describe()[['BMI', 'Gender', 'Systolic', 'Diastolic', 'Gluc', 'HDL', 'TG', 'MS', 'Age']]
    ms_d[['BMI','Systolic', 'Diastolic', 'Gluc', 'HDL', 'TG', 'Age']].describe().to_csv(output_file)

In [376]:
df = pd.read_excel('data/Klongmai/data1.xlsx', 0, skiprows=[0,1])

In [378]:
run(df, 'klongmai-2549.csv')


63
22 41

In [379]:
df = pd.read_excel('data/Klongmai/data1.xlsx', 1, skiprows=[0,1])
x = df.columns.values[-1]
df = df.drop([x], axis=1)
run(df, 'klongmai-2550.csv')


66
22 44

In [ ]: