In [321]:
import pandas as pd
import numpy as np
In [211]:
colnames = ['No.', 'Title', 'Fullname', 'Age', 'Organization', 'Weight', 'Height', 'Heartbeat',
'BMI', 'Systolic', 'HLA', 'Serum', 'Gluc', 'BUN', 'Creat', 'Uric', 'Chol', 'TG',
'AST', 'AL', 'ALP', 'HDL', 'LDLD', 'Ca', 'GGT', 'Pb', 'HBsAg', 'HBsAb', 'HBcAb', 'HbA1c',
'T3','T4', 'TSH', 'PSA', 'AFP', 'CEA', 'Others', 'Blood', 'WBC', 'N %', 'L %', 'M %', 'E %',
'B %', 'PLT', 'Hb', 'Hct', 'RBC Morphology', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50',
'Sample', 'pH', 'Sp.gr.', 'Protein', 'Sugar', 'WBC.1', 'RBC', 'Sq.Epi.', 'Tran.Epi.',
'Bacteria', 'Amorp.', 'Cal.ox.', 'Uric a.', 'Others.1', 'Unnamed: 65', 'Sample.1', 'Parasite&ova',
'IFOBT', 'Unnamed: 69', 'Chest', 'ผิดปกติ', 'Unnamed: 72']
In [213]:
def find_gender(s):
title = s.encode('utf-8')
if title == 'ด.ญ.':
return 'female'
elif title == 'น.ส.':
return 'female'
elif title == 'นาง':
return 'female'
elif title == 'นาย':
return 'male'
elif title == 'พระ':
return 'male'
elif title == 'ด.ช.':
return 'male'
else:
return None
In [237]:
def find_ms(df):
df = df[df['BMI']>=25.0]
df['TG-s'] = df['TG']>=150.0
df['Gluc-s'] = df['Gluc'] >= 100.0
df['Systolic-s'] = df['Systolic']>=130.0
df['Diastolic-s'] = df['Diastolic'] >=85.0
df['TG-s'] = df['TG-s'].astype(int)
df['Gluc-s'] = df['Gluc-s'].astype(int)
df['BP-s'] = df['Systolic-s'] | df['Diastolic-s']
df['BP-s'] = df['BP-s'].astype(int)
male_df = df[df['Gender'] == 'male']
female_df = df[df['Gender'] == 'female']
male_df['HDL-s'] = male_df['HDL'] < 40.0
female_df['HDL-s'] = female_df['HDL'] < 50.0
female_df['HDL-s'] = female_df['HDL-s'].astype(int)
male_df['HDL-s'] = male_df['HDL-s'].astype(int)
male_df['MS'] = male_df[['BP-s', 'Gluc-s', 'TG-s', 'HDL-s']].sum(axis=1)
female_df['MS'] = female_df[['BP-s', 'Gluc-s', 'TG-s', 'HDL-s']].sum(axis=1)
return pd.concat([male_df, female_df])
In [317]:
def replace_missing_value(s):
try:
value = float(s)
except:
return np.nan
else:
return float(s)
In [377]:
def run(df, output_file):
df.columns = colnames
df['BMI'] = df['BMI'].apply(replace_missing_value)
df['TG'] = df['TG'].apply(replace_missing_value)
df['HDL'] = df['HDL'].apply(replace_missing_value)
df['Gluc'] = df['Gluc'].apply(replace_missing_value)
df['Age'] = df['Age'].apply(replace_missing_value)
df['Gender'] = df['Title'].apply(find_gender)
df_subset_index = df[['BMI', 'Systolic', 'TG', 'HDL', 'Gluc', 'Gender']].dropna().index
df_subset = df.ix[df_subset_index]
df_subset[['Systolic', 'Diastolic']] = df_subset['Systolic'].str.split('/').apply(pd.Series)
df_subset['Systolic'] = df_subset['Systolic'].apply(replace_missing_value)
df_subset['Diastolic'] = df_subset['Diastolic'].apply(replace_missing_value)
d = find_ms(df_subset)
ms_d = d[d['MS']>1] # individuals who pass the metabolic syndrome criteria
#ms_d[['BMI', 'Gender', 'Systolic', 'Diastolic', 'Gluc', 'HDL', 'TG', 'MS']].to_csv(output_file)
#print ms_d[['BMI', 'Gender', 'Systolic', 'Diastolic', 'Gluc', 'HDL', 'TG', 'MS']]
#print ms_d.describe()[['BMI', 'Gender', 'Systolic', 'Diastolic', 'Gluc', 'HDL', 'TG', 'MS', 'Age']]
ms_d[['BMI','Systolic', 'Diastolic', 'Gluc', 'HDL', 'TG', 'Age']].describe().to_csv(output_file)
In [376]:
df = pd.read_excel('data/Klongmai/data1.xlsx', 0, skiprows=[0,1])
In [378]:
run(df, 'klongmai-2549.csv')
In [379]:
df = pd.read_excel('data/Klongmai/data1.xlsx', 1, skiprows=[0,1])
x = df.columns.values[-1]
df = df.drop([x], axis=1)
run(df, 'klongmai-2550.csv')
In [ ]: