In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from scripts import utils
In [55]:
reload(utils)
Out[55]:
In [2]:
%matplotlib inline
In [3]:
years=[]
for f in glob.glob('255*.xls'):
df = pd.read_excel(f, 0)
years.append(df)
In [4]:
def replace_missing_value(s):
try:
value = float(s)
except:
return np.nan
else:
return float(s)
In [ ]:
kpi_scores = pd.DataFrame()
new_df = []
for j in range(len(years)):
drop_percnt = []
for n, loc in enumerate(years[j].Location.unique()):
df = years[j][years[j]['Location']==loc]
subset_index = df[['BMI', 'Systolic', 'Triglyceride',
'Diastolic', 'HDL-C', 'Glucose', 'Gender',
'Creatinine', 'CBC_Hemoglobin', 'Total Cholesterol',
'LDL-C (Calculated)']].dropna().index
df_dna = df.ix[subset_index] # dna -> dropped NA
if len(df_dna) == 0:
print loc, len(df_dna)
continue
drop_percnt.append((len(df)-len(df_dna))/float(len(df))*100)
df_dna = utils.find_ms(df_dna)
utils.set_kpi(df_dna)
score = utils.calculate_kpi_score(df_dna)
if score is not None:
kpi_scores = kpi_scores.append(score, ignore_index=True)
kpi_scores['total_score'] = kpi_scores[['BMI', 'BP', 'Glucose','Creatinine',
'Total Cholesterol-a', 'Total Cholesterol-b','HDL-C',
'LDL-C (Calculated)', 'CBC_Hemoglobin']].sum(axis=1)
new_df.append(df_dna)
try:
print 'Year %d, average dropped data=%.2f' % (df_dna.year.tolist()[0], np.mean(drop_percnt))
except IndexError:
print loc, 'IndexError'
print kpi_scores.head()
#new_df = pd.concat(new_df, ignore_index=True)
In [ ]:
kpi_scores[kpi_scores['Year']==2551].sort('total_score', ascending=False)
In [11]:
kpi_scores.to_excel('kpi.xls', header=True, index=False, sheet_name='kpi')
In [ ]:
kpi_scores[kpi_scores['Location'].apply(lambda x: x.encode('utf-8'))=='หอสมุดและคลังความรู้มหาวิทยาลัยมหิดล']
In [65]:
reload(utils)
Out[65]:
In [78]:
kpi_scores = pd.DataFrame()
df = years[0][years[0]['Location'].apply(lambda x: x.encode('utf-8'))=='หอสมุดและคลังความรู้มหาวิทยาลัยมหิดล']
print('all individuals', len(df))
df = utils.find_ms(df)
print('after find_ms', len(df))
#print df[['BMI', 'Systolic', 'Diastolic', 'Glucose', 'HDL-C', 'Triglyceride', 'MS']].head()
utils.set_kpi(df)
score = utils.calculate_kpi_score(df)
if score is not None:
kpi_scores = kpi_scores.append(score, ignore_index=True)
kpi_scores['total_score'] = kpi_scores[['BMI', 'BP', 'Glucose','Creatinine',
'Total Cholesterol-a', 'Total Cholesterol-b','HDL-C',
'LDL-C (Calculated)', 'CBC_Hemoglobin']].sum(axis=1)
print df[['Age', 'Gender']].describe()
In [264]:
for location in kpi_scores['Location'].unique():
for loc in kpi_scores[kpi_scores['Location']==location]:
for year in [2551,2552,2553,2554,2555]:
plt.plot(kpi_scores[kpi_scores['Year']==year]['total_score'].tolist(),
marker='o')
plt.xticks(range(len(kpi_scores[kpi_scores['Year']==2551]['Location'])),
['Asean Institute',
'Library',
'Institute of Population\nand Social Research',
'Institute of Molecular\nBiosciences'],
rotation=80,
)
plt.ylim(0,10)
plt.legend()
In [13]:
kpi_scores[kpi_scores['Year']==2551].head()
Out[13]:
In [ ]: