In [51]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from scripts import utils



In [55]:

    
reload(utils)









    Out[55]:





<module 'scripts.utils' from 'scripts/utils.py'>



In [2]:

    
%matplotlib inline

Load data



In [3]:

    
years=[]
for f in glob.glob('255*.xls'):
    df = pd.read_excel(f, 0)
    years.append(df)



In [4]:

    
def replace_missing_value(s):
    try:
        value = float(s)
    except:
        return np.nan
    else:
        return float(s)



In [ ]:

    
kpi_scores = pd.DataFrame()
new_df = []
for j in range(len(years)):
    drop_percnt = []
    for n, loc in enumerate(years[j].Location.unique()):
        df = years[j][years[j]['Location']==loc]
        subset_index = df[['BMI', 'Systolic', 'Triglyceride',
                              'Diastolic', 'HDL-C', 'Glucose', 'Gender',
                              'Creatinine', 'CBC_Hemoglobin', 'Total Cholesterol',
                              'LDL-C (Calculated)']].dropna().index
        df_dna = df.ix[subset_index]  # dna -> dropped NA

        if len(df_dna) == 0:
            print loc, len(df_dna)
            continue

        drop_percnt.append((len(df)-len(df_dna))/float(len(df))*100)
        df_dna = utils.find_ms(df_dna)
        utils.set_kpi(df_dna)
        score = utils.calculate_kpi_score(df_dna)
        if score is not None:
            kpi_scores = kpi_scores.append(score, ignore_index=True)
            kpi_scores['total_score'] = kpi_scores[['BMI', 'BP', 'Glucose','Creatinine',
                                                    'Total Cholesterol-a', 'Total Cholesterol-b','HDL-C',
                                                    'LDL-C (Calculated)', 'CBC_Hemoglobin']].sum(axis=1)
        new_df.append(df_dna)
    try:
        print 'Year %d, average dropped data=%.2f' % (df_dna.year.tolist()[0], np.mean(drop_percnt))
    except IndexError:
        print loc, 'IndexError'
print kpi_scores.head()
#new_df = pd.concat(new_df, ignore_index=True)









    



บริษัท วิทยาศรม จำกัด 0
บริษัท อาคเนย์อุตสาหกรรมบรรจุภัณฑ์ จำกัด 0
Year 2551, average droped data=19.99
บริษัท วิทยาศรม จำกัด



In [ ]:

    
kpi_scores[kpi_scores['Year']==2551].sort('total_score', ascending=False)



In [11]:

    
kpi_scores.to_excel('kpi.xls', header=True, index=False, sheet_name='kpi')



In [ ]:

    
kpi_scores[kpi_scores['Location'].apply(lambda x: x.encode('utf-8'))=='หอสมุดและคลังความรู้มหาวิทยาลัยมหิดล']



In [65]:

    
reload(utils)









    Out[65]:





<module 'scripts.utils' from 'scripts/utils.py'>



In [78]:

    
kpi_scores = pd.DataFrame()
df = years[0][years[0]['Location'].apply(lambda x: x.encode('utf-8'))=='หอสมุดและคลังความรู้มหาวิทยาลัยมหิดล']
print('all individuals', len(df))
df = utils.find_ms(df)
print('after find_ms', len(df))
#print df[['BMI', 'Systolic', 'Diastolic', 'Glucose', 'HDL-C', 'Triglyceride', 'MS']].head()
utils.set_kpi(df)
score = utils.calculate_kpi_score(df)
if score is not None:
    kpi_scores = kpi_scores.append(score, ignore_index=True)
    kpi_scores['total_score'] = kpi_scores[['BMI', 'BP', 'Glucose','Creatinine',
                                            'Total Cholesterol-a', 'Total Cholesterol-b','HDL-C',
                                            'LDL-C (Calculated)', 'CBC_Hemoglobin']].sum(axis=1)
print df[['Age', 'Gender']].describe()









    



('all individuals', 74)
('after find_ms', 63)
             Age     Gender
count  63.000000  63.000000
mean   48.920635   1.793651
std     5.842838   0.407935
min    36.000000   1.000000
25%    44.000000   2.000000
50%    49.000000   2.000000
75%    54.000000   2.000000
max    59.000000   2.000000



In [264]:

    
for location in kpi_scores['Location'].unique():
    for loc in kpi_scores[kpi_scores['Location']==location]:
        for year in [2551,2552,2553,2554,2555]:
            plt.plot(kpi_scores[kpi_scores['Year']==year]['total_score'].tolist(),
                     marker='o')

plt.xticks(range(len(kpi_scores[kpi_scores['Year']==2551]['Location'])),
           ['Asean Institute',
            'Library',
            'Institute of Population\nand Social Research',
            'Institute of Molecular\nBiosciences'],
           rotation=80,
           )
plt.ylim(0,10)
plt.legend()



In [13]:

    
kpi_scores[kpi_scores['Year']==2551].head()









    Out[13]:






  
    
      
      BMI
      BP
      CBC_Hemoglobin
      Creatinine
      Glucose
      HDL-C
      LDL-C (Calculated)
      Location
      MS
      Total Cholesterol-a
      Total Cholesterol-b
      Year
      total_score
    
  
  
    
      0
       0
       0
       1
       1
       1
       1
       0
       โรงเรียนนวมินทราชินูทิศ บดินทรเดชา
       0
       0
       1
       2551
       5
    
    
      1
       0
       0
       1
       1
       0
       0
       0
         คณะพยาบาลศาสตร์ มหาวิทยาลัยมหิดล
       0
       0
       1
       2551
       3
    
    
      2
       0
       0
       1
       1
       0
       0
       0
         สถาบันพัฒนาสุขภาพอาเซียน ม.มหิดล
       0
       0
       1
       2551
       3
    
    
      3
       0
       0
       1
       1
       0
       0
       0
          คณะเภสัชศาสตร์ มหาวิทยาลัยมหิดล
       0
       0
       1
       2551
       3
    
    
      4
       0
       0
       1
       1
       1
       1
       0
                       มหาวิทยาลัยศิลปากร
       0
       0
       1
       2551
       5



In [ ]:

	CBC_Hemoglobin	Creatinine	Glucose	HDL-C	Location	Total Cholesterol-b	Year	total_score
0	1	1	1	1	โรงเรียนนวมินทราชินูทิศ บดินทรเดชา	1	2551	5
1	1	1	0	0	คณะพยาบาลศาสตร์ มหาวิทยาลัยมหิดล	1	2551	3
2	1	1	0	0	สถาบันพัฒนาสุขภาพอาเซียน ม.มหิดล	1	2551	3
3	1	1	0	0	คณะเภสัชศาสตร์ มหาวิทยาลัยมหิดล	1	2551	3
4	1	1	1	1	มหาวิทยาลัยศิลปากร	1	2551	5