In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from scripts import utils

In [55]:
reload(utils)


Out[55]:
<module 'scripts.utils' from 'scripts/utils.py'>

In [2]:
%matplotlib inline

Load data


In [3]:
years=[]
for f in glob.glob('255*.xls'):
    df = pd.read_excel(f, 0)
    years.append(df)

In [4]:
def replace_missing_value(s):
    try:
        value = float(s)
    except:
        return np.nan
    else:
        return float(s)

In [ ]:
kpi_scores = pd.DataFrame()
new_df = []
for j in range(len(years)):
    drop_percnt = []
    for n, loc in enumerate(years[j].Location.unique()):
        df = years[j][years[j]['Location']==loc]
        subset_index = df[['BMI', 'Systolic', 'Triglyceride',
                              'Diastolic', 'HDL-C', 'Glucose', 'Gender',
                              'Creatinine', 'CBC_Hemoglobin', 'Total Cholesterol',
                              'LDL-C (Calculated)']].dropna().index
        df_dna = df.ix[subset_index]  # dna -> dropped NA

        if len(df_dna) == 0:
            print loc, len(df_dna)
            continue

        drop_percnt.append((len(df)-len(df_dna))/float(len(df))*100)
        df_dna = utils.find_ms(df_dna)
        utils.set_kpi(df_dna)
        score = utils.calculate_kpi_score(df_dna)
        if score is not None:
            kpi_scores = kpi_scores.append(score, ignore_index=True)
            kpi_scores['total_score'] = kpi_scores[['BMI', 'BP', 'Glucose','Creatinine',
                                                    'Total Cholesterol-a', 'Total Cholesterol-b','HDL-C',
                                                    'LDL-C (Calculated)', 'CBC_Hemoglobin']].sum(axis=1)
        new_df.append(df_dna)
    try:
        print 'Year %d, average dropped data=%.2f' % (df_dna.year.tolist()[0], np.mean(drop_percnt))
    except IndexError:
        print loc, 'IndexError'
print kpi_scores.head()
#new_df = pd.concat(new_df, ignore_index=True)


บริษัท วิทยาศรม จำกัด 0
บริษัท อาคเนย์อุตสาหกรรมบรรจุภัณฑ์ จำกัด 0
Year 2551, average droped data=19.99
บริษัท วิทยาศรม จำกัด

In [ ]:
kpi_scores[kpi_scores['Year']==2551].sort('total_score', ascending=False)

In [11]:
kpi_scores.to_excel('kpi.xls', header=True, index=False, sheet_name='kpi')

In [ ]:
kpi_scores[kpi_scores['Location'].apply(lambda x: x.encode('utf-8'))=='หอสมุดและคลังความรู้มหาวิทยาลัยมหิดล']

In [65]:
reload(utils)


Out[65]:
<module 'scripts.utils' from 'scripts/utils.py'>

In [78]:
kpi_scores = pd.DataFrame()
df = years[0][years[0]['Location'].apply(lambda x: x.encode('utf-8'))=='หอสมุดและคลังความรู้มหาวิทยาลัยมหิดล']
print('all individuals', len(df))
df = utils.find_ms(df)
print('after find_ms', len(df))
#print df[['BMI', 'Systolic', 'Diastolic', 'Glucose', 'HDL-C', 'Triglyceride', 'MS']].head()
utils.set_kpi(df)
score = utils.calculate_kpi_score(df)
if score is not None:
    kpi_scores = kpi_scores.append(score, ignore_index=True)
    kpi_scores['total_score'] = kpi_scores[['BMI', 'BP', 'Glucose','Creatinine',
                                            'Total Cholesterol-a', 'Total Cholesterol-b','HDL-C',
                                            'LDL-C (Calculated)', 'CBC_Hemoglobin']].sum(axis=1)
print df[['Age', 'Gender']].describe()


('all individuals', 74)
('after find_ms', 63)
             Age     Gender
count  63.000000  63.000000
mean   48.920635   1.793651
std     5.842838   0.407935
min    36.000000   1.000000
25%    44.000000   2.000000
50%    49.000000   2.000000
75%    54.000000   2.000000
max    59.000000   2.000000

In [264]:
for location in kpi_scores['Location'].unique():
    for loc in kpi_scores[kpi_scores['Location']==location]:
        for year in [2551,2552,2553,2554,2555]:
            plt.plot(kpi_scores[kpi_scores['Year']==year]['total_score'].tolist(),
                     marker='o')

plt.xticks(range(len(kpi_scores[kpi_scores['Year']==2551]['Location'])),
           ['Asean Institute',
            'Library',
            'Institute of Population\nand Social Research',
            'Institute of Molecular\nBiosciences'],
           rotation=80,
           )
plt.ylim(0,10)
plt.legend()



In [13]:
kpi_scores[kpi_scores['Year']==2551].head()


Out[13]:
BMI BP CBC_Hemoglobin Creatinine Glucose HDL-C LDL-C (Calculated) Location MS Total Cholesterol-a Total Cholesterol-b Year total_score
0 0 0 1 1 1 1 0 โรงเรียนนวมินทราชินูทิศ บดินทรเดชา 0 0 1 2551 5
1 0 0 1 1 0 0 0 คณะพยาบาลศาสตร์ มหาวิทยาลัยมหิดล 0 0 1 2551 3
2 0 0 1 1 0 0 0 สถาบันพัฒนาสุขภาพอาเซียน ม.มหิดล 0 0 1 2551 3
3 0 0 1 1 0 0 0 คณะเภสัชศาสตร์ มหาวิทยาลัยมหิดล 0 0 1 2551 3
4 0 0 1 1 1 1 0 มหาวิทยาลัยศิลปากร 0 0 1 2551 5

In [ ]: