In [1]:
import pandas as pd
import numpy as np
import glob
import re
import matplotlib.pyplot as plt
import os
In [2]:
%matplotlib inline
In [3]:
def to_float(x):
'''returns float value of x or NaN'''
if x is np.nan:
return x
try:
x = float(x)
except (ValueError, UnicodeEncodeError, TypeError), e:
print(x), e
return np.nan
else:
return x
def replace_value(x, pattern):
# there've got to be a better way
try:
missing = re.findall(pattern, str(x))
except UnicodeEncodeError:
return np.nan # usually because of Thai words for "not tested"
else:
if missing:
return np.nan
else:
return x
def replace_missing_value(dataframe, colnames):
for pattern in ['\s+\-', '\-\s+', '\s+-\s+', '-']:
dataframe[colnames] = dataframe[colnames].applymap(lambda x: replace_value(x, pattern))
def convert_to_float(dataframe, colnames):
dataframe[colnames] = dataframe[colnames].applymap(to_float)
In [4]:
years = []
for yr in ['2551', '2552', '2553', '2554', '2555']:
months = []
for f in glob.glob('data/community/%s/*xls' % yr):
#print 'year: %s, month: %s' % (yr, f)
try:
df = pd.read_excel(f)
except:
print 'ERROR! year: %s, month: %s' % (yr, f)
else:
m = os.path.splitext(os.path.basename(f))[0]
df['month'] = m
df['year'] = yr
months.append(df)
years.append(months)
#years.append(pd.concat(months, ignore_index=True))
In [5]:
new_col_names = [unicode(s) for s in ['ServiceNo', 'Location', 'HN',
'ServiceDate', 'Title', 'Age', 'Gender',
'Weight', 'Height', 'HeartRate', 'OldSystolic',
'HeartLungAbdomen', 'Smoke', 'LongTermSmoker', 'ExSmoker',
'Drink', 'LongTermDrinker', 'Disease',
'Stress', 'Medication', 'Exercise', '35YrMenstruation',
'MenopausePeriod', 'CauseOfMenopause', 'MenopauseMedication']]
new_col_names += months[0].columns[25:]
for months in years:
for df in months:
#df.rename(columns={'Systolic': 'OldSystolic'}, inplace=True)
df.columns = new_col_names
In [6]:
for y, months in enumerate(years):
for m, df in enumerate(months):
#print(y, m)
replace_missing_value(df, ['Glucose', 'Triglyceride', 'HDL-C',
'OldSystolic', 'Weight', 'Height'])
#some values have an extra forward slash, which is needed to be removed.
df[['Systolic', 'Diastolic']] = df['OldSystolic'].str.replace('//', '/').str.split('/').apply(pd.Series)
convert_to_float(df, ['Glucose', 'Triglyceride', 'HDL-C',
'Systolic', 'Diastolic', 'Weight', 'Height'])
In [7]:
for months in years:
for df in months:
df['BMI'] = df['Weight']/(df['Height']/100.0)**2.0
In [8]:
new_years = []
for months in years:
df = pd.concat(months, ignore_index=True)
new_years.append(df)
In [9]:
bmis = []
gluc = []
trig = []
hdlc = []
systolic = []
diastolic = []
for yr in new_years:
bmis.append(getattr(yr, 'BMI').tolist())
gluc.append(getattr(yr, 'Glucose').tolist())
trig.append(getattr(yr, 'Triglyceride').tolist())
hdlc.append(getattr(yr, 'HDL-C').tolist())
systolic.append(getattr(yr, 'Systolic').tolist())
diastolic.append(getattr(yr, 'Diastolic').tolist())
In [10]:
def plot(data, title, ylim, outfile):
fig = plt.figure(figsize=(8,6))
plt.boxplot(data)
plt.title(title, size=16)
plt.ylim(ylim)
plt.xticks(np.arange(len(new_years))+1, ['2551', '2552', '2553', '2554', '2555'])
plt.savefig(outfile)
In [11]:
plot(bmis, 'BMI', (0,60), 'bmis.png')
In [12]:
plot(trig, 'Triglyceride', (0,800), 'trig.png')
In [13]:
plot(gluc, 'Glucose', (0,500), 'gluc.png')
In [14]:
plot(hdlc, 'HDL-C', (0,200), 'hdlc.png')
In [15]:
plot(systolic, 'Systolic', (0,300), 'systolic.png')
In [16]:
plot(diastolic, 'Diastolic', (0,200), 'diastolic.png')
In [17]:
klongmai = set()
for y, months in enumerate(years):
for m, df in enumerate(months):
for e in df['Location']:
if isinstance(e, unicode):
if 'คลอง' in e.encode('utf-8'):
klongmai.add((y,m))
In [18]:
klongmai
Out[18]:
In [19]:
year_list = ['2551', '2552', '2553', '2554', '2555']
concat_data = {}
for i in range(len(years)):
concat_data[year_list[i]] = pd.concat(years[i])
In [20]:
num_row_months = []
for yr in years:
num_row = []
for m in yr:
num_row.append(len(m))
num_row_months.append(num_row)
In [21]:
import datetime
fig = plt.figure(figsize=(8,6))
xticks = []
for i in range(len(num_row_months)):
plt.plot(range(1,13), num_row_months[i], label=year_list[i], marker='o')
for i in range(1,13):
month = datetime.date(1900, i, 1).strftime('%B')
xticks.append(month)
plt.legend(loc='upper left')
plt.title('Number of Individuals Per Months', size=16)
plt.xticks(range(1,13), xticks, rotation=70, size=10)
plt.savefig('num-per-month.png')
In [22]:
num_per_year = []
for yr in year_list:
df = concat_data[yr]
num_per_year.append(len(df))
In [23]:
fig = plt.figure(figsize=(8,6))
plt.bar(range(len(num_per_year)), num_per_year, color='grey')
plt.xticks(np.arange(len(num_per_year))+0.5, year_list, size=16)
plt.title('Number of Individuals Per Year', size=16)
plt.savefig('num-per-year.png')
In [24]:
from collections import Counter
from __future__ import unicode_literals
In [25]:
locations = []
for y, months in enumerate(years):
for m, df in enumerate(months):
for loc in df['Location']:
try:
locations.append(loc.encode('utf-8'))
#if 'ชุมชน' in loc.encode('utf-8'):
#print y, m, '-'*45
except AttributeError, err:
pass
#print err, e, type(e)
locations = Counter(locations)
In [26]:
for e in years[0][0]['Location'].head():
print e.encode('utf-8')
In [32]:
n=20
sorted_locations = sorted(locations, key=lambda x: locations[x], reverse=True)
#fig = plt.figure(figsize=(8,6))
#plt.title('Top Institutions by The Number of Individuals', size=15)
#plt.bar(range(len(sorted_locations[:n])),
# [locations[loc] for loc in sorted_locations[:10]],
# color='grey')
#plt.xticks(np.arange(n)+0.5, range(n), size=16)
#plt.savefig('top-institutions.png')
for n, loc in enumerate(sorted_locations[:10]):
print n, loc, locations[loc]
In [28]:
from collections import defaultdict
loc_by_year = defaultdict(set) # locations by year
for y, months in enumerate(years):
for m, df in enumerate(months):
for loc in df['Location']:
try:
loc_by_year[loc.encode('utf-8')].add(year_list[y])
except AttributeError, err:
pass
In [29]:
sorted_loc_by_year = sorted(loc_by_year, key=lambda x: len(loc_by_year[x]), reverse=True)
num_per_loc = []
for n, k in enumerate(sorted_loc_by_year):
num_per_loc.append(len(loc_by_year[k]))
num_per_loc = Counter(num_per_loc)
total_loc = sum([num_per_loc[i] for i in num_per_loc])
In [30]:
fig = plt.figure(figsize=(8,6))
plt.bar(range(1,6), [num_per_loc[i]/float(total_loc)*100.0 for i in num_per_loc], color='grey')
plt.xticks(np.arange(1,6)+0.4, range(1,6), size=16)
plt.ylabel('Percentage of Institutions', size=15)
plt.xlabel('Number of Years', size=15)
plt.title('Number of Institutions Categorized\nby the Number of Years', size=16)
plt.savefig('num-loc-years.png')
In [40]:
for k in loc_by_year:
if len(loc_by_year[k]) >= 4:
print k, 'year:', len(loc_by_year[k]), 'number of cases:', locations[k]
In [31]:
for yr in concat_data:
concat_data[yr].to_excel(yr + '.xls')
In [ ]:
pwd
In [ ]: