In [375]:
import pandas as pd, numpy as np, json
import kendo_romania
Read data
In [376]:
members=pd.read_excel('rawdata/Evidenta membrilor.xlsm',header=[1])
In [377]:
members.head()
Out[377]:
In [378]:
members.columns
Out[378]:
In [379]:
members=members[[231, 'Nr. EKF',
'Club', 'Unnamed: 3',
'Numele', 'Prenumele',
'Gen', 'Data naşterii',
'1 kyu','practică',
'1 dan', '2 dan',
'3 dan', '4 dan',
'5 dan', '6 dan',
'7 dan', '8 dan']]
In [380]:
members.head()
Out[380]:
In [381]:
def get_transfer(name,tf,verbose=False):
if tf==[]:
return tf
else:
to_blank=[' ','(',')','Transfer:','?','FDS','/']
to_replace={'Hungary':'HUN'}
to_year={'Gușu Rebeca':'2010'}
def get_tf_clubs(z):
for t in range(len(to_blank)):
z=z.replace(to_blank[t],'')
for t in to_replace:
z=z.replace(t,to_replace[t])
if ('=>') in z:
from_to=z.find('=>')
to_return={'from':z[from_to-3:from_to],'to':z[from_to+2:from_to+5],'time':z[-4:]}
if verbose:
to_return['orig']=z
else:
print('error with transfer',z)
to_return=z
##check years
#infer year from wrong note order
if '20' not in to_return['time']:
if '20' in z:
to_return['time']=z[z.find('20'):z.find('20')+4]
#if still not inferred, then manual fix
if '20' not in to_return['time']:
to_return['time']=to_year[name]
to_return['time']=int(to_return['time'])
return to_return
transfers=str(tf).split('\n')
tfr=[]
for i in transfers:
if not i in ('','nan'):
tfr.append(get_tf_clubs(i))
return sorted(tfr, key=lambda k: k['time'])
In [382]:
def get_club_by_year(d,club,year):
if d==[]:
return [club]
else:
years={}
transfer_years=[mingrade]+[i['time'] for i in d]+[maxyear]
transfer_clubs=[d[0]['from']]+[i['to'] for i in d]
for i in range(1,len(transfer_years)):
for y in range(transfer_years[i-1],transfer_years[i]+1):
if y not in years:years[y]=[]
years[y].append(transfer_clubs[i-1])
return years[year]
In [350]:
def add_to_club(data,club,year,d):
if club not in data: data[club]={}
if year not in data[club]:data[club][year]=[]
data[club][year].append(d)
return data
In [370]:
pretty_clubs={'ARA':'Arashi', 'BDS':'Budoshin', 'BSD':'Bushido', 'BTK':'Bushi Tokukai', 'BG':'Bulgaria',
'CDO':'Coroan de Oțel', 'CRK':'Clubul Român de Kendo', 'HAR':'Hargita',
'ICH':'Ichimon', 'IKA':'Ikada','ISH':'Ishhin', 'IT':'Italy','HU':'Hungary','HUN':'Hungary',
'KAS':'Kashi', 'KNS':'Kenshin', 'KYO':'Kyobukan', 'MC':'Macedonia',
'SR':'Serbia', 'MN':'Montenegro', 'MOL':'Moldova', 'MUS':'Museido',
'RON':'Ronin-do', 'SAK':'Sakura', 'SAM':'Sam-sho','SAN':'Sankon', 'SBK':'Sobukan',
'SON':'Sonkei', 'SR':'Serbia', 'TAI':'Taiken', 'TR':'Turkey','ACKIJ':'ACKIJ',
'YUK':'Yu-kai','KAY':'Kaybukan'}
club_replacer={'':'KYO'}
In [371]:
data={}
for i in members.T.iteritems():
grades=i[1][['1 kyu','1 dan','2 dan','3 dan','4 dan','5 dan','6 dan','7 dan','8 dan']].dropna()
grades0=i[1][['1 dan','2 dan','3 dan','4 dan','5 dan','6 dan','7 dan','8 dan']].dropna()
df=pd.DataFrame(grades0)
df.columns=['dan']
df=df.reset_index().set_index('dan').sort_index()
#dummy={j:str(grades[j])[:10] for j in grades.index}
dummy={}
grades=pd.to_datetime(grades.astype(str))
if len(grades)>0:
mingrade=grades.min().year
maxgrade=grades.max().year
else:
mingrade=np.nan
maxgrade=np.nan
dummy['name']=i[1]['Numele']+' '+i[1]['Prenumele']
dummy['birth']=str(i[1]['Data naşterii'])[:10]
dummy['gen']=i[1]['Gen']
dummy['ekf']=i[1]['Nr. EKF']
dummy['active']=i[1][231]
club=i[1]['Club']
dummy['transfer']=get_transfer(dummy['name'],i[1]['Unnamed: 3'])
if np.isnan(mingrade):
mingrade=2016 #default starting year
maxyear=2019 #default max year
for year in range(mingrade,maxyear):
#get year from exams
dummy['dan']=len(df[:str
(year)])
#get club from transfers
clubs=get_club_by_year(dummy['transfer'],club,year)
clubs=clubs[:1] #remove this step to double count. this limits to first club in transfer years
for j in range(len(clubs)):
iclub=clubs[j]
if iclub in club_replacer: iclub=club_replacer[iclub]
dummy['club']=iclub
dummy['pretty_club']=pretty_clubs[iclub]
dummy['age']=year-1-pd.to_datetime(dummy['birth']).year
data=add_to_club(data,iclub,year,dummy.copy())
In [372]:
all_data=[]
for club in data:
for year in data[club]:
df=pd.DataFrame(data[club][year])
df['year']=year
df['club']=club
df=df.drop('transfer',axis=1)
all_data.append(df)
all_data=pd.concat(all_data)
In [374]:
all_data.to_csv('data/members_base.csv')
In [ ]: