In [77]:
engine = create_engine('mysql+mysqlconnector://zhoub:zhoub-ses@easygaokao.com:3306/scores_v2', echo=False)
In [20]:
from pandas import DataFrame
In [31]:
all_province = pd.read_excel('../2016zhiyuanguanjia6.18.xlsx',converters = {'最低位次':str})
In [32]:
all_province['招生批次'].unique()
Out[32]:
In [33]:
all_province.info()
In [34]:
an_hui=all_province[all_province['省份']=='安徽']
In [35]:
an_hui[:3]
Out[35]:
In [36]:
SCHOOL_EXCEL=an_hui
In [46]:
major_df=DataFrame({
'category':SCHOOL_EXCEL['招生批次'].apply(distinguish),
'school_name':SCHOOL_EXCEL['院校名称'],
'district':SCHOOL_EXCEL['省份'],
'round':SCHOOL_EXCEL['招生批次'].apply(distinguish_round),
'highest_score':SCHOOL_EXCEL['最高分'],
# 'highest_score_rank':SCHOOL_EXCEL['最高分位次'],
'highest_score_rank':None,
'average_score':SCHOOL_EXCEL['平均分'],
# 'average_score_rank':SCHOOL_EXCEL['平均分位次'],
'average_score_rank':None,
'lowest_score':SCHOOL_EXCEL['最低分'],
'lowest_score_rank':SCHOOL_EXCEL['最低位次'],
# 'lowest_score_rank':'--',
'year':SCHOOL_EXCEL['年份'],
'tot_rc':SCHOOL_EXCEL['录取数'],
'discipline':SCHOOL_EXCEL['专业名称']
})
In [ ]:
In [47]:
major_df=major_df.replace('-',0)
In [48]:
major_df=major_df.replace('--',0)
In [54]:
major_df['lowest_score']
Out[54]:
In [58]:
ndict=dict()
df=major_df
for k,v in df.iterrows():
key=v['school_name']+v['round']+v['category']
if key in ndict:
ndict[key]['average_score']=\
(v['tot_rc']*v['average_score']+
ndict[key]['tot_rc']
*ndict[key]['average_score'])/(
v['tot_rc']+ndict[key]['tot_rc']
)
ndict[key]['tot_rc']+=v['tot_rc']
if int(ndict[key]['lowest_score']) > int(v['lowest_score']) and int(v['lowest_score'])>0:
ndict[key]['lowest_score'] = v['lowest_score']
ndict[key]['lowest_score_rank'] = v['lowest_score_rank']
if ndict[key]['highest_score'] < v['highest_score']:
ndict[key]['highest_score']=v['highest_score']
else:
ndict[key]=dict()
ndict[key]['highest_score']=int(v['highest_score'])
ndict[key]['highest_score_rank']=None
ndict[key]['lowest_score'] = int(v['lowest_score'])
ndict[key]['lowest_score_rank'] = int(v['lowest_score_rank'])
ndict[key]['tot_rc']=int(v['tot_rc'])
ndict[key]['average_score']=int(v['average_score'])
ndict[key]['average_score_rank']=None
ndict[key]['round']=v['round']
ndict[key]['year']=v['year']
ndict[key]['district']=v['district']
ndict[key]['school_name']=v['school_name']
ndict[key]['category']=v['category']
In [72]:
df = DataFrame.from_dict(ndict,orient='index')
In [78]:
if 'tot_rc' in df.columns:
df=df.drop('tot_rc',1)
df.to_sql(name='school_cutoff_scores_2013_2015', chunksize=1000, con=engine, if_exists = 'append', index=False)
# add to ali yun db for 大学详情|大学分数线
if 'tot_rc' in df.columns:
df=df.drop('tot_rc',1)
df.to_sql(name='school_cutoff_scores_2008_2015', chunksize=1000, con=engine, if_exists = 'append', index=False)
In [75]:
df['average_score']
Out[75]:
In [ ]:
In [73]:
df['average_score']=df['average_score'].map('{:,.1f}'.format)
In [ ]:
In [ ]:
In [14]:
import re
def distinguish(x):
## for nei_meng_gu
if re.search('普通理科', str(x)):
return '理科'
if re.search('普通文科', str(x)):
return '文科'
if re.search('理', str(x)):
return '理科'
if re.search('文', str(x)):
return '文科'
return x
def distinguish_round(x):
## for nei_meng_gu
if re.search('本科二批C(三本)', str(x)):
return '本科三批'
if re.search('本科第三批', str(x)):
return '本科三批'
if re.search('本科第二批', str(x)):
return '本科二批'
if re.search('本科第一批', str(x)):
return '本科一批'
if re.search('本科提前', str(x)):
return '本科二批'
if re.search('本科提前批', str(x)):
return '提前批次'
if re.search('本科二批', str(x)):
return '本科二批'
if re.search('本科一批', str(x)):
return '本科一批'
if re.search('本科三批', str(x)):
return '本科三批'
if re.search('本一批', str(x)):
return '本科一批'
if re.search('本二批', str(x)):
return '本科二批'
if re.search('本三批', str(x)):
return '本科三批'
if re.search('专科', str(x)):
return '专科'
return str(x)