In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
In [2]:
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR,Integer
engine = create_engine('mysql+mysqlconnector://dongyh:dongyh-ses@192.168.2.224:3306/scores_dongyh', echo=False)
In [3]:
District_PinYin_Map = {
'上海':'shang_hai','云南':'yun_nan','内蒙古':'nei_meng_gu','北京':'bei_jing',
'吉林':'ji_lin','四川':'si_chuan','宁夏':'ning_xia','安徽':'an_hui',
'山东':'shan_dong','山西':'shan_xi','广东':'guang_dong','广西':'guang_xi','新疆':'xin_jiang',
'江苏':'jiang_su', '江西':'jiang_xi','河北':'he_bei','河南':'he_nan',
'海南':'hai_nan', '湖北':'hu_bei','湖南':'hu_nan','甘肃':'gan_su','福建':'fu_jian',
'贵州':'gui_zhou','辽宁':'liao_ning','重庆':'chong_qing','陕西':'shann_xi',
'青海':'qing_hai', '黑龙江':'hei_long_jiang'}
import re
def strim_round(text):
if text==None:
return 'No_round'
match = re.search('(?<=\()(\S+)(?=\))',str(text))
if match:
return match.group(0)
else:
return text
In [7]:
# replace 已有的数据表
sql = "SELECT * FROM tian_jin WHERE 1;"
df = pd.read_sql(sql,con=engine)
df['round_trim']=df['round'].apply(strim_round)
ndict=dict()
for k,v in df.iterrows():
key=v['school_name']+str(v['year'])+v['round_trim']+v['category']
if key in ndict:
ndict[key]['average_score']=\
(v['recruit_num_of_rank']*v['score']+
ndict[key]['tot_rc']
*ndict[key]['average_score'])/(
v['recruit_num_of_rank']+ndict[key]['tot_rc']
)
ndict[key]['tot_rc']+=v['recruit_num_of_rank']
if ndict[key]['lowest_score'] > v['score']:
ndict[key]['lowest_score'] = v['score']
ndict[key]['lowest_score_rank'] = v['rank']
if ndict[key]['highest_score'] < v['score']:
ndict[key]['highest_score']=v['score']
ndict[key]['highest_score_rank']=v['rank']
else:
ndict[key]=dict()
ndict[key]['highest_score']=v['score']
ndict[key]['highest_score_rank']=v['rank']
ndict[key]['lowest_score'] = v['score']
ndict[key]['lowest_score_rank'] = v['rank']
ndict[key]['tot_rc']=v['recruit_num_of_rank']
ndict[key]['average_score']=v['score']
ndict[key]['round']=v['round']
ndict[key]['round_trim']=v['round_trim']
ndict[key]['degree']=v['degree']
ndict[key]['year']=v['year']
ndict[key]['district']=v['district']
ndict[key]['school_name']=v['school_name']
ndict[key]['category']=v['category']
df = DataFrame.from_dict(ndict,orient='index')
df.to_sql(name='college_analytic', con=engine, if_exists = 'replace', index=False,
dtype={'round':VARCHAR(255),
'round_trim':VARCHAR(255),
'degree':VARCHAR(255),
'highest_score':Integer,
'school_name':VARCHAR(255),
'district':VARCHAR(255),
'category':VARCHAR(255),
})
In [8]:
for index, item in District_PinYin_Map.items():
sql = "SELECT * FROM "+ item + " WHERE 1;"
df = pd.read_sql(sql,con=engine)
df['round_trim']=df['round'].apply(strim_round)
ndict=dict()
for k,v in df.iterrows():
key=v['school_name']+str(v['year'])+v['round_trim']+v['category']
if key in ndict:
ndict[key]['average_score']=\
(v['recruit_num_of_rank']*v['score']+
ndict[key]['tot_rc']
*ndict[key]['average_score'])/(
v['recruit_num_of_rank']+ndict[key]['tot_rc']
)
ndict[key]['tot_rc']+=v['recruit_num_of_rank']
if ndict[key]['lowest_score'] > v['score']:
ndict[key]['lowest_score'] = v['score']
ndict[key]['lowest_score_rank'] = v['rank']
if ndict[key]['highest_score'] < v['score']:
ndict[key]['highest_score']=v['score']
ndict[key]['highest_score_rank']=v['rank']
else:
ndict[key]=dict()
ndict[key]['highest_score']=v['score']
ndict[key]['highest_score_rank']=v['rank']
ndict[key]['lowest_score'] = v['score']
ndict[key]['lowest_score_rank'] = v['rank']
ndict[key]['tot_rc']=v['recruit_num_of_rank']
ndict[key]['average_score']=v['score']
ndict[key]['round']=v['round']
ndict[key]['round_trim']=v['round_trim']
ndict[key]['degree']=v['degree']
ndict[key]['year']=v['year']
ndict[key]['district']=v['district']
ndict[key]['school_name']=v['school_name']
ndict[key]['category']=v['category']
df = DataFrame.from_dict(ndict,orient='index')
df.to_sql(name='college_analytic', con=engine, if_exists = 'append', index=False, chunksize=15000)
In [ ]: