notebook.community

Edit and run



In [1]:

    
import pandas as pd
from pandas import DataFrame
import numpy as np



In [2]:

    
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR,Integer
engine = create_engine('mysql+mysqlconnector://dongyh:dongyh-ses@192.168.2.224:3306/scores_dongyh', echo=False)



In [3]:

    
District_PinYin_Map =  {
        '上海':'shang_hai','云南':'yun_nan','内蒙古':'nei_meng_gu','北京':'bei_jing',
        '吉林':'ji_lin','四川':'si_chuan','宁夏':'ning_xia','安徽':'an_hui',
        '山东':'shan_dong','山西':'shan_xi','广东':'guang_dong','广西':'guang_xi','新疆':'xin_jiang',
        '江苏':'jiang_su', '江西':'jiang_xi','河北':'he_bei','河南':'he_nan',
        '海南':'hai_nan',  '湖北':'hu_bei','湖南':'hu_nan','甘肃':'gan_su','福建':'fu_jian',
        '贵州':'gui_zhou','辽宁':'liao_ning','重庆':'chong_qing','陕西':'shann_xi',
        '青海':'qing_hai', '黑龙江':'hei_long_jiang'}

import re
def strim_round(text):
    if text==None:
        return 'No_round'
    match = re.search('(?<=\()(\S+)(?=\))',str(text))
    if match:
        return match.group(0)
    else:
        return text



In [7]:

    
# replace 已有的数据表
sql = "SELECT * FROM tian_jin WHERE 1;"
df = pd.read_sql(sql,con=engine)
df['round_trim']=df['round'].apply(strim_round)
ndict=dict()
for k,v in df.iterrows():
    key=v['school_name']+str(v['year'])+v['round_trim']+v['category']
    if key in ndict:
        ndict[key]['average_score']=\
        (v['recruit_num_of_rank']*v['score']+
         ndict[key]['tot_rc']
         *ndict[key]['average_score'])/(
            v['recruit_num_of_rank']+ndict[key]['tot_rc']
        )
        ndict[key]['tot_rc']+=v['recruit_num_of_rank']
        
        if ndict[key]['lowest_score'] > v['score']:
            ndict[key]['lowest_score'] = v['score']
            ndict[key]['lowest_score_rank'] = v['rank']
        if ndict[key]['highest_score'] < v['score']:
            ndict[key]['highest_score']=v['score']
            ndict[key]['highest_score_rank']=v['rank']
    else:
        ndict[key]=dict()
        ndict[key]['highest_score']=v['score']
        ndict[key]['highest_score_rank']=v['rank']
        ndict[key]['lowest_score'] = v['score']
        ndict[key]['lowest_score_rank'] = v['rank']
        ndict[key]['tot_rc']=v['recruit_num_of_rank']
        ndict[key]['average_score']=v['score']
        ndict[key]['round']=v['round']
        ndict[key]['round_trim']=v['round_trim']
        ndict[key]['degree']=v['degree']
        ndict[key]['year']=v['year']
        ndict[key]['district']=v['district']
        ndict[key]['school_name']=v['school_name']
        ndict[key]['category']=v['category']
df = DataFrame.from_dict(ndict,orient='index')
df.to_sql(name='college_analytic', con=engine, if_exists = 'replace', index=False,
           dtype={'round':VARCHAR(255),
                  'round_trim':VARCHAR(255),
                  'degree':VARCHAR(255),
                  'highest_score':Integer,
                  'school_name':VARCHAR(255),
                  'district':VARCHAR(255),
                  'category':VARCHAR(255),
                 })



In [8]:

    
for index, item in District_PinYin_Map.items():
    sql = "SELECT * FROM "+ item + " WHERE 1;"
    df = pd.read_sql(sql,con=engine)
    df['round_trim']=df['round'].apply(strim_round)
    ndict=dict()
    for k,v in df.iterrows():
        key=v['school_name']+str(v['year'])+v['round_trim']+v['category']
        if key in ndict:
            ndict[key]['average_score']=\
            (v['recruit_num_of_rank']*v['score']+
             ndict[key]['tot_rc']
             *ndict[key]['average_score'])/(
                v['recruit_num_of_rank']+ndict[key]['tot_rc']
            )
            ndict[key]['tot_rc']+=v['recruit_num_of_rank']

            if ndict[key]['lowest_score'] > v['score']:
                ndict[key]['lowest_score'] = v['score']
                ndict[key]['lowest_score_rank'] = v['rank']
            if ndict[key]['highest_score'] < v['score']:
                ndict[key]['highest_score']=v['score']
                ndict[key]['highest_score_rank']=v['rank']
        else:
            ndict[key]=dict()
            ndict[key]['highest_score']=v['score']
            ndict[key]['highest_score_rank']=v['rank']
            ndict[key]['lowest_score'] = v['score']
            ndict[key]['lowest_score_rank'] = v['rank']
            ndict[key]['tot_rc']=v['recruit_num_of_rank']
            ndict[key]['average_score']=v['score']
            ndict[key]['round']=v['round']
            ndict[key]['round_trim']=v['round_trim']
            ndict[key]['degree']=v['degree']
            ndict[key]['year']=v['year']
            ndict[key]['district']=v['district']
            ndict[key]['school_name']=v['school_name']
            ndict[key]['category']=v['category']
    df = DataFrame.from_dict(ndict,orient='index')
    df.to_sql(name='college_analytic', con=engine, if_exists = 'append', index=False, chunksize=15000)



In [ ]: