In [12]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
import mysql.connector
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR,Integer
# 阿里云数据库
DB_ALI = create_engine('mysql+mysqlconnector://zhoub:zhoub-ses@www.easygaokao.com:3306/scores_v2', echo=False)
# 本地数据库
DB_LOCAL = create_engine('mysql+mysqlconnector://zhoub:zhoub-ses@192.168.2.224:3306/scores_v2', echo=False)
In [13]:
try:
CHANGETIME_SCHEDULE
except:
CHANGETIME_SCHEDULE=os.path.getmtime('/data1/public/02数据/2016年数据状态跟进表.xlsx')
EXCEL_SCHEDULE=pd.read_excel('/data1/public/02数据/2016年数据状态跟进表.xlsx')
print(EXCEL_SCHEDULE)
else:
if CHANGETIME_SCHEDULE != os.path.getmtime('/data1/public/02数据/2016年数据状态跟进表.xlsx'):
CHANGETIME_SCHEDULE=os.path.getmtime('/data1/public/02数据/2016年数据状态跟进表.xlsx')
EXCEL_SCHEDULE=pd.read_excel('/data1/public/02数据/2016年数据状态跟进表.xlsx')
print(EXCEL_SCHEDULE)
In [14]:
# DEFINE some constant
ROOT_DIR='/data1/public/购买2016年最详细各省份数据(for周博)/'
DISTRICT='四川'
TBL_NAME='si_chuan'
BASE_PATH=ROOT_DIR+DISTRICT+'/'
print(os.listdir(BASE_PATH))
S_FILE_PATH=BASE_PATH + DISTRICT + '2016年院校.xlsx'
M_FILE_PATH=BASE_PATH + DISTRICT + '2016年专业.xlsx'
# 导入原始学校数据
if os.path.exists(S_FILE_PATH):
try:
CHANGETIME_S_FILE_PATH
except:
CHANGETIME_S_FILE_PATH=os.path.getctime(S_FILE_PATH)
SCHOOL_EXCEL = pd.read_excel(S_FILE_PATH,converters = {'最高分': str,'平均分': str,})
else:
if CHANGETIME_S_FILE_PATH != os.path.getctime(S_FILE_PATH):
SCHOOL_EXCEL = pd.read_excel(S_FILE_PATH,converters = {'最高分': str,'平均分': str,})
else:
print('院校文件不存在!!')
# 导入原始专业数据
if os.path.exists(M_FILE_PATH):
try:
CHANGETIME_M_FILE_PATH
except:
CHANGETIME_M_FILE_PATH=os.path.getctime(M_FILE_PATH)
MAJOR_EXCEL = pd.read_excel(M_FILE_PATH,converters = {'最高分': str,'平均分': str,})
else:
if CHANGETIME_M_FILE_PATH != os.path.getctime(M_FILE_PATH):
MAJOR_EXCEL = pd.read_excel(M_FILE_PATH,converters = {'最高分': str,'平均分': str,})
else:
print('专业数据文件不存在!!')
In [15]:
# 查看各columns信息
for clm in SCHOOL_EXCEL.columns:
print(clm)
if clm not in ['院校名称', '最高分', '最低分位次', '最高分位次', '最低分', '平均分','平均分位次', '录取人数']:
print(SCHOOL_EXCEL[clm].unique())
# pass
In [16]:
# 查看各columns信息
for clm in MAJOR_EXCEL.columns:
print(clm)
if clm not in ['院校名称','院校代码', '专业名称','专业代码', '最高分', '最低分位次','平均分位次','平均位次', '最高分位次', '最低分', '平均分', '录取人数', '录取人', '院校代码','专业代码','该专业总录取数']:
print(MAJOR_EXCEL[clm].unique())
# pass
In [17]:
# some utility
import re
def distinguish(x):
## for nei_meng_gu
if re.search('普通理科', str(x)):
return '理科'
if re.search('普通文科', str(x)):
return '文科'
if re.search('理', str(x)):
return '理科'
if re.search('文', str(x)):
return '文科'
return x
def distinguish_round(x):
## for nei_meng_gu
if re.search('本科二批C(三本)', str(x)):
return '本科三批'
if re.search('本科第三批', str(x)):
return '本科三批'
if re.search('本科第二批', str(x)):
return '本科二批'
if re.search('本科第一批', str(x)):
return '本科一批'
if re.search('本科提前', str(x)):
return '本科二批'
if re.search('本科提前批', str(x)):
return '提前批次'
if re.search('本科二批', str(x)):
return '本科二批'
if re.search('本科一批', str(x)):
return '本科一批'
if re.search('本科三批', str(x)):
return '本科三批'
if re.search('本一批', str(x)):
return '本科一批'
if re.search('本二批', str(x)):
return '本科二批'
if re.search('本三批', str(x)):
return '本科三批'
if re.search('专科', str(x)):
return '专科'
return str(x)
In [18]:
# 读取数据到DataFrame
SCHOOL_DF = DataFrame({
'category':SCHOOL_EXCEL['科类'].apply(distinguish),
'school_name':SCHOOL_EXCEL['院校名称'],
'district':DISTRICT,
'round':SCHOOL_EXCEL['批次'].apply(distinguish_round),
'highest_score':SCHOOL_EXCEL['最高分'],
'highest_score_rank':SCHOOL_EXCEL['最高分位次'],
# 'highest_score_rank':'--',
'average_score':SCHOOL_EXCEL['平均分'],
'average_score_rank':SCHOOL_EXCEL['平均分位次'],
# 'average_score_rank':'--',
'lowest_score':SCHOOL_EXCEL['最低分'],
'lowest_score_rank':SCHOOL_EXCEL['最低分位次'],
# 'lowest_score_rank':'--',
'year':2016,
'tot_rc':SCHOOL_EXCEL['录取人数']
})
# add to local db for every province
# SCHOOL_DF.to_sql(name='school_cutoff_scores_'+TBL_NAME+'_2016', con=DB_LOCAL, if_exists = 'replace', index=False)
In [9]:
SCHOOL_DF=SCHOOL_DF[SCHOOL_DF['year']==2016]
# add to local db for every province
SCHOOL_DF.to_sql(name='school_cutoff_scores_'+TBL_NAME+'_2016', con=DB_LOCAL, if_exists = 'replace', index=False)
sql='ALTER TABLE `school_cutoff_scores_'+TBL_NAME+'_2016` ADD `id` INT NOT NULL AUTO_INCREMENT FIRST, ADD PRIMARY KEY (`id`);'
with DB_LOCAL.begin() as conn: # TRANSACTION
conn.execute(sql)
sql='ALTER TABLE `school_cutoff_scores_'+TBL_NAME+'_2016` CHANGE `category` `category` TEXT CHARACTER SET utf8 COLLATE utf8_unicode_ci NULL DEFAULT NULL AFTER `id`, CHANGE `district` `district` TEXT CHARACTER SET utf8 COLLATE utf8_unicode_ci NULL DEFAULT NULL AFTER `id`, CHANGE `school_name` `school_name` TEXT CHARACTER SET utf8 COLLATE utf8_unicode_ci NULL DEFAULT NULL AFTER `id`;'
with DB_LOCAL.begin() as conn: # TRANSACTION
conn.execute(sql)
In [19]:
# 只取2016年数据
SCHOOL_DF=SCHOOL_DF[SCHOOL_DF['year']==2016]
SCHOOL_DF.to_sql(name='school_cutoff_scores_2016', con=DB_ALI, if_exists = 'append', index=False)
SCHOOL_DF.to_sql(name='school_cutoff_scores_2016', con=DB_LOCAL, if_exists = 'append', index=False)
# add to ali yun db for 按分推荐大学
if 'tot_rc' in SCHOOL_DF.columns:
SCHOOL_DF=SCHOOL_DF.drop('tot_rc',1)
SCHOOL_DF.to_sql(name='school_cutoff_scores_2013_2015', chunksize=1000, con=DB_ALI, if_exists = 'append', index=False)
# add to ali yun db for 大学详情|大学分数线
if 'tot_rc' in SCHOOL_DF.columns:
SCHOOL_DF=SCHOOL_DF.drop('tot_rc',1)
SCHOOL_DF.to_sql(name='school_cutoff_scores_2008_2015', chunksize=1000, con=DB_ALI, if_exists = 'append', index=False)
In [15]:
# add to 大学分数线
# if 'tot_rc' in SCHOOL_DF.columns:
# SCHOOL_DF=SCHOOL_DF.drop('tot_rc',1)
# SCHOOL_DF.to_sql(name='school_cutoff_scores_2016', con=DB_ALI, if_exists = 'append', index=False)
In [10]:
MAJOR_DF = DataFrame({
'category':MAJOR_EXCEL['科类'].apply(distinguish),
'school_name':MAJOR_EXCEL['院校名称'],
'district':DISTRICT,
'round':MAJOR_EXCEL['批次名称'].apply(distinguish_round),
'highest_score':MAJOR_EXCEL['最高分'],
'highest_score_rank':MAJOR_EXCEL['最高分位次'],
# 'highest_score_rank':'--',
'average_score':MAJOR_EXCEL['平均分'],
'average_score_rank':MAJOR_EXCEL['平均分位次'],
# 'average_score_rank':'--',
'lowest_score':MAJOR_EXCEL['最低分'],
'lowest_score_rank':MAJOR_EXCEL['最低分位次'],
# 'lowest_score_rank':'--',
'year':2016,
'discipline':MAJOR_EXCEL['专业名称'],
'tot_rc':MAJOR_EXCEL['录取人'],
})
In [11]:
MAJOR_DF=MAJOR_DF[MAJOR_DF['year']==2016]
# add to local db for every province
MAJOR_DF.to_sql(name='school_cutoff_scores_'+TBL_NAME+'_M_2016', chunksize=1000,con=DB_LOCAL, if_exists = 'replace', index=False)
sql='ALTER TABLE `school_cutoff_scores_'+TBL_NAME+'_M_2016` ADD `id` INT NOT NULL AUTO_INCREMENT FIRST, ADD PRIMARY KEY (`id`);'
with DB_LOCAL.begin() as conn: # TRANSACTION
conn.execute(sql)
sql='ALTER TABLE `school_cutoff_scores_'+TBL_NAME+'_M_2016` CHANGE `category` `category` TEXT CHARACTER SET utf8 COLLATE utf8_unicode_ci NULL DEFAULT NULL AFTER `id`, CHANGE `district` `district` TEXT CHARACTER SET utf8 COLLATE utf8_unicode_ci NULL DEFAULT NULL AFTER `id`, CHANGE `school_name` `school_name` TEXT CHARACTER SET utf8 COLLATE utf8_unicode_ci NULL DEFAULT NULL AFTER `id`;'
with DB_LOCAL.begin() as conn: # TRANSACTION
conn.execute(sql)
In [19]:
# 只取2016年数据
MAJOR_DF=MAJOR_DF[MAJOR_DF.year==2016]
MAJOR_DF[:3]
Out[19]:
In [20]:
if 'tot_rc' in MAJOR_DF.columns:
MAJOR_DF=MAJOR_DF.drop('tot_rc',1)
if 'lowest_score_rank' in MAJOR_DF.columns:
MAJOR_DF=MAJOR_DF.drop('lowest_score_rank',1)
if 'highest_score_rank' in MAJOR_DF.columns:
MAJOR_DF=MAJOR_DF.drop('highest_score_rank',1)
if 'average_score_rank' in MAJOR_DF.columns:
MAJOR_DF=MAJOR_DF.drop('average_score_rank',1)
MAJOR_DF.to_sql(name='school_cutoff_scores_by_y_m_src2_uniq2', chunksize=500, con=DB_ALI, if_exists = 'append', index=False)
In [ ]: