In [1]:
# def 各种必要环境
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import logging,os,sys,time
logging.basicConfig(level=logging.WARNING, format='levelname:%(levelname)s filename: %(filename)s ,outputNumber: [%(lineno)d] thread: %(threadName)s output msg: %(message)s - %(asctime)s', datefmt='[%d/%b/%Y %H:%M:%S]')
logging.warning("gogogo开工干活")
from selenium import webdriver
import selenium.common.exceptions
def start():
##/初始化路径
chromedriver="/Users/laiyao/Documents/GitHub/MyApp/PythonApplication1/爬虫练习/chromedriver/chromedriver-2"
##引入selenium,设置Chrome
from selenium import webdriver
import selenium
os.environ["webdriver.chrome.driver"] = chromedriver
option = webdriver.ChromeOptions()#自定义设置
option.add_argument('--user-data-dir=' + '/Users/laiyao/Library/Application Support/Google/Chrome/') #设置成用户自己的数据目录##注意退出当前的chrome,MAC 为~/Library/Application Support/Google/Chrome/Default
option.add_argument('--user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36') #修改浏览器的User-Agent来伪装你的浏览器
option.add_argument('--process-per-site') #每个站点使用单独进程
option.add_argument('--lang=zh-CN') #设置语言为简体中文
##启动浏览器
driver = webdriver.Chrome(chromedriver,options=option)
driver.get('about:version')
return driver
def openNewTab(driver,url='https://www.sogou.com'):
js='window.open("'+url+'");'
driver.execute_script(js)
def quit(driver):
driver.quit()
logging.info(['QUIT success'])
return
In [2]:
# 初始化数据储存:
df=pd.DataFrame(columns =['日期','ID','账户名称','总支出','现金支出','赠款支出'] ,index=[0])
df = df.append({'日期':'2000-01-01','ID':0,'账户名称':'涉及金额的账户需要除以100000','总支出':0,'现金支出':0,'赠款支出':0}, ignore_index=True) #忽略索引,往dataframe中插入一行数据
FailSubAccount={}
def loginToutiao():
# 开浏览器窗口(2个窗口)
try:
driver=start()
except selenium.common.exceptions.InvalidArgumentException as err:
print("浏览器窗口已打开啦~")
openNewTab(driver,url='https://ad.toutiao.com/pages/login/index.html?redirect_uri=/pages/')
# openNewTab(driver,url='https://ad.toutiao.com/pages/login/index.html?advertiser_id=')
# 业务代码:登录(登录需要手工)
driver.switch_to.window(driver.window_handles[0])
return driver
def taskClear(driver,subAccountID):
time.sleep(1)
try:
subAccountURL='https://ad.toutiao.com/overture/account/notification/?'
time.sleep(0.2)
driver.get(subAccountURL+str(subAccountID))
driver.find_element_by_xpath('//*[@id="account-notification"]/div/div[2]/div[1]/span/div/div').click()
time.sleep(0.1)
driver.find_element_by_xpath('//*[@id="account-notification"]/div/div[2]/div[1]/button').click()
except Exception:
print("没点击成功,小问题", end=" ")
print(Exception)
return None
def loginSubAccount(driver,subAccountID=0,name='默认名字'):
# 业务代码:登录子账户(自动)
subAccountURL='https://ad.toutiao.com/marco/account/account_cutover/?advertiser_id='
# driver.switch_to.window(driver.window_handles[1])
driver.get(subAccountURL+str(subAccountID))
time.sleep(0.2)
# 所有未读消息标记为已读,主要是防止链接没跳转成功
taskClear(driver,subAccountID)
return None
def saveData(SubAccountData=None,subAccountID=0,name='默认名字'):
time.sleep(0.1)
if SubAccountData==None:
global FailSubAccount
FailSubAccount[subAccountID]=name
logging.error('当前失败账户:'+name)
return None
for data in SubAccountData:
if not subAccountID ==data["advertiser_id"]:
logging.error(jsonData)
logging.error('账户ID错误,无法访问账户')
FailSubAccount[subAccountID]=name
else:
global df
df = df.append({'日期':data["date"],'ID':data["advertiser_id"],'账户名称':name,'总支出':data["cost"],'现金支出':data["cash_cost"] ,'赠款支出':data["reward_cost"]}, ignore_index=True) #忽略索引,往dataframe中插入一行数据
pass
def getSubAccountData(driver,start_date="2019-03-20",end_date=time.strftime("%Y-%m-%d", time.localtime())):
dataURL='https://ad.toutiao.com/overture/cash/get_cash_flow/?page=1&start_date='+start_date+'&end_date='+end_date
# driver.switch_to.window(driver.window_handles[2])
driver.get(dataURL)
time.sleep(0.6)
# 去掉HTML里面乱七八糟的东西,拿到真实的JSON数据
htmlText=driver.page_source.replace('<html xmlns="http://www.w3.org/1999/xhtml"><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">','').replace('</pre></body></html>','')
jsonData=htmlText
if htmlText[0] =='{':
jsonData=json.loads(htmlText)
else:
logging.error(htmlText)
logging.error('浏览器获取数据失败')
return jsonData
# 验证"status": "success"
if not jsonData["status"] =="success":
logging.error(jsonData)
logging.error('头条系统获取数据失败')
return jsonData
return jsonData['data']["items"]
# 应审计要求截图3月数据所需代码:
def takePhoto(driver,v='默认名字'):
def goToCashFlow(driver):
cashFlowURL="https://ad.toutiao.com/overture/cash/flow/"
driver.get(cashFlowURL)
time.sleep(0.5)
driver.find_element_by_xpath('//*[@id="cash-cashflow-date"]/div[1]/span').click()
time.sleep(0.1)
driver.find_element_by_xpath('//*[@id="cash-cashflow-date"]/div[2]/div[1]/div/div[5]').click()
time.sleep(0.1)
driver.find_element_by_xpath('//*[@id="cash-cashflow-date"]/div[2]/div[5]/button/span').click()
time.sleep(0.3)
pass
def nextPage(driver):
driver.find_element_by_xpath('//*[@id="cash-cashflow-pagination"]/ul/li[4]').click()
time.sleep(1)
pass
def takePagePhoto(driver,v):
#以下代码是将浏览器页面拖到最顶上。
driver.find_element_by_css_selector('body').send_keys(webdriver.common.keys.Keys.PAGE_UP)
driver.get_screenshot_as_file(v+"-up.png")
#以下代码是将浏览器页面拉到最下面。
driver.find_element_by_css_selector('body').send_keys(webdriver.common.keys.Keys.PAGE_DOWN)
driver.save_screenshot(v+"-down.png")
pass
goToCashFlow(driver)
takePagePhoto(driver,v+"-1")
try:
nextPage(driver)
except selenium.common.exceptions.NoSuchElementException as err:
print('没找到下一页', err)
except selenium.common.exceptions.StaleElementReferenceException as err:
print('没找到下一页.StaleElementReferenceException ', err)
else:
takePagePhoto(driver,v+"-2")
print(v,"截图完成")
In [3]:
# 业务执行(登录头条),记得要下载汇总数据
driver=loginToutiao()
time.sleep(22)
In [ ]:
# 数据源来自这个文件:
# import '/Users/laiyao/Documents/成本/投放成本录入准确性核对/今日头条数据/ID-Name-Python.py'
# h核心数据不能乱传
sys.path.append('/Users/laiyao/Documents/成本/头条-投放成本核对/今日头条数据/')
import id_Name_Python
# 下载汇总数据——下载不下来的
# driver.switch_to.window(driver.window_handles[0])
# dataURL='https://ad.toutiao.com/marco/account/get_majordomo_binded_account_stat_info/?start_time=2019-01-01&end_time='+time.strftime("%Y-%m-%d", time.localtime())+'&action=download&single=1'
# driver.get(dataURL)
# 业务执行(拿数据) 鱼元,大众,量元
rundata=id_Name_Python.大众
driver.switch_to.window(driver.window_handles[0])
for k,v in rundata.items():
print("正在抓取:",k,v, end=" ")
loginSubAccount(driver,k,v)
# 头条后台限制,日期区间15~20天比较好
saveData(getSubAccountData(driver,start_date="2019-04-25"),k,v)
print(" 抓取完成",time.asctime())
# 如果审计要求截图的话~~~
# takePhoto(driver,v)
driver.get('https://baijiahao.baidu.com/')
print('抓取数据完成:' , v[0:4], FailSubAccount,"抓取过程貌似无异常" if not FailSubAccount else "抓取过程中异常数据为:" + str(FailSubAccount))
# 运行一下这个save 数据
df.to_csv(path_or_buf=v[0:4] + time.strftime("%Y-%m-%d", time.localtime()) +'导出数据.csv' ,encoding='utf-8-sig')
df[0:2]
In [ ]:
# 检查账户ID是否正确
driver.switch_to.window(driver.window_handles[0])
for k,v in rundata.items():
try:
if k % 10 == 0:
print("正在抓取:",k,v, end=" ")
loginSubAccount(driver,k,v)
# 头条后台限制,日期区间15~20天比较好
saveData(getSubAccountData(driver,start_date="2019-03-30"),k,v)
print(" 抓取完成")
else:
pass
except NameError:
print("id 异常账户")
In [5]:
# 运行这个退出浏览器
quit(driver)
In [ ]: