notebook.community



In [1]:

    
# def 各种必要环境
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import logging,os,sys,time
logging.basicConfig(level=logging.WARNING, format='levelname:%(levelname)s filename: %(filename)s ，outputNumber: [%(lineno)d]  thread: %(threadName)s output msg:  %(message)s - %(asctime)s', datefmt='[%d/%b/%Y %H:%M:%S]')
logging.warning("gogogo开工干活")

from selenium import webdriver
import selenium.common.exceptions

def start():
	##/初始化路径
	chromedriver="/Users/laiyao/Documents/GitHub/MyApp/PythonApplication1/爬虫练习/chromedriver/chromedriver-2"
	##引入selenium，设置Chrome
	from selenium import webdriver
	import selenium
	os.environ["webdriver.chrome.driver"] = chromedriver
	option = webdriver.ChromeOptions()#自定义设置
	option.add_argument('--user-data-dir=' + '/Users/laiyao/Library/Application Support/Google/Chrome/') #设置成用户自己的数据目录##注意退出当前的chrome,MAC 为~/Library/Application Support/Google/Chrome/Default
	option.add_argument('--user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36') #修改浏览器的User-Agent来伪装你的浏览器
	option.add_argument('--process-per-site') #每个站点使用单独进程
	option.add_argument('--lang=zh-CN') #设置语言为简体中文
	##启动浏览器
	driver = webdriver.Chrome(chromedriver,options=option)
	driver.get('about:version')
	return driver

def openNewTab(driver,url='https://www.sogou.com'):
    js='window.open("'+url+'");'
    driver.execute_script(js)

def quit(driver):
    driver.quit()
    logging.info(['QUIT success'])
    return









    



levelname:WARNING filename: <ipython-input-1-84842def8fea> ，outputNumber: [8]  thread: MainThread output msg:  gogogo开工干活 - [05/May/2019 10:32:29]



In [2]:

    
# 初始化数据储存：
df=pd.DataFrame(columns =['日期','ID','账户名称','总支出','现金支出','赠款支出'] ,index=[0])
df = df.append({'日期':'2000-01-01','ID':0,'账户名称':'涉及金额的账户需要除以100000','总支出':0,'现金支出':0,'赠款支出':0}, ignore_index=True)  #忽略索引,往dataframe中插入一行数据
FailSubAccount={}
def loginToutiao():
    # 开浏览器窗口（2个窗口）
    try:
        driver=start()
    except selenium.common.exceptions.InvalidArgumentException  as err:
        print("浏览器窗口已打开啦~")
    openNewTab(driver,url='https://ad.toutiao.com/pages/login/index.html?redirect_uri=/pages/')
    # openNewTab(driver,url='https://ad.toutiao.com/pages/login/index.html?advertiser_id=')
    # 业务代码：登录（登录需要手工）
    driver.switch_to.window(driver.window_handles[0])
    return driver

def taskClear(driver,subAccountID):
    time.sleep(1)
    try:
        subAccountURL='https://ad.toutiao.com/overture/account/notification/?'
        time.sleep(0.2)
        driver.get(subAccountURL+str(subAccountID))
        driver.find_element_by_xpath('//*[@id="account-notification"]/div/div[2]/div[1]/span/div/div').click()
        time.sleep(0.1)
        driver.find_element_by_xpath('//*[@id="account-notification"]/div/div[2]/div[1]/button').click()
    except Exception:
        print("没点击成功，小问题", end="  ")
        print(Exception)
        return None

def loginSubAccount(driver,subAccountID=0,name='默认名字'):
    # 业务代码：登录子账户（自动）
    subAccountURL='https://ad.toutiao.com/marco/account/account_cutover/?advertiser_id='
    # driver.switch_to.window(driver.window_handles[1])
    driver.get(subAccountURL+str(subAccountID))
    time.sleep(0.2)
    # 所有未读消息标记为已读，主要是防止链接没跳转成功
    taskClear(driver,subAccountID)
    return None

def saveData(SubAccountData=None,subAccountID=0,name='默认名字'):
    time.sleep(0.1)
    if SubAccountData==None:
        global FailSubAccount
        FailSubAccount[subAccountID]=name
        logging.error('当前失败账户：'+name)
        return None
    for data in SubAccountData:
        if not subAccountID ==data["advertiser_id"]:
            logging.error(jsonData)
            logging.error('账户ID错误，无法访问账户')
            FailSubAccount[subAccountID]=name
        else:
            global df
            df = df.append({'日期':data["date"],'ID':data["advertiser_id"],'账户名称':name,'总支出':data["cost"],'现金支出':data["cash_cost"] ,'赠款支出':data["reward_cost"]}, ignore_index=True)  #忽略索引,往dataframe中插入一行数据
    pass

def getSubAccountData(driver,start_date="2019-03-20",end_date=time.strftime("%Y-%m-%d", time.localtime())):
    dataURL='https://ad.toutiao.com/overture/cash/get_cash_flow/?page=1&start_date='+start_date+'&end_date='+end_date
    # driver.switch_to.window(driver.window_handles[2])
    driver.get(dataURL)
    time.sleep(0.6)
    # 去掉HTML里面乱七八糟的东西，拿到真实的JSON数据
    htmlText=driver.page_source.replace('<html xmlns="http://www.w3.org/1999/xhtml"><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">','').replace('</pre></body></html>','')
    jsonData=htmlText
    if htmlText[0] =='{':
        jsonData=json.loads(htmlText)
    else:
        logging.error(htmlText)
        logging.error('浏览器获取数据失败')
        return jsonData
    # 验证"status": "success"
    if not jsonData["status"] =="success":
        logging.error(jsonData)
        logging.error('头条系统获取数据失败')
        return jsonData
    return jsonData['data']["items"]

# 应审计要求截图3月数据所需代码：
def takePhoto(driver,v='默认名字'):
    def goToCashFlow(driver):
        cashFlowURL="https://ad.toutiao.com/overture/cash/flow/"
        driver.get(cashFlowURL)    
        time.sleep(0.5)
        driver.find_element_by_xpath('//*[@id="cash-cashflow-date"]/div[1]/span').click()
        time.sleep(0.1)
        driver.find_element_by_xpath('//*[@id="cash-cashflow-date"]/div[2]/div[1]/div/div[5]').click()
        time.sleep(0.1)
        driver.find_element_by_xpath('//*[@id="cash-cashflow-date"]/div[2]/div[5]/button/span').click()
        time.sleep(0.3)
        pass

    def nextPage(driver):
        driver.find_element_by_xpath('//*[@id="cash-cashflow-pagination"]/ul/li[4]').click()
        time.sleep(1)
        pass

    def takePagePhoto(driver,v):
        #以下代码是将浏览器页面拖到最顶上。
        driver.find_element_by_css_selector('body').send_keys(webdriver.common.keys.Keys.PAGE_UP)
        driver.get_screenshot_as_file(v+"-up.png")
        #以下代码是将浏览器页面拉到最下面。
        driver.find_element_by_css_selector('body').send_keys(webdriver.common.keys.Keys.PAGE_DOWN)
        driver.save_screenshot(v+"-down.png")
        pass
    
    goToCashFlow(driver)
    takePagePhoto(driver,v+"-1")
    try:
        nextPage(driver)
    except selenium.common.exceptions.NoSuchElementException  as err:
            print('没找到下一页', err)
    except selenium.common.exceptions.StaleElementReferenceException  as err:
            print('没找到下一页.StaleElementReferenceException ', err)
    else:
        takePagePhoto(driver,v+"-2")
    print(v,"截图完成")



In [3]:

    
# 业务执行（登录头条），记得要下载汇总数据
driver=loginToutiao()
time.sleep(22)



In [ ]:

    
# 数据源来自这个文件：
# import '/Users/laiyao/Documents/成本/投放成本录入准确性核对/今日头条数据/ID-Name-Python.py'
# h核心数据不能乱传

sys.path.append('/Users/laiyao/Documents/成本/头条-投放成本核对/今日头条数据/')
import id_Name_Python

# 下载汇总数据——下载不下来的
# driver.switch_to.window(driver.window_handles[0])
# dataURL='https://ad.toutiao.com/marco/account/get_majordomo_binded_account_stat_info/?start_time=2019-01-01&end_time='+time.strftime("%Y-%m-%d", time.localtime())+'&action=download&single=1'
# driver.get(dataURL)



# 业务执行（拿数据） 鱼元，大众，量元
rundata=id_Name_Python.大众
driver.switch_to.window(driver.window_handles[0])
for k,v in rundata.items():
    print("正在抓取：",k,v, end="  ")
    loginSubAccount(driver,k,v)
    # 头条后台限制，日期区间15~20天比较好
    saveData(getSubAccountData(driver,start_date="2019-04-25"),k,v)
    print(" 抓取完成",time.asctime())
    # 如果审计要求截图的话~~~
    # takePhoto(driver,v)

driver.get('https://baijiahao.baidu.com/')

print('抓取数据完成：' , v[0:4], FailSubAccount,"抓取过程貌似无异常" if not FailSubAccount else "抓取过程中异常数据为：" + str(FailSubAccount))

# 运行一下这个save 数据
df.to_csv(path_or_buf=v[0:4] + time.strftime("%Y-%m-%d", time.localtime()) +'导出数据.csv' ,encoding='utf-8-sig')
df[0:2]

// 需要的数据在 jsonData['data']["items"][0]["cost"] //总支出

// 需要的数据在 jsonData['data']["items"][0]["cash_cost"] //现金支出

// 需要的数据在 jsonData['data']["items"][0]["reward_cost"] //赠款支出

// 需要的数据在 jsonData['data']["items"][0]["date"] //日期

// 验证字段： jsonData['data']["items"][0]["advertiser_id"] //子账户ID



In [ ]:

    
# 检查账户ID是否正确

driver.switch_to.window(driver.window_handles[0])

for k,v in rundata.items():
    try:
        if  k % 10 == 0:
            print("正在抓取：",k,v, end="  ")
            loginSubAccount(driver,k,v)
            # 头条后台限制，日期区间15~20天比较好
            saveData(getSubAccountData(driver,start_date="2019-03-30"),k,v)
            print(" 抓取完成")
        else:
            pass
    except NameError:
        print("id 异常账户")



In [5]:

    
# 运行这个退出浏览器
quit(driver)



In [ ]: