In [ ]:
#测试安装环境是否正确
from selenium import webdriver
driver_detail = webdriver.PhantomJS()
driver_detail.get('https://www.baidu.com')
news = driver_detail.find_element_by_xpath("//div[@id='u1']/a")
print (news.text)
driver_detail.quit()


新闻

In [ ]:
print ("hello")

In [ ]:
import traceback
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import random


# 存储为文本的子函数
def write2txt(data,path):
    f = open(path,"a")
    f.write(data)
    f.write("\n")
    f.close()


# 获取该id喜欢音乐的列表
def catchSongs(url_id,url):
    user = url_id.split('=')[-1].strip()  
    print ('excute user:',user)
    
    #根据自己的路径修改PhantomJS路径
    driver = webdriver.PhantomJS()#,executable_path='/Users/Lea/Documents/CloudDisk/Workshop/tools/phantomjs-2.1.1-macosx/bin/phantomjs')   
    driver.get(url)
    
    # 网易云的音乐元素都放在框架内,先切换框架
    driver.switch_to_frame('g_iframe')  

    try:
        wait = ui.WebDriverWait(driver,15)
        # 等待元素渲染出来
        wait.until(lambda driver: driver.find_element_by_xpath('//*[@class="j-flag"]/table/tbody'))  
        try:
            song_key = 1
            wrong_time = 0
            while wrong_time < 5:  # 不断获取歌信息,假定5次获取不到值,就判无值可获取,跳出循环
                try:
                    songs = driver.find_elements_by_xpath('//*[@class="j-flag"]/table/tbody/tr[%s]'%song_key)
                    info_ = songs[0].text.strip().split("\n")
                    if len(info_) == 5:
                        info_.insert(2,'None') # 没有MV选项的进行插入None
                    new_line = '%s|'%user+'|'.join(info_)
                    song_key +=1
                    #new_line = "%s|%s|%s|%s|%s|%s|%s"%(user,info_[0],info_[1],info_[2],info_[3],info_[4],info_[5])

                    print (new_line)

# mac写入文件需要改变字符,以id命名的文件,存储在执行脚本的当前路径下,在win下请去掉编.endcode('utf-8')
                    write2txt(new_line,user)  


                except Exception as ex:
                    wrong_time +=1
                    # print ex
        except Exception as ex:
            pass

    except Exception as ex:
        traceback.print_exc()
    finally:
        driver.quit()



# 获取id所喜爱的音乐的url
def catchPlaylist(url):

    # 注意填上路径
    driver = webdriver.PhantomJS()#,executable_path='/Users/Lea/Documents/CloudDisk/Workshop/tools/phantomjs-2.1.1-macosx/bin/phantomjs' 
    driver.get(url)
    
# 网易云的音乐元素都放在框架内!!!!先切换框架
    driver.switch_to_frame('g_iframe')  
    try:
        wait = ui.WebDriverWait(driver,15)
        # 根据xpath获取元素
        wait.until(lambda driver: driver.find_element_by_xpath('//*[@class="m-cvrlst f-cb"]/li[1]/div/a'))  

        urls = driver.find_elements_by_xpath('//*[@class="m-cvrlst f-cb"]/li[1]/div/a')
        favourite_url = urls[0].get_attribute("href")

    except Exception as ex:
        traceback.print_exc()
    finally:
        driver.quit()
    # print favourite_url
    return favourite_url



if __name__ == '__main__':
    # 这里把自己的id替换掉,想爬谁的歌单都可以,只要你有他的id
    for url in ['http://music.163.com/user/home?id=78120034']:  
        # 随机休眠时间2~4秒
        time.sleep(random.randint(2, 4)) 
        url_playlist = catchPlaylist(url)
        time.sleep(random.randint(1, 2))
        catchSongs(url,url_playlist)


excute user: 78120034
78120034|1|I Am You|None|04:34|Kim Taylor|I Am You
78120034|2|儿时|None|04:21|刘昊霖|鱼干铺里
78120034|3|The truth that you leave - (你离开的事实)|MV|03:43|Pianoboy|The truth that you leave
78120034|4|一如年少模样|None|04:22|陈鸿宇|一如年少模样
78120034|5|也罢|None|05:08|鲁向卉|也罢
78120034|6|凉城|None|03:52|任然|后继者
78120034|7|还想听你的故事|None|04:43|谢春花/王碧浪|算云烟
78120034|8|多得他|None|04:49|王菲|情.菲.得意
78120034|9|永远の嘘をついてくれ - live|None|07:58|中島みゆき/吉田拓郎|最新热歌慢摇110
78120034|10|青春再见 - (电影《怒放》首支概念单曲)|MV|03:47|水木年华/老狼//|青春再见
78120034|11|去大理 - (电影《心花路放》插曲)|MV|02:49|黄渤|去大理
78120034|12|一生所爱 (Cover 卢冠廷)|None|04:16|暗杠|一生所爱 (Cover 卢冠廷)
78120034|13|落日谣 - (电视剧《余罪》宣传曲)|None|04:36|张一山/张承/葛铮|落日谣
78120034|14|Some Dreams|MV|04:30|Allan Taylor|Hotels & Dreamers
78120034|15|一个人|None|04:27|韩红|一个人
78120034|16|孤独的和弦|None|04:30|萧煌奇|孤独的和弦
78120034|17|A Time For Us|None|04:04|David Davidson|Silver Screen Classics
78120034|18|September Remembered|None|03:34|Johannes Linstead|September Remembered
78120034|19|7 Years|MV|03:57|Lukas Graham|Lukas Graham
78120034|20|¡Viva La Gloria! (Album Version)|None|03:30|Green Day|21st Century Breakdown
78120034|21|星の在り処 Full Ver. (Less Vocal)|None|04:45|Falcom Sound Team jdk|英雄伝説VI-空の轨迹 OST
78120034|22|I Don't Want To Change You|MV|05:26|Damien Rice|I Don't Want To Change You
78120034|23|Like Sunday, Like Rain|None|07:04|Ed Harcourt|Like Sunday, Like Rain
78120034|24|在水一方|None|03:47|邓丽君|在水一方
78120034|25|悟空|None|03:20|戴荃|悟空 (Live版)
78120034|26|陽だまりにて和む猫|None|03:33|Falcom Sound Team jdk|英雄伝説 空の軌跡FC Evolution オリジナルサウンドトラック
78120034|27|岁月神偷|None|02:42|金玟岐|金玟岐作品集
78120034|28|春风十里|None|06:15|鹿先森乐队|所有的酒,都不如你
78120034|29|逝年|None|03:16|夏小虎|逝年
78120034|30|逝去的歌|None|04:54|旅行团|10 DAY’S
78120034|31|浪费 - (原唱 : 林宥嘉)|None|05:28|徐佳莹|我是歌手第四季 第10期
78120034|32|Sleepyhead|MV|03:44|Galen Crew|Acoustic Daydreams
78120034|33|夜机|None|04:30|陈慧娴|永远是你的朋友
78120034|34|可惜没如果 - (If Only…)|MV|04:58|林俊杰|新地球
78120034|35|清平调|MV|04:34|王菲/邓丽君|清平调
78120034|36|罗生门|MV|04:22|麦浚龙/谢安琪|Addendum
78120034|37|月半小夜曲(Live) - live|None|03:45|陈慧娴|活出生命 II 演唱会

In [ ]:
import numpy as np
import PIL.Image as Image
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt

# 统计词频
def statistics(lst):  
    dic = {}  
    for k in lst:  
        if not k in dic:dic[k] = 0  
        dic[k] +=1  
    return dic  

# 替换成自己的用户名
path = '78120034'  
list_ = []
with open(path,'r') as f:
    for line in f:
        list_.append(line.strip().split('|')[-2].strip())
        
dict_ = statistics(list_)

# the font from github: https://github.com/adobe-fonts
font = r'SimHei.ttf'
# 遮罩层自己定义,可选自己的图片
coloring = np.array(Image.open("/Users/Lea/Documents/CloudDisk/Sync/Workshop/python-training/shot.png"))  
wc = WordCloud(background_color="white",collocations=False,font_path=font,width=1400, height=1400,margin=2,
# 这里采用了generate_from_frequencies(dict_)的方法,里面传入的值是{‘歌手1’:5,‘歌手2’:8,},分别是歌手及出现次数,其实和jieba分词之后使用generate(text)是一个效果,只是这里的text已经被jieba封装成字典了
mask = np.array(Image.open("/Users/Lea/Documents/CloudDisk/Sync/Workshop/python-training/shot.png"))).generate_from_frequencies(dict_)

image_colors = ImageColorGenerator(np.array(Image.open("/Users/Lea/Documents/CloudDisk/Sync/Workshop/python-training/shot.png")))
plt.imshow(wc.recolor(color_func=image_colors))
plt.imshow(wc)
plt.axis("off")
plt.show()

# 把词云保存下来 
wc.to_file('mymusic2.png')


Out[ ]:
<wordcloud.wordcloud.WordCloud at 0x1110e5c50>