In [ ]:
#测试安装环境是否正确
from selenium import webdriver
driver_detail = webdriver.PhantomJS()
driver_detail.get('https://www.baidu.com')
news = driver_detail.find_element_by_xpath("//div[@id='u1']/a")
print (news.text)
driver_detail.quit()
In [ ]:
print ("hello")
In [ ]:
import traceback
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import random
# 存储为文本的子函数
def write2txt(data,path):
f = open(path,"a")
f.write(data)
f.write("\n")
f.close()
# 获取该id喜欢音乐的列表
def catchSongs(url_id,url):
user = url_id.split('=')[-1].strip()
print ('excute user:',user)
#根据自己的路径修改PhantomJS路径
driver = webdriver.PhantomJS()#,executable_path='/Users/Lea/Documents/CloudDisk/Workshop/tools/phantomjs-2.1.1-macosx/bin/phantomjs')
driver.get(url)
# 网易云的音乐元素都放在框架内,先切换框架
driver.switch_to_frame('g_iframe')
try:
wait = ui.WebDriverWait(driver,15)
# 等待元素渲染出来
wait.until(lambda driver: driver.find_element_by_xpath('//*[@class="j-flag"]/table/tbody'))
try:
song_key = 1
wrong_time = 0
while wrong_time < 5: # 不断获取歌信息,假定5次获取不到值,就判无值可获取,跳出循环
try:
songs = driver.find_elements_by_xpath('//*[@class="j-flag"]/table/tbody/tr[%s]'%song_key)
info_ = songs[0].text.strip().split("\n")
if len(info_) == 5:
info_.insert(2,'None') # 没有MV选项的进行插入None
new_line = '%s|'%user+'|'.join(info_)
song_key +=1
#new_line = "%s|%s|%s|%s|%s|%s|%s"%(user,info_[0],info_[1],info_[2],info_[3],info_[4],info_[5])
print (new_line)
# mac写入文件需要改变字符,以id命名的文件,存储在执行脚本的当前路径下,在win下请去掉编.endcode('utf-8')
write2txt(new_line,user)
except Exception as ex:
wrong_time +=1
# print ex
except Exception as ex:
pass
except Exception as ex:
traceback.print_exc()
finally:
driver.quit()
# 获取id所喜爱的音乐的url
def catchPlaylist(url):
# 注意填上路径
driver = webdriver.PhantomJS()#,executable_path='/Users/Lea/Documents/CloudDisk/Workshop/tools/phantomjs-2.1.1-macosx/bin/phantomjs'
driver.get(url)
# 网易云的音乐元素都放在框架内!!!!先切换框架
driver.switch_to_frame('g_iframe')
try:
wait = ui.WebDriverWait(driver,15)
# 根据xpath获取元素
wait.until(lambda driver: driver.find_element_by_xpath('//*[@class="m-cvrlst f-cb"]/li[1]/div/a'))
urls = driver.find_elements_by_xpath('//*[@class="m-cvrlst f-cb"]/li[1]/div/a')
favourite_url = urls[0].get_attribute("href")
except Exception as ex:
traceback.print_exc()
finally:
driver.quit()
# print favourite_url
return favourite_url
if __name__ == '__main__':
# 这里把自己的id替换掉,想爬谁的歌单都可以,只要你有他的id
for url in ['http://music.163.com/user/home?id=78120034']:
# 随机休眠时间2~4秒
time.sleep(random.randint(2, 4))
url_playlist = catchPlaylist(url)
time.sleep(random.randint(1, 2))
catchSongs(url,url_playlist)
In [ ]:
import numpy as np
import PIL.Image as Image
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
# 统计词频
def statistics(lst):
dic = {}
for k in lst:
if not k in dic:dic[k] = 0
dic[k] +=1
return dic
# 替换成自己的用户名
path = '78120034'
list_ = []
with open(path,'r') as f:
for line in f:
list_.append(line.strip().split('|')[-2].strip())
dict_ = statistics(list_)
# the font from github: https://github.com/adobe-fonts
font = r'SimHei.ttf'
# 遮罩层自己定义,可选自己的图片
coloring = np.array(Image.open("/Users/Lea/Documents/CloudDisk/Sync/Workshop/python-training/shot.png"))
wc = WordCloud(background_color="white",collocations=False,font_path=font,width=1400, height=1400,margin=2,
# 这里采用了generate_from_frequencies(dict_)的方法,里面传入的值是{‘歌手1’:5,‘歌手2’:8,},分别是歌手及出现次数,其实和jieba分词之后使用generate(text)是一个效果,只是这里的text已经被jieba封装成字典了
mask = np.array(Image.open("/Users/Lea/Documents/CloudDisk/Sync/Workshop/python-training/shot.png"))).generate_from_frequencies(dict_)
image_colors = ImageColorGenerator(np.array(Image.open("/Users/Lea/Documents/CloudDisk/Sync/Workshop/python-training/shot.png")))
plt.imshow(wc.recolor(color_func=image_colors))
plt.imshow(wc)
plt.axis("off")
plt.show()
# 把词云保存下来
wc.to_file('mymusic2.png')
Out[ ]: