In [1]:
import math
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
# 讀取粉絲頁posts
page_id = "appledaily.tw"
path = 'post/'+page_id+'_post.csv'
In [3]:
df = pd.read_csv(path, encoding = 'utf8')
來看看資料的前5筆
In [4]:
df.head()
Out[4]:
要如何找到這則在FB上的post呢?
可以透過status_link去找回這則po文
In [5]:
df['status_link'][0]
Out[5]:
處理前總共5234筆
In [6]:
len(df)
Out[6]:
把這些過濾掉,並且重新做reindex,原因是因為內建過濾的時候,它的index是不會改變的
In [7]:
df = df[(df['num_reactions']!=0) & (df['status_message'].notnull())].reindex()
處理後剩下5061筆,總共過濾掉了173筆
In [8]:
len(df)
Out[8]:
In [9]:
df['datetime'] = df['status_published'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
df['weekday'] = df['datetime'].apply(lambda x: x.weekday_name)
df['hour'] = df['datetime'].apply(lambda x: x.hour)
In [10]:
df.plot(x='datetime', y=['num_likes', 'num_loves', 'num_wows', 'num_hahas', 'num_sads', 'num_angrys'] ,
figsize=(12,8))
Out[10]:
In [11]:
df.plot(x='datetime', y=['num_reactions', 'num_comments', 'num_shares'],
figsize=(12,8))
Out[11]:
In [12]:
import datetime
delta_datetime = df['datetime'].shift(1) - df['datetime']
delta_datetime_df = pd.Series(delta_datetime).describe().apply(str)
delta_datetime_df = delta_datetime_df.to_frame(name='frequent of posts')
delta_datetime_df
Out[12]:
In [13]:
def weekday(d):
list_key = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
list_value = []
for one in list_key:
if one in d.keys():
list_value.append(d[one])
else:
list_value.append(0)
df = pd.DataFrame(index = list_key, data = {'weekday': list_value}).reset_index()
return df
In [14]:
df_weekday = weekday(dict(df['weekday'].value_counts()))
df_weekday
Out[14]:
In [15]:
sns.barplot(x='index', y='weekday', data = df_weekday)
Out[15]:
In [16]:
def hour(d):
list_key = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]
list_value = []
for one in list_key:
if one in d.keys():
list_value.append(d[one])
else:
list_value.append(0)
df = pd.DataFrame(index = list_key, data = {'hour': list_value}).reset_index()
return df
In [17]:
df_hour = hour(dict(df['hour'].value_counts()))
df_hour
Out[17]:
In [18]:
ax = sns.barplot(x='index', y='hour', data = df_hour)
In [19]:
df_status_type = df['status_type'].value_counts().to_frame(name='status_type')
df_status_type
Out[19]:
In [20]:
sns.barplot(x='index', y='status_type', data = df_status_type.reset_index())
Out[20]:
In [21]:
sns.stripplot(x="status_type", y="num_reactions", data=df, jitter=True)
Out[21]:
In [22]:
sns.stripplot(x="weekday", y="num_reactions", data=df, jitter=True)
Out[22]:
In [23]:
sns.stripplot(x="hour", y="num_reactions", data=df, jitter=True)
Out[23]:
In [24]:
g = sns.FacetGrid(df, col="status_type")
g.map(plt.hist, "num_reactions")
Out[24]:
如果我們想看不同reaction間的關係就需要用到Pearson Correlation: 主要衡量兩變數間線性關聯性的高低程度
由此圖我們可以得知reaction之間的相關性都不高
In [25]:
df_reaction = df[['num_likes', 'num_loves', 'num_wows', 'num_hahas', 'num_sads', 'num_angrys']]
colormap = plt.cm.viridis
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(df_reaction.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
Out[25]:
share和comment相關性較高,留言和分享的人數相關性比留言和reaction的人數相關性高
In [26]:
df_tmp = df[['num_reactions', 'num_comments', 'num_shares']]
colormap = plt.cm.viridis
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(df_tmp.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
Out[26]:
In [27]:
import jieba
import jieba.analyse
import operator
from wordcloud import WordCloud
# 安裝jieba套件的時候,就有繁體詞庫
jieba.set_dictionary('/home/wy/anaconda3/envs/python3/lib/python3.6/site-packages/jieba/extra_dict/dict.txt.big')
In [28]:
list(df['status_message'])[99]
Out[28]:
In [29]:
for one in jieba.cut(list(df['status_message'])[99]):
print (one)
In [30]:
jieba.analyse.extract_tags(list(df['status_message'])[99], topK=120)
Out[30]:
因此我們可以把每一篇發文的內容透過jieba的關鍵字抽取,抽取出重要的字
然後統計這些字出現的頻率,並使用WordCloud畫成文字雲
In [31]:
def jieba_extract(message_list):
word_count = {}
for message in message_list:
# 在抽取關鍵字時,可能會發生錯誤,先把錯誤的message收集起來,看看是怎麼一回事
seg_list = jieba.analyse.extract_tags(message, topK=120)
for seg in seg_list:
if not seg in word_count:
word_count[seg] = 1
else:
word_count[seg] += 1
sorted_word_count = sorted(word_count.items(), key=operator.itemgetter(1))
sorted_word_count.reverse()
return sorted_word_count
sorted_word_count = jieba_extract(list(df['status_message']))
看一下最常出現在post的字前十名是啥
In [32]:
print (sorted_word_count[:10])
出現文字雲! 文字雲其實還有蠻多可以調整的,甚至可以把圖片填滿字,請參考 wordcloud
會出現http com www 是因為每篇發文都會有link連結,所以就把link連結解析出來了
In [33]:
tpath = '/home/wy/font/NotoSansCJKtc-Black.otf'
wordcloud = WordCloud(max_font_size=120, relative_scaling=.1, width=900, height=600, font_path=tpath).fit_words(sorted_word_count)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
所以我們把前30名頻率高的拿掉在畫文字雲
sorted_word_count[30:]
In [34]:
tpath = '/home/wy/font/NotoSansCJKtc-Black.otf'
wordcloud = WordCloud(max_font_size=120, relative_scaling=.1, width=900, height=600, font_path=tpath).fit_words(sorted_word_count[30:])
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [35]:
# 讀入comment csv
c_path = path = 'comment/'+page_id+'_comment.csv'
c_df = pd.read_csv(c_path)
In [36]:
c_df.head()
Out[36]:
In [37]:
c_df = c_df[c_df['comment_message'].notnull()].reindex()
In [38]:
sorted_comment_message = jieba_extract(list(c_df['comment_message']))
print (sorted_comment_message[:10])
tpath = '/home/wy/font/NotoSansCJKtc-Black.otf'
wordcloud = WordCloud(max_font_size=120, relative_scaling=.1, width=900, height=600, font_path=tpath).fit_words(sorted_comment_message)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [39]:
c_df = c_df[c_df['comment_author'].notnull()].reindex()
In [40]:
def word_count(data_list):
d = {}
for one in data_list:
if one not in d:
d[one] = 1
else:
d[one] += 1
return d
In [41]:
d = word_count(list(c_df['comment_author']))
comment_authors = [(k, d[k]) for k in sorted(d, key=d.get, reverse=True)]
print (comment_authors[:10])
tpath = '/home/wy/font/NotoSansCJKtc-Black.otf'
wordcloud = WordCloud(max_font_size=120, relative_scaling=.1, width=900, height=600, font_path=tpath).fit_words(comment_authors)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
當我們在上面做了許多分析,那能不能把這些分析都輸成excel報表!?
用python輸出excel就靠它了 xlsxwriter !
In [42]:
import xlsxwriter
我們先用pandas中的describe (Generate various summary statistics),去統計該欄位的平均數,中位數,標準差,四分位數...
In [43]:
df_num_reactions = df['num_reactions'].describe().to_frame(name='reactions')
df_num_reactions
Out[43]:
In [44]:
df_num_comments = df['num_comments'].describe().to_frame(name='comments')
df_num_comments
Out[44]:
In [45]:
df_num_shares = df['num_shares'].describe().to_frame(name='shares')
df_num_shares
Out[45]:
然後把這些DataFrame寫入xlsx
官網介紹 : Working with Python Pandas and XlsxWriter
當你在寫入excel的時候,需要設定你要寫入的位置 :
第一種 : 把DataFrame寫入到xlsx
df_num_reactions.to_excel(writer, sheet_name=page_id, startcol=0, startrow=0)
startcol,startrow : 像座標的形式
sheet_name : 工作表
第二種繪製內建圖表 :
chart1.add_series({
'categories': '='+page_id+'!$A$13:$A$18',
'values': '='+page_id+'!$B$13:$B$18',
})
categories : name來自A13-A18
values : value來自B13-B18
第三種插入圖表 :
worksheet.insert_image('L12', 'image/image1.png')
'L12' : 插入位置
'image/image1.png' : 圖片路徑
In [46]:
# 設定路徑
excel_path = 'excel/'+page_id+'_analysis.xlsx'
writer = pd.ExcelWriter(excel_path, engine='xlsxwriter')
# 把DataFrame寫入到xlsx
df_num_reactions.to_excel(writer, sheet_name=page_id, startcol=0, startrow=0)
df_num_comments.to_excel(writer, sheet_name=page_id, startcol=3, startrow=0)
df_num_shares.to_excel(writer, sheet_name=page_id, startcol=6, startrow=0)
delta_datetime_df.to_excel(writer, sheet_name=page_id, startcol=9, startrow=0)
df_status_type.to_excel(writer, sheet_name=page_id, startcol=0, startrow=11)
df_weekday.set_index('index').to_excel(writer, sheet_name=page_id, startcol=0, startrow=25)
df_hour.set_index('index').to_excel(writer, sheet_name=page_id, startcol=0, startrow=39)
# 畫出內建長條圖
workbook = writer.book
# 發文種類長條統計圖
chart1 = workbook.add_chart({'type': 'column'})
chart1.add_series({
'categories': '='+page_id+'!$A$13:$A$18',
'values': '='+page_id+'!$B$13:$B$18',
})
chart1.set_title ({'name': '發文種類長條統計圖'})
chart1.set_x_axis({'name': 'status_type'})
chart1.set_y_axis({'name': 'count'})
worksheet = writer.sheets[page_id]
worksheet.insert_chart('D12', chart1)
# 星期幾發文統計長條圖
chart2 = workbook.add_chart({'type': 'column'})
chart2.add_series({
'categories': '='+page_id+'!$A$27:$A$33',
'values': '='+page_id+'!$B$27:$B$33',
})
chart2.set_title ({'name': '星期幾發文統計長條圖'})
chart2.set_x_axis({'name': 'hour'})
chart2.set_y_axis({'name': 'count'})
worksheet = writer.sheets[page_id]
worksheet.insert_chart('D26', chart2)
# 單日幾時發文統計長條圖
chart3 = workbook.add_chart({'type': 'column'})
chart3.add_series({
'categories': '='+page_id+'!$A$41:$A$64',
'values': '='+page_id+'!$B$41:$B$64',
})
chart3.set_title ({'name': '單日幾時發文統計長條圖'})
chart3.set_x_axis({'name': 'weekday'})
chart3.set_y_axis({'name': 'count'})
worksheet = writer.sheets[page_id]
worksheet.insert_chart('D40', chart3)
# 示範插入image, 當把上面的圖畫出來之後,要先存起來才能插入到xlsx
df.plot(x='datetime', y=['num_likes', 'num_loves', 'num_wows', 'num_hahas', 'num_sads', 'num_angrys'])
plt.savefig('image/image1.png')
worksheet.insert_image('L12', 'image/image1.png')
由於會把太瑣碎的資料都寫到excel裡面,就不把全部分析的資料都寫進去
想要了解更多繪製內建圖表的方法就到 xlsxwriter 去看吧~