In [4]:
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
import datawash
In [7]:
# 遍历所有用户,读取需要的信息
voteupCountList = list() # 赞同数
thankedCountList = list() # 感谢数
followingCountList = list() # 关注数
followerCountList = list() # 关注者数
favoriteCountList = list() # 收藏数
favoritedCountList = list() # 被收藏数
answerCountList = list() # 回答数
articlesCountList = list() # 文章数
questionCountList = list() # 提问数
followingQuestionCountList = list() # 关注问题数
followingTopicCountList = list() # 关注话题数
followingFavlistsCountList = list() # 关注收藏夹数
followingColumnsCountList = list() # 关注专栏数
jsons = datawash.datajsons()
for user in jsons:
try:
voteupCountList.append(user['voteupCount'])
thankedCountList.append(user['thankedCount'])
followingCountList.append(user['followingCount'])
followerCountList.append(user['followerCount'])
favoriteCountList.append(user['favoriteCount'])
favoritedCountList.append(user['favoritedCount'])
answerCountList.append(user['answerCount'])
articlesCountList.append(user['articlesCount'])
questionCountList.append(user['questionCount'])
followingQuestionCountList.append(user['followingQuestionCount'])
followingTopicCountList.append(user['followingTopicCount'])
followingFavlistsCountList.append(user['followingFavlistsCount'])
followingColumnsCountList.append(user['followingColumnsCount'])
except:
pass
In [5]:
# 绘图用到的颜色
black = '#212121'
gray = '#727272'
red = '#D32F2F'
orange = '#FF9500'
orange2 = '#FFF1DE'
green = '#99cc33'
brown = '#cc6600'
In [6]:
# 72万知乎用户获得赞同数和感谢数分布
# voteupCount 赞同数
# thankedCount 感谢数
fig, axes = plt.subplots(3,2)
fig.set_size_inches(18,10)
fig.suptitle('72万知乎用户获得赞同数和感谢数分布', fontsize=16, color=red)
fig.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)
# 把一个二维子图数组压扁
subplotlist = list()
for i in axes:
for j in i:
subplotlist.append(j)
# 每个直方图的统计范围
edge = [[0,100],
[100,1000],
[1000,10000],
[10000,100000],
[100000,1000000],
[1000000,4000000]
]
# 每个直方图的组距
widthlist = [1, 5, 50, 500, 5000, 50000]
for i in range(len(subplotlist)):
tempList = [x for x in voteupCountList if x>=edge[i][0] and x<edge[i][1]]
voteupCountArray = np.array(tempList)
tempList = [x for x in thankedCountList if x>=edge[i][0] and x<edge[i][1]]
thankedCountArray = np.array(tempList)
subplotlist[i].hist([voteupCountArray,thankedCountArray], normed=0, histtype='barstacked', label=['赞同数','感谢数'],
bins=int((edge[i][1]-edge[i][0])/widthlist[i]), color=[red,orange], alpha = 0.7)
subplotlist[i].legend(loc='best')
subplotlist[i].set_xlim(edge[i][0], edge[i][1])
subplotlist[i].set_title('%d-%d赞同数和感谢数分布(组距:%d)'%(edge[i][0],edge[i][1],widthlist[i]), color=red)
subplotlist[i].set_xlabel('赞同数和感谢数',color=red)
subplotlist[i].set_ylabel('用户数量(人)',color=red)
subplotlist[i].set_facecolor(orange2)
subplotlist[i].grid(True, linestyle='--')
# 微调
axes[0,0].set_ylim(0,300000)
subplotlist[5].set_ylim(0,10)
fig.show()
In [42]:
# 72万知乎用户关注数和关注者数分布
# followingCount # 关注数
# followerCount # 关注者数
fig, axes = plt.subplots(3,2)
fig.set_size_inches(18,10)
fig.suptitle('72万知乎用户关注数和关注者数分布', fontsize=16, color=red)
fig.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)
# 把一个二维数组压扁
subplotlist = list()
for i in axes:
for j in i:
subplotlist.append(j)
# 每个直方图的统计范围
edge = [[0,100],
[100,1000],
[1000,10000],
[10000,100000],
[100000,1000000],
[1000000,3000000]
]
# 每个直方图的组距
widthlist = [1, 5, 50, 500, 5000, 50000]
for i in range(len(subplotlist)):
tempList = [x for x in followingCountList if x>=edge[i][0] and x<edge[i][1]]
followingCountArray = np.array(tempList)
tempList = [x for x in followerCountList if x>=edge[i][0] and x<edge[i][1]]
followerCountArray = np.array(tempList)
subplotlist[i].hist([followingCountArray,followerCountArray], normed=0, histtype='barstacked', label=['关注数','关注者数'],
bins=int((edge[i][1]-edge[i][0])/widthlist[i]), color=[red,orange], alpha = 0.7)
subplotlist[i].legend(loc='best')
subplotlist[i].set_xlim(edge[i][0], edge[i][1])
subplotlist[i].set_title('%d-%d关注数和关注者数分布(组距:%d)'%(edge[i][0],edge[i][1],widthlist[i]), color=red)
subplotlist[i].set_xlabel('关注数和关注者数',color=red)
subplotlist[i].set_ylabel('用户数量(人)',color=red)
subplotlist[i].set_facecolor(orange2)
subplotlist[i].grid(True, linestyle='--')
# 微调
subplotlist[5].set_ylim(0,10)
fig.show()
In [43]:
# 72万知乎用户收藏答案数和答案被收藏数分布
# favoriteCount 收藏答案数
# favoritedCount 答案被收藏数
fig, axes = plt.subplots(3,2)
fig.set_size_inches(18,10)
fig.suptitle('72万知乎用户答案收藏数和答案被收藏数分布', fontsize=16, color=red)
fig.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)
# 把一个二维数组压扁
subplotlist = list()
for i in axes:
for j in i:
subplotlist.append(j)
# 每个直方图的统计范围
edge = [[0,100],
[100,1000],
[1000,10000],
[10000,100000],
[100000,1000000],
[1000000,3000000]
]
# 每个直方图的组距
widthlist = [1, 5, 50, 500, 5000, 50000]
for i in range(len(subplotlist)):
tempList = [x for x in favoriteCountList if x>=edge[i][0] and x<edge[i][1]]
favoriteCountArray = np.array(tempList)
tempList = [x for x in favoritedCountList if x>=edge[i][0] and x<edge[i][1]]
favoritedCountArray = np.array(tempList)
subplotlist[i].hist([favoriteCountArray,favoritedCountArray], normed=0, histtype='barstacked', label=['收藏答案数','答案被收藏数'],
bins=int((edge[i][1]-edge[i][0])/widthlist[i]), color=[red,orange], alpha = 0.7)
subplotlist[i].legend(loc='best')
subplotlist[i].set_xlim(edge[i][0], edge[i][1])
subplotlist[i].set_title('%d-%d收藏答案数和答案被收藏数分布(组距:%d)'%(edge[i][0],edge[i][1],widthlist[i]), color=red)
subplotlist[i].set_xlabel('收藏答案数和答案被收藏数',color=red)
subplotlist[i].set_ylabel('用户数量(人)',color=red)
subplotlist[i].set_facecolor(orange2)
subplotlist[i].grid(True, linestyle='--')
# 微调
subplotlist[5].set_ylim(0,10)
fig.show()
In [44]:
# 72万知乎用户回答数,文章数和提问数分布
# answerCount 回答数
# articlesCount 文章数
# questionCount 提问数
fig, axes = plt.subplots(2,2)
fig.set_size_inches(18,10)
fig.suptitle('72万知乎用户回答数,文章数和提问数分布', fontsize=16, color=red)
fig.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)
# 把一个二维数组压扁
subplotlist = list()
for i in axes:
for j in i:
subplotlist.append(j)
# 每个直方图的统计范围
edge = [[0,100],
[100,1000],
[1000,10000],
[10000,40000],
]
# 每个直方图的组距
widthlist = [1, 5, 50, 500]
for i in range(len(subplotlist)):
tempList = [x for x in answerCountList if x>=edge[i][0] and x<edge[i][1]]
answerCountArray = np.array(tempList)
tempList = [x for x in articlesCountList if x>=edge[i][0] and x<edge[i][1]]
articlesCountArray = np.array(tempList)
tempList = [x for x in questionCountList if x>=edge[i][0] and x<edge[i][1]]
questionCountArray = np.array(tempList)
subplotlist[i].hist([answerCountArray,articlesCountArray,questionCountArray], normed=0, histtype='barstacked',
label=['回答数', '文章数', '提问数'],
bins=int((edge[i][1]-edge[i][0])/widthlist[i]), color=[red,orange,green], alpha = 0.7)
subplotlist[i].legend(loc='best')
subplotlist[i].set_xlim(edge[i][0], edge[i][1])
subplotlist[i].set_title('%d-%d回答数,文章数和提问数分布(组距:%d)'%(edge[i][0],edge[i][1],widthlist[i]), color=red)
subplotlist[i].set_xlabel('回答数,文章数和提问数',color=red)
subplotlist[i].set_ylabel('用户数量(人)',color=red)
subplotlist[i].set_facecolor(orange2)
subplotlist[i].grid(True, linestyle='--')
# 微调
subplotlist[3].set_ylim(0,10)
fig.show()
In [45]:
# 72万知乎用户关注问题数,关注话题数,关注收藏夹数和关注专栏数分布
# followingQuestionCount
# followingTopicCount
# followingFavlistsCount
# followingColumnsCount
fig, axes = plt.subplots(2,2)
fig.set_size_inches(18,10)
fig.suptitle('72万知乎用户关注问题数,关注话题数,关注收藏夹数和关注专栏数分布', fontsize=16, color=red)
fig.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)
# 把一个二维数组压扁
subplotlist = list()
for i in axes:
for j in i:
subplotlist.append(j)
# 每个直方图的统计范围
edge = [[0,100],
[100,1000],
[1000,10000],
[10000,40000],
]
# 每个直方图的组距
widthlist = [1, 5, 50, 500]
for i in range(len(subplotlist)):
tempList = [x for x in followingQuestionCountList if x>=edge[i][0] and x<edge[i][1]]
followingQuestionCountArray = np.array(tempList)
tempList = [x for x in followingTopicCountList if x>=edge[i][0] and x<edge[i][1]]
followingTopicCountArray = np.array(tempList)
tempList = [x for x in followingFavlistsCountList if x>=edge[i][0] and x<edge[i][1]]
followingFavlistsCountArray = np.array(tempList)
tempList = [x for x in followingColumnsCountList if x>=edge[i][0] and x<edge[i][1]]
followingColumnsCountArray = np.array(tempList)
subplotlist[i].hist([followingQuestionCountArray,followingTopicCountArray,followingFavlistsCountArray,followingColumnsCountArray],
normed=0, histtype='barstacked', label=['关注问题数','关注话题数','关注收藏夹数','关注专栏数'],
bins=int((edge[i][1]-edge[i][0])/widthlist[i]), color=[red,orange,green,brown], alpha = 0.7)
subplotlist[i].legend(loc='best')
subplotlist[i].set_xlim(edge[i][0], edge[i][1])
subplotlist[i].set_title('%d-%d关注问题数,关注话题数,关注收藏夹数和关注专栏数(组距:%d)'%(edge[i][0],edge[i][1],widthlist[i]),
color=red)
subplotlist[i].set_xlabel('关注问题数,关注话题数,关注收藏夹数和关注专栏数',color=red)
subplotlist[i].set_ylabel('用户数量(人)',color=red)
subplotlist[i].set_facecolor(orange2)
subplotlist[i].grid(True, linestyle='--')
# 微调
# subplotlist[3].set_ylim(0,10)
fig.show()
In [ ]: