In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
In [3]:
dtt = []
with open('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_network.txt', 'r') as f:
for line in f:
pnum, link, time, author_id, author, content = line.replace('\n', '').split('\t')
dtt.append([pnum, link, time, author_id, author, content])
len(dtt)
Out[3]:
In [4]:
import pandas as pd
dt = pd.DataFrame(dtt)
dt=dt.rename(columns = {0:'page_num', 1:'link', 2:'time', 3:'author',4:'author_name', 5:'reply'})
dt[:5]
Out[4]:
In [5]:
# extract date from datetime
date = map(lambda x: x[:10], dt.time)
dt['date'] = pd.to_datetime(date)
In [19]:
dt[:5]
Out[19]:
In [7]:
import pandas as pd
df = pd.read_csv('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:2]
Out[7]:
In [8]:
from collections import defaultdict
link_user_dict = defaultdict(list)
for i in range(len(dt)):
link_user_dict[dt.link[i]].append(dt.author[i])
In [12]:
df['user'] = [len(link_user_dict[l]) for l in df.link]
df[:2]
Out[12]:
In [18]:
import statsmodels.api as sm
import numpy as np
x = np.log(df.user+1)
y = np.log(df.reply+1)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
fig = plt.figure(figsize=(8, 4),facecolor='white')
plt.plot(df.user, df.reply, 'rs', label= 'Data')
plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit')
plt.yscale('log');plt.xscale('log')
plt.xlabel(r'$Users$', fontsize = 20)
plt.ylabel(r'$Replies$', fontsize = 20)
plt.text(max(df.user)/300,max(df.reply)/20,
r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
plt.legend(loc=2,fontsize=10, numpoints=1)
plt.axis('tight')
plt.show()
In [14]:
x = np.log(df.user+1)
y = np.log(df.click+1)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
fig = plt.figure(figsize=(8, 4),facecolor='white')
plt.plot(df.user, df.click, 'rs', label= 'Data')
plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit')
plt.yscale('log');plt.xscale('log')
plt.xlabel(r'$Users$', fontsize = 20)
plt.ylabel(r'$Replies$', fontsize = 20)
plt.text(max(df.user)/300,max(df.click)/20,
r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
plt.legend(loc=2,fontsize=10, numpoints=1)
plt.axis('tight')
plt.show()
In [9]:
# convert str to datetime format
dt.time = pd.to_datetime(dt.time)
dt['month'] = dt.time.dt.month
dt['year'] = dt.time.dt.year
dt['day'] = dt.time.dt.day
type(dt.time[0])
Out[9]:
In [80]:
d = dt.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
ds = dd.cumsum()
In [81]:
def getDate(dat):
dat_date_str = map(lambda x: str(x) +'-01-01', dat.index)
dat_date = pd.to_datetime(dat_date_str)
return dat_date
ds.date = getDate(ds)
dd.date = getDate(dd)
In [82]:
fig = plt.figure(figsize=(12,5))
plt.plot(ds.date, ds.year, 'g-s', label = '$Cumulative\: Number\:of\: Threads$')
plt.plot(dd.date, dd.year, 'r-o', label = '$Yearly\:Number\:of\:Threads$')
#plt.yscale('log')
plt.legend(loc=2,numpoints=1,fontsize=13)
plt.show()
In [98]:
dt.reply[:55]
Out[98]:
@贾也2012-10-297:59:00 导语:人人宁波,面朝大海,春暖花开 ........
@兰质薰心2012-10-2908:55:52 楼主好文! 相信政府一定有能力解决好这些...
回复第20楼,@rual_f “我相信官场中,许多官员应该葆有社会正能量” 通篇好文,顶...
In [96]:
import re
tweet = u"//@lilei: dd //@Bob: cc//@Girl: dd//@魏武: \
利益所致 自然念念不忘// @诺什: 吸引优质 客户,摆脱屌丝男!!!//@MarkGreene: 转发微博"
RTpattern = r'''//?@(\w+)'''
for word in re.findall(RTpattern, tweet, re.UNICODE):
print word
In [152]:
RTpattern = r'''@(\w+)\s'''
re.findall(RTpattern, dt.reply[11].decode('utf8'), re.UNICODE)
Out[152]:
In [154]:
if re.findall(RTpattern, dt.reply[0].decode('utf8'), re.UNICODE):
print True
else:
print False
In [121]:
for k, tweet in enumerate(dt.reply[:100]):
tweet = tweet.decode('utf8')
RTpattern = r'''@(\w+)\s'''
for person in re.findall(RTpattern, tweet, re.UNICODE):
print k,'\t',dt.author_name[k],'\t', person,'\t\t', tweet[:30]
In [109]:
print dt.reply[80]
In [158]:
link_author_dict = {}
for i in range(len(df)):
link_author_dict[df.link[i]] =df.author[i]
In [176]:
graph = []
for k, tweet in enumerate(dt.reply):
tweet = tweet.decode('utf8')
url = dt.link[k]
RTpattern = r'''@(\w+)\s'''
persons = re.findall(RTpattern, tweet, re.UNICODE)
if persons:
for person in persons:
graph.append([dt.author_name[k].decode('utf8'), person])
else:
graph.append( [dt.author_name[k].decode('utf8'), link_author_dict[url].decode('utf8')] )
In [177]:
len(graph)
Out[177]:
In [178]:
for x, y in graph[:3]:
print x, y
In [179]:
import networkx as nx
In [180]:
G = nx.DiGraph()
for x,y in graph:
if x != y:
G.add_edge(x,y)
In [181]:
nx.info(G)
Out[181]:
In [182]:
GU=G.to_undirected(reciprocal=True)
graphs = list(nx.connected_component_subgraphs(GU))
In [185]:
import numpy as np
size = []
for i in graphs:
size.append(len(i.nodes()))
len(size), np.max(size)
Out[185]:
In [190]:
gs = []
for i in graphs:
if len(i.nodes()) >5:
gs.append(i)
len(gs)
Out[190]:
In [191]:
for g in gs:
print len(g.nodes())
In [192]:
g_max = gs[0]
len(g_max.nodes())
Out[192]:
In [198]:
pos = nx.spring_layout(g_max)
#定义一个布局,此处采用了spectral布局方式,后变还会介绍其它布局方式,注意图形上的区别
nx.draw(g_max,pos,with_labels=False,node_size = 30)
#绘制规则图的图形,with_labels决定节点是非带标签(编号),node_size是节点的直径
plt.show() #显示图形
In [203]:
with open('/Users/chengjun/github/cjc2016/data/tianya_network_120.csv', 'a') as f:
for x, y in g_max.edges():
f.write(x.encode('utf8') + ',' + y.encode('utf8') + '\n')
In [ ]: