In [23]:
bigfile = open('/Users/chengjun/百度云同步盘/Writing/OWS/ows-raw.txt', 'r')
chunkSize = 1000000
chunk = bigfile.readlines(chunkSize)
print(len(chunk))
with open("/Users/chengjun/GitHub/cjc/data/ows_tweets_sample.txt", 'w') as f:
for i in chunk:
f.write(i)
In [6]:
# https://stackoverflow.com/questions/519633/lazy-method-for-reading-big-file-in-python?lq=1
import csv
bigfile = open('/Users/datalab/bigdata/cjc/ows-raw.txt', 'r')
chunkSize = 10**8
chunk = bigfile.readlines(chunkSize)
num, num_lines = 0, 0
while chunk:
lines = csv.reader((line.replace('\x00','') for line in chunk),
delimiter=',', quotechar='"')
#do sth.
num_lines += len(list(lines))
print(num, num_lines)
num += 1
chunk = bigfile.readlines(chunkSize) # read another chunk
In [ ]:
import pandas as pd
f = open('../bigdata/OWS/ows-raw.txt',encoding='utf-8')
reader = pd.read_table(f, sep=',', iterator=True, error_bad_lines=False) #跳过报错行
loop = True
chunkSize = 100000
data = []
while loop:
try:
chunk = reader.get_chunk(chunkSize)
dat = data_cleaning_funtion(chunk) # do sth.
data.append(dat)
except StopIteration:
loop = False
print("Iteration is stopped.")
df = pd.concat(data, ignore_index=True)
In [7]:
with open("../data/ows_tweets_sample.txt", 'r') as f:
lines = f.readlines()
In [8]:
# 总行数
len(lines)
Out[8]:
In [9]:
# 查看第一行
lines[15]
Out[9]:
In [40]:
help(lines[1].split)
In [10]:
varNames = lines[0].replace('\n', '').split(',')
varNames
Out[10]:
In [11]:
len(varNames)
Out[11]:
In [12]:
lines[1344]
Out[12]:
In [7]:
with open("../data/ows_tweets_sample_clean.txt", 'w') as f:
right_line = '' # 正确的行,它是一个空字符串
blocks = [] # 确认为正确的行会被添加到blocks里面
for line in lines:
right_line += line.replace('\n', ' ')
line_length = len(right_line.split(','))
if line_length >= 14:
blocks.append(right_line)
right_line = ''
for i in blocks:
f.write(i + '\n')
In [8]:
len(blocks)
Out[8]:
In [9]:
blocks[1344]
Out[9]:
In [13]:
import re
re.split(',"|",', lines[15])
Out[13]:
In [14]:
import re
with open("../data/ows_tweets_sample.txt",'r') as f:
lines = f.readlines()
for i in range(35,50):
i_ = re.split(',"|",', lines[i])
print('line =',i,' length =', len(i_))
In [15]:
with open("../data/ows_tweets_sample_clean4.txt", 'w') as f:
right_line = '' # 正确的行,它是一个空字符串
blocks = [] # 确认为正确的行会被添加到blocks里面
for line in lines:
right_line += line.replace('\n', ' ').replace('\r', ' ')
#line_length = len(right_line.split(','))
i_ = re.split(',"|",', right_line)
line_length = len(i_)
if line_length >= 6:
blocks.append(right_line)
right_line = ''
# for i in blocks:
# f.write(i + '\n')
In [16]:
len(blocks)
Out[16]:
In [18]:
# 提示:你可能需要修改以下路径名
with open("../data/ows_tweets_sample.txt", 'r') as f:
chunk = f.readlines()
In [19]:
len(chunk)
Out[19]:
In [20]:
chunk[:3]
Out[20]:
In [21]:
import csv
lines_csv = csv.reader(chunk, delimiter=',', quotechar='"')
print(len(list(lines_csv)))
# next(lines_csv)
# next(lines_csv)
In [27]:
import re
import csv
from collections import defaultdict
def extract_rt_user(tweet):
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)
if rt_user_name:
rt_user_name = rt_user_name[0][1].strip(' @')
else:
rt_user_name = None
return rt_user_name
rt_network = defaultdict(int)
f = open("../data/ows_tweets_sample.txt", 'r')
chunk = f.readlines(100000)
while chunk:
#lines = csv.reader(chunk, delimiter=',', quotechar='"')
lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"')
for line in lines:
tweet = line[1]
from_user = line[8]
rt_user = extract_rt_user(tweet)
rt_network[(from_user, rt_user)] += 1
chunk = f.readlines(100000)
In [22]:
import pandas as pd
df = pd.read_csv("../data/ows_tweets_sample.txt",
sep = ',', quotechar='"')
df[:3]
Out[22]:
In [23]:
len(df)
Out[23]:
In [24]:
df.Text[0]
Out[24]:
In [25]:
df['From User'][:10]
Out[25]:
In [26]:
from collections import defaultdict
data_dict = defaultdict(int)
for i in df['From User']:
data_dict[i] +=1
In [27]:
list(data_dict.items())[:5]
#data_dict
Out[27]:
In [28]:
%matplotlib inline
import matplotlib.pyplot as plt
In [29]:
plt.hist(data_dict.values())
#plt.yscale('log')
#plt.xscale('log')
plt.xlabel(u'发帖数', fontsize = 20)
plt.ylabel(u'人数', fontsize = 20)
plt.show()
In [30]:
tweet_dict = defaultdict(int)
for i in data_dict.values():
tweet_dict[i] += 1
plt.loglog(tweet_dict.keys(), tweet_dict.values(), 'ro')#linewidth=2)
plt.xlabel(u'推特数', fontsize=20)
plt.ylabel(u'人数', fontsize=20 )
plt.show()
In [31]:
import numpy as np
import statsmodels.api as sm
def powerPlot(d_value, d_freq, color, marker):
d_freq = [i + 1 for i in d_freq]
d_prob = [float(i)/sum(d_freq) for i in d_freq]
#d_rank = ss.rankdata(d_value).astype(int)
x = np.log(d_value)
y = np.log(d_prob)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
plt.plot(d_value, d_prob, linestyle = '',\
color = color, marker = marker)
plt.plot(d_value, np.exp(constant+x*beta),"red")
plt.xscale('log'); plt.yscale('log')
plt.text(max(d_value)/2,max(d_prob)/10,
r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)), fontsize = 20)
In [31]:
histo, bin_edges = np.histogram(list(data_dict.values()), 15)
bin_center = 0.5*(bin_edges[1:] + bin_edges[:-1])
powerPlot(bin_center,histo, 'r', '^')
#lg=plt.legend(labels = [u'Tweets', u'Fit'], loc=3, fontsize=20)
plt.ylabel(u'概率', fontsize=20)
plt.xlabel(u'推特数', fontsize=20)
plt.show()
In [32]:
import statsmodels.api as sm
from collections import defaultdict
import numpy as np
def powerPlot2(data):
d = sorted(data, reverse = True )
d_table = defaultdict(int)
for k in d:
d_table[k] += 1
d_value = sorted(d_table)
d_value = [i+1 for i in d_value]
d_freq = [d_table[i]+1 for i in d_value]
d_prob = [float(i)/sum(d_freq) for i in d_freq]
x = np.log(d_value)
y = np.log(d_prob)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
plt.plot(d_value, d_prob, 'ro')
plt.plot(d_value, np.exp(constant+x*beta),"red")
plt.xscale('log'); plt.yscale('log')
plt.text(max(d_value)/2,max(d_prob)/5,
'Beta = ' + str(round(beta,2)) +'\n' + 'R squared = ' + str(round(r2, 2)))
plt.title('Distribution')
plt.ylabel('P(K)')
plt.xlabel('K')
plt.show()
In [33]:
powerPlot2(data_dict.values())
In [34]:
import powerlaw
def plotPowerlaw(data,ax,col,xlab):
fit = powerlaw.Fit(data,xmin=2)
#fit = powerlaw.Fit(data)
fit.plot_pdf(color = col, linewidth = 2)
a,x = (fit.power_law.alpha,fit.power_law.xmin)
fit.power_law.plot_pdf(color = col, linestyle = 'dotted', ax = ax, \
label = r"$\alpha = %d \:\:, x_{min} = %d$" % (a,x))
ax.set_xlabel(xlab, fontsize = 20)
ax.set_ylabel('$Probability$', fontsize = 20)
plt.legend(loc = 0, frameon = False)
In [35]:
from collections import defaultdict
data_dict = defaultdict(int)
for i in df['From User']:
data_dict[i] += 1
In [36]:
import matplotlib.cm as cm
cmap = cm.get_cmap('rainbow_r',6)
fig = plt.figure(figsize=(6, 4),facecolor='white')
ax = fig.add_subplot(1, 1, 1)
plotPowerlaw(list(data_dict.values()), ax,cmap(1),
'$Tweets$')
In [1]:
tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!!
#OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com
http://ccc.nju.edu.cn RT !!HELP!!!!'''
In [32]:
import re
import twitter_text
# https://github.com/dryan/twitter-text-py/issues/21
#Macintosh HD ▸ 用户 ▸ datalab ▸ 应用程序 ▸ anaconda ▸ lib ▸ python3.5 ▸ site-packages
twitter-text-py could not be used for python 3
pip install twitter-text
Glyph debug the problem, and make a new repo of twitter-text-py3.
pip install twitter-text
In [35]:
import re
tweet = '''RT @AnonKitsu: @who ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!!
#OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com
http://ccc.nju.edu.cn RT !!HELP!!!!'''
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", \
re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @')#.split(':')[0]
rt_user_name
Out[35]:
In [28]:
import re
tweet = '''RT @AnonKitsu: @who ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!!
#OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com
http://ccc.nju.edu.cn RT !!HELP!!!!'''
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", \
re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @').split(':')[0]
rt_user_name
Out[28]:
In [36]:
import re
tweet = '''@chengjun:@who ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!!
#OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com
http://ccc.nju.edu.cn RT !!HELP!!!!'''
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)
print(rt_user_name)
if rt_user_name:
print('it exits.')
else:
print('None')
In [37]:
import re
def extract_rt_user(tweet):
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)
if rt_user_name:
rt_user_name = rt_user_name[0][1].strip(' @').split(':')[0]
else:
rt_user_name = None
return rt_user_name
In [38]:
tweet = '''RT @chengjun: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!!
#OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com
http://ccc.nju.edu.cn RT !!HELP!!!!'''
extract_rt_user(tweet)
Out[38]:
In [39]:
tweet = '''@chengjun: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!!
#OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com
http://ccc.nju.edu.cn RT !!HELP!!!!'''
print(extract_rt_user(tweet) )
In [40]:
import csv
with open("../data/ows_tweets_sample.txt", 'r') as f:
chunk = f.readlines()
rt_network = []
lines = csv.reader(chunk[1:], delimiter=',', quotechar='"')
tweet_user_data = [(i[1], i[8]) for i in lines]
tweet_user_data[:3]
Out[40]:
In [41]:
from collections import defaultdict
rt_network = []
rt_dict = defaultdict(int)
for k, i in enumerate(tweet_user_data):
tweet,user = i
rt_user = extract_rt_user(tweet)
if rt_user:
rt_network.append((user, rt_user)) #(rt_user,' ', user, end = '\n')
rt_dict[(user, rt_user)] += 1
#rt_network[:5]
list(rt_dict.items())[:3]
Out[41]:
In [42]:
def extract_tweet_text(tweet, at_names, urls):
for i in at_names:
tweet = tweet.replace(i, '')
for j in urls:
tweet = tweet.replace(j, '')
marks = ['RT @', '@', '"', '#', '\n', '\t', ' ']
for k in marks:
tweet = tweet.replace(k, '')
return tweet
In [43]:
import twitter_text
tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!!
#OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com
http://ccc.nju.edu.cn RT !!HELP!!!!'''
ex = twitter_text.Extractor(tweet)
at_names = ex.extract_mentioned_screen_names()
urls = ex.extract_urls()
hashtags = ex.extract_hashtags()
rt_user = extract_rt_user(tweet)
#tweet_text = extract_tweet_text(tweet, at_names, urls)
print(at_names, urls, hashtags, rt_user,'-------->')#, tweet_text)
In [44]:
import csv
lines = csv.reader(chunk,delimiter=',', quotechar='"')
tweets = [i[1] for i in lines]
In [45]:
for tweet in tweets[:5]:
ex = twitter_text.Extractor(tweet)
at_names = ex.extract_mentioned_screen_names()
urls = ex.extract_urls()
hashtags = ex.extract_hashtags()
rt_user = extract_rt_user(tweet)
#tweet_text = extract_tweet_text(tweet, at_names, urls)
print(at_names, urls, hashtags, rt_user)
#print(tweet_text)
In [ ]: