We want ot identify interesting hashtags folowing those characteristics :
We will use d3js to allow interactive exploration of the results
In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import font_manager
import pandas as pd
import mpld3, mpld3.plugins
mpld3.enable_notebook("//cdnjs.cloudflare.com/ajax/libs/d3/3.4.1/d3.min.js")
Let's parse the data
In [15]:
root_path="/home/clemsos/Dev/mitras/out/hashtags/"
data_path=root_path+"data/"
min_tweets=1000
min_conversation=1000
max_conversation=0
skip_hashtags=30
# read data with pandas
df=pd.read_csv(root_path+"hashtags_stats.csv")
# delete 3 biggest values to suppress noise in the data
if skip_hashtags!=0 : df=df.drop(df.index[:skip_hashtags])
# keep only hastags with more than 1600 tweets
if min_tweets!=0 : df=df[df['tweets'] > min_tweets]
# keep only hastags with more than 100000 exchanges
if min_conversation!=0 : df=df[df['conversation'] > min_conversation]
if max_conversation!=0 : df=df[df['conversation'] < max_conversation]
# remove a specific value (for graph purposes)
df = df[df.label != "失独母亲"] #.decode("utf-8")]
df.head() # show content
df
Out[15]:
Ensuite nous réalisons un graphe pour mieux observer les mèmes les plus indiqués pour notre étude.
In [3]:
# support for Chinese Font for ipython
font_path = '/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc'
fontP = font_manager.FontProperties(fname=font_path)
fontP.set_size(16)
In [17]:
# init graph
fig,ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'),figsize=(20, 10))
# fig,ax = plt.subplots()
#
scatter = ax.scatter(df.tweets,
df.conversation,
s=df["tweets"]/50,
c=df.conversation,
alpha=0.5,
cmap=plt.cm.jet)
# legends and title
title= "Hashtags distribution (more than %s tweets and %s actions)"%(min_tweets,min_conversation)
ax.set_title(title, size=20)
ax.set_xlabel('Tweets volume')
ax.set_ylabel('Conversation activity')
ax.grid(color='lightgray', alpha=0.7)
# labels
plt.subplots_adjust(bottom = 0.1)
labels=[]
for i in df.label:
labels.append(i.decode('utf-8'))
# interactive chart
# tooltip = mpld3.plugins.PointHTMLTooltip(scatter[0], labels, voffset=10, hoffset=10, css=css)
# mpld3.plugins.connect(fig, labels)
fig.plugins = [mpld3.plugins.PointLabelTooltip(scatter, labels)]
# mpld3.display_d3(fig)
In [ ]:
for label, x, y in zip(labels, df.tweets, df.conversation):
plt.annotate(
label,
xy = (x, y), xytext = (-20, 20),
fontproperties=fontP,
textcoords = 'offset points', ha = 'right', va = 'bottom',
bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
In [ ]:
for i,l in enumerate(labels):
if l=="失独母亲".decode("utf-8") : print i
print l
In [ ]:
In [ ]: