To detect memes in a large corpus, some recent researches propose to implement the concept of protomemes as a clustering algorithm (Ferrara, 2013). Protomeme is an abbreviation for "prototype meme", a meme under construction (Gabora, 1997). In the context of social media, we can describe protomemes as minimum units contained in tweets :
By looking at the evolution of protomemes, we can identify memes in formation.
In [ ]:
import csv
import bson.errors.InvalidStringData:
from time import time
from models.tweet import Tweet # mongo data model
import lib.tweetminer as minetweet
# Connect to Mongo
db=MongoDB("weibodata").db
# Download files (56GB, make take some time)
# os.system('wget -r "http://147.8.142.179/datazip/"')
# Where are the downloaded raw files
raw_path=os.path.dirname(os.path.abspath(__file__)) +"/data/datazip/"
# Init libraries
nlp=NLPMiner()
# scan all downloaded files
csvfiles = [ os.path.join(raw_path,f) for f in os.listdir(raw_path) if os.path.isfile(os.path.join(raw_path,f)) ]
for csv_file in csv_files:
# extract_and_store_tweets(csv_file,nlp, minetweet)
t0=time() # measure time
with open(csvfile, 'r') as f:
next(f) # skip csv header
data = csv.reader(f)
# one row at a time
for row in data:
# create Tweet object
t=Tweet()
# Populate Tweet
t.mid=row[0]
t.retweetFromPostId=row[1]
t.userId=row[2]
t.retweetFromUserId=row[3]
t.source=row[4]
t.hasImage=row[5]
t.txt=row[6]
t.geo=row[7]
t.created_at=row[8]
t.deleted_last_seen=row[9]
t.permission_denied=row[10]
# Extract tweet entities
t.mentions,t.urls,t.hashtags,clean=minetweet.extract_tweet_entities(t.txt)
# Extract keywords
dico=nlp.extract_dictionary(clean)
# Remove stopwords and store clean dico
t.dico=nlp.remove_stopwords(dico)
# Extract entities (NER server should be started - see lib/ner-server)
# t.entities=nlp.extract_named_entities_from_dico(t.dico)
# Check encoding problems
valid_utf8 = True
try:
t.txt.decode('utf-8')
except UnicodeDecodeError:
unvalid_tweets+=1
valid_utf8 = False
print ' bad encoding : tweet ',t.mid
# pprint(t)
# Save tweet
if valid_utf8 is True:
try:
t.save() # save to mongo
tweets_count+=1
except InvalidStringData:
print ' bad encoding : tweet ',t.mid
print " done in %fs" % (time() - t0)
Once all our tweets are stored in the database, we use Mongo map-reduce algorithm (not fast) to collect tweets and create protomeme-specific datasets (for each hashtag, url and mentions).
See extract_protomemes_using_multiple_processes
function in lib/protomemes.py
for an optimized version using multi-processing
In [ ]:
from time import time
from bson.code import Code
from lib.mongo import MongoDB
# Define the Mongo collection where the raw data is stored
source_collection="tweets"
# Connect to Mongo
db=MongoDB("weibodata").db
# Start
t0 =time()
# Get corpus length
tweets_count=db[source_collection].count()
print str(tweets_count)+" tweets in the db"
# Define collections to be created
pm_collections= ["hashtags", "mentions", "urls"]
# import JS code to use Mongo native map-reduce
mapjs=open(os.path.dirname(os.path.abspath(__file__)) +"lib/mapreduce/map.js", "r").read()
reducejs=open(os.path.dirname(os.path.abspath(__file__)) +"lib/mapreduce/reduce.js", "r").read()
for collection_name in pm_collections :
# compile code to BSON
mapper = Code(mapjs.replace("TO_BE_CHANGED", collection_name)) # change collection type within the js code
reducer = Code(reducejs)
# Process Mongo map-reduce
result = db[source_collection].map_reduce(mapper, reducer, collection_name, limit=tweets_count)
print " %d new protomemes extracted" % result.count()
print " stored in collection : %s " % db[_destination]
print " done in %fs" % (time() - t0)
Now that we have specific dataset for each protomemes, we want to identify memes within them. To detect clusters, we will compute different similiarities between each protomemes set to compare them and find the most similar. Four similarities values are defined as follow :
We use a linear combination to combine those similarities into a single index. Scalars are use to weight each sim value, following Ferrara's original paper :
wt = 0.0 , wc = 0.7 , wu = 0.1 , wd = 0.2
combined_index = wc*txt_sim + wd*diff_sim + wt*tweets_sim + wu*users_sim
We can also store corpora as files to keep the process memory-friendly
In [ ]:
from lib.mongo import MongoDB
# Reference different types of corpus to be extracted
types=["txt","diffusion","tweets","users"]
# Temporary directory to store all indexes
tmp_path="/tmp"
# Collections to be used
pm_collections= ["hashtags", "mentions", "urls"]
# Connect to Mongo
db=MongoDB("weibodata").db
# Start
t0 =time()
for collection in pm_collections:
# Get corpus length
tweets_count=db[source_collection].count()
print str(tweets_count)+" tweets in the db"
# Create raw corpus for each protomeme
for t in types:
filename=_path+"/protomemes."+type
# apply treshold with at least 5 tweets and 5 users
query1={
"value.tweets.5": { "$exists":"true"},
"value.users.5": { "$exists":"true"}
}
# get only specific type
query2 = { "value."+_type : 1 }
data=db[source_collection].find(query1, query2).limit(tweets_count)
print ' got %d records'%len(data)
# open file
with codecs.open(filename, "w", "utf-8") as outfile:
for item in data:
outfile.write(str(item["value"]["txt"].split())[1:-1]+"\n")
outfile.close()
print ' %s done '%t