In [1]:
import timeit
import importlib
import relevancer as rlv
import pandas as pd
from sklearn.naive_bayes import MultinomialNB 
from bson.objectid import ObjectId
from collections import Counter
import numpy as np

pd.set_option("display.max_colwidth",200)


/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/sklearn/utils/fixes.py:64: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  if 'order' in inspect.getargspec(np.copy)[0]:

In [2]:
# this is just to load the latest version of the Relevancer after we edit the code.
importlib.reload(rlv)
active_col = "active_text"
rlv.set_active_column(active_col)

In [3]:
#importlib.reload(rlv)
my_token_pattern=r"[#@]?\w+\b|[\U00010000-\U0010ffff]"
rlvdb, rlvcl = rlv.connect_mongodb(configfile='elifdb.ini',coll_name="testcl")
# set active columns
active_col = "active_text"
rlv.set_active_column(active_col)

In [4]:
importlib.reload(rlv)

# after the first iteration, the annotated clusters should be excluded from the clustering.
# read this file from the tagged collection.
annotated_tw_ids = ['563657483395530753', '563662532326330370', '563654330041909248', '563654944927281152', '563657924233289728', '563661021559390208', '563651950386757632', '563657164317667328', '563660271810383872', '563662538949160960'] #You should get the actual annotated tweet ids from the annotated tweets collection.
#annotated_tw_ids = []
# mongo_query=({'_id': {'$gte': begin, '$lte': end},'lang':'en'})
tweetlist = rlv.read_json_tweet_fields_database(rlvcl, mongo_query=({}), read_fields={'text': 1, 'id_str': 1, '_id': 0, 'user_id': 1}, tweet_count=-1, annotated_ids=annotated_tw_ids)#=tweetsDF)

rlv.logging.info("Number of tweets:" + str(len(tweetlist)))
print("Number of tweets:",len(tweetlist))


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-4-27e6df74d4e4> in <module>()
      6 #annotated_tw_ids = []
      7 # mongo_query=({'_id': {'$gte': begin, '$lte': end},'lang':'en'})
----> 8 tweetlist = rlv.read_json_tweet_fields_database(rlvcl, mongo_query=({}), read_fields={'text': 1, 'id_str': 1, '_id': 0, 'user_id': 1}, tweet_count=-1, annotated_ids=annotated_tw_ids)#=tweetsDF)
      9 
     10 rlv.logging.info("Number of tweets:" + str(len(tweetlist)))

/Users/alihurriyetoglu/Dropbox/Projects/Relevancer/relevancer.py in read_json_tweet_fields_database(rlvcl, mongo_query, read_fields, tweet_count, annotated_ids, annotated_users)
    269 
    270         ftwits = []
--> 271         for i, t in enumerate(rlvcl.find(mongo_query, read_fields)):
    272                 if (i != tweet_count) and (t['id_str'] not in annotated_ids) and ((("user" in t) and (t["user"]["screen_name"] not in annotated_users)) or (("user_id" in t) and (t["user_id"] not in annotated_users))):  # restrict line numbers for test
    273                         # break

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/cursor.py in next(self)
    975             raise StopIteration
    976         _db = self.__collection.database
--> 977         if len(self.__data) or self._refresh():
    978             if self.__manipulate:
    979                 return _db._fix_outgoing(self.__data.popleft(),

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/cursor.py in _refresh(self)
    917                 self.__send_message(_GetMore(self.__collection.full_name,
    918                                              limit,
--> 919                                              self.__id))
    920 
    921         else:  # Cursor id is zero nothing else to return

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/cursor.py in __send_message(self, operation)
    811             try:
    812                 response = client._send_message_with_response(operation,
--> 813                                                               **kwargs)
    814                 self.__address = response.address
    815                 if self.__exhaust:

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/mongo_client.py in _send_message_with_response(self, operation, read_preference, exhaust, address)
    742             set_slave_ok,
    743             self.__all_credentials,
--> 744             exhaust)
    745 
    746     def _reset_on_error(self, server, func, *args, **kwargs):

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/mongo_client.py in _reset_on_error(self, server, func, *args, **kwargs)
    753         """
    754         try:
--> 755             return func(*args, **kwargs)
    756         except NetworkTimeout:
    757             # The socket has been closed. Don't reset the server.

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/server.py in send_message_with_response(self, operation, set_slave_okay, all_credentials, exhaust)
     86             request_id, data, max_doc_size = self._split_message(message)
     87             sock_info.send_message(data, max_doc_size)
---> 88             response_data = sock_info.receive_message(1, request_id)
     89             if exhaust:
     90                 return ExhaustResponse(

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/pool.py in receive_message(self, operation, request_id)
    214             return receive_message(self.sock, operation, request_id)
    215         except BaseException as error:
--> 216             self._raise_connection_failure(error)
    217 
    218     def legacy_write(self, request_id, msg, max_doc_size, with_last_error):

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/pool.py in _raise_connection_failure(self, error)
    314             _raise_connection_failure(self.address, error)
    315         else:
--> 316             raise error
    317 
    318     def __eq__(self, other):

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/pool.py in receive_message(self, operation, request_id)
    212         """
    213         try:
--> 214             return receive_message(self.sock, operation, request_id)
    215         except BaseException as error:
    216             self._raise_connection_failure(error)

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/network.py in receive_message(sock, operation, request_id)
     58 def receive_message(sock, operation, request_id):
     59     """Receive a raw BSON message or raise socket.error."""
---> 60     header = _receive_data_on_socket(sock, 16)
     61     length = _UNPACK_INT(header[:4])[0]
     62 

/Users/alihurriyetoglu/anaconda3/lib/python3.5/site-packages/pymongo/network.py in _receive_data_on_socket(sock, length)
     80     msg = b""
     81     while length:
---> 82         chunk = sock.recv(length)
     83         if chunk == b"":
     84             raise AutoReconnect("connection closed")

KeyboardInterrupt: 

In [ ]:
tweetsDF = rlv.create_dataframe(tweetlist)
tweetsDF.head()

In [ ]:
len(tweetsDF)

In [ ]:
#tweetsDF.to_pickle("20151005_tweetsDF_Genocide")

In [ ]:
# Be careful do not overwrite tweetsDFBackUP with a modified tweetsDF
#tweetsDFBackUP = tweetsDF.copy()
#len(tweetsDFBackUP)

In [ ]:
# Get a clean copy 
#importlib.reload(rlv)
tweetsDF = tweetsDFBackUP.copy()
len(tweetsDF)

In [ ]:
tweetsDF[active_col] = tweetsDF["text"].copy()
tweetsDF = rlv.tok_results(tweetsDF, elimrt = True)

In [ ]:
len(tweetsDF)

In [ ]:
tweetsDF = rlv.normalize_text(tweetsDF)
len(tweetsDF), tweetsDF.columns

In [ ]:
#importlib.reload(rlv)
#rlv.set_active_column(active_col)

In [ ]:
def eliminate_duplicates_recursively(df, duplicate_elim_func):
    """
    The actual near-duplicate detection algorithm is not memory-efficient enough. Therefore,
    we mostly need to divide the data in the buckets, eliminate duplicates, merge the data, shuffle it, and repeat
    the same cycle, until no-duplicate detected in any bucket. That may take long for big data sets. Conditions can
    be relaxed to be quicker but leave a few duplicates.
    """
            
        print("starting, length:",len(df))
        df = df.reindex(np.random.permutation(df.index))
        df.reset_index(inplace=True, drop=True)
        
        tmp_df2 = pd.DataFrame()
        for i in range(0, len(df), 10000):
            tmp_unique = duplicate_elim_func(df[i:i+step], similarity_threshold=0.20, debug=True, debug_threshold=10000)
            tmp_df2 = pd.concat([tmp_df2, tmp_unique], ignore_index=True)
            
        if len(df) > len(tmp_df2):
            print(str(len(df) - len(tmp_df2))+" tweets were eliminated!")
            return eliminate_duplicates_recursively(tmp_df2, duplicate_elim_func)
        
        return df

In [ ]:
tweetsDF_uniq = eliminate_duplicates_recursively(tweetsDF.copy(), rlv.get_and_eliminate_near_duplicate_tweets)

In [ ]:
len(tweetsDF_uniq)

In [ ]:
tweetsDF_uniq.to_pickle("20151005_unique_genocide_tweets.pickle")

In [ ]:
tweetsDF_uniq[["active_text"]][:10]

In [224]:
importlib.reload(rlv)
rlv.set_active_column(active_col)

In [225]:
cluster_list = rlv.create_clusters(tweetsDF_uniq, my_token_pattern, min_dist_thres=0.725, max_dist_thres=0.875, min_max_diff_thres=0.4, nameprefix='1-', min_clusters=100, user_identifier='user_id')

In [226]:
print("Number of clusters:",len(cluster_list))
print("available cluster information:", cluster_list[0].keys())


Number of clusters: 145
available cluster information: dict_keys(['ctweettuplelist', 'twids', 'cnoprefix', 'rif', 'cno', 'cstr', 'user_entropy'])

In [227]:
i = 0

In [237]:
#i = i-2
print("No:",cluster_list[i]['cno'])
print("CStr:",cluster_list[i]['cstr'])
print("Cluster Tuple List:")
print(*[(c[0],c[2]) for c in cluster_list[i]['ctweettuplelist']], sep='\n')
# Add any field you want to observe.
i+=1


No: 324
CStr: cluster number and size are: 324    62

Cluster Tuple List:
(0.78476101036957813, 'yes nothing spreads hate like opposing racism &amp; black genocide i agree. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.79291036153521355, 'so ending black genocide insights violence? oh really? no black genocide is violent. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr')
(0.79291834637943648, 'yes racists think opposing racism &amp; black genocide makes me a racist. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr')
(0.8036062851138589, 'yes racists call me a monster for opposing black genocide its absurd. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr')
(0.80379103769140281, "mor'on(n) !)someone black destroys the arguments of dumb whites. 2)opposes black genocide. tk usrusrusr usrusrusr usrusrusr usrusrusr")
(0.8129036502182011, 'yes black genocide is anything meant to kill 1.1b blacks. you damn right. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.81320378072796051, 'yes racists lie in order to deflect &amp; demean to justify black genocide. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr')
(0.81678614503876179, "no my premise is if we're all criminals you get to commit black genocide. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr")
(0.82089592977935222, "racists think if they don't spell out black genocide then its ok. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr")
(0.82162976179221181, 'how blacks feel about racists is not an excuse for black genocide its absurd &amp; dumb. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.83561711871271105, 'the present facts when it advances black genocide. there are 1.1b blacks in africa. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.83824228506973408, 'yes the klan supports black genocide but how does that help you? tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr')
(0.83824616691607023, "yes i'm sure he thinks i'm a bigot for opposing racism &amp; black genocide. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr")
(0.84155828300305391, 'all whites who accepted your racist lies are complicit in black genocide. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.84265470647592755, 'yes nothing is more racist then opposing racism and black genocide makes sense. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.84355214497971465, 'yes racists deny the existence of blacks makes genocide easier. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr')
(0.84366956570579488, 'yes the mentally ill blacks think black genocide is best but its not. tk usrusrusr usrusrusr usrusrusr')
(0.84657802617254119, 'he thinks if he labels me a liar that justifies black genocide its absurd. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr')
(0.84990341081479226, 'whites killing off the black race is "white suicide" not genocide. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.8511630098640669, 'you don\'t get to call opposition to racism &amp; black genocide "hate" that\'s absurd. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.85129584160934846, 'racists think blacks oppose racism &amp; black genocide because we hate whites, its absurd. tk usrusrusr usrusrusr #blackmediamatters')
(0.85197261793972756, "i will never denounce blacks so you can kill us that's genocide. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr")
(0.85215057484694834, 'yes blacks who oppose black genocide are responsible for racists committing black genocide makes sense.  tk usrusrusr usrusrusr')
(0.85231342673872057, 'whites think if they call black genocide "love" that makes it ok. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.85445551233328076, "populations don't decrease by 200 million over time unless its genocide. tk usrusrusr usrusrusr usrusrusr usrusrusr")
(0.85539943774957561, 'lester knows its genocide but he thinks killing all blacks brings joy. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr')
(0.85652412449705773, "what argument are racists making that blacks shouldn't take money to fight genocide? tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr")
(0.85833426932701307, 'no black genocide is killing over 200 million blacks since slavery. tk usrusrusr usrusrusr usrusrusr')
(0.85997611491020909, 'we reject the lie that opposing black genocide equates to black supremacy. usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr tk')
(0.86456741652732738, 'most whites have no problem with black genocide through miscegenation. tk usrusrusr usrusrusr usrusrusr')
(0.8661217270236462, 'racists w/the lie that its hate to oppose racism &amp; black genocide how stupid. tk usrusrusr usrusrusr usrusrusr')
(0.87195684224298742, "bikers are bikers you're simply lying to sell your black genocide narrative. tk usrusrusr usrusrusr usrusrusr")
(0.87325614715706046, 'we reject the lie that to oppose black genocide means we hate whites. tk usrusrusr usrusrusr usrusrusr')
(0.87366498706309459, "it doesn't but you need an excuse to justify black genocide so any dumb lie will do. tk usrusrusr usrusrusr usrusrusr usrusrusr")
(0.87492330169019039, "or are you suggesting to make opposing black genocide fair we can't be funded only you can? tk usrusrusr usrusrusr usrusrusr")
(0.87742905615410893, "that's stupid you make up a word to pin on ppl oppose black genocide? its stupid. tk usrusrusr usrusrusr usrusrusr usrusrusr")
(0.87972159686685714, 'yes black genocide masked as love is no less damning than the nazi death camps. tk usrusrusr usrusrusr usrusrusr')
(0.88037461481674173, 'should there be a black race lester? should we start one? ok then its genocide. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.88351868895364083, 'i reject the notion that opposing racism &amp; black genocide means i hate your massa. its absurd. tk usrusrusr usrusrusr usrusrusr')
(0.88746446399118895, 'not all whites well where the hell are the 2 whites not down for black genocide? step up. tk usrusrusr usrusrusr usrusrusr usrusrusr')
(0.88767361802772149, "to argue, that blacks shouldn't react to racist murder as an excuse for genocide? tk usrusrusr usrusrusr usrusrusr usrusrusr")
(0.88775379715600466, "so because racists can't make their case for black genocide its my fault? no your argument is dumb. tk usrusrusr usrusrusr usrusrusr")
(0.88791599488266404, "i disagree w/racists killing blacks because we won't date inter-racially. tk usrusrusr usrusrusr usrusrusr usrusrusr #genocide")
(0.88828879021774465, "no we don't wait until the genocide is complete to object to it. tk usrusrusr usrusrusr usrusrusr usrusrusr usrusrusr")
(0.89316692021776323, "so blacks fighting black genocide shouldnt' be compensated poor oprah &amp; lebron should get all the money, really? tk usrusrusr usrusrusr")
(0.89815298851299119, "there's no such thing as less than 100% black your mental illness doesn't' make black genocide ok. tk usrusrusr usrusrusr usrusrusr")
(0.90637528662282396, "no all whites envy blacks because we don't feel white guilt but genocide is not the answer. tk usrusrusr usrusrusr usrusrusr")
(0.90859507213359925, 'no, black men selling out is still black genocide cause its self-hate &amp; white supremacy myth. tk usrusrusr usrusrusr')
(0.91112986673645935, 'no i don\'t call my opposition to black genocide "hate". genocide is wrong we should all oppose it. tk usrusrusr usrusrusr')
(0.91414368178362648, 'what your fellow racist did in dallas was what genocide or suicide? come on racists tell me? tk usrusrusr usrusrusr usrusrusr')
(0.91758015122539349, "you'd think murdering ppl cause the pres black begs to be prosecuted, maybe not.  tk usrusrusr usrusrusr usrusrusr #genocide")
(0.92204605940248408, 'wrong. the black pop will resist genocide as we have throughout the first 8m years. sorry. tk usrusrusr usrusrusr')
(0.92349808835039071, 'so the racist tea party nuts supported obama and opposed racism &amp; black genocide? you dumb liar you need the help. tk usrusrusr usrusrusr')
(0.929570156965562, "yes its called a pattern of racist genocide that's part of your dna. #racistwhitedna tk usrusrusr usrusrusr usrusrusr")
(0.93031761530957457, 'but you see she gets along better with a retarded racist hell bent on black genocide than with a black man? tk usrusrusr usrusrusr')
(0.93531352946846558, "i'm looking for your dumb argument that blks who oppose genocide are paid for it? tk usrusrusr usrusrusr usrusrusr")
(0.93747803823598752, 'yes retarded white racists are given favor long as they hate blacks promote genocide good point joe. tk usrusrusr usrusrusr')
(0.94054207970026527, "you wont' guilt me into supporting black genocide. without blacks humanity would end. sorry racist lunatic. usrusrusr usrusrusr tk")
(0.94159801766118467, 'yes racists think opposing black genocide is an affront to whites. that argument is absurd and they know it. usrusrusr usrusrusr tk')
(0.9416179367402091, "if blacks denounce blacks reacting to black genocide that's a license to exterminate the entire race. usrusrusr usrusrusr usrusrusr tk")
(0.94403327349938071, 'genocide is not love. the who has labeled interracial rape as genocide. i destroyed that racist joe. tk usrusrusr usrusrusr')
(0.95386870249370681, 'six generations over 450yrs would be 600 million blacks outside of africa. 200 million are missing. #genocide tk usrusrusr usrusrusr')

In [215]:
print("Reverse Frequency of the terms:")
print(i)
for k, v in Counter({int(k):v for k,v in cluster_list[i-1]['rif'].items()}).items():
    if k>1:
        print(k,":",v)


Reverse Frequency of the terms:
1
2 : ['news the', 'indonesia', 'abortion', 'house of', 'oslo', 'language', 'urlurlurl myanmar', 'commemoration', 'rwanda', 'happening', 'genocide against', 'urlurlurl pakistan', 'daily', 'n', 'day', 'in the', 'cry', 'a rohingya', 'don t', 'the rohingya', 'breaking', '4', 'to end', 'the armenian', 'wanted for', 's genocide', 'nazi', 'with', 'part 2', '1', 'breaking news', 'ethnic cleansing', 'voice', 'cleansing of', 'urlurlurl rwanda', 'commemoration of', 'obama s', 'in arakan', 'pacific', 'anniversary', 'now', 'play', 'don', 'urlurlurl obama', 'rohingya language', 'fukushima breaking', 'cultural', 'oslo conference', 'cultural genocide', 'is genocide', 'about', 'armenia', 'urlurlurl what', 'film', 'wanted', 'pakistan', 'who', 'bangladesh', 'ethnic', 'house', '100', 'address', 'conference', 'rohingyas', 'fukushima news', 'myanmar to', 'cover', 'times', 'tv', 'plight', 'mark', 'speaks', 'of genocide', 'burma times', 'srebrenica genocide', 'their', 'genocide and', 'urlurlurl turkey', 'pacific genocide']
3 : ['against', 'on myanmar', 'urlurlurl a', 'of the', 'armenian', 'armenian genocide', 'amp', 'genocide the', 'genocide of', 'full', '2', 'from the', 'srebrenica', 'rohingya in', 't', 'obama', 'dr', 'urlurlurl rohingya', 'end', 'cleansing', 'vienna', 'at', 'urlurlurl the', 'not', 'new', 'what', 'by']
4 : ['myanmar s', 'people', 'fukushima', 'rohingya muslims', 'part', 'urlurlurl fukushima', 'arakan', 'the genocide', 'is', 'and the', 'urlurlurl genocide', 'turkey', 'documentary']
5 : ['muslims', 'war', 'for']
6 : ['news', 'of rohingya', 'burma']
8 : ['2015']
9 : ['on']
10 : ['and']
11 : ['myanmar']
76 : ['i liked', 'liked a', 'liked']
77 : ['video']
78 : ['i']
16 : ['to']
17 : ['usrusrusr urlurlurl', 'from usrusrusr']
18 : ['video from']
14 : ['in', 's']
22 : ['from']
87 : ['a']
75 : ['urlurlurl', 'a usrusrusr', 'usrusrusr video']
92 : ['usrusrusr']
31 : ['the']
32 : ['rohingya']
28 : ['of']
50 : ['genocide']
58 : ['video urlurlurl']

In [252]:
rlvdb2, rlvcl2 = rlv.connect_mongodb(configfile='ebasar2.ini',coll_name="testcl")

collection_name = 'genocide_clusters_20151005'
rlvdb2[collection_name].insert(cluster_list) #Each iteration results with a candidate cluster list. Each iteration will have its own list. Therefore they are not mixed.
print("Clusters were written to the collection:", collection_name)


---------------------------------------------------------------------------
OperationFailure                          Traceback (most recent call last)
<ipython-input-252-6a28de1f66f4> in <module>()
      2 
      3 collection_name = 'genocide_clusters_20151005'
----> 4 rlvdb2[collection_name].insert(cluster_list) #Each iteration results with a candidate cluster list. Each iteration will have its own list. Therefore they are not mixed.
      5 print("Clusters were written to the collection:", collection_name)

/Users/alihurriyetoglu/anaconda3/lib/python3.4/site-packages/pymongo/collection.py in insert(self, doc_or_docs, manipulate, safe, check_keys, continue_on_error, **kwargs)
    408                     self.database.name + ".$cmd", _INSERT, command,
    409                     gen(), check_keys, self.uuid_subtype, client)
--> 410             _check_write_command_response(results)
    411         else:
    412             # Legacy batched OP_INSERT

/Users/alihurriyetoglu/anaconda3/lib/python3.4/site-packages/pymongo/helpers.py in _check_write_command_response(results)
    207                 raise WTimeoutError(error.get("errmsg"),
    208                                     error.get("code"), error)
--> 209         raise OperationFailure(error.get("errmsg"), error.get("code"), error)
    210 
    211 

OperationFailure: quota exceeded

In [ ]:
import json
with open('data.txt', 'w') as outfile:
    json.dump(cluster_list, "genocide_clusters.json")

In [ ]:
x = 5
x

In [ ]:
# import pickle
# with open("genocide_clusters_20151005", 'wb') as f:
#     pickle.dump(cluster_list, f, pickle.HIGHEST_PROTOCOL)
#     print("wrote km user to pickle")

In [3]:
import pickle
with open("genocide_clusters_20151005", 'rb') as f:
    cluster_list = pickle.load(f)


read km_user from pickle

In [4]:
cluster_list[0].keys()


Out[4]:
dict_keys(['cno', 'twids', 'cnoprefix', 'cstr', 'user_entropy', 'ctweettuplelist', '_id', 'rif'])

In [249]:
cluster_list_small = []
for c in cluster_list:
    cluster_list_small.append({k:v for k,v in c.items() if k in ["cno","cstr","user_entropy","rif","_id","cnoprefix",'twids']})

In [250]:
collection_name = 'genocide_clusters_small_20151005'
rlvdb[collection_name].insert(cluster_list_small) #Each iteration results with a candidate cluster list. Each iteration will have its own list. Therefore they are not mixed.
print("Clusters were written to the collection:", collection_name)


---------------------------------------------------------------------------
OperationFailure                          Traceback (most recent call last)
<ipython-input-250-287de3a25cf6> in <module>()
      1 collection_name = 'genocide_clusters_small_20151005'
----> 2 rlvdb[collection_name].insert(cluster_list_small) #Each iteration results with a candidate cluster list. Each iteration will have its own list. Therefore they are not mixed.
      3 print("Clusters were written to the collection:", collection_name)

/Users/alihurriyetoglu/anaconda3/lib/python3.4/site-packages/pymongo/collection.py in insert(self, doc_or_docs, manipulate, safe, check_keys, continue_on_error, **kwargs)
    408                     self.database.name + ".$cmd", _INSERT, command,
    409                     gen(), check_keys, self.uuid_subtype, client)
--> 410             _check_write_command_response(results)
    411         else:
    412             # Legacy batched OP_INSERT

/Users/alihurriyetoglu/anaconda3/lib/python3.4/site-packages/pymongo/helpers.py in _check_write_command_response(results)
    207                 raise WTimeoutError(error.get("errmsg"),
    208                                     error.get("code"), error)
--> 209         raise OperationFailure(error.get("errmsg"), error.get("code"), error)
    210 
    211 

OperationFailure: quota exceeded

In [ ]: