In [18]:
    
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import cPickle
import pymongo
import datetime
dbBases = ['Biology','English','Finance','MathOverflow','Physics','Statistics']
dataDir = "/home/cesar/Desktop/Doctorado/Projects/PopulationDynamics/Hierarchies/Results/Trees/"
%matplotlib inline
    
In [19]:
    
dbIndex = 1
siteName = dbBases[dbIndex]
print "Working With ",siteName
dbString = siteName
date0 = datetime.datetime(2006,1,1)
datef = datetime.datetime(2016,1,1)
client = pymongo.MongoClient("129.26.78.128",27017)
dataBase = client[dbString]
postCollections = dataBase["Posts{0}".format(2013)]
    
    
In [20]:
    
dataPerDataBase = {}
    
In [22]:
    
for dbIndex,dbString in enumerate(dbBases):
    
    siteName = dbBases[dbIndex]
    print "Site Name ",siteName
    client[dbString].authenticate("Cesar","gaussianprocess")
    dataBase = client[dbString]
    #====================================================
    Tags = []
    Filter = False
    for year in range(2006,2016): 
        postCollections = dataBase["Posts{0}".format(year)]
        tags = postCollections.aggregate([{"$match":{"PostTypeId":1,
                                                     "CreationDate":{"$gt":date0,"$lt":datef}}
                                          },
                                          {"$project":{"_id":0,"Tags":1}}])
        Tags.extend([t for t in tags])
    AllTags = np.array(list(set(np.concatenate([t["Tags"] for t in Tags]))))
    AllTags.sort()
    print "Number of tags found: ",len(AllTags)
    
    #======================================================
    tupleSets = []
    differentTuples = []
    fullTuples = []
    
    tuplesPerTag = {}
    differentTuplesPerTag = {}
    for i, mytag in enumerate(AllTags):
        alpha = []
        for year in range(2006,2016):
            postCollections = dataBase["Posts{0}".format(year)]
            alphaTags = postCollections.aggregate([{"$match":{"PostTypeId":1,"Tags":{"$in":[mytag]}}},
                                          {"$project":{"_id":0,"Tags":1}}])
            alpha.extend([np.array(t["Tags"]) for t in alphaTags])
        for a in alpha:
            a.sort()
        differentStrings = ["|".join(map(str,np.where(np.in1d(AllTags,a) == True)[0])) 
                        for a in alpha]    
        tuplesArray = list(set(differentStrings))
        fullTuples.extend(tuplesArray)
        tuplesPerTag[mytag] = len(differentStrings)
        differentTuplesPerTag[mytag] = len(tuplesArray)
    
    dataPerDataBase[dbString] = (AllTags,tuplesPerTag,
                                 differentTuplesPerTag,len(list(set(fullTuples))))
    #==========================================================
    
    
In [23]:
    
resultsDir = "/home/cesar/Desktop/Doctorado/Projects/PopulationDynamics/Hierarchies/Results/BasicCooccurraceStats/"
    
In [24]:
    
cPickle.dump(dataPerDataBase,
             open(resultsDir+"coOccurranceStats.cpickle","w"))
    
In [ ]: