In [18]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import cPickle
import pymongo
import datetime

dbBases = ['Biology','English','Finance','MathOverflow','Physics','Statistics']
dataDir = "/home/cesar/Desktop/Doctorado/Projects/PopulationDynamics/Hierarchies/Results/Trees/"

%matplotlib inline

In [19]:
dbIndex = 1
siteName = dbBases[dbIndex]
print "Working With ",siteName

dbString = siteName

date0 = datetime.datetime(2006,1,1)
datef = datetime.datetime(2016,1,1)

client = pymongo.MongoClient("129.26.78.128",27017)
dataBase = client[dbString]

postCollections = dataBase["Posts{0}".format(2013)]


Working With  English

In [20]:
dataPerDataBase = {}

Preprocessing of Tags For Tree


In [22]:
for dbIndex,dbString in enumerate(dbBases):
    
    siteName = dbBases[dbIndex]
    print "Site Name ",siteName
    client[dbString].authenticate("Cesar","gaussianprocess")
    dataBase = client[dbString]

    #====================================================
    Tags = []
    Filter = False
    for year in range(2006,2016): 
        postCollections = dataBase["Posts{0}".format(year)]
        tags = postCollections.aggregate([{"$match":{"PostTypeId":1,
                                                     "CreationDate":{"$gt":date0,"$lt":datef}}
                                          },
                                          {"$project":{"_id":0,"Tags":1}}])
        Tags.extend([t for t in tags])
    AllTags = np.array(list(set(np.concatenate([t["Tags"] for t in Tags]))))
    AllTags.sort()
    print "Number of tags found: ",len(AllTags)
    
    #======================================================
    tupleSets = []
    differentTuples = []
    fullTuples = []
    
    tuplesPerTag = {}
    differentTuplesPerTag = {}
    for i, mytag in enumerate(AllTags):
        alpha = []
        for year in range(2006,2016):
            postCollections = dataBase["Posts{0}".format(year)]
            alphaTags = postCollections.aggregate([{"$match":{"PostTypeId":1,"Tags":{"$in":[mytag]}}},
                                          {"$project":{"_id":0,"Tags":1}}])
            alpha.extend([np.array(t["Tags"]) for t in alphaTags])
        for a in alpha:
            a.sort()
        differentStrings = ["|".join(map(str,np.where(np.in1d(AllTags,a) == True)[0])) 
                        for a in alpha]    
        tuplesArray = list(set(differentStrings))
        fullTuples.extend(tuplesArray)
        tuplesPerTag[mytag] = len(differentStrings)
        differentTuplesPerTag[mytag] = len(tuplesArray)
    
    dataPerDataBase[dbString] = (AllTags,tuplesPerTag,
                                 differentTuplesPerTag,len(list(set(fullTuples))))
    #==========================================================


Site Name  Biology
Number of tags found:  642
Site Name  English
Number of tags found:  947
Site Name  Finance
Number of tags found:  495
Site Name  MathOverflow
Number of tags found:  1301
Site Name  Physics
Number of tags found:  824
Site Name  Statistics
Number of tags found:  1032

Record Statistics


In [23]:
resultsDir = "/home/cesar/Desktop/Doctorado/Projects/PopulationDynamics/Hierarchies/Results/BasicCooccurraceStats/"

In [24]:
cPickle.dump(dataPerDataBase,
             open(resultsDir+"coOccurranceStats.cpickle","w"))

In [ ]: