In [18]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import cPickle
import pymongo
import datetime

dbBases = ['Biology','English','Finance','MathOverflow','Physics','Statistics']
dataDir = "/home/cesar/Desktop/Doctorado/Projects/PopulationDynamics/Hierarchies/Results/Trees/"

%matplotlib inline

In [19]:
dbIndex = 1
siteName = dbBases[dbIndex]
print "Working With ",siteName

dbString = siteName

date0 = datetime.datetime(2006,1,1)
datef = datetime.datetime(2016,1,1)

client = pymongo.MongoClient("",27017)
dataBase = client[dbString]

postCollections = dataBase["Posts{0}".format(2013)]

Working With  English

In [20]:
dataPerDataBase = {}

Preprocessing of Tags For Tree

In [22]:
for dbIndex,dbString in enumerate(dbBases):
    siteName = dbBases[dbIndex]
    print "Site Name ",siteName
    dataBase = client[dbString]

    Tags = []
    Filter = False
    for year in range(2006,2016): 
        postCollections = dataBase["Posts{0}".format(year)]
        tags = postCollections.aggregate([{"$match":{"PostTypeId":1,
        Tags.extend([t for t in tags])
    AllTags = np.array(list(set(np.concatenate([t["Tags"] for t in Tags]))))
    print "Number of tags found: ",len(AllTags)
    tupleSets = []
    differentTuples = []
    fullTuples = []
    tuplesPerTag = {}
    differentTuplesPerTag = {}
    for i, mytag in enumerate(AllTags):
        alpha = []
        for year in range(2006,2016):
            postCollections = dataBase["Posts{0}".format(year)]
            alphaTags = postCollections.aggregate([{"$match":{"PostTypeId":1,"Tags":{"$in":[mytag]}}},
            alpha.extend([np.array(t["Tags"]) for t in alphaTags])
        for a in alpha:
        differentStrings = ["|".join(map(str,np.where(np.in1d(AllTags,a) == True)[0])) 
                        for a in alpha]    
        tuplesArray = list(set(differentStrings))
        tuplesPerTag[mytag] = len(differentStrings)
        differentTuplesPerTag[mytag] = len(tuplesArray)
    dataPerDataBase[dbString] = (AllTags,tuplesPerTag,

Site Name  Biology
Number of tags found:  642
Site Name  English
Number of tags found:  947
Site Name  Finance
Number of tags found:  495
Site Name  MathOverflow
Number of tags found:  1301
Site Name  Physics
Number of tags found:  824
Site Name  Statistics
Number of tags found:  1032

Record Statistics

In [23]:
resultsDir = "/home/cesar/Desktop/Doctorado/Projects/PopulationDynamics/Hierarchies/Results/BasicCooccurraceStats/"

In [24]:

In [ ]: