In [18]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import cPickle
import pymongo
import datetime
dbBases = ['Biology','English','Finance','MathOverflow','Physics','Statistics']
dataDir = "/home/cesar/Desktop/Doctorado/Projects/PopulationDynamics/Hierarchies/Results/Trees/"
%matplotlib inline
In [19]:
dbIndex = 1
siteName = dbBases[dbIndex]
print "Working With ",siteName
dbString = siteName
date0 = datetime.datetime(2006,1,1)
datef = datetime.datetime(2016,1,1)
client = pymongo.MongoClient("129.26.78.128",27017)
dataBase = client[dbString]
postCollections = dataBase["Posts{0}".format(2013)]
In [20]:
dataPerDataBase = {}
In [22]:
for dbIndex,dbString in enumerate(dbBases):
siteName = dbBases[dbIndex]
print "Site Name ",siteName
client[dbString].authenticate("Cesar","gaussianprocess")
dataBase = client[dbString]
#====================================================
Tags = []
Filter = False
for year in range(2006,2016):
postCollections = dataBase["Posts{0}".format(year)]
tags = postCollections.aggregate([{"$match":{"PostTypeId":1,
"CreationDate":{"$gt":date0,"$lt":datef}}
},
{"$project":{"_id":0,"Tags":1}}])
Tags.extend([t for t in tags])
AllTags = np.array(list(set(np.concatenate([t["Tags"] for t in Tags]))))
AllTags.sort()
print "Number of tags found: ",len(AllTags)
#======================================================
tupleSets = []
differentTuples = []
fullTuples = []
tuplesPerTag = {}
differentTuplesPerTag = {}
for i, mytag in enumerate(AllTags):
alpha = []
for year in range(2006,2016):
postCollections = dataBase["Posts{0}".format(year)]
alphaTags = postCollections.aggregate([{"$match":{"PostTypeId":1,"Tags":{"$in":[mytag]}}},
{"$project":{"_id":0,"Tags":1}}])
alpha.extend([np.array(t["Tags"]) for t in alphaTags])
for a in alpha:
a.sort()
differentStrings = ["|".join(map(str,np.where(np.in1d(AllTags,a) == True)[0]))
for a in alpha]
tuplesArray = list(set(differentStrings))
fullTuples.extend(tuplesArray)
tuplesPerTag[mytag] = len(differentStrings)
differentTuplesPerTag[mytag] = len(tuplesArray)
dataPerDataBase[dbString] = (AllTags,tuplesPerTag,
differentTuplesPerTag,len(list(set(fullTuples))))
#==========================================================
In [23]:
resultsDir = "/home/cesar/Desktop/Doctorado/Projects/PopulationDynamics/Hierarchies/Results/BasicCooccurraceStats/"
In [24]:
cPickle.dump(dataPerDataBase,
open(resultsDir+"coOccurranceStats.cpickle","w"))
In [ ]: