In [5]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import cPickle
import pymongo
import datetime


dbBases = ['Biology','English','Finance','MathOverflow','Physics','Statistics']
dataDir = "/home/cesar/Desktop/Doctorado/Projects/PopulationDynamics/Hierarchies/Results/Trees/"

%matplotlib inline

In [9]:
dbIndex = 1
siteName = dbBases[dbIndex]
print "Working With ",siteName

dbString = siteName
date0 = datetime.datetime(2006,1,1)
datef = datetime.datetime(2016,1,1)

client = pymongo.MongoClient("129.26.78.128",27017)

dbIndex = 1
siteName = dbBases[dbIndex]
client[dbString].authenticate("Cesar","gaussianprocess")
dataBase = client[dbString]
postCollections = dataBase["Posts{0}".format(2013)]


Working With  English

In [10]:
postCollections.find_one({'PostTypeId':1})['Tags']


Out[10]:
[u'prefixes']

In [11]:
specificTags = ['classical-mechanics']
Filter = False
if Filter:
    tags = postCollections.aggregate([{"$match":{"PostTypeId":1,
                                                 "CreationDate":{"$gt":date0,"$lt":datef},
                                                 "Tags":{"$in":specificTags}}
                                      },
                                      {"$project":{"_id":0,"Tags":1}}])
else:
    tags = postCollections.aggregate([{"$match":{"PostTypeId":1,
                                                 "CreationDate":{"$gt":date0,"$lt":datef}}
                                      },
                                      {"$project":{"_id":0,"Tags":1}}])

AllTags = np.array(list(set(np.concatenate([t["Tags"] for t in tags]))))
AllTags.sort()
print "Number of tags found: ",len(AllTags)
AllTags[:100]


Number of tags found:  661
Out[11]:
array([u'aave', u'abbreviations', u'ablaut', u'able-eable', u'above-below',
       u'abstract-nouns', u'accent', u'acronyms', u'active-voice',
       u'adjectives', u'adjunct', u'adverb-position', u'adverbs',
       u'affixes', u'agent-noun-suffix', u'agent-nouns', u'agreement',
       u'aint', u'all-of', u'all-the', u'alphabet', u'also-too',
       u'alternative', u'ambiguity', u'american-english', u'among-between',
       u'analogy', u'and-and', u'and-or', u'anglicization', u'animal',
       u'antecedents', u'antimeria', u'antipodean-english', u'antonyms',
       u'any-every', u'aphorism', u'apostrophe', u'appositives',
       u'appropriate', u'archaic', u'argument-structure', u'articles',
       u'as-like', u'aspect', u'aspiration', u'at-by', u'at-in', u'at-on',
       u'australian-english', u'auto-antonyms', u'auxiliary-verbs',
       u'backshifting', u'bare-conditional', u'bare-infinitive', u'be',
       u'be-deletion', u'best-most', u'blending', u'book-title', u'books',
       u'braces', u'brands', u'british-dialect', u'british-english',
       u'business-language', u'buzzword', u'can-could', u'can-may',
       u'can-vs-be-able', u'canadian-english', u'capitalization',
       u'caribbean-english', u'case', u'catachresis', u'catch-phrases',
       u'category', u'christmas', u'cinema', u'citation',
       u'class-based-usage', u'clauses', u'cliche', u'collective-nouns',
       u'collocation', u'colloquialisms', u'colon', u'colors', u'comma',
       u'comma-before-and', u'comma-splices', u'commands',
       u'common-pronunciation', u'commonweath-english', u'comparative',
       u'comparison', u'complement', u'complex-sentences', u'compliment',
       u'compound-adjectives'], 
      dtype='<U25')

In [22]:
AllTags[0]


Out[22]:
u'aave'

In [23]:
#OBTAIN TUPLE SET
alphaTags = postCollections.aggregate([{"$match":{"PostTypeId":1,"Tags":{"$in":['comparison']}}},
                                      {"$project":{"_id":0,"Tags":1}}])

In [24]:
alpha = [np.array(t["Tags"]) for t in alphaTags]
for a in alpha:
    a.sort()
tuplesArray = ["|".join(map(str,np.where(np.in1d(AllTags,a)== True)[0])) for a in alpha]
print tuplesArray


['95|320', '95|222|547|648', '95', '95|226', '85|95|112|199|547', '95|226|359', '24|95|262', '95|137', '95|151|486', '95', '95|184|334', '9|95|653', '94|95', '95|184', '22|95|521|621', '95|179', '95|226', '95|132', '95', '95|151|653', '95', '95', '95|112|222|335', '95|226|423|497|648', '95|226', '65|95|335|592', '51|95|400', '26|95|188|339|507', '95', '95|335', '95|142', '12|95|222']

Preprocessing of Tags For Tree


In [28]:
tupleSets = []
differentTuples = []
for i, mytag in enumerate(AllTags):
    alphaTags = postCollections.aggregate([{"$match":{"PostTypeId":1,"Tags":{"$in":[mytag]}}},
                                  {"$project":{"_id":0,"Tags":1}}])
    alpha = [np.array(t["Tags"]) for t in alphaTags]
    for a in alpha:
        a.sort()
    differentStrings = ["|".join(map(str,np.where(np.in1d(AllTags,a) == True)[0])) 
                        for a in alpha]
    
    differentTuples.append(differentStrings)
    tuplesArray = list(set(differentStrings))
    tuplesArray = np.array([np.array(map(int,t.split("|"))) for t in tuplesArray])
    #tuplesArray = np.array(map(int,list(tuplesArray)[0].split("|")))
    tupleSets.append(tuplesArray)
    
tupleSets = np.array(tupleSets) ## Each entry is a list with the arrays of different tuples which appear for tags
numberOfTuples = np.array(map(len,tupleSets)) ## How many different n-tuples appear
tagsInBranch = np.array(map(set,map(np.concatenate,tupleSets))) #which are the different tags which appear with each tag
numberOfTags = np.array(map(len,tagsInBranch)) #number of different tags encountered

In [32]:
numberOfTuples


Out[32]:
array([  1,  56,   1,   2,   1,   1,  12,  20,   8, 234,   2,   7, 127,
         4,   2,   6,  11,   1,   4,   1,   4,   1,  16,  87, 157,   1,
        23,   3,   1,   1,   4,   9,   1,   3,  58,   2,   3,  40,   5,
         6,  16,   5,  69,   1,   6,   2,   1,   3,   2,  24,   2,  26,
         1,   3,  13,   4,   3,   1,   1,   3,   8,   2,   1,   2, 138,
        23,   1,   3,   3,   5,   4,  61,   2,  17,   1,   8,   3,   8,
         1,  16,   3,  31,   4,   8,  50,  43,  20,   8,  83,   5,   3,
         2,   2,   4,  18,  25,   9,  10,   3,   2,   4,   3,  37,  20,
         5,   5,   3,  35,   3,   8,   9,   6,  90,  38,  11,   7,  11,
        13,  32,   1,  13,   4,  12,   6,   3,   2,   8,   1,   6,   8,
        18,   1,  21,   1,  50,   1,   9,  10,   1,   1,   3,   2,  18,
         6,   4,   2,  37,   4,  10,   2,  19, 231,   1,   9,   1,   5,
        11,   2,   1,  10,   4,   6,   9,   3,   8,  15,   3,   6,  24,
        28,   1,  12,   1,   2,   1,   1,  24,   3,   1, 293,   7,   3,
         6,  49, 222,   1,   3,   5,  14,   3,   6,   2,   1,   3,   2,
         1,   4,   3,   1,  71,   3,   6,  21,   8,   3,  24,   1,  16,
         9,   1,   4,   2,   1,   7,   7,   3,  52,   1,   5,   1,   2,
         2, 397,   5, 136,  43, 471,   6,   2,  12,   2,  14,   1,   2,
         4,   1,   1,   2,   2,   2,   4,  17,  68,   9,   1,   4,   9,
         8,   1,   7,   1,   1,   3,  25,  42,   1,   1,   5,   1,   3,
        25,   8, 213,   1,   2,  11,   1,   5,   3,   5,  31,   3,  20,
         7,   2,  15,  56,   3,   9,   8,   6,   1,   2,   2,   6,   7,
         6,  13,  15,  17,   1,   7,   1,  15,   1,   8,   1,   2,   5,
         1,  40,   3,   1,   4,   7,  22,   6,   6,   1,   4,   3,   5,
        38,   4,  20,  13,   1,   1,   2,  11,  32,   2,   2,  13,   9,
        21,   1,  21,   4,  16,   1,  10,   2,   5,  27, 643, 152,  12,
         8,  28,   4,   4,   8,   8,   6,   4,   4,   6,  52,   3,   1,
        12,   2,   1,  14,   7,   9,  47,   2,  57,   1,   3,   5,  21,
         1,   3,   5,   4,  13,   1,   5,   2,   3,  16,  15,  10, 293,
        13,  49,  15,   3,   1,   1,   3,  50,  12,   2,   3,   1,   5,
         4,   1,   2,   4,   6,   3, 108,   3,   3,   9,  25,   2,   2,
        20,   5,   3,  35,   1,   4,  10,   6,   1,   8,  22,  45,  30,
        70,   5,  38,  17,  18,   2,  19,  18,   3,   3,   4,  15,  21,
         2,  14,   2,  50,  33, 128, 263,   9,   2,   1,   4,   1,   3,
         2,   1,   1,  19,  32,   6,   3,   2,   1,   2,  68,   1,   2,
         1,   9,   3,   1,   4,  24,   1,  20, 188,  28,  50,   1,  35,
         1,   3,  38,   4,  21,   3, 105, 106,  40,  34,  25,   2,   3,
         6, 158,   6,  17,   8,  64,  16,  17,  17,   7,   1,   4,   2,
         1,   5,  11,  14,   5,   4,  19,  10,   9,  22,   4,   7,   2,
        20,   1,   6,   7,   1,  12,   1,   1,   6,  10,  24,  10,   3,
         1,  30,  21,  12,   3, 104,  13,  29,  21,  80,   9,   3,   2,
         2,   1,   6,   4,   5,   9,   5,  22, 363,  23,   9, 109,   4,
         1,   2,   1,   1,   1,  11,   3,   2,   4,   2,   2,   9,  18,
         2,  21,   9,   4,  11,   6,   2,  10,  18,  20,   7,  27,   3,
         1,  20,   4,  37,  14,   5,   1,   7,   7,   1, 103,  16,  34,
         2,   1,   1,  22,  11,   3, 117, 146,   1,   5,   3,   1,   8,
         7,   5,   1,   2,   4,   1,  46,  19,   4,   6,   1,   3,  12,
         4,  12,  18,  64,   5,  17,   1,   1,  18,   2, 165,   7,   7,
        69,   2,   5,   6, 304,  89,   2,   1,  26,   3,   5,   2,   2,
         7,   4,   3,   5,   1,   3,  12,   1,   5,   1,   2, 538,   6,
         2,  92,   4, 285,   1,   3,  31,  69,   3,   1,  17])

In [27]:
len(set(np.concatenate(differentTuples)))


Out[27]:
4663

In [10]:
len(tupleSets)


Out[10]:
1015

In [11]:
len(tagsInBranch)


Out[11]:
1015

In [43]:
sizesOfTuples = []
for tupleList in tupleSets:
    for t in tupleList:
        sizesOfTuples.append(len(t))

In [44]:
plt.hist(sizesOfTuples)
plt.show()

In [12]:
tupleSets


Out[12]:
array([array([[  0, 438, 462, 538]]),
       array([array([  1, 438, 477]), array([  1, 477, 946]),
       array([  1, 355, 438]), array([  1, 380, 438, 884]),
       array([  1, 438, 451]), array([  1,   2,  18, 670]),
       array([  1,   2, 438, 538]), array([1]), array([  1,  61, 438]),
       array([  1,  61, 438, 877, 940]), array([  1,  77, 438]),
       array([  1, 587, 723, 977]), array([  1, 812]),
       array([  1, 164, 866, 901]), array([  1, 125, 191, 255]),
       array([  1,  61, 415, 438]), array([  1, 266]),
       array([  1, 434, 438]), array([  1, 255, 438, 636]),
       array([  1, 473]), array([  1, 477]), array([  1, 538]),
       array([  1, 438, 680, 812]), array([  1, 380]), array([  1, 438]),
       array([  1, 924]), array([  1, 284, 438, 477])], dtype=object),
       array([array([  2,  61, 438]), array([  2,  61, 414, 587]),
       array([  2, 438, 812]), array([  1,   2,  18, 670]),
       array([  2, 255, 854, 882]), array([  2, 266, 438]),
       array([  2, 438, 924]), array([  2,  61, 511]),
       array([  2,  61, 266, 438, 940]), array([  2, 438, 877]),
       array([  2, 438, 854]), array([  2, 432, 894]), array([  2, 409]),
       array([  2, 352, 642]), array([  2,  61, 438, 812]),
       array([  2, 391, 438, 812]), array([  2, 438, 980]),
       array([  1,   2, 438, 538])], dtype=object),
       ..., array([[ 222,  950, 1012]]),
       array([array([ 142, 1013]), array([  60,  568,  834, 1013]),
       array([ 142,  717, 1013]), array([ 142,  782,  834, 1013]),
       array([ 142,  812,  831, 1013]),
       array([ 101,  142,  738,  831, 1013]),
       array([ 142,  705,  950, 1013]),
       array([  21,  142,  165,  717, 1013])], dtype=object),
       array([array([  18,  423,  825, 1014]), array([  86,  670, 1014]),
       array([ 364,  670, 1014]), array([  41,  228,  670,  825, 1014]),
       array([ 670, 1014]), array([ 293,  622,  670, 1014]),
       array([ 670,  823, 1014]), array([ 390,  670,  849, 1014]),
       array([  38,  670, 1014]), array([  61,  185,  284,  868, 1014]),
       array([ 104,  367,  888, 1014]), array([ 228,  749, 1014]),
       array([ 228, 1014]), array([  18,  310, 1014]),
       array([ 104,  670,  888, 1014]), array([1014]),
       array([ 462,  670, 1014]), array([ 670,  812, 1014]),
       array([ 104, 1014]), array([  18,   51,  344, 1014]),
       array([ 142,  423,  734,  849, 1014])], dtype=object)], dtype=object)

In [ ]: