Literals

  1. Num
  2. Str
  3. FormattedValue
  4. JoinedStr

Literals -- Data Structures

  1. List
  2. Tuple
  3. Set
  4. Dict

Variables

  1. Name
  2. NameConstant
  3. Starred

Unary Ops

  1. UnaryOp
  2. UAdd
  3. USub
  4. Not
  5. Invert

Math

  1. Add
  2. Sub
  3. Mult
  4. Div
  5. FloorDiv
  6. Mod
  7. Pow

Binary Ops

  1. LShift
  2. RShift
  3. BitOr
  4. BitXor
  5. BitAnd

BoolOp

  1. BoolOp
  2. And
  3. Or

Compare

  1. Compare
  2. Eq
  3. NotEq
  4. Lt
  5. LtE
  6. Gt
  7. GtE
  8. Is
  9. IsNot
  10. In
  11. NotIn

Subscripting

  1. Subscript
  2. Index
  3. Slice
  4. ExtSlice

In [1]:
import json
import ast
def get_node_dictionary():
    infile = open('/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/Posts_FullInfoSmall.xml')
    count = 0
    node_counter = {}
    total = 0 

    for line in infile:
        count += 1
        try:
            line_obj = json.loads(line)
        except:
            continue
        for code_block in line_obj['CodeBlocks']:
            if code_block['Guesslang'].lower().strip() == 'python' and code_block['Parsable'] == 'True':
                code = code_block['code']
                try:
                    a = ast.parse(code)
                except:
                    continue
                total+=1
                for l in ast.walk(a):
                    node_name = str(l).split()[0][2:] 
                    if node_name not in node_counter:
                        node_counter[node_name] = 0
                    node_counter[node_name] += 1
        if count % 10000 == 0:
            print(count)
            print('\t',total)
    return node_counter

In [2]:
nc = get_node_dictionary()


10000
	 2838
20000
	 5698
30000
	 8673
40000
	 11652
50000
	 14546
60000
	 17471
70000
	 20461
80000
	 23699
90000
	 26746
100000
	 29719
110000
	 32588
120000
	 35526
130000
	 38487
140000
	 41561
150000
	 44664
160000
	 47623
170000
	 50556
180000
	 53531
190000
	 56314
200000
	 58941
210000
	 61707
220000
	 64524
230000
	 67397
240000
	 70131
250000
	 73007
260000
	 75713
270000
	 78520
280000
	 81193
290000
	 83978
300000
	 86796
310000
	 89505
320000
	 92266
330000
	 94879
340000
	 97615
350000
	 100323
360000
	 102890
370000
	 105581
380000
	 108155
390000
	 110984
400000
	 113616
410000
	 116244
420000
	 118767
430000
	 121419
440000
	 124165
450000
	 127001
460000
	 129712
470000
	 132380
480000
	 135140
490000
	 137823
500000
	 140451
510000
	 143203
520000
	 145929
530000
	 148731
540000
	 151412
550000
	 154128
560000
	 156938
570000
	 159595
580000
	 162304
590000
	 165250
600000
	 168034
610000
	 170806
620000
	 173648
630000
	 176532
640000
	 179436
650000
	 182260
660000
	 185145
670000
	 188007
680000
	 190838
690000
	 193736
700000
	 196462
710000
	 199261
720000
	 202000
730000
	 204809
740000
	 207712
750000
	 210696
760000
	 213652
770000
	 216494
780000
	 219336
790000
	 222246
800000
	 225079
810000
	 228068
820000
	 230832
830000
	 233812
840000
	 236806
850000
	 239698
860000
	 242593
870000
	 245583
880000
	 248621
890000
	 251664
900000
	 254674
910000
	 257641
920000
	 260722
930000
	 263764
940000
	 266849
950000
	 269901
960000
	 272981
970000
	 276070
980000
	 279128
990000
	 282218
1000000
	 285196
1010000
	 288214
1020000
	 291363
1030000
	 294377
1040000
	 297310
1050000
	 300309
1060000
	 303288
1070000
	 306313
1080000
	 309303
1090000
	 312385
1100000
	 315496
1110000
	 318470
1120000
	 321379
1130000
	 324302
1140000
	 327378
1150000
	 330399
1160000
	 333465
1170000
	 336632
1180000
	 339731
1190000
	 342769
1200000
	 345894
1210000
	 349011
1220000
	 352297
1230000
	 355529
1240000
	 358767
1250000
	 361924
1260000
	 365126
1270000
	 368445
1280000
	 371502
1290000
	 374670
1300000
	 377705
1310000
	 380717
1320000
	 383872
1330000
	 386937
1340000
	 390080
1350000
	 393062
1360000
	 396177
1370000
	 399363
1380000
	 402514
1390000
	 405722
1400000
	 408945
1410000
	 412088
1420000
	 415206
1430000
	 418326
1440000
	 421504
1450000
	 424633
1460000
	 427737
1470000
	 430841
1480000
	 433979
1490000
	 437159
1500000
	 440321
1510000
	 443378
1520000
	 446608
1530000
	 449709
1540000
	 452785
1550000
	 455836
1560000
	 459074
1570000
	 462101
1580000
	 465135
1590000
	 468253
1600000
	 471392
1610000
	 474627
1620000
	 477874
1630000
	 481111
1640000
	 484314
1650000
	 487601
1660000
	 490787
1670000
	 493933
1680000
	 497040
1690000
	 500150
1700000
	 503281
1710000
	 506367
1720000
	 509345
1730000
	 512404
1740000
	 515532
1750000
	 518706
1760000
	 522021
1770000
	 525251
1780000
	 528528
1790000
	 531696
1800000
	 535146
1810000
	 538447
1820000
	 541770
1830000
	 545037
1840000
	 548484
1850000
	 551866
1860000
	 555398
1870000
	 558639
1880000
	 561896
1890000
	 565358
1900000
	 568916
1910000
	 572478
1920000
	 575940
1930000
	 579520
1940000
	 583010
1950000
	 586457
1960000
	 589910
1970000
	 593301
1980000
	 596646
1990000
	 600199
2000000
	 603603
2010000
	 607055
2020000
	 610365
2030000
	 613675
2040000
	 617136
2050000
	 620520
2060000
	 624057
2070000
	 627581
2080000
	 631011
2090000
	 634672
2100000
	 638102
2110000
	 641575
2120000
	 644824
2130000
	 648284
2140000
	 651715
2150000
	 655218
2160000
	 658621
2170000
	 661936
2180000
	 665392
2190000
	 668906
2200000
	 672501
2210000
	 676028
2220000
	 679556
2230000
	 683044
2240000
	 686491
2250000
	 689854
2260000
	 693169
2270000
	 696719
2280000
	 700170
2290000
	 703669
2300000
	 707216
2310000
	 710546
2320000
	 714160
2330000
	 717682
2340000
	 721228
2350000
	 724844
2360000
	 728556
2370000
	 732123
2380000
	 735745
2390000
	 739303
2400000
	 742956
2410000
	 746803
2420000
	 750365
2430000
	 754148
2440000
	 757856
2450000
	 761670
2460000
	 765382
2470000
	 769056
2480000
	 772604
2490000
	 776376
2500000
	 779897
2510000
	 783455
2520000
	 787160
2530000
	 790897
2540000
	 794542
2550000
	 798151
2560000
	 802044
2570000
	 805969
2580000
	 809862
2590000
	 813609
2600000
	 817316
2610000
	 821235
2620000
	 825023
2630000
	 828955
2640000
	 832837
2650000
	 836781
2660000
	 840710
2670000
	 844674
2680000
	 848442
2690000
	 852112
2700000
	 855868
2710000
	 859644
2720000
	 863380
2730000
	 867092
2740000
	 870877
2750000
	 874698
2760000
	 878483
2770000
	 882173
2780000
	 885994
2790000
	 889874
2800000
	 893565
2810000
	 897363
2820000
	 901124
2830000
	 904843
2840000
	 908686
2850000
	 912480
2860000
	 916238
2870000
	 920062
2880000
	 923886
2890000
	 927730
2900000
	 931496
2910000
	 935349
2920000
	 939177
2930000
	 943173
2940000
	 946736
2950000
	 950522
2960000
	 954478
2970000
	 958473
2980000
	 962445
2990000
	 966430
3000000
	 970296
3010000
	 974414
3020000
	 978472
3030000
	 982518
3040000
	 986321
3050000
	 990318
3060000
	 994398
3070000
	 998551
3080000
	 1002671
3090000
	 1006823
3100000
	 1011031
3110000
	 1015099
3120000
	 1019232
3130000
	 1023469
3140000
	 1027552
3150000
	 1031632
3160000
	 1035920
3170000
	 1040300
3180000
	 1044383
3190000
	 1048566
3200000
	 1052751
3210000
	 1057083
3220000
	 1061309
3230000
	 1065571
3240000
	 1069715
3250000
	 1073775
3260000
	 1077858
3270000
	 1082101
3280000
	 1086285
3290000
	 1090557
3300000
	 1094753
3310000
	 1098815
3320000
	 1102981
3330000
	 1107180
3340000
	 1111456
3350000
	 1115760
3360000
	 1119869
3370000
	 1124241
3380000
	 1128565
3390000
	 1132786
3400000
	 1137290
3410000
	 1141582
3420000
	 1145914
3430000
	 1150256
3440000
	 1154545
3450000
	 1158750
3460000
	 1163066
3470000
	 1167316
3480000
	 1171613
3490000
	 1176030
3500000
	 1180307
3510000
	 1184719
3520000
	 1189027
3530000
	 1193350
3540000
	 1198080
3550000
	 1202579
3560000
	 1207063
3570000
	 1211523
3580000
	 1216049
3590000
	 1220370
3600000
	 1224953
3610000
	 1229472
3620000
	 1234032
3630000
	 1238570
3640000
	 1242884
3650000
	 1247495
3660000
	 1251970
3670000
	 1256447
3680000
	 1260888
3690000
	 1265534
3700000
	 1270044
3710000
	 1274828
3720000
	 1279570
3730000
	 1284045
3740000
	 1288712
3750000
	 1293527
3760000
	 1297934
3770000
	 1302607
3780000
	 1307337
3790000
	 1311818
3800000
	 1316615
3810000
	 1321538
3820000
	 1326003
3830000
	 1330764
3840000
	 1335662
3850000
	 1340308
3860000
	 1345163
3870000
	 1349966
3880000
	 1354737
3890000
	 1359542
3900000
	 1364339
3910000
	 1369151
3920000
	 1374068
3930000
	 1378726
3940000
	 1383747
3950000
	 1388645
3960000
	 1393732
3970000
	 1398621
3980000
	 1403327
3990000
	 1408109
4000000
	 1412829
4010000
	 1417482
4020000
	 1422202
4030000
	 1427018
4040000
	 1431624
4050000
	 1436601
4060000
	 1441537
4070000
	 1446249
4080000
	 1451062
4090000
	 1455813
4100000
	 1460731

In [3]:
nc.keys()


Out[3]:
dict_keys(['ast.Break', 'ast.Call', 'ast.ImportFrom', 'ast.withitem', 'ast.UAdd', 'ast.Slice', 'ast.Return', 'ast.Module', 'ast.ExtSlice', 'ast.IfExp', 'ast.Tuple', 'ast.MatMult', 'ast.Name', 'ast.Lt', 'ast.AugAssign', 'ast.Add', 'ast.arg', 'ast.Continue', 'ast.Num', 'ast.USub', 'ast.Raise', 'ast.If', 'ast.Assign', 'ast.UnaryOp', 'ast.ClassDef', 'ast.Set', 'ast.Dict', 'ast.Delete', 'ast.BitAnd', 'ast.List', 'ast.And', 'ast.Yield', 'ast.NameConstant', 'ast.YieldFrom', 'ast.Pass', 'ast.arguments', 'ast.Lambda', 'ast.Global', 'ast.FloorDiv', 'ast.With', 'ast.Import', 'ast.NotIn', 'ast.Invert', 'ast.Attribute', 'ast.Store', 'ast.Mod', 'ast.Ellipsis', 'ast.GeneratorExp', 'ast.DictComp', 'ast.keyword', 'ast.RShift', 'ast.AsyncFunctionDef', 'ast.AsyncWith', 'ast.In', 'ast.Expr', 'ast.ListComp', 'ast.BitOr', 'ast.Sub', 'ast.Div', 'ast.Not', 'ast.comprehension', 'ast.Subscript', 'ast.Is', 'ast.For', 'ast.FunctionDef', 'ast.Str', 'ast.BoolOp', 'ast.GtE', 'ast.Bytes', 'ast.NotEq', 'ast.Load', 'ast.Try', 'ast.Compare', 'ast.Del', 'ast.BinOp', 'ast.alias', 'ast.SetComp', 'ast.LShift', 'ast.Index', 'ast.Assert', 'ast.Mult', 'ast.Nonlocal', 'ast.ExceptHandler', 'ast.LtE', 'ast.Await', 'ast.Or', 'ast.IsNot', 'ast.Gt', 'ast.Eq', 'ast.Pow', 'ast.Starred', 'ast.While', 'ast.AsyncFor', 'ast.BitXor'])

In [6]:
new_nc = {}
for key in nc:
    new_nc[key[4:]] = nc[key]

In [9]:
total = 0
for el in new_nc:
    total += new_nc[el]
for el in new_nc:
    new_nc[el] = new_nc[el]/total

In [24]:
print('         Node name               Percentage of total nodes')
cat_set = set()
for category in categories:
    print(category[0])
    total_num = 0
    for el in category[1]:
        if el not in new_nc:
            cat_set.add(el)
            extended_el = el + ' '*(15-len(el))
            print('\t',extended_el,':\t',0)
            print('\t' + '-'*30)
            continue
        total_num += new_nc[el]
        extended_el = el + ' '*(15-len(el))
        print('\t',extended_el,':\t',"%.3f" % new_nc[el])
        print('\t' + '-'*30)

        cat_set.add(el)
    print('\tTOTAL:                  ',"%.3f" % total_num)
    print('*'*50)

print('Misc')
total_num = 0
for el in new_nc.keys():
    if el not in cat_set:
        extended_el = el + ' '*(15-len(el))
        print('\t',extended_el,':\t',"%.3f" % new_nc[el])
        print('\t' + '-'*30)
        total_num += new_nc[el]
print('\tTOTAL:                  ',"%.3f" % total_num)


         Node name               Percentage of total nodes
Literals
	 Num             :	 0.036
	------------------------------
	 Str             :	 0.048
	------------------------------
	 FormattedValue  :	 0
	------------------------------
	 JoinedStr       :	 0
	------------------------------
	TOTAL:                   0.083
**************************************************
Variables
	 Name            :	 0.201
	------------------------------
	 NameConstant    :	 0.006
	------------------------------
	 Starred         :	 0.000
	------------------------------
	TOTAL:                   0.208
**************************************************
UnaryOps
	 UnaryOp         :	 0.002
	------------------------------
	 UAdd            :	 0.000
	------------------------------
	 USub            :	 0.002
	------------------------------
	 Not             :	 0.001
	------------------------------
	 Invert          :	 0.000
	------------------------------
	TOTAL:                   0.004
**************************************************
Math
	 Add             :	 0.006
	------------------------------
	 Sub             :	 0.002
	------------------------------
	 Mult            :	 0.002
	------------------------------
	 Div             :	 0.001
	------------------------------
	 FloorDiv        :	 0.000
	------------------------------
	 Mod             :	 0.001
	------------------------------
	 Pow             :	 0.000
	------------------------------
	TOTAL:                   0.012
**************************************************
Binary Ops
	 LShift          :	 0.000
	------------------------------
	 RShift          :	 0.000
	------------------------------
	 BitOr           :	 0.000
	------------------------------
	 BitXor          :	 0.000
	------------------------------
	 BitAnd          :	 0.000
	------------------------------
	TOTAL:                   0.000
**************************************************
BoolOp
	 BoolOp          :	 0.001
	------------------------------
	 And             :	 0.001
	------------------------------
	 Or              :	 0.000
	------------------------------
	TOTAL:                   0.002
**************************************************
Compare
	 Compare         :	 0.007
	------------------------------
	 Eq              :	 0.003
	------------------------------
	 NotEq           :	 0.000
	------------------------------
	 Lt              :	 0.001
	------------------------------
	 LtE             :	 0.000
	------------------------------
	 Gt              :	 0.001
	------------------------------
	 GtE             :	 0.000
	------------------------------
	 Is              :	 0.000
	------------------------------
	 IsNot           :	 0.000
	------------------------------
	 In              :	 0.001
	------------------------------
	 NotIn           :	 0.000
	------------------------------
	TOTAL:                   0.013
**************************************************
Subscripting
	 Subscript       :	 0.012
	------------------------------
	 Index           :	 0.011
	------------------------------
	 Slice           :	 0.002
	------------------------------
	 ExtSlice        :	 0.000
	------------------------------
	TOTAL:                   0.025
**************************************************
Misc
	 AugAssign       :	 0.001
	------------------------------
	 Call            :	 0.073
	------------------------------
	 Load            :	 0.246
	------------------------------
	 Import          :	 0.004
	------------------------------
	 Pass            :	 0.000
	------------------------------
	 comprehension   :	 0.002
	------------------------------
	 DictComp        :	 0.000
	------------------------------
	 keyword         :	 0.018
	------------------------------
	 Store           :	 0.053
	------------------------------
	 List            :	 0.007
	------------------------------
	 Continue        :	 0.000
	------------------------------
	 Module          :	 0.014
	------------------------------
	 AsyncFunctionDef :	 0.000
	------------------------------
	 Lambda          :	 0.001
	------------------------------
	 Bytes           :	 0.000
	------------------------------
	 alias           :	 0.010
	------------------------------
	 IfExp           :	 0.000
	------------------------------
	 ClassDef        :	 0.003
	------------------------------
	 Break           :	 0.000
	------------------------------
	 Delete          :	 0.000
	------------------------------
	 ExceptHandler   :	 0.001
	------------------------------
	 MatMult         :	 0.000
	------------------------------
	 AsyncFor        :	 0.000
	------------------------------
	 Ellipsis        :	 0.001
	------------------------------
	 ImportFrom      :	 0.004
	------------------------------
	 arg             :	 0.013
	------------------------------
	 BinOp           :	 0.011
	------------------------------
	 FunctionDef     :	 0.008
	------------------------------
	 If              :	 0.006
	------------------------------
	 Await           :	 0.000
	------------------------------
	 While           :	 0.001
	------------------------------
	 With            :	 0.001
	------------------------------
	 Raise           :	 0.000
	------------------------------
	 Try             :	 0.001
	------------------------------
	 For             :	 0.004
	------------------------------
	 Tuple           :	 0.007
	------------------------------
	 Return          :	 0.005
	------------------------------
	 ListComp        :	 0.001
	------------------------------
	 Attribute       :	 0.073
	------------------------------
	 Del             :	 0.000
	------------------------------
	 Nonlocal        :	 0.000
	------------------------------
	 Yield           :	 0.000
	------------------------------
	 arguments       :	 0.009
	------------------------------
	 withitem        :	 0.001
	------------------------------
	 Assert          :	 0.000
	------------------------------
	 Set             :	 0.000
	------------------------------
	 Global          :	 0.000
	------------------------------
	 YieldFrom       :	 0.000
	------------------------------
	 Dict            :	 0.002
	------------------------------
	 AsyncWith       :	 0.000
	------------------------------
	 SetComp         :	 0.000
	------------------------------
	 Expr            :	 0.028
	------------------------------
	 Assign          :	 0.041
	------------------------------
	 GeneratorExp    :	 0.000
	------------------------------
	TOTAL:                   0.652

In [4]:
categories = [
    ('Literals', [
    'Num',
    'Str',
    'FormattedValue',
    'JoinedStr',
    
    ]),
    
    ('Variables', [
        'Name',
        'NameConstant',
        'Starred',
    ]),
    
    ('UnaryOps', [
        'UnaryOp',
        'UAdd',
        'USub',
        'Not',
        'Invert'
    ]),
    
    ('Math', [
        'Add',
        'Sub',
        'Mult',
        'Div',
        'FloorDiv',
        'Mod',
        'Pow'
    ]),
    
    ('Binary Ops', [
        'LShift',
        'RShift',
        'BitOr',
        'BitXor',
        'BitAnd'
    ]),
    ('BoolOp', [
        'BoolOp',
        'And',
        'Or'
    ]),
    ('Compare', [
    'Compare',
    'Eq',
    'NotEq',
    'Lt',
    'LtE',
    'Gt',
    'GtE',
    'Is',
    'IsNot',
    'In',
    'NotIn',
    ]),
    ('Subscripting', [
        'Subscript',
        'Index',
        'Slice',
        'ExtSlice',
    ]),

]

In [ ]: