Here we will compare how much code exists in each stack exchange


In [1]:
import time
import xml.etree.cElementTree
import numpy as np
import ast
from html.parser import HTMLParser

# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.code_start = 0
        self.code = []
    
    def reset_vars(self):
        self.code = []
        self.code_start = 0

    def handle_starttag(self, tag, attrs):
        if self.code_start == 1:
            pass
        else:
            if tag == 'code':
                self.code_start = 1
                self.code.append('')

    def handle_endtag(self, tag):
        if tag == 'code':
            self.code_start = 0

    def handle_data(self, data):
        if self.code_start == 1:
            self.code[-1] += data

In [20]:
import tqdm
import datetime
def is_valid_python(code):
    try:
        ast.parse(code)
    except SyntaxError:
        return False
        if 'import ' in code:
            return True
        else:
            return False
    return True

def parse_file(filename):
    # instantiate the parser and fed it some HTML
    parser = MyHTMLParser()
    
    first_day = datetime.datetime(2008,1,1)
    all_code = []
    python_code = []
    code_time = []
    total = 0
    for event, elem in xml.etree.ElementTree.iterparse(filename):
        total += 1
        if total == 10000*300:
            break
        cur_code = []
        for neighbor in elem.iter('row'):
            if 'Body' not in neighbor.attrib:
                continue
            bod = (neighbor.attrib['Body'])
            if total % 10000 == 0:
                creation_date = neighbor.attrib['CreationDate']
                print(creation_date)
            parser.feed(bod)
            for el in parser.code:
                cur_code.append(el)
                if is_valid_python(el):
                    cd = neighbor.attrib['CreationDate']
                    #print(cd)
                    #print(cd[:4],cd[5:7],cd[9:10])
                    d = datetime.datetime(year=int(cd[:4]),month=int(cd[5:7]), day=int(cd[8:10]))
                    diff_time = d-first_day
                    python_code.append(el)
            parser.reset_vars()
        elem.clear()
        all_code.append(cur_code)


    return all_code, python_code

%matplotlib inline
import matplotlib.pyplot as plt
def print_summary(all_code, python_code):
    print('We have looked through',len(all_code),'posts.')
    zero_posts = len([el for el in all_code if len(el) > 0])
    print('We have',zero_posts,'posts with code')
    print('Our average code blocks per code post is:',np.mean([len(el) for el in all_code if len(el) > 0]))
    print('Number of python code blocks is:',len(python_code),'out of',np.sum([len(el) for el in all_code]),'total blocks')
    larger_than_1 = len([el for el in python_code if len(el.split('\n')) > 1])
    print('Number of python code blocks that are greater than one line of code is:',larger_than_1)
    plt.hist([len(code) for code in python_code if len(code) < 2000],bins=100)

In [18]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/iot/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for iot:')
print_summary(all_code, python_code)


Summary for iot:
We have looked through 2345 posts.
We have 217 posts with no code
Our average code blocks per code post is: 3.161290322580645
Number of python code blocks is: 307 out of 686 total blocks
Number of python code blocks that are greater than one line of code is: 28

In [21]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/cs/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for cs:')
print_summary(all_code, python_code)


2013-10-07T13:29:53.487
2014-12-19T16:06:15.420
2016-01-28T16:31:28.337
2017-01-06T17:47:24.807
2017-11-07T06:41:58.393
Summary for iot:
We have looked through 54374 posts.
We have 7320 posts with no code
Our average code blocks per code post is: 4.3740437158469945
Number of python code blocks is: 18808 out of 32018 total blocks
Number of python code blocks that are greater than one line of code is: 879

In [22]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/datascience/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for datascience:')
print_summary(all_code, python_code)


2017-01-18T00:44:44.920
Summary for datascience:
We have looked through 19214 posts.
We have 5295 posts with no code
Our average code blocks per code post is: 2.965816808309726
Number of python code blocks is: 10243 out of 15704 total blocks
Number of python code blocks that are greater than one line of code is: 2579

In [23]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/opensource/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for opensource:')
print_summary(all_code, python_code)


Summary for opensource:
We have looked through 5000 posts.
We have 507 posts with no code
Our average code blocks per code post is: 2.5996055226824457
Number of python code blocks is: 713 out of 1318 total blocks
Number of python code blocks that are greater than one line of code is: 16

In [24]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/ai/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for ai:')
print_summary(all_code, python_code)


Summary for ai:
We have looked through 3901 posts.
We have 311 posts with no code
Our average code blocks per code post is: 3.559485530546624
Number of python code blocks is: 736 out of 1107 total blocks
Number of python code blocks that are greater than one line of code is: 91

In [25]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/crypto/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for crypto:')
print_summary(all_code, python_code)


2014-02-26T21:10:29.470
2015-12-30T09:50:21.823
2017-05-18T18:13:13.897
Summary for crypto:
We have looked through 36244 posts.
We have 4858 posts with no code
Our average code blocks per code post is: 3.821737340469329
Number of python code blocks is: 11373 out of 18566 total blocks
Number of python code blocks that are greater than one line of code is: 776

In [26]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/devops/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for devops:')
print_summary(all_code, python_code)


Summary for devops:
We have looked through 3327 posts.
We have 1187 posts with no code
Our average code blocks per code post is: 3.486099410278012
Number of python code blocks is: 1742 out of 4138 total blocks
Number of python code blocks that are greater than one line of code is: 90

In [27]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/engineering/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for engineering:')
print_summary(all_code, python_code)


2017-06-06T10:18:39.477
Summary for engineering:
We have looked through 13388 posts.
We have 317 posts with no code
Our average code blocks per code post is: 2.675078864353312
Number of python code blocks is: 368 out of 848 total blocks
Number of python code blocks that are greater than one line of code is: 36

In [28]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/opendata/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for opendata:')
print_summary(all_code, python_code)


Summary for opendata:
We have looked through 9693 posts.
We have 1138 posts with no code
Our average code blocks per code post is: 2.393673110720562
Number of python code blocks is: 1219 out of 2724 total blocks
Number of python code blocks that are greater than one line of code is: 168

In [37]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/softwareengineering/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for softwareengineering:')
print_summary(all_code, python_code)


2010-12-07T13:23:14.763
2011-02-12T05:47:17.937
2011-04-15T05:21:17.263
2011-07-02T22:36:22.167
2011-09-12T18:22:11.480
2011-12-09T19:42:01.447
2012-03-19T07:33:15.280
2012-07-06T16:28:16.270
2012-11-02T04:45:33.800
2013-03-20T13:50:09.767
2013-08-14T15:13:59.240
2014-02-03T21:49:41.087
2014-07-17T00:13:04.383
2015-01-15T17:20:40.477
2015-07-10T15:43:06.047
2016-02-11T12:08:10.067
2016-08-05T18:07:56.790
2017-02-15T12:57:47.620
2017-09-22T15:13:56.980
Summary for softwareengineering:
We have looked through 196987 posts.
We have 54370 posts with no code
Our average code blocks per code post is: 4.495696155968365
Number of python code blocks is: 149471 out of 244431 total blocks
Number of python code blocks that are greater than one line of code is: 6173

In [40]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/webapps/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for webapps:')
print_summary(all_code, python_code)


2011-06-02T09:26:04.963
2012-09-21T04:08:27.747
2013-12-10T04:28:10.343
2015-05-14T03:34:06.127
2016-10-10T12:55:12.650
Summary for webapps:
We have looked through 59962 posts.
We have 13132 posts with no code
Our average code blocks per code post is: 2.7130673164788304
Number of python code blocks is: 14066 out of 35628 total blocks
Number of python code blocks that are greater than one line of code is: 612

In [31]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/italian/Posts.xml'
all_code, python_code = parse_file(filename)
print('Summary for italian:')
print_summary(all_code, python_code)


Summary for italian:
We have looked through 5673 posts.
We have 202 posts with no code
Our average code blocks per code post is: 3.4257425742574257
Number of python code blocks is: 348 out of 692 total blocks
Number of python code blocks that are greater than one line of code is: 4

In [32]:
print(python_code)


['ó', 'O', 'l', '1,898,196', '1,135,672', 'lemma/inflections', 'lemma/inflections', 'a', 'b', 'a·b', 'dal', 'di', 'dal', 'tintìn', 'tin-tin', '"voi"', '"vous"', '"tu"', '"ti raccomando"', '"mi raccomando"', '"I recommend you"', '"I recommend to myself"', 'schifo', 'amelioration', 'Io', 'X', 'X-1', 'X', 'X+1', 'X', 'X-1', 'X', 'X-1', 'b = 2', 'randomatlabuser', 'Evasivo', 'Elusivo', 'mai', 'ever', 'giammai', 'giammai', 'mai', 'affatto', 'VB.NET', 'alcuno/a', 'qualche', 'dentro', 'dentro', 'sentirsi', 'sembrarsi', 'decreto', 'Tienes', 'Tener', 'Tienes', 'Hai', 'Avete', 'Avere', 'Hai', 'Avete', 'Avete', 'h', 'Agenzia', 'Indirizzo', 'Shift', 'Maiusc', 'Ctrl', 'Alt', 'Italiano', 'Italiano (142)', 'AltGr+e', 'AltGr+5', 'è', 'È', 'SHIFT+F3', 'AltGr', 'AltGr', 'qualità', 'Alt', 'è', 'perché', 'why or because', 'o', 'però', 'but', 'ho', 'o', 'o', 'ò', 'ó', 'e', 'o', 'perché', 'perchè', 'perché', 'e', 'PIÙ', 'Piú', 'e', 'o', 'Da', 'Dà', 'Dai', 'poco', 'Po', 'la', 'È', 'É', 'À', 'Alt+125', 'charmap', 'E', 'È', 'É', 'è', 'È', 'é', 'Shift+è', 'É', 'gli', 'li', 'gni', 'ni', 'gni', 'ni', '...', 'è', 'é', 'eh', 'eh', 'cartelloni', 'avresti', 'acca', 'che', 'chi', 'ghe', 'ghi', 'ahó', 'ohimè', 'avere', 'ho', 'hai', 'ha', 'hanno', 'ah', 'òh', 'beh', 'boh', '"mettere loro a disposizione"', '"mettere a loro disposizione"', '"mettere alla loro disposizione"', '"loro"', '"to them"', '"mettere"', 'î', '"ignorante"', '"OK"\n"Capito"\n"Chiaro"\n"Va bene"\n', 'ne', 'Un', 'Uno', 'Una', 'macchina', 'automobile', 'obiezione', 'obbiezione', 'macchina', 'a', 'XX', 'nella', 'dalla', 'dalla', 'nella', 'bravo', 'buono', 'Piccolo', 'Basso', 'Small', 'essere', 'gente', 'ricaricare', 'insegnante', 'maestro', 'donne', 'incinte', 'a', 'e', 'dargli\nribellarsi\n', 'affatto', 'le', 'gli', 'le', 'le', 'gli', 'fatto', '"Nave <nome_della_nave>"', '"<nome_della_nave>"', '"Nave Giuseppe Garibaldi"', '"Giuseppe Garibaldi"', 'cardine', 'cardinale', 'decumano', 'tanto', 'che', 'poco', 'talmente', 'tanto', 'che', 'tanto', 'talmente', 'talmente', 'che', 'da', 'tra', 'di/...', 'di', 'tra', 'di', 'tra', 'di', 'tra', 'mi', 'mi', 'manifattura', 'manufactus', 'manifattura', 'finire', 'a', 'andare', 'a', 'albergo', 'teatro', 'scuola', 'stadio', 'lavoro', 'molestatore', 'persecutore', 'Artefatto', 'figlio/figli', 'Costruzione', 'figlia/figlie', 'symbol+number', 'cucina', 'cucina', 'questo', 'addato', 'giusto', 'nuovo', 'niente', 'gg/MM/aaaa\n', 'dirimere', 'risolvere', '" "', 'o', 'vel', 'scunire', 'Scunire', '"l\'obbiettivo erano questi quadri"', '"Il ladro aveva un solo obbiettivo, l\'obbiettivo era questi quadri"', 'pesto', 'fate', 'Loro', 'loro', 'voi', 'voi', 'list', 'directory', 'spaventevole', 'spaventoso', 'spaventevole', 'enorme', 'ingente', 'vertiginoso', 'molla', 'molle', 'mollare', 'allentare', 'mollare', 'lasciare', 'spendere', 'spendibile', 'falcidiare', 'falce', 'falciare', 'falcidia', 'calice', 'bicchiere', 'calice', 'calice', 'minestra', 'minestra', 'minestrone', 'zuppa', 'parlare', 'raspare', 'la', 'km/ora', 'Mangio', 'Mangi', 'Mangia', 'Mangiamo', 'Mangiate', 'Mangiano', 'Il, lo, la, i, gli, le', 'Un, uno, una', 'il', 'la', 'dello', 'il', 'la', 'effetto', 'annunci', 'Vigàta', 'nirbùso', 'programma', 'programma', 'programma', 'programma', 'a', 'o/u', 'la', 'il', 'a', 'o', 'avere', 'troon', 'Lalla', 's', 's', 's', 'a', 'da', 'da + infinitive', 'to + infinitive', 'a', 'da', 'fratello', 'apparecchiatura', 'apparecchio', 'Apparecchiatura', 'pescaiolo', 'da', 'benché =      bene + che\nfinché =      fino + che\nfintantoché = fintanto + che\nperché =      per + che\ntalché =      tal + che\ntantoché =    tanto + che\n', '"I love you to death"', '"you mean something very profound to me"', 'amo', '"ti voglio bene da morire"', 'i', 'dileggiatorio', 'dileggiatore']

In [35]:
print([el for el in python_code if len(el.split('\n')) > 1][0])


"OK"
"Capito"
"Chiaro"
"Va bene"


In [41]:
parsed_python_code = {
    'stackoverflow':20000000,
    'iot':307,
    'cs':18808,
    'datascience':10243,
    'opensource':713,
    'ai':736,
    'crypto':11373,
    'devops':1732,
    'engineering':368,
    'opendata':1219,
    'softwareengineering':149471,
    'webapps':14066,
    'italian':348,
}

In [ ]: