In [1]:
import numpy as np
from nbminer.notebook_miner import NotebookMiner

hw_filenames = np.load('homework_file_names.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp] for temp in hw_filenames]

In [2]:
from nbminer.stats.multiple_summary import MultipleSummary
summaries = [MultipleSummary(hw_notebooks[i]) for i in range(5)]

In [3]:
for i in range(5):
    print("Average number of cells, hw ",str(i),": ", summaries[i].average_number_of_cells())


Average number of cells, hw  0 :  281.81451612903226
Average number of cells, hw  1 :  60.75
Average number of cells, hw  2 :  49.4375
Average number of cells, hw  3 :  75.34782608695652
Average number of cells, hw  4 :  54.36974789915966

In [4]:
min_length = np.min([len(el) for el in hw_notebooks])

In [5]:
even_notebooks = [el[:min_length] for el in hw_notebooks]

In [6]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
a = Features(even_notebooks[0], 'hw1')
a.add_notebooks(even_notebooks[1], 'hw2')
a.add_notebooks(even_notebooks[2], 'hw3')
a.add_notebooks(even_notebooks[3], 'hw4')
a.add_notebooks(even_notebooks[4], 'hw5')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 50)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
ci = CorpusIdentifier()
pipe = Pipeline([gastf, rbn, gi, fe, ke, ci])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x7f7dbe482160>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x7f7dd3e673c8>
<nbminer.preprocess.get_imports.GetImports object at 0x7f7dbe47cc50>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x7f7dbe47c9b0>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x7f7dbe47c2b0>
<nbminer.results.prediction.corpus_identifier.CorpusIdentifier object at 0x7f7dbe47ce80>

In [22]:
fe.corpus.get_feature('dataframe').columns
print(len(fe.corpus.get_feature('dataframe')))


163839

In [7]:
template_counter = {}
key_names = ['hw1','hw2','hw3','hw4','hw5']
for key in key_names:
    template_counter[key] = {}
all_templates = set()
for i in range(a.get_number_notebooks()):
    group = a.get_notebook(i).get_feature('import_name')
    for seg in a.get_notebook_segments(i):
        templ = seg.get_feature('template')
        if templ != None:
            all_templates.add(templ)
            if templ not in template_counter[group]:
                template_counter[group][templ] = 0
            template_counter[group][templ] += 1

In [8]:
percentages = []
for key in all_templates:
    temp_arr = []
    for hw in key_names:
        if key not in template_counter[hw]:
            temp_arr.append(0)
            continue
        v = template_counter[hw][key]
        temp_arr.append(v)
    total = np.sum(temp_arr)
    temp_arr = np.array(temp_arr)/np.sum(temp_arr)
    percentages.append((total, temp_arr, key))

In [9]:
for el in percentages:
    print(el)


(5417, array([0.05279675, 0.13033044, 0.17998892, 0.27856747, 0.35831641]), 'template_10')
(4589, array([0.05251689, 0.11810852, 0.16234474, 0.31183264, 0.35519721]), 'template_28')
(1314, array([0.13622527, 0.17808219, 0.20624049, 0.21765601, 0.26179604]), 'template_26')
(746, array([0.05093834, 0.08042895, 0.13270777, 0.29088472, 0.44504021]), 'template_44')
(21, array([0.        , 0.14285714, 0.19047619, 0.33333333, 0.33333333]), 'template_24')
(12, array([0. , 0. , 0. , 0.5, 0.5]), 'template_7')
(1170, array([0.14102564, 0.17094017, 0.1982906 , 0.23333333, 0.25641026]), 'template_27')
(393, array([0.02035623, 0.16793893, 0.21882952, 0.2697201 , 0.32315522]), 'template_41')
(1501, array([0.14257162, 0.18720853, 0.20652898, 0.22984677, 0.2338441 ]), 'template_42')
(2056, array([0.12645914, 0.14688716, 0.18774319, 0.25875486, 0.28015564]), 'template_29')
(640, array([0.1328125, 0.1546875, 0.209375 , 0.228125 , 0.275    ]), 'template_46')
(626, array([0.03194888, 0.12300319, 0.13578275, 0.31948882, 0.38977636]), 'template_35')
(1793, array([0.14445064, 0.17177914, 0.19743447, 0.23870608, 0.24762967]), 'template_14')
(1033, array([0.15876089, 0.1732817 , 0.18683446, 0.22362052, 0.25750242]), 'template_6')
(12, array([0. , 0. , 0. , 0.5, 0.5]), 'template_40')
(15082, array([0.15813553, 0.18107678, 0.20143217, 0.22092561, 0.23842992]), 'template_8')
(1098, array([0.10200364, 0.16848816, 0.23224044, 0.24499089, 0.25227687]), 'template_9')
(1905, array([0.12965879, 0.15170604, 0.1832021 , 0.24776903, 0.28766404]), 'template_19')
(1327, array([0.1507159 , 0.16880181, 0.19065561, 0.23662396, 0.25320271]), 'template_43')
(926, array([0.0712743 , 0.14794816, 0.20302376, 0.28293737, 0.29481641]), 'template_33')
(770, array([0.1974026, 0.1974026, 0.1974026, 0.2038961, 0.2038961]), 'template_30')
(210, array([0.17619048, 0.19047619, 0.19047619, 0.21904762, 0.22380952]), 'template_21')
(784, array([0.00127551, 0.19005102, 0.25892857, 0.25892857, 0.29081633]), 'template_38')
(24, array([0.  , 0.25, 0.25, 0.25, 0.25]), 'template_4')
(2255, array([0.11086475, 0.17560976, 0.21197339, 0.23813747, 0.26341463]), 'template_20')
(620, array([0.00322581, 0.05322581, 0.07580645, 0.32419355, 0.54354839]), 'template_39')
(3798, array([0.12295945, 0.18404423, 0.1866772 , 0.25144813, 0.25487098]), 'template_5')
(2280, array([0.10877193, 0.15219298, 0.2       , 0.2377193 , 0.30131579]), 'template_36')
(1919, array([0.07347577, 0.12402293, 0.16102137, 0.31683168, 0.32464825]), 'template_15')
(750, array([0.2, 0.2, 0.2, 0.2, 0.2]), 'template_2')
(1061, array([0.10273327, 0.19227144, 0.19792648, 0.25259189, 0.25447691]), 'template_37')
(498, array([0.03012048, 0.12650602, 0.16465863, 0.29317269, 0.38554217]), 'template_18')
(2014, array([0.17378352, 0.19414101, 0.20258193, 0.21151936, 0.21797418]), 'template_13')
(55822, array([0.11377235, 0.15785891, 0.19859554, 0.2461216 , 0.28365161]), 'template_1')
(29840, array([0.07868633, 0.13223861, 0.17412869, 0.26893432, 0.34601206]), 'template_0')
(3192, array([0.15162907, 0.1566416 , 0.19548872, 0.22431078, 0.27192982]), 'template_11')
(563, array([0.06927176, 0.19893428, 0.22912966, 0.24866785, 0.25399645]), 'template_45')
(123, array([0.02439024, 0.24390244, 0.24390244, 0.24390244, 0.24390244]), 'template_22')
(1441, array([0.13046495, 0.15891742, 0.19639139, 0.24843858, 0.26578765]), 'template_17')
(110, array([0.02727273, 0.21818182, 0.23636364, 0.24545455, 0.27272727]), 'template_48')
(1750, array([0.09371429, 0.11942857, 0.22057143, 0.28114286, 0.28514286]), 'template_49')
(2565, array([0.05302144, 0.10877193, 0.16608187, 0.27563353, 0.39649123]), 'template_25')
(1225, array([0.0922449 , 0.1755102 , 0.22122449, 0.24571429, 0.26530612]), 'template_23')
(1162, array([0.10843373, 0.18244406, 0.22547332, 0.23407917, 0.24956971]), 'template_47')
(555, array([0.2, 0.2, 0.2, 0.2, 0.2]), 'template_31')
(1780, array([0.13988764, 0.19438202, 0.21123596, 0.2252809 , 0.22921348]), 'template_34')
(3275, array([0.09618321, 0.16977099, 0.21435115, 0.25648855, 0.26320611]), 'template_32')
(1357, array([0.19675755, 0.19749447, 0.20191599, 0.20191599, 0.20191599]), 'template_16')
(4, array([0.  , 0.25, 0.25, 0.25, 0.25]), 'template_12')
(431, array([0.12064965, 0.18561485, 0.19953596, 0.23201856, 0.26218097]), 'template_3')

In [15]:
import astor
# Smaller number -- more likely in group 2 (aka, final)
for el in percentages:
    if el[0] > 20 and np.max(el[1]) > .3:
        print(el)
        for i in range(3):
            print (astor.to_source(ke.templates.get_random_example(el[2])))


(5417, array([0.05279675, 0.13033044, 0.17998892, 0.27856747, 0.35831641]), 'template_10')
def construct_search_keys(row):
    var = []
    if var.University != None and pd.notnull(var.University):
        var += var(var.University)
    if var.Institution != None and pd.notnull(var.Institution):
        var += var(var.Institution)
    if var.Short != None and pd.notnull(var.Short):
        var += var(var.Short)
    return var

var = var('2007', 'B1')

for var in var:
    var.append(var(var))

(4589, array([0.05251689, 0.11810852, 0.16234474, 0.31183264, 0.35519721]), 'template_28')
print(100 * (var.size() / var.shape[0]))

print('-' * 100)

for var, var in var:
    print('patient', var)
    print('group', var)

(746, array([0.05093834, 0.08042895, 0.13270777, 0.29088472, 0.44504021]), 'template_44')
var = dict(list(var))

var = [var.replace(',', '') for var in list(var) if isinstance(var, str)]

var = list(var.canton.values)

(21, array([0.        , 0.14285714, 0.19047619, 0.33333333, 0.33333333]), 'template_24')
def get_key_and_value(url):
    """ Scrape the key and value
        @param url : initial url
        @return : the values and key
    """
    var = requests.get(var)
    var = BeautifulSoup(var.text, 'lxml')
    var = var.find_all('frame')
    var = var[1].attrs['src']
    var = 'http://isa.epfl.ch/imoniteur_ISAP/' + var
    var = requests.get(var)
    var = BeautifulSoup(var.text, 'lxml')
    var = var.contents[0]
    var = var.body
    var = var.find_all(['div'])
    var = var[1].find_all('option')
    var = var[1].find_all('input')
    var = var[1].find_all('select')
    var = []
    var = []
    var = []
    var = []
    var = []
    var = 0
    var = []
    var = []
    for var in var:
        if var['value']:
            var.append(var['name'])
            var.append(var['value'])
        elif var['name'].startswith('z'):
            continue
    for var in var:
        for var in var:
            var = int(var['value'])
            if var[0].startswith('2'):
                var.append(var)
            elif 'Bachelor' in var:
                var.append(var)
            elif 'Master semestre' in var:
                var.append(var)
            elif 'Projet Master' in var:
                var.append(var)
            elif "d'automne" in var or 'de printemps' in var:
                var.append(var)
            elif 'Informatique' in var:
                var = var
            else:
                continue
    var = [var[0], var[6]]
    var = [var[0], var[1], var[2]]
    for var in var:
        if var['name']:
            var.append(var['name'])
    var.append(var)
    var.append(var[::-1])
    var.append(var)
    var.append(var)
    var.append(var)
    var.append(var)
    return var, var, var

def get_key_and_value(url):
    """ Scrape the key and value
        @param url : initial url
        @return : the values and key
    """
    var = requests.get(var)
    var = BeautifulSoup(var.text, 'lxml')
    var = var.find_all('frame')
    var = var[1].attrs['src']
    var = 'http://isa.epfl.ch/imoniteur_ISAP/' + var
    var = requests.get(var)
    var = BeautifulSoup(var.text, 'lxml')
    var = var.contents[0]
    var = var.body
    var = var.find_all(['div'])
    var = var[1].find_all('option')
    var = var[1].find_all('input')
    var = var[1].find_all('select')
    var = []
    var = []
    var = []
    var = []
    var = []
    var = 0
    var = []
    var = []
    for var in var:
        if var['value']:
            var.append(var['name'])
            var.append(var['value'])
        elif var['name'].startswith('z'):
            continue
    for var in var:
        for var in var:
            var = int(var['value'])
            if var[0].startswith('2'):
                var.append(var)
            elif 'Bachelor' in var:
                var.append(var)
            elif 'Master semestre' in var:
                var.append(var)
            elif 'Projet Master' in var:
                var.append(var)
            elif "d'automne" in var or 'de printemps' in var:
                var.append(var)
            elif 'Informatique' in var:
                var = var
            else:
                continue
    var = [var[0], var[6]]
    var = [var[0], var[1], var[2]]
    for var in var:
        if var['name']:
            var.append(var['name'])
    var.append(var)
    var.append(var[::-1])
    var.append(var)
    var.append(var)
    var.append(var)
    var.append(var)
    return var, var, var

for var in var:
    var['ww_x_PERIODE_ACAD'] = var
    for var in var:
        var['ww_x_PERIODE_PEDAGO'] = var
        var = requests.get(var, params=var)
        var = var.text
        var = BeautifulSoup(var, 'html.parser')
        var = var.findAll('table')[0]
        var = var.findAll('tr')[2:]
        var = re.search(
            'Informatique, (\\d+-\\d+), Bachelor semestre (\\d)\n\xa0\\(.*\\)',
            var.findAll('tr')[0].findAll('th')[0].get_text())
        var = var.group(1)
        var = var.group(2)
        for var in var:
            var = var.findAll('td')
            var['Sciper'].append(int(var[10].get_text()))
            var['Civilite'].append(var[0].get_text())
            var['Nom'].append(var[1].get_text())
            var['BA' + var].append(var)
            if var is '1':
                var['BA5'].append(None)
                var['BA6'].append(None)
            elif var is '5':
                var['BA1'].append(None)
                var['BA6'].append(None)
            elif var is '6':
                var['BA1'].append(None)
                var['BA5'].append(None)

(393, array([0.02035623, 0.16793893, 0.21882952, 0.2697201 , 0.32315522]), 'template_41')
for var in var.keys():
    var = int(var[var])
    folium.Marker([var[var].latitude, var[var].longitude], popup=str(var +
        ' Total granted amount in CHF : ' + str(var))).add_to(var)

for var in range(2007, 2017):
    var.update(var(var[str(var) + '-' + str(var + 1)]))
    var.update(var(var[str(var) + '-' + str(var + 1)]))
    var.update(var(var[str(var) + '-' + str(var + 1)]))

def lastB6(sci):
    var = 2007
    var = 0
    while var < 2017:
        var = str(var) + '-' + str(var + 1)
        if str(var) in var(var[var]).keys():
            var = var
        var = var + 1
    return var

(626, array([0.03194888, 0.12300319, 0.13578275, 0.31948882, 0.38977636]), 'template_35')
plt.ylabel('Mean skin color')

plt.ylabel('Months')

plt.ylabel('Frequency', fontsize=18)

(620, array([0.00322581, 0.05322581, 0.07580645, 0.32419355, 0.54354839]), 'template_39')
def plot_wordcloud(cloud, figsize=(16, 12)):
    plt.figure(figsize=var)
    plt.imshow(var)
    plt.axis('off')
    plt.show()

plt.show()

plt.show()

(2280, array([0.10877193, 0.15219298, 0.2       , 0.2377193 , 0.30131579]), 'template_36')
var = var.apply(pd.to_numeric)

var.st_time.apply(lambda d: datetime.strptime(var, '%m/%d/%y %H:%M'))

var.seg_length.apply(np.log).hist(bins=500)

(1919, array([0.07347577, 0.12402293, 0.16102137, 0.31683168, 0.32464825]), 'template_15')
var.drop([89525, 89526])

var.drop(['Description'], axis=1, inplace=True)

var = var.drop('index', axis=1)

(498, array([0.03012048, 0.12650602, 0.16465863, 0.29317269, 0.38554217]), 'template_18')
def get_topics_from_models(list_of_LdaModels, num_words):
    var = list([])
    for var in var:
        var = list([])
        var = var.show_topics(num_topics=-1, num_words=var, formatted=False)
        for var in range(0, len(var)):
            var = ''
            for var, var in var[var][1]:
                var = var + ' ' + str(var)
            var.append(var)
        var.append(var)
    return var

def fit_kmeans_on_features(sets, data_in, data_out, iterations=20):
    var = []
    for var in var:
        if var:
            var = []
            for _ in range(var):
                var = list(var)
                var = preprocessing.scale(var[var])
                var = cluster.KMeans(n_clusters=2, init='k-means++', n_init=10)
                var = var.fit(var)
                var = metrics.silhouette_score(var, var.labels_)
                var = max(metrics.accuracy_score(var, var.labels_), metrics
                    .accuracy_score(~var, var.labels_))
                var.append((var, var))
            var.append((var, var))
    return var

for var in range(var.shape[1]):
    var = var.iloc[:, 0:var + 1]
    var = KMeans(n_clusters=2, init='k-means++', random_state=0).fit(var)
    var.append(silhouette_score(var, var.labels_))
    var = np.mean([int(np.abs(var) < 0.5) for var in var.labels_ - var])
    if var > 0.5:
        var = var.labels_
    else:
        var = 1 - var.labels_
    var = var[var == 0]
    var = var[var == 1]
    var.append(np.abs(var.mean() - var.mean()))
    var = np.mean([int(np.abs(var) < 0.5) for var in var - var])
    var.append(var)

(29840, array([0.07868633, 0.13223861, 0.17412869, 0.26893432, 0.34601206]), 'template_0')
stats.ks_2samp(var, var)

var = var.rename(columns={(0): 'Polarity'})

plt.tick_params(axis='x', labelsize=18)

(2565, array([0.05302144, 0.10877193, 0.16608187, 0.27563353, 0.39649123]), 'template_25')
var['month'] = ['Jan'] * len(var)

var['month'] = ['Jan'] * len(var)

print('Total number of referees : {}.'.format(len(var)))


In [ ]: