In [24]:
import numpy as np
from nbminer.notebook_miner import NotebookMiner

hw_filenames = np.load('homework_file_names.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp] for temp in hw_filenames]

In [25]:
hw_python = []
for corp in hw_notebooks:
    hw_python.append([nb.get_all_python() for nb in corp])

In [26]:
all_nbs = []
for corp in hw_python:
    all_nbs.extend(corp)

Tf-idf conversion


In [27]:
import sklearn
from sklearn.feature_extraction import text
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
tfidf.max_features = 1000
tfidf.fit(all_nbs)


Out[27]:
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [28]:
hw_featurized = [tfidf.transform(corp) for corp in hw_python]
min_val = min([el.shape[0] for el in hw_featurized])
X = np.array([el[:min_val,:].todense() for el in hw_featurized])
y = np.array([[i for num in range(min_val)] for i, el in enumerate(hw_featurized)])
X = np.concatenate(X,axis=0)
y = y.reshape((-1))

Cross Validation


In [29]:
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
p = np.random.permutation(len(X))
X = X[p]
y = y[p]
clf = sklearn.ensemble.RandomForestClassifier()
scores = cross_val_score(clf, X, y, cv=10)
print(scores)
print('Mean', np.mean(scores))


[1.    0.975 0.95  0.925 1.    1.    1.    1.    1.    0.975]
Mean 0.9824999999999999

Train and test set, with feature importances for random forest


In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
clf.fit(X_train, y_train)


Out[31]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [32]:
for pair in reversed(sorted([el for el in zip(clf.feature_importances_, tfidf.get_feature_names())])):
    print (pair)


(0.06557051385716282, 'hillary')
(0.06515493733409342, 'month')
(0.049741059713574606, 'imoniteur_isap')
(0.04624591304551501, 'microbiome')
(0.034466225800557845, 'students')
(0.030517614560347777, 'canton')
(0.026666274625110252, 'emails')
(0.02401429357686349, 'nltk')
(0.022578819055862048, 'university')
(0.02219599033544997, 'topojson')
(0.021432091486187216, 'crowdstormingdatajuly1st')
(0.020933974696863318, 'sklearn')
(0.020653459141712936, 'in')
(0.020629464659728883, 'geo_path')
(0.02038092953511923, 'words')
(0.01954363822809963, 'rater1')
(0.019253132191487183, 'read_excel')
(0.018536199281182865, 'ww_i_reportmodelxsl')
(0.01809492567889883, 'fit')
(0.01784736606326528, 'bs4')
(0.017427468373461097, 'player')
(0.016442979545612926, 'student')
(0.016350514092939104, 'isin')
(0.015619583058925076, 'cluster')
(0.015533195127383432, 'date')
(0.01483233663939031, '166')
(0.014636036605065319, 'madame')
(0.01378564496747436, 'objects')
(0.012554389782998355, 'metrics')
(0.011308148340293662, 'vd')
(0.011011843145514635, 'location')
(0.010631536783427471, 'import')
(0.00972009533611413, 'fr')
(0.009485892914760239, 'predict')
(0.008520384373312147, 'approved')
(0.008497564522687137, 'xls')
(0.008285305950440806, 'foo')
(0.007891663348809705, 'drop')
(0.007253622281940568, 'wordcloud')
(0.006762545717474139, '1e')
(0.0064301346112195625, 'leaguecountry')
(0.005457144370796609, 'choropleth')
(0.004657405017209589, 'matplotlib')
(0.004619923646383419, 'pd')
(0.004156781579204336, 'guinea')
(0.004118137762889683, 'amount')
(0.004003145133960935, 'ttest_ind')
(0.003944778989171257, 'for')
(0.003919655462284865, 'year')
(0.0035434238695811566, 'width')
(0.0035006414634748775, 'array')
(0.003486691167480565, 'folium')
(0.003472601539994056, 'fill_color')
(0.003429639280496588, 'sep')
(0.0033214192100705963, 'master')
(0.003317616160066591, 'gedpublicreports')
(0.0031230280671294013, '86')
(0.0030992810377604998, 'so')
(0.0029935929803956984, '133685247')
(0.002974994647043324, 'id')
(0.002958218625801195, 'label')
(0.00285309884427207, 'mb')
(0.0028133416344691643, 'save')
(0.00272887952626172, 'refnum')
(0.0027018935443314804, 'query')
(0.002679130940967613, 'birthday')
(0.0026684214438117823, 'scipy')
(0.00262566432017275, 'true')
(0.0023205214587416296, 'coding')
(0.002286068859659866, '46')
(0.0022313727092733225, 'classes')
(0.0022118165421057033, 'ascending')
(0.0021683818526847575, 'clinton')
(0.0020866446590862785, 'rater2')
(0.0020424020424020437, '87')
(0.0020347331117465477, 'monsieur')
(0.0020193663755374005, 'drop_duplicates')
(0.001960068827201707, 'utf')
(0.00187798019317765, '58')
(0.0016999713947121087, 'stopwords')
(0.0016731961434190228, 'beautifulsoup')
(0.0015195648210568258, 'text')
(0.001518360515724952, '44')
(0.001518055142449444, 'imshow')
(0.0015160797575579348, 'ge')
(0.0014986535534480742, 'scores')
(0.0014526234199590444, '82')
(0.0014510629035768702, 'one')
(0.0014429658576067136, 'train_test_split')
(0.001441906735203849, '28')
(0.0014262882833322274, 'cases')
(0.0014085021705644786, 'matrix')
(0.001370432909967452, 'data')
(0.001355133434849782, 'playershort')
(0.0012616992696283962, 'token')
(0.0012596931535677103, 'female')
(0.0011667545428098623, 'csv')
(0.0011563245906086247, '107')
(0.0011208602602497417, 'lambda')
(0.001008210520375821, 'than')
(0.0010058702982503405, 'def')
(0.0009443009959424567, 'address')
(0.0009386228642663341, '71')
(0.0008799589352496884, 'plot')
(0.0008401919838683141, 'use')
(0.0007840831128099579, 'means')
(0.0007693630034969943, 'print')
(0.0007688031224938831, 'titanic')
(0.0007630435901200908, 'pycountry')
(0.0007594772840056672, 'on')
(0.0007581685014543729, '25')
(0.0007565806756685756, 'to_numeric')
(0.0007523959107282258, 'how')
(0.0007445448420938664, 'liberia')
(0.0007433671597994863, '43')
(0.0007406452896983344, 'place')
(0.0007238577091788548, 'sum')
(0.0006978005653274891, '100')
(0.0006969627669421846, 'validation')
(0.0006720712395513926, 'notnull')
(0.0006270360300781345, 'unique')
(0.0006049382283851863, 'file')
(0.0005929331834987532, '91')
(0.0005884516366311144, 'xticks')
(0.0005884516366311144, 'sciper')
(0.0005884516366311144, 'first')
(0.0005878462781982512, '47')
(0.0005873283899860509, '24')
(0.0005866392901664588, 'display')
(0.0005780198624405958, 'confirmed')
(0.0005563542746330544, 'split')
(0.0005227861232958398, '21')
(0.0005225300250651122, 'gensim')
(0.0005216611647715695, 'cell')
(0.0004966468171310477, '13')
(0.0004929126305360775, 'hist')
(0.00039345874830935704, 'estimator')
(0.00039208959247187986, 'epfl')
(0.00039208959247187986, 'enumerate')
(0.0003915522599907006, 'row')
(0.00039124587357867714, 'subplots')
(0.0001445122100570702, 'groupby')
(0.0, 'zoom_start')
(0.0, 'zip')
(0.0, 'zh')
(0.0, 'your')
(0.0, 'ylim')
(0.0, 'ylabel')
(0.0, 'yellowreds')
(0.0, 'yellowcards')
(0.0, 'years')
(0.0, 'y_train')
(0.0, 'y_test')
(0.0, 'y_pred')
(0.0, 'xlabel')
(0.0, 'x_train')
(0.0, 'x_test')
(0.0, 'x3b')
(0.0, 'x2b')
(0.0, 'ww_x_unite_acad')
(0.0, 'ww_x_periode_pedago')
(0.0, 'ww_x_periode_acad')
(0.0, 'ww_x_hiverete')
(0.0, 'ww_x_gps')
(0.0, 'ww_i_reportmodel')
(0.0, 'write')
(0.0, 'wordnetlemmatizer')
(0.0, 'word_tokenize')
(0.0, 'word')
(0.0, 'women')
(0.0, 'womacto01chn2006')
(0.0, 'without')
(0.0, 'with')
(0.0, 'will')
(0.0, 'white')
(0.0, 'which')
(0.0, 'where')
(0.0, 'when')
(0.0, 'weight')
(0.0, 'week')
(0.0, 'we')
(0.0, 'wc')
(0.0, 'was')
(0.0, 'warn')
(0.0, 'want')
(0.0, 'vs')
(0.0, 'victories')
(0.0, 'vessels5')
(0.0, 'vessels')
(0.0, 'variable')
(0.0, 'var_name')
(0.0, 'values')
(0.0, 'value_counts')
(0.0, 'value')
(0.0, 'vals')
(0.0, 'val')
(0.0, 'vader')
(0.0, 'utils')
(0.0, 'utc')
(0.0, 'using')
(0.0, 'used')
(0.0, 'usecols')
(0.0, 'us')
(0.0, 'url')
(0.0, 'update')
(0.0, 'unstack')
(0.0, 'universities')
(0.0, 'uni')
(0.0, 'tz_localize')
(0.0, 'type')
(0.0, 'txt')
(0.0, 'twstrs_wide')
(0.0, 'twstrs')
(0.0, 'two')
(0.0, 'try')
(0.0, 'tree')
(0.0, 'treatment_map')
(0.0, 'treatment')
(0.0, 'treat')
(0.0, 'transit_segments')
(0.0, 'transform')
(0.0, 'training')
(0.0, 'train_sizes')
(0.0, 'train_scores_mean')
(0.0, 'train_scores')
(0.0, 'train')
(0.0, 'tr')
(0.0, 'totals')
(0.0, 'total')
(0.0, 'topics')
(0.0, 'topic')
(0.0, 'top5')
(0.0, 'top3segments')
(0.0, 'top')
(0.0, 'tolist')
(0.0, 'tokens')
(0.0, 'tokenizer')
(0.0, 'tokenize')
(0.0, 'to_datetime')
(0.0, 'to_csv')
(0.0, 'to')
(0.0, 'tmp')
(0.0, 'title')
(0.0, 'time')
(0.0, 'ties')
(0.0, 'tick_params')
(0.0, 'ti')
(0.0, 'thresh')
(0.0, 'this')
(0.0, 'there')
(0.0, 'then')
(0.0, 'them')
(0.0, 'their')
(0.0, 'the')
(0.0, 'that')
(0.0, 'th')
(0.0, 'texts')
(0.0, 'test_size')
(0.0, 'test_scores_mean')
(0.0, 'test_scores')
(0.0, 'test')
(0.0, 'temp_df')
(0.0, 'temp')
(0.0, 'team')
(0.0, 'td')
(0.0, 'taxon')
(0.0, 'target')
(0.0, 'take')
(0.0, 'tail')
(0.0, 'tag')
(0.0, 'table')
(0.0, 'system')
(0.0, 'switzerland')
(0.0, 'swiss_map')
(0.0, 'swiss')
(0.0, 'survived')
(0.0, 'subset')
(0.0, 'subject')
(0.0, 'sub')
(0.0, 'strptime')
(0.0, 'string')
(0.0, 'str')
(0.0, 'stop_words')
(0.0, 'stop')
(0.0, 'stem')
(0.0, 'std')
(0.0, 'status')
(0.0, 'stats')
(0.0, 'state')
(0.0, 'stat_range')
(0.0, 'start')
(0.0, 'stacked')
(0.0, 'st_time')
(0.0, 'spécialisation')
(0.0, 'specialization')
(0.0, 'specialisation')
(0.0, 'spec')
(0.0, 'spe')
(0.0, 'soup')
(0.0, 'sorted')
(0.0, 'sort_values')
(0.0, 'sort_index')
(0.0, 'sort')
(0.0, 'some')
(0.0, 'sns')
(0.0, 'slugging')
(0.0, 'sl_data')
(0.0, 'sl')
(0.0, 'skintone')
(0.0, 'skin')
(0.0, 'size')
(0.0, 'site')
(0.0, 'since')
(0.0, 'silhouette_score')
(0.0, 'silhouette')
(0.0, 'show')
(0.0, 'sheetname')
(0.0, 'sheet')
(0.0, 'shape')
(0.0, 'sfn')
(0.0, 'sex')
(0.0, 'set_ylabel')
(0.0, 'set_xlabel')
(0.0, 'set_value')
(0.0, 'set_title')
(0.0, 'set_index')
(0.0, 'set_context')
(0.0, 'set_agree2')
(0.0, 'set')
(0.0, 'series')
(0.0, 'sentiments')
(0.0, 'sentimentintensityanalyzer')
(0.0, 'sentiment')
(0.0, 'sentence')
(0.0, 'senderpersonid')
(0.0, 'semestre')
(0.0, 'semesters')
(0.0, 'semester')
(0.0, 'sem')
(0.0, 'select')
(0.0, 'seiat')
(0.0, 'segments_merged')
(0.0, 'segments')
(0.0, 'seg_length')
(0.0, 'seexp')
(0.0, 'see')
(0.0, 'section')
(0.0, 'season')
(0.0, 'search')
(0.0, 'seaborn')
(0.0, 'scoring')
(0.0, 'score')
(0.0, 'scatter')
(0.0, 'scale')
(0.0, 'sb')
(0.0, 'sample')
(0.0, 'same')
(0.0, 'run_line_magic')
(0.0, 'rows')
(0.0, 'round')
(0.0, 'rotation')
(0.0, 'right_on')
(0.0, 'right_index')
(0.0, 'right')
(0.0, 'rfc')
(0.0, 'rf')
(0.0, 'returns')
(0.0, 'return')
(0.0, 'results')
(0.0, 'result')
(0.0, 'response')
(0.0, 'reset_index')
(0.0, 'res')
(0.0, 'requests')
(0.0, 'request')
(0.0, 'replace')
(0.0, 'rename')
(0.0, 'remove')
(0.0, 'reindex')
(0.0, 'registered')
(0.0, 'refcountry')
(0.0, 'redcards')
(0.0, 'red')
(0.0, 'receivers')
(0.0, 'read_pickle')
(0.0, 'read_csv')
(0.0, 'read')
(0.0, 're')
(0.0, 'rawtext')
(0.0, 'raw')
(0.0, 'rater')
(0.0, 'rank')
(0.0, 'range')
(0.0, 'randomforestclassifier')
(0.0, 'random_state')
(0.0, 'random')
(0.0, 'quantiles')
(0.0, 'qcut')
(0.0, 'pyplot')
(0.0, 'prénom')
(0.0, 'proteobacteria')
(0.0, 'projet')
(0.0, 'project')
(0.0, 'printemps')
(0.0, 'print_topics')
(0.0, 'preprocessing')
(0.0, 'positive')
(0.0, 'position')
(0.0, 'pos_words')
(0.0, 'pos')
(0.0, 'polarity_scores')
(0.0, 'polarity')
(0.0, 'pm')
(0.0, 'plt')
(0.0, 'players')
(0.0, 'player_unique')
(0.0, 'player_id')
(0.0, 'placebo')
(0.0, 'pivot')
(0.0, 'pickle')
(0.0, 'phylum')
(0.0, 'photoid')
(0.0, 'persons')
(0.0, 'personid')
(0.0, 'person')
(0.0, 'periode_pedago')
(0.0, 'period')
(0.0, 'percentage')
(0.0, 'per')
(0.0, 'people')
(0.0, 'pclass')
(0.0, 'payload')
(0.0, 'patient2')
(0.0, 'patient1')
(0.0, 'patient')
(0.0, 'path')
(0.0, 'passengers')
(0.0, 'partition')
(0.0, 'part')
(0.0, 'parser')
(0.0, 'parse_dates')
(0.0, 'parse')
(0.0, 'parch')
(0.0, 'params')
(0.0, 'parameters')
(0.0, 'parameter')
(0.0, 'param')
(0.0, 'pandas')
(0.0, 'page')
(0.0, 'p3')
(0.0, 'over')
(0.0, 'output')
(0.0, 'outer')
(0.0, 'out')
(0.0, 'our')
(0.0, 'os')
(0.0, 'order')
(0.0, 'or')
(0.0, 'options')
(0.0, 'option')
(0.0, 'opinion_lexicon')
(0.0, 'open')
(0.0, 'only')
(0.0, 'old')
(0.0, 'ohio')
(0.0, 'off')
(0.0, 'of')
(0.0, 'obs')
(0.0, 'obp')
(0.0, 'object')
(0.0, 'nx')
(0.0, 'numpy')
(0.0, 'number')
(0.0, 'num_topics')
(0.0, 'num')
(0.0, 'null')
(0.0, 'npo')
(0.0, 'np')
(0.0, 'now')
(0.0, 'notebook')
(0.0, 'not')
(0.0, 'normalize')
(0.0, 'none')
(0.0, 'nom')
(0.0, 'nodes')
(0.0, 'no')
(0.0, 'niat')
(0.0, 'nexp')
(0.0, 'new_order')
(0.0, 'new')
(0.0, 'negative')
(0.0, 'neg_words')
(0.0, 'neg')
(0.0, 'need')
(0.0, 'ne')
(0.0, 'national')
(0.0, 'nan')
(0.0, 'names')
(0.0, 'name')
(0.0, 'na_values')
(0.0, 'na')
(0.0, 'n_jobs')
(0.0, 'n_estimators')
(0.0, 'n_clusters')
(0.0, 'my_age')
(0.0, 'most')
(0.0, 'more')
(0.0, 'months')
(0.0, 'models')
(0.0, 'model_selection')
(0.0, 'model')
(0.0, 'mode')
(0.0, 'mmsi')
(0.0, 'missing')
(0.0, 'minor')
(0.0, 'mineur')
(0.0, 'min_ab')
(0.0, 'min')
(0.0, 'mid2')
(0.0, 'mid1')
(0.0, 'microbiome_missing')
(0.0, 'method')
(0.0, 'metadata')
(0.0, 'merge')
(0.0, 'men')
(0.0, 'median')
(0.0, 'meaniat')
(0.0, 'meanexp')
(0.0, 'mean_tissue')
(0.0, 'mean')
(0.0, 'mb_class')
(0.0, 'mb2')
(0.0, 'mb1')
(0.0, 'max_features')
(0.0, 'max_depth')
(0.0, 'max')
(0.0, 'marker')
(0.0, 'maps')
(0.0, 'map')
(0.0, 'many')
(0.0, 'male')
(0.0, 'make')
(0.0, 'mails')
(0.0, 'mail')
(0.0, 'lower')
(0.0, 'log')
(0.0, 'loc')
(0.0, 'load')
(0.0, 'list')
(0.0, 'line')
(0.0, 'like')
(0.0, 'liberia_data')
(0.0, 'level')
(0.0, 'length')
(0.0, 'len')
(0.0, 'lemmatize')
(0.0, 'legend')
(0.0, 'left_on')
(0.0, 'left_index')
(0.0, 'left')
(0.0, 'le')
(0.0, 'ldamodel')
(0.0, 'lda')
(0.0, 'lat')
(0.0, 'last')
(0.0, 'language')
(0.0, 'lan')
(0.0, 'labelsize')
(0.0, 'labels_')
(0.0, 'labels')
(0.0, 'kmeans')
(0.0, 'kind')
(0.0, 'keys')
(0.0, 'key_on')
(0.0, 'key')
(0.0, 'keep')
(0.0, 'just')
(0.0, 'json')
(0.0, 'join')
(0.0, 'jan')
(0.0, 'ix')
(0.0, 'itertools')
(0.0, 'iterrows')
(0.0, 'items')
(0.0, 'item')
(0.0, 'it')
(0.0, 'isnull')
(0.0, 'isa')
(0.0, 'is_unique')
(0.0, 'is')
(0.0, 'ipython')
(0.0, 'into')
(0.0, 'int')
(0.0, 'institution')
(0.0, 'inplace')
(0.0, 'inner')
(0.0, 'inline')
(0.0, 'informatique')
(0.0, 'information')
(0.0, 'info')
(0.0, 'indices')
(0.0, 'indexing')
(0.0, 'index_col')
(0.0, 'index')
(0.0, 'ind')
(0.0, 'importances')
(0.0, 'image')
(0.0, 'iloc')
(0.0, 'ignore_index')
(0.0, 'ignore')
(0.0, 'if')
(0.0, 'idx')
(0.0, 'id_range')
(0.0, 'id2word')
(0.0, 'http')
(0.0, 'html')
(0.0, 'hr_total')
(0.0, 'hr2007')
(0.0, 'hr2006')
(0.0, 'hr')
(0.0, 'hits')
(0.0, 'here')
(0.0, 'height')
(0.0, 'header')
(0.0, 'head')
(0.0, 'hbp')
(0.0, 'have')
(0.0, 'has')
(0.0, 'guinea_data')
(0.0, 'groups')
(0.0, 'grouped_by_type')
(0.0, 'grouped')
(0.0, 'group')
(0.0, 'green')
(0.0, 'graph')
(0.0, 'grants')
(0.0, 'grant_data_refine')
(0.0, 'grant_data')
(0.0, 'gr')
(0.0, 'gonzalu01ari2006')
(0.0, 'goals')
(0.0, 'glob')
(0.0, 'gives')
(0.0, 'given')
(0.0, 'get_level_values')
(0.0, 'get_ipython')
(0.0, 'get_group')
(0.0, 'get_dummies')
(0.0, 'get')
(0.0, 'german')
(0.0, 'geonames')
(0.0, 'geo_all_copy')
(0.0, 'generate')
(0.0, 'gender')
(0.0, 'games')
(0.0, 'function')
(0.0, 'from')
(0.0, 'frequency')
(0.0, 'french')
(0.0, 'frame')
(0.0, 'found')
(0.0, 'format')
(0.0, 'forest')
(0.0, 'fontsize')
(0.0, 'float')
(0.0, 'firmicutes')
(0.0, 'findall')
(0.0, 'find_all')
(0.0, 'find')
(0.0, 'final')
(0.0, 'filtered')
(0.0, 'filter')
(0.0, 'fillna')
(0.0, 'fill_value')
(0.0, 'files')
(0.0, 'filename')
(0.0, 'figure')
(0.0, 'figsize')
(0.0, 'fig')
(0.0, 'field')
(0.0, 'features')
(0.0, 'feature_importances_')
(0.0, 'feature')
(0.0, 'fare')
(0.0, 'false')
(0.0, 'f1')
(0.0, 'extractedsubject')
(0.0, 'extractedbodytext')
(0.0, 'extract')
(0.0, 'extra_bases')
(0.0, 'except')
(0.0, 'errors')
(0.0, 'error')
(0.0, 'entry')
(0.0, 'entries')
(0.0, 'ensemble')
(0.0, 'english')
(0.0, 'endswith')
(0.0, 'end')
(0.0, 'empty')
(0.0, 'embarked')
(0.0, 'email')
(0.0, 'else')
(0.0, 'elif')
(0.0, 'ebola')
(0.0, 'each')
(0.0, 'dyads')
(0.0, 'duration')
(0.0, 'dtypes')
(0.0, 'dtype')
(0.0, 'dt')
(0.0, 'dropna')
(0.0, 'don')
(0.0, 'do')
(0.0, 'distribution')
(0.0, 'different')
(0.0, 'difference')
(0.0, 'diff')
(0.0, 'dictionary')
(0.0, 'dict')
(0.0, 'df_players')
(0.0, 'df_emails')
(0.0, 'df2')
(0.0, 'df1')
(0.0, 'df')
(0.0, 'description')
(0.0, 'describe')
(0.0, 'del')
(0.0, 'define')
(0.0, 'defeats')
(0.0, 'default')
(0.0, 'deaths')
(0.0, 'de')
(0.0, 'day')
(0.0, 'datetime')
(0.0, 'dataset')
(0.0, 'dataframe')
(0.0, 'database')
(0.0, 'data_nomonth')
(0.0, 'data_clean')
(0.0, 'data_chunks')
(0.0, 'cv')
(0.0, 'cut')
(0.0, 'cs')
(0.0, 'cross_validation')
(0.0, 'cross_val_score')
(0.0, 'cross')
(0.0, 'create')
(0.0, 'counts')
(0.0, 'country')
(0.0, 'countries_dict')
(0.0, 'countries_df')
(0.0, 'countries')
(0.0, 'counter')
(0.0, 'count')
(0.0, 'corresponding')
(0.0, 'corr')
(0.0, 'corpus')
(0.0, 'corpora')
(0.0, 'copy')
(0.0, 'continue')
(0.0, 'content')
(0.0, 'contains')
(0.0, 'confusion_matrix')
(0.0, 'concatenate')
(0.0, 'concat')
(0.0, 'compute')
(0.0, 'compound')
(0.0, 'community')
(0.0, 'com')
(0.0, 'columns')
(0.0, 'column')
(0.0, 'cols')
(0.0, 'colors')
(0.0, 'colorado')
(0.0, 'color')
(0.0, 'col')
(0.0, 'codes')
(0.0, 'code')
(0.0, 'cmap')
(0.0, 'cm')
(0.0, 'club')
(0.0, 'clf')
(0.0, 'cleaneddataset')
(0.0, 'classifier')
(0.0, 'class_index')
(0.0, 'class')
(0.0, 'civilité')
(0.0, 'chunksize')
(0.0, 'chunks')
(0.0, 'chunk')
(0.0, 'children')
(0.0, 'check')
(0.0, 'chained_assignment')
(0.0, 'ch')
(0.0, 'center')
(0.0, 'cdystonia_wide')
(0.0, 'cdystonia_grouped')
(0.0, 'cdystonia2')
(0.0, 'cdystonia')
(0.0, 'category')
(0.0, 'categories')
(0.0, 'categorical')
(0.0, 'cat')
(0.0, 'case')
(0.0, 'cantons')
(0.0, 'can')
(0.0, 'by')
(0.0, 'but')
(0.0, 'bs')
(0.0, 'both')
(0.0, 'body')
(0.0, 'blue')
(0.0, 'black')
(0.0, 'bins')
(0.0, 'between')
(0.0, 'best')
(0.0, 'because')
(0.0, 'be')
(0.0, 'bb')
(0.0, 'bases')
(0.0, 'baseball_pickle')
(0.0, 'baseball_newind')
(0.0, 'baseball_h')
(0.0, 'baseball')
(0.0, 'barcode')
(0.0, 'bar')
(0.0, 'bacteroidetes')
(0.0, 'bacteria_dict')
(0.0, 'bacteria2')
(0.0, 'bacteria')
(0.0, 'bachelor')
(0.0, 'axis')
(0.0, 'axes')
(0.0, 'ax')
(0.0, 'average')
(0.0, 'automne')
(0.0, 'at')
(0.0, 'astype')
(0.0, 'as')
(0.0, 'are')
(0.0, 'arange')
(0.0, 'apply')
(0.0, 'append')
(0.0, 'api')
(0.0, 'any')
(0.0, 'answer')
(0.0, 'and')
(0.0, 'analysis')
(0.0, 'an')
(0.0, 'also')
(0.0, 'already')
(0.0, 'alpha_3')
(0.0, 'alpha_2')
(0.0, 'alpha')
(0.0, 'all_data')
(0.0, 'all')
(0.0, 'align')
(0.0, 'ais')
(0.0, 'agg')
(0.0, 'age')
(0.0, 'after')
(0.0, 'add')
(0.0, 'actinobacteria')
(0.0, 'accuracy_score')
(0.0, 'accuracy')
(0.0, 'academic')
(0.0, 'above')
(0.0, 'ab')
(0.0, '99')
(0.0, '98')
(0.0, '96')
(0.0, '95')
(0.0, '94')
(0.0, '93')
(0.0, '90')
(0.0, '89521')
(0.0, '89')
(0.0, '85')
(0.0, '84')
(0.0, '83')
(0.0, '81')
(0.0, '80')
(0.0, '79')
(0.0, '78')
(0.0, '77')
(0.0, '76')
(0.0, '754')
(0.0, '75')
(0.0, '74')
(0.0, '73')
(0.0, '72')
(0.0, '70')
(0.0, '69')
(0.0, '68')
(0.0, '67')
(0.0, '66')
(0.0, '65')
(0.0, '64')
(0.0, '632')
(0.0, '63')
(0.0, '62')
(0.0, '61')
(0.0, '60')
(0.0, '59')
(0.0, '57')
(0.0, '569')
(0.0, '56')
(0.0, '555')
(0.0, '55')
(0.0, '54')
(0.0, '53')
(0.0, '52')
(0.0, '51')
(0.0, '5000u')
(0.0, '500')
(0.0, '50')
(0.0, '49')
(0.0, '48')
(0.0, '45')
(0.0, '433')
(0.0, '42')
(0.0, '41')
(0.0, '40')
(0.0, '39')
(0.0, '38')
(0.0, '37')
(0.0, '36')
(0.0, '35')
(0.0, '34')
(0.0, '33')
(0.0, '32')
(0.0, '31')
(0.0, '30')
(0.0, '2f')
(0.0, '29')
(0.0, '27')
(0.0, '26')
(0.0, '240')
(0.0, '23')
(0.0, '220')
(0.0, '22')
(0.0, '218')
(0.0, '215')
(0.0, '208')
(0.0, '2017')
(0.0, '2016')
(0.0, '2014')
(0.0, '2013')
(0.0, '2008')
(0.0, '2007')
(0.0, '2006')
(0.0, '200')
(0.0, '20')
(0.0, '1970')
(0.0, '193')
(0.0, '191')
(0.0, '19')
(0.0, '18')
(0.0, '173')
(0.0, '170')
(0.0, '17')
(0.0, '169')
(0.0, '168')
(0.0, '164')
(0.0, '1638')
(0.0, '161')
(0.0, '16')
(0.0, '159')
(0.0, '158')
(0.0, '153')
(0.0, '152')
(0.0, '150')
(0.0, '15')
(0.0, '149')
(0.0, '145')
(0.0, '144')
(0.0, '142')
(0.0, '14')
(0.0, '136')
(0.0, '127')
(0.0, '122')
(0.0, '120')
(0.0, '12')
(0.0, '117')
(0.0, '115')
(0.0, '114')
(0.0, '1130')
(0.0, '112')
(0.0, '110')
(0.0, '11')
(0.0, '109')
(0.0, '106')
(0.0, '103')
(0.0, '101')
(0.0, '10000u')
(0.0, '1000')
(0.0, '10')
(0.0, '05')
(0.0, '01')

In [33]:
y_test


Out[33]:
array([0, 1, 2, 3, 3, 1, 2, 4, 1, 4, 1, 0, 3, 1, 2, 4, 1, 3, 4, 3, 2, 1,
       1, 4, 0, 1, 4, 2, 1, 1, 4, 0, 3, 4, 4, 0, 4, 4, 1, 3, 0, 1, 2, 4,
       0, 3, 1, 2, 4, 4, 0, 0, 0, 3, 1, 3, 0, 3, 3, 4, 1, 4, 3, 2, 1, 0,
       3, 1, 2, 4, 1, 2, 3, 2, 1, 3, 0, 1, 3, 0])

In [34]:
clf.predict(X_test)


Out[34]:
array([0, 1, 2, 3, 3, 1, 2, 4, 1, 4, 1, 0, 3, 1, 2, 4, 1, 3, 4, 3, 2, 1,
       1, 4, 0, 1, 4, 2, 1, 1, 4, 0, 3, 4, 4, 0, 4, 4, 1, 3, 0, 1, 2, 4,
       4, 3, 1, 2, 4, 4, 0, 0, 0, 3, 1, 3, 0, 3, 3, 4, 1, 4, 3, 2, 1, 0,
       3, 1, 2, 4, 1, 1, 3, 2, 1, 3, 0, 1, 3, 0])

In [35]:
np.sum(y_test==clf.predict(X_test))/len(y_test)


Out[35]:
0.975

In [ ]:

Just function name baseline


In [65]:
import numpy as np
from nbminer.notebook_miner import NotebookMiner

hw_filenames = np.load('homework_file_names.npy')
min_val = min([len(temp) for temp in hw_filenames])
print(min_val)
hw_notebooks = [[NotebookMiner(filename) for filename in temp[:min_val]] for temp in hw_filenames]


80

In [66]:
a = Features(hw_notebooks[0], 'hw1')
a.add_notebooks(hw_notebooks[1], 'hw2')
a.add_notebooks(hw_notebooks[2], 'hw3')
a.add_notebooks(hw_notebooks[3], 'hw4')
a.add_notebooks(hw_notebooks[4], 'hw5')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
pipe = Pipeline([gastf, rbn, gi])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x7f180555e6a0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x7f180555e358>
<nbminer.preprocess.get_imports.GetImports object at 0x7f1807cacc18>

In [67]:
nbs = a.get_list_notebooks()
print(len(nbs))
function_names = []
homework_num = []
for nb in nbs:
    temp = []
    for cell in nb.get_all_cells():
        temp.append(" ".join(cell.get_feature('short_name_string')))
    function_names.append(temp)
    homework_num.append(nb.get_feature('import_name'))


1200

In [68]:
len(hw_notebooks[0])


Out[68]:
400

In [69]:
function_names = [" ".join(temp) for temp in function_names]

In [70]:
import sklearn
from sklearn.feature_extraction import text
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
tfidf.max_features = 5000
tfidf.fit(function_names)
X = tfidf.transform(function_names).todense()
y = np.array(homework_num)

In [71]:
X.shape


Out[71]:
(1200, 1687)

In [72]:
from sklearn.model_selection import cross_val_score
p = np.random.permutation(len(X))
X = X[p]
y = y[p]
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, max_depth=2)
scores = cross_val_score(clf, X, y, cv=10)
print(scores)
print('Mean', np.mean(scores))


[0.33333333 0.33333333 0.33333333 0.33333333 0.3        0.33333333
 0.33333333 0.33333333 0.33333333 0.33333333]
Mean 0.32999999999999996

In [43]:
clf.fit(X,y)
for pair in reversed(sorted([el for el in zip(clf.feature_importances_, tfidf.get_feature_names())])):
    print (pair)


(0.03342782778554434, 'words')
(0.027263968955001498, 'dictionary')
(0.025703548927278875, 'read_excel')
(0.024245345258121356, 'wordcloud')
(0.0240558927552438, 'polarity_scores')
(0.023494779983363013, 'ldamodel')
(0.021976378280706563, 'axis')
(0.021814588778321146, 'generate')
(0.02069978185248119, 'imshow')
(0.019608860036538542, 'sort_values')
(0.01912165118056879, 'show')
(0.018988972563274344, 'log')
(0.01861402089886122, 'isin')
(0.017615206349491513, 'head')
(0.016984033438781743, 'sentimentintensityanalyzer')
(0.016750453117679084, 'type')
(0.0166581384696986, 'fit')
(0.01573668919430045, 'lower')
(0.015128460323087853, 'concat')
(0.014616462587093733, 'describe')
(0.014575802105039164, 'porterstemmer')
(0.013816413473966036, 'join')
(0.01265804058290071, 'read_csv')
(0.012420148509140971, 'stem')
(0.012036448159883268, 'figure')
(0.011959897571131792, 'silhouette_score')
(0.010884406631069527, 'astype')
(0.010559935905972151, 'series')
(0.010486646434277774, 'value_counts')
(0.010477892423834578, 'wordnetlemmatizer')
(0.010346375967520402, 'word_tokenize')
(0.010309608140940738, 'ylabel')
(0.010262803082242991, 'dataframe')
(0.009923293920091098, 'lemmatize')
(0.009804574105006953, 'set_index')
(0.009247837176525803, 'system')
(0.00911689092173278, 'mean')
(0.008721379603043018, 'doc2bow')
(0.00858605874087031, 'kmeans')
(0.008100250491083838, 'cov')
(0.008070184608796008, 'randomforestclassifier')
(0.007776223199270897, 'tokenize')
(0.007729519869330844, 'set')
(0.007227844826609645, 'cut')
(0.00696097227005167, 'add')
(0.006799937944555356, 'best_partition')
(0.006471020141606359, 'predict')
(0.0063180955904620665, 'choropleth')
(0.006248601341208945, 'pivot')
(0.006119597559587702, 'to_datetime')
(0.0061171733087566045, 'colorbar')
(0.006078141564600367, 'negative')
(0.006018385038050686, 'find')
(0.0058542902115113, 'pivot_table')
(0.005849146568432393, 'get_ipython')
(0.005843214284990862, 'crosstab')
(0.005107929393204269, 'combine_first')
(0.005103984048549549, 'date')
(0.005086533396600264, 'strptime')
(0.0050289443027618, 'take')
(0.004699363169640751, 'as_ordered')
(0.004681102518918418, 'corr')
(0.00461742387401663, 'read_pickle')
(0.004424167514727358, 'groupby')
(0.004388510491670259, 'duplicated')
(0.004285650570217172, 'rank')
(0.004284222950757564, 'weekday')
(0.004276740365925086, 'cross_val_score')
(0.00423636107934807, 'get')
(0.004106900220413386, 'argsort')
(0.0040685048893097965, 'html')
(0.00402723662871221, 'most_common')
(0.003995924755905972, 'xlabel')
(0.003981971218815328, 'ttest_ind')
(0.003917860690515545, 'apply')
(0.00385864631988212, 'random')
(0.003814723904031969, 'read_table')
(0.0037522910542717143, 'clf')
(0.0037245585253359863, 'sortlevel')
(0.0036997267744863825, 'draw_networkx_edges')
(0.003699009371772606, 'draw_networkx_nodes')
(0.0036787301776250642, 'melt')
(0.003368770509771899, 'print_topics')
(0.003274988699081505, 'sort_index')
(0.0032329879488114882, 'tail')
(0.003201404635179864, 'copy')
(0.0031463538634961844, 'fit_transform')
(0.0031023948706735828, 'unstack')
(0.0030915928267884318, 'to_pickle')
(0.0030526429490819198, 'reindex')
(0.002974503521245464, 'permutation')
(0.0029425938927024973, 'qcut')
(0.002915784062733587, 'xticks')
(0.0028662576683575504, 'train_test_split')
(0.0028248772485177597, 'query')
(0.002759407298901605, 'merge')
(0.0026588355283443683, 'concatenate')
(0.0025830025934069157, 'swaplevel')
(0.0025207440280865844, 'find_all')
(0.0024929906688579613, 'dict')
(0.0024905232823711457, 'positive')
(0.0024215919506560242, 'plot_learning_curve')
(0.002293367613207335, 'save')
(0.0022663272312538883, 'tz_localize')
(0.0022647244435102954, 'show_topics')
(0.0022093976342396743, 'now')
(0.002207064149123884, 'endswith')
(0.002196663285569309, 'get_group')
(0.0020724519124149204, 'fillna')
(0.00194938795748974, 'get_dummies')
(0.0019488220013589642, 'sample')
(0.0019369092790606206, 'categorical')
(0.00189134021128192, 'list')
(0.0018559375796581108, 'datetime')
(0.001851354240868443, 'graph')
(0.0018244004663076163, 'dropna')
(0.0017383480205541034, 'split')
(0.0016663369457451942, 'spring_layout')
(0.001623669018375396, 'transform')
(0.001573333907884433, 'print')
(0.0015351995268713405, 'findall')
(0.0015128597620932699, 'round')
(0.0014959709093735985, 'min')
(0.0014313283773856598, 'treebankwordtokenizer')
(0.0013956577675697899, 'randint')
(0.0013668895621614407, 'beautifulsoup')
(0.0013395401504599262, 'regexptokenizer')
(0.0013211280035265146, 'hist')
(0.0013166321387542124, 'xlim')
(0.0012988157184645804, 'reshape')
(0.0012861464754469773, 'add_edge')
(0.0012771459168402998, 'str')
(0.0012353434147555895, 'isnull')
(0.0012195907232902534, 'read_html')
(0.0011929598932334526, 'set_context')
(0.0011391666969498588, 'quantile')
(0.0011220941823722541, 'max')
(0.001079184837685195, 'arange')
(0.0010712736362380212, 'keys')
(0.001068916244511541, 'tz_convert')
(0.00106730080797535, 'time')
(0.001037096919647508, 'plot')
(0.0010277239129308363, 'run_line_magic')
(0.001018485796954269, 'std')
(0.0009831455500588456, 'learning_curve')
(0.0008713558983374267, 'sorted')
(0.0008671763569257966, 'text_search')
(0.0008438806694513739, 'confusion_matrix')
(0.0008270474310086067, 'counter')
(0.0007562497572893962, 'replace')
(0.0007051088317612969, 'to_dict')
(0.0006941971692459675, 'reset_index')
(0.0006903914461966328, 'rdylgn')
(0.0006866675250842091, 'drop_duplicates')
(0.0006614978811288614, 'all')
(0.0006604867860308817, 'seed')
(0.0006464737793851732, 'get_cmap')
(0.0006422420595902914, 'grid')
(0.0006178830341707735, 'fill_between')
(0.0005989934121273283, 'add_suffix')
(0.0005906520668142012, 'log10')
(0.0005884736609302276, 'title')
(0.0005838292401107568, 'int')
(0.0005574539736170818, 'parse')
(0.0005460060353816589, 'labelencoder')
(0.0005424849673186218, 'accuracy_score')
(0.0005350157383335451, 'abs')
(0.0005337625292049806, 'ks_2samp')
(0.0005319596299411272, 'sent_tokenize')
(0.000515412866744238, 'legend')
(0.00048184054489491704, 'range')
(0.0004777093841378044, 'open')
(0.00044490545966521696, 'isalpha')
(0.0004439302060529287, 'snowballstemmer')
(0.0004216063259238095, 'first')
(0.00040997811620743373, 'request')
(0.00040593688107224226, 'append')
(0.00039876402733503166, 'len')
(0.00039590382189405045, 'savehtml')
(0.0003954218214663371, 'rename')
(0.0003655883013163895, 'loadmasterdata')
(0.00036183999134178164, 'decode')
(0.00036121174692300897, 'json')
(0.0003610291755467456, 'f1_score')
(0.0003568094877685954, 'prepare')
(0.00035509799132017516, 'combinations')
(0.0003441057833000783, 'factorplot')
(0.00032580364900087236, 'fit_predict')
(0.00032277095724254326, 'get_text')
(0.0003172666113501058, 'shuffle')
(0.0003167302941837998, 'group')
(0.000293538492772071, 'get_percentage')
(0.00027876355686801905, 'map')
(0.00027281812283915023, 'subplots')
(0.00026614242686260925, 'column_maxmin')
(0.00026405731327068234, 'draw_spring')
(0.0002620577712926733, 'load')
(0.0002526228652501899, 'restructure_reduced_data')
(0.00025002601971416447, 'tight_layout')
(0.00024338457542211049, 'linspace')
(0.00023838859687946756, 'bar')
(0.00022428997756566985, 'getstudentmastersemesters')
(0.00021881361775326347, 'sum')
(0.0002184517885635216, 'filter')
(0.00021516574964696797, 'factorize')
(0.00021375188306516465, 'stack')
(0.00020552202540508564, 'count')
(0.0002006401263469694, 'getyearsemesterbachelor')
(0.0001999412993379547, 'getyear')
(0.00019940777431287423, 'delete')
(0.00019005036093305177, 'enumerate')
(0.00018809817855309967, 'randomforestregressor')
(0.00018527223531533083, 'meshgrid')
(0.0001824002685653418, 'listdir')
(0.00017631530940042967, 'from_dict')
(0.00017484526178052407, 'imputer')
(0.00016969990622166814, 'filterwarnings')
(0.00016968900674158748, 'mode')
(0.0001631989455766741, 'compute_max_min_score')
(0.0001562040395650363, 'add_legend')
(0.0001524787345386875, 'get_random_forests')
(0.00014226071409003318, 'reduce')
(0.00013867883644841302, 'loads')
(0.0001320525052237209, 'agg')
(0.00012977151837845902, 'get_level_values')
(0.00012933027945659112, 'createdata')
(0.0001203158467324147, 'prepare_feature_subsets')
(0.00011978382316251363, 'logspace')
(0.00011526809323228761, 'process_exp')
(0.00011273545696614732, 'caseless_contains')
(0.0001117193980455279, 'reindex_axis')
(9.601497159203157e-05, 'to_numeric')
(9.390570586140251e-05, 'yticks')
(9.269130903634653e-05, 'mapping_by_hand')
(9.200445082422109e-05, 'set_option')
(9.147686442064241e-05, 'foreign_policy')
(8.601379334742903e-05, 'countplot')
(8.152564033854231e-05, 'load_ebola')
(7.814004906654685e-05, 'load_md')
(7.475532580036443e-05, 'search')
(5.685454127461305e-05, 'save_dict')
(5.004622093745776e-05, 'select_fields')
(4.6413171755811685e-05, 'add_subplot')
(3.561212114632737e-05, 'choice')
(2.6827720958511894e-05, 'ones')
(0.0, 'zscore')
(0.0, 'zip_longest')
(0.0, 'zip')
(0.0, 'zfill')
(0.0, 'zeros_like')
(0.0, 'zeros')
(0.0, 'zero_one_loss')
(0.0, 'yscale')
(0.0, 'ylim')
(0.0, 'years_to_mean_semester')
(0.0, 'yearevent')
(0.0, 'yandextranslate')
(0.0, 'xscale')
(0.0, 'xs')
(0.0, 'xpath')
(0.0, 'wrong_pred')
(0.0, 'writerow')
(0.0, 'writer')
(0.0, 'write_to_csv')
(0.0, 'write')
(0.0, 'wordpuncttokenizer')
(0.0, 'wordpunct_tokenize')
(0.0, 'word_count')
(0.0, 'word_contains_number')
(0.0, 'word_cloud_creator')
(0.0, 'whitespacetokenizer')
(0.0, 'where')
(0.0, 'weighting')
(0.0, 'weight_sample')
(0.0, 'walk')
(0.0, 'vstack')
(0.0, 'visualize_silhouette_score')
(0.0, 'visualize_pca')
(0.0, 'visualize_clusters')
(0.0, 'visualise_topics')
(0.0, 'verifyresult')
(0.0, 'vega')
(0.0, 'var')
(0.0, 'values')
(0.0, 'valueref')
(0.0, 'valueerror')
(0.0, 'validation_curve')
(0.0, 'val_curve_rf')
(0.0, 'val_curve_all_params')
(0.0, 'vader_sentiment_computation')
(0.0, 'vader_approach')
(0.0, 'use')
(0.0, 'usa_cloud')
(0.0, 'urltodataframe')
(0.0, 'urlopen')
(0.0, 'urljoin')
(0.0, 'urlencode')
(0.0, 'url_param_str')
(0.0, 'upper')
(0.0, 'updateparam')
(0.0, 'updatecountrywordused')
(0.0, 'update_sentiment_dict')
(0.0, 'update_full')
(0.0, 'update_dict')
(0.0, 'update')
(0.0, 'untag_lemma_emails')
(0.0, 'unstem')
(0.0, 'unique')
(0.0, 'union1d')
(0.0, 'union')
(0.0, 'unimapper')
(0.0, 'uniform')
(0.0, 'uni_to_canton')
(0.0, 'twinx')
(0.0, 'tuple')
(0.0, 'ttest_1samp')
(0.0, 'try_query')
(0.0, 'triu_indices_from')
(0.0, 'trim')
(0.0, 'transpose')
(0.0, 'translated')
(0.0, 'translate')
(0.0, 'train_test_rf')
(0.0, 'train_forest')
(0.0, 'train')
(0.0, 'tqdm')
(0.0, 'totalgrantmoney')
(0.0, 'total_seconds')
(0.0, 'topojson')
(0.0, 'topic_display')
(0.0, 'top_words_from_nodes')
(0.0, 'tolist')
(0.0, 'tokenize_text')
(0.0, 'tokenize_and_stem')
(0.0, 'tokenization_normalization')
(0.0, 'toarray')
(0.0, 'to_string')
(0.0, 'to_step')
(0.0, 'to_rgba')
(0.0, 'to_json')
(0.0, 'to_image')
(0.0, 'to_frame')
(0.0, 'to_file')
(0.0, 'to_excel')
(0.0, 'to_csv')
(0.0, 'to_array')
(0.0, 'timestamp')
(0.0, 'ticklabel_format')
(0.0, 'tick_params')
(0.0, 'threadpoolexecutor')
(0.0, 'tfidfvectorizer')
(0.0, 'text_proce_pipeline')
(0.0, 'text')
(0.0, 'test_rfc_complete')
(0.0, 'test_rfc')
(0.0, 'test_km')
(0.0, 'term_normalization')
(0.0, 'tabletolist')
(0.0, 'tabletoframe')
(0.0, 'tabletocsv')
(0.0, 'tablestoframes')
(0.0, 'tablecontigence')
(0.0, 'symmetric_difference')
(0.0, 'svc')
(0.0, 'survival_percentage')
(0.0, 'suptitle')
(0.0, 'summary')
(0.0, 'sum_by_region')
(0.0, 'sum_by_canton')
(0.0, 'subtract')
(0.0, 'subplot')
(0.0, 'submit')
(0.0, 'subgraph')
(0.0, 'sub')
(0.0, 'study_feature_importance')
(0.0, 'studentvaliditymineur')
(0.0, 'studentspecialisation')
(0.0, 'studenthasproject')
(0.0, 'studenthasmineur')
(0.0, 'student_is_valid')
(0.0, 'student')
(0.0, 'stubborn_geocode')
(0.0, 'stripplot')
(0.0, 'strip')
(0.0, 'stringio')
(0.0, 'strftime')
(0.0, 'stratifiedkfold')
(0.0, 'stopword_removal')
(0.0, 'stopword_filtering')
(0.0, 'stepcolormap')
(0.0, 'step')
(0.0, 'stemming')
(0.0, 'stem_doc')
(0.0, 'steamming')
(0.0, 'startswith')
(0.0, 'start_year')
(0.0, 'start')
(0.0, 'standardscaler')
(0.0, 'standardize')
(0.0, 'st')
(0.0, 'sqrt')
(0.0, 'spring')
(0.0, 'splitlines')
(0.0, 'split_university_in_name_and_abbrev')
(0.0, 'split_training_predicting')
(0.0, 'split_switzerland')
(0.0, 'split_six')
(0.0, 'split_function')
(0.0, 'split_digi_letter')
(0.0, 'split_data')
(0.0, 'split_and_request')
(0.0, 'spectralclustering')
(0.0, 'specilisation')
(0.0, 'spe')
(0.0, 'soupstrainer')
(0.0, 'soup2')
(0.0, 'sorted_strings_values')
(0.0, 'sort_year')
(0.0, 'sort')
(0.0, 'sleep')
(0.0, 'skipdict')
(0.0, 'skintone_diff')
(0.0, 'skinlabels')
(0.0, 'size')
(0.0, 'simplifyunivname')
(0.0, 'simplefilter')
(0.0, 'simple_preprocess')
(0.0, 'silhouette_score_drop')
(0.0, 'silhouette_kmeans')
(0.0, 'silhouette_analysis')
(0.0, 'silhouette')
(0.0, 'sign')
(0.0, 'shufflesplit')
(0.0, 'shuffle_features')
(0.0, 'showlinkgen')
(0.0, 'showbarfreq')
(0.0, 'show_words_for_communities')
(0.0, 'show_wordcloud')
(0.0, 'show_topic')
(0.0, 'show_student_info')
(0.0, 'show_score')
(0.0, 'show_results')
(0.0, 'show_item')
(0.0, 'show_important_features_random_forest')
(0.0, 'show_features_ranking')
(0.0, 'show_cloud')
(0.0, 'shift')
(0.0, 'shape')
(0.0, 'sf')
(0.0, 'setrecursionlimit')
(0.0, 'setp')
(0.0, 'setlocale')
(0.0, 'setlevel')
(0.0, 'setformatter')
(0.0, 'setdiff1d')
(0.0, 'setdefaultencoding')
(0.0, 'setdefault')
(0.0, 'setdate')
(0.0, 'set_zlabel')
(0.0, 'set_yticklabels')
(0.0, 'set_yscale')
(0.0, 'set_ylim')
(0.0, 'set_ylabel')
(0.0, 'set_xticks')
(0.0, 'set_xticklabels')
(0.0, 'set_xscale')
(0.0, 'set_xlim')
(0.0, 'set_xlabel')
(0.0, 'set_visible')
(0.0, 'set_value')
(0.0, 'set_useoffset')
(0.0, 'set_title')
(0.0, 'set_ticklabels')
(0.0, 'set_text')
(0.0, 'set_style')
(0.0, 'set_size_inches')
(0.0, 'set_printoptions')
(0.0, 'set_position')
(0.0, 'set_params')
(0.0, 'set_palette')
(0.0, 'set_major_locator')
(0.0, 'set_major_formatter')
(0.0, 'set_levels')
(0.0, 'set_label')
(0.0, 'set_coordinates_and_canton')
(0.0, 'set_color')
(0.0, 'set_array')
(0.0, 'session')
(0.0, 'serviceexception')
(0.0, 'serialize')
(0.0, 'septables')
(0.0, 'sentimentanalysis')
(0.0, 'sentiment_vader')
(0.0, 'sentiment_score')
(0.0, 'sentiment_process')
(0.0, 'sentiment_liu_hu_mod')
(0.0, 'sentiment_liu_hu')
(0.0, 'sentiment_lexicon')
(0.0, 'sentiment_email_analysis')
(0.0, 'sentiment_dection')
(0.0, 'semilogx')
(0.0, 'semesteronetosix')
(0.0, 'semester_data')
(0.0, 'sem_start_date')
(0.0, 'select_features')
(0.0, 'select_dtypes')
(0.0, 'select')
(0.0, 'searchstring')
(0.0, 'searchcantongeonames')
(0.0, 'search_missing_cantons')
(0.0, 'search_google')
(0.0, 'search_cantons')
(0.0, 'scrapmaster')
(0.0, 'scrape_student_data')
(0.0, 'scrapbachelor')
(0.0, 'scoring_complete')
(0.0, 'scorer')
(0.0, 'score_wo_feature')
(0.0, 'score_w_feature')
(0.0, 'score_labels')
(0.0, 'score')
(0.0, 'sciper')
(0.0, 'scattercount')
(0.0, 'scatter_ma')
(0.0, 'scatter')
(0.0, 'scale')
(0.0, 'scalarmappable')
(0.0, 'sca')
(0.0, 'savefig')
(0.0, 'save_html')
(0.0, 'run_random_forest')
(0.0, 'run_query')
(0.0, 'run_once')
(0.0, 'run_lda')
(0.0, 'run_kmeans')
(0.0, 'run_forest_importance')
(0.0, 'run_cross_validation')
(0.0, 'run_clusterer')
(0.0, 'run_cell_magic')
(0.0, 'rstrip')
(0.0, 'rsplit')
(0.0, 'round_')
(0.0, 'roesti_unilingual_map_on_df')
(0.0, 'roesti_bilingual_map_on_df')
(0.0, 'roc_curve')
(0.0, 'roc_auc_score')
(0.0, 'rfind')
(0.0, 'rfecv')
(0.0, 'rfcmodel_cv_featureimportance')
(0.0, 'rfc')
(0.0, 'rf_f1score')
(0.0, 'rf_accuracy')
(0.0, 'reversed')
(0.0, 'reverse_geocoding')
(0.0, 'reverse_geocode')
(0.0, 'reverse')
(0.0, 'return_model')
(0.0, 'retrievepostalcode')
(0.0, 'retrievecanton')
(0.0, 'retrieve_dataframes')
(0.0, 'retrieve_above_threshold')
(0.0, 'result')
(0.0, 'restrict_years')
(0.0, 'reset_option')
(0.0, 'res_to_csv')
(0.0, 'request_geo_name')
(0.0, 'request2soup')
(0.0, 'req')
(0.0, 'repr')
(0.0, 'replace_to_int')
(0.0, 'replace_to_id')
(0.0, 'replace_pos_labels')
(0.0, 'replace_nan_by_mean')
(0.0, 'replace_contractions')
(0.0, 'repeat')
(0.0, 'reorder_levels')
(0.0, 'removestopwords')
(0.0, 'removesinglefeature')
(0.0, 'removepunctuation')
(0.0, 'removenoisylines')
(0.0, 'removenan')
(0.0, 'remove_unwanted_tokens')
(0.0, 'remove_stopwords')
(0.0, 'remove_stop_words')
(0.0, 'remove_special_chars')
(0.0, 'remove_rows_and_cols_with_only_nan')
(0.0, 'remove_punctuation')
(0.0, 'remove_one_feature')
(0.0, 'remove_non_cantons')
(0.0, 'remove_feat_silhouette')
(0.0, 'remove_feat_score')
(0.0, 'remove_email_terminology')
(0.0, 'remove_accents')
(0.0, 'remove')
(0.0, 'reload')
(0.0, 'relativedelta')
(0.0, 'regplot')
(0.0, 'regexify')
(0.0, 'reduced_preprocessing')
(0.0, 'reduce_overfitting')
(0.0, 'rectangle')
(0.0, 'recolor')
(0.0, 'recall_score')
(0.0, 'readme')
(0.0, 'readlines')
(0.0, 'readline')
(0.0, 'reader')
(0.0, 'read_json')
(0.0, 'read_file')
(0.0, 'read_csvs')
(0.0, 'read_country_csv')
(0.0, 'read')
(0.0, 're_longer_than')
(0.0, 'rdylbu')
(0.0, 'rawemailcleaner')
(0.0, 'ravel')
(0.0, 'ratio_total_cards_cluster_rest')
(0.0, 'ratio_total_cards_cluster0')
(0.0, 'ranksums')
(0.0, 'rankdata')
(0.0, 'randomizedsearchcv')
(0.0, 'random_forest_scores')
(0.0, 'randn')
(0.0, 'rand_jitter')
(0.0, 'rand')
(0.0, 'quotedstring')
(0.0, 'quote')
(0.0, 'query_university')
(0.0, 'query_uni_gm')
(0.0, 'query_term')
(0.0, 'query_placeid_gm')
(0.0, 'query_epfl')
(0.0, 'query_city')
(0.0, 'pvalue_signification')
(0.0, 'punctuation_numbers_stopword_removal')
(0.0, 'proportion_cal')
(0.0, 'propertyset')
(0.0, 'progressbar')
(0.0, 'product')
(0.0, 'processtext')
(0.0, 'processpoolexecutor')
(0.0, 'process_uni_data')
(0.0, 'process_text')
(0.0, 'process_mails')
(0.0, 'process_file')
(0.0, 'process_exp_wrapper')
(0.0, 'process_emails')
(0.0, 'process_df')
(0.0, 'process_database')
(0.0, 'process_data')
(0.0, 'printstats')
(0.0, 'printfieldvalues')
(0.0, 'print_wordclouds')
(0.0, 'print_unimportant_features')
(0.0, 'print_topics_from_list_of_models_topics')
(0.0, 'print_topic')
(0.0, 'print_top_words')
(0.0, 'print_scores')
(0.0, 'print_score')
(0.0, 'print_metrics')
(0.0, 'print_lda_topics')
(0.0, 'print_important_features')
(0.0, 'print_confusion_matrix')
(0.0, 'prettyprinter')
(0.0, 'pretty_print_topics')
(0.0, 'prettify')
(0.0, 'pretify_df')
(0.0, 'preprocesstext')
(0.0, 'preprocessing')
(0.0, 'preprocess_raw_email')
(0.0, 'preprocess_msg')
(0.0, 'preprocess_basic')
(0.0, 'preprocess_2')
(0.0, 'preprocess_1')
(0.0, 'preprocess2')
(0.0, 'preprocess')
(0.0, 'prepare_params')
(0.0, 'prepare_master_dataframe')
(0.0, 'prepare_gm_uni_params')
(0.0, 'prepare_gm_placeid_params')
(0.0, 'prepare_features_multi_remove')
(0.0, 'prepare_features_multi')
(0.0, 'prepare_dataframe')
(0.0, 'prepare_data')
(0.0, 'prep_ml')
(0.0, 'predict_proba')
(0.0, 'predict_class_from_reg')
(0.0, 'precision_score')
(0.0, 'pprint')
(0.0, 'pp')
(0.0, 'powerset')
(0.0, 'pow')
(0.0, 'post')
(0.0, 'pos_tagging')
(0.0, 'pos_tag')
(0.0, 'popup')
(0.0, 'populate')
(0.0, 'pop')
(0.0, 'pool')
(0.0, 'polyline')
(0.0, 'point')
(0.0, 'plotpreprocessing')
(0.0, 'plotlearningcurves2')
(0.0, 'plotlearningcurves')
(0.0, 'plotbarchart')
(0.0, 'plot_xy')
(0.0, 'plot_wordclouds')
(0.0, 'plot_wordcloud')
(0.0, 'plot_word_cloud')
(0.0, 'plot_validation_curve')
(0.0, 'plot_te_tr_curve')
(0.0, 'plot_surface')
(0.0, 'plot_sentiment_by_country')
(0.0, 'plot_most_quoted_countries')
(0.0, 'plot_ma_time')
(0.0, 'plot_importance')
(0.0, 'plot_hist_regions')
(0.0, 'plot_hist_of_features')
(0.0, 'plot_hist_cantons')
(0.0, 'plot_hist')
(0.0, 'plot_gmm')
(0.0, 'plot_frame')
(0.0, 'plot_forest_importances')
(0.0, 'plot_fig')
(0.0, 'plot_features_importances')
(0.0, 'plot_feature_removal_evolution')
(0.0, 'plot_feature_importances')
(0.0, 'plot_feature_importance')
(0.0, 'plot_dict')
(0.0, 'plot_curve')
(0.0, 'plot_countries_by_occurrences_and_sentiment')
(0.0, 'plot_countries')
(0.0, 'plot_confusion_matrix')
(0.0, 'plot_color')
(0.0, 'plot_colar_bar')
(0.0, 'plot_cnf')
(0.0, 'plot_avg_stay_per_year')
(0.0, 'plot_array')
(0.0, 'played')
(0.0, 'places')
(0.0, 'place')
(0.0, 'pipeline_text')
(0.0, 'pipeline')
(0.0, 'pie')
(0.0, 'person_from_id')
(0.0, 'percentile')
(0.0, 'penn_to_wn')
(0.0, 'peek')
(0.0, 'pdf')
(0.0, 'pca')
(0.0, 'patch')
(0.0, 'partition_skin_length')
(0.0, 'partition_at_level')
(0.0, 'parsetitle')
(0.0, 'parsetable_ma')
(0.0, 'parsetable')
(0.0, 'parseresult')
(0.0, 'parserequest')
(0.0, 'parseperson')
(0.0, 'parseperiod')
(0.0, 'parsehtml')
(0.0, 'parse_topojson')
(0.0, 'parse_table')
(0.0, 'parse_students_list')
(0.0, 'parse_student_row')
(0.0, 'parse_semesters_list')
(0.0, 'parse_name')
(0.0, 'parse_item_pdm')
(0.0, 'parse_item')
(0.0, 'parse_canton')
(0.0, 'pairplot')
(0.0, 'overlay_map')
(0.0, 'output_file')
(0.0, 'ordereddict')
(0.0, 'ord')
(0.0, 'option_context')
(0.0, 'optimize_clusters')
(0.0, 'optimize')
(0.0, 'openmapquest')
(0.0, 'opacity')
(0.0, 'onehotencoder')
(0.0, 'onbasepercentage')
(0.0, 'on_base_percentage')
(0.0, 'ols')
(0.0, 'of_minor')
(0.0, 'of_gender')
(0.0, 'obp')
(0.0, 'objdict')
(0.0, 'nunique')
(0.0, 'number_of_edges')
(0.0, 'null_check_loc')
(0.0, 'notnull')
(0.0, 'not_equal')
(0.0, 'normalizer')
(0.0, 'normalize_token')
(0.0, 'normalize_standardize')
(0.0, 'normalize_nava_tags')
(0.0, 'normalize')
(0.0, 'nonzero')
(0.0, 'nominatim')
(0.0, 'nodes_connected')
(0.0, 'nodes')
(0.0, 'nlppipeline')
(0.0, 'nlargest')
(0.0, 'nice_plot')
(0.0, 'nice_bar_plot')
(0.0, 'next')
(0.0, 'newcasess')
(0.0, 'newcasesl')
(0.0, 'newcases')
(0.0, 'neighbors')
(0.0, 'nanstd')
(0.0, 'nanify')
(0.0, 'namedtuple')
(0.0, 'name_range')
(0.0, 'name')
(0.0, 'my_square_scatter')
(0.0, 'my_parse')
(0.0, 'my_circle_scatter')
(0.0, 'multiplelocator')
(0.0, 'multiindex')
(0.0, 'multicolumnlabelencoder')
(0.0, 'most_frequent')
(0.0, 'most_common_words_for_each_partition')
(0.0, 'months_between_dates')
(0.0, 'modularity')
(0.0, 'modify_university_name')
(0.0, 'mmcorpus')
(0.0, 'mlpclassifier')
(0.0, 'mkdir')
(0.0, 'minor')
(0.0, 'minmaxscaler')
(0.0, 'minibatchkmeans')
(0.0, 'mine_data')
(0.0, 'mergefunction')
(0.0, 'meme')
(0.0, 'median')
(0.0, 'meancards')
(0.0, 'mean_absolute_error')
(0.0, 'math')
(0.0, 'matches')
(0.0, 'match')
(0.0, 'masterstart')
(0.0, 'master_time')
(0.0, 'master_duration_in_months')
(0.0, 'mask')
(0.0, 'markercluster')
(0.0, 'marker')
(0.0, 'mark_negation')
(0.0, 'margins')
(0.0, 'mapping_cantons')
(0.0, 'mapping')
(0.0, 'map_uni_to_canton')
(0.0, 'map_inst_canton')
(0.0, 'map_email_to_vader_sentiment')
(0.0, 'map_email_to_liu_and_hu_sentiment')
(0.0, 'map_email_bodies_to_countries')
(0.0, 'map_author_to_community')
(0.0, 'mannwhitneyu')
(0.0, 'makewordcloud')
(0.0, 'maketrans')
(0.0, 'makematrixforclassifier')
(0.0, 'makedirs')
(0.0, 'makecountrytable')
(0.0, 'makeadvancedplotforallsentimentdict')
(0.0, 'makeadvanceddffromdict')
(0.0, 'make_wordcloud')
(0.0, 'make_url')
(0.0, 'make_scorer')
(0.0, 'make_print_confusion_matrix')
(0.0, 'make_params')
(0.0, 'make_learning_curve')
(0.0, 'make_frame_liu')
(0.0, 'make_frame')
(0.0, 'make_forest')
(0.0, 'make_cluster')
(0.0, 'lstrip')
(0.0, 'lower_case')
(0.0, 'lookup_country')
(0.0, 'lookup_countries')
(0.0, 'lookup')
(0.0, 'loglog')
(0.0, 'logical_not')
(0.0, 'logical_and')
(0.0, 'log_perplexity')
(0.0, 'locsciper')
(0.0, 'location')
(0.0, 'loadbachelordata')
(0.0, 'loadaddressdictfromcsv')
(0.0, 'load_student_page')
(0.0, 'load_pdm_data')
(0.0, 'load_msc_data')
(0.0, 'load_mb')
(0.0, 'load_ldamodels')
(0.0, 'load_iris')
(0.0, 'load_dict')
(0.0, 'load_data')
(0.0, 'load_bsc_data')
(0.0, 'lmplot')
(0.0, 'liu_hu_lexicon_relative')
(0.0, 'liu_hu_lexicon_neutral_ponderated')
(0.0, 'liu_hu_lexicon_absolute')
(0.0, 'liu_hu_lexicon')
(0.0, 'liu_and_hu_lexicon')
(0.0, 'liu_and_hu')
(0.0, 'literal_eval')
(0.0, 'listof')
(0.0, 'linguistic_region')
(0.0, 'linearregression')
(0.0, 'linear_log_histogram_plot')
(0.0, 'line_tokenize')
(0.0, 'line')
(0.0, 'lemmatizer')
(0.0, 'lemmatize_email')
(0.0, 'lemmatization')
(0.0, 'lemmas')
(0.0, 'lemmandstem')
(0.0, 'legend_scaler')
(0.0, 'learning_curve_plot')
(0.0, 'learning_curve_mean_squared_error')
(0.0, 'ldamulticore')
(0.0, 'layercontrol')
(0.0, 'latest_date')
(0.0, 'later_date')
(0.0, 'latentdirichletallocation')
(0.0, 'lastb6')
(0.0, 'last')
(0.0, 'lancasterstemmer')
(0.0, 'kurtosis')
(0.0, 'kstest')
(0.0, 'kmeans_loop')
(0.0, 'kmean_dark_separator_score')
(0.0, 'kfold')
(0.0, 'keys_with')
(0.0, 'keep_year')
(0.0, 'keep_nouns')
(0.0, 'k_means')
(0.0, 'jsonencoder')
(0.0, 'jsondecoder')
(0.0, 'jointplot')
(0.0, 'jitter')
(0.0, 'itertuples')
(0.0, 'itersiblings')
(0.0, 'iterrows')
(0.0, 'iteritems')
(0.0, 'iterfind')
(0.0, 'iterate_for_cluster')
(0.0, 'iter_emails')
(0.0, 'iter_docs')
(0.0, 'iter_content')
(0.0, 'iter')
(0.0, 'items')
(0.0, 'itemgetter')
(0.0, 'itemfreq')
(0.0, 'item')
(0.0, 'isvalid')
(0.0, 'isupper')
(0.0, 'isonesix')
(0.0, 'isone')
(0.0, 'isnan')
(0.0, 'islice')
(0.0, 'isinstance')
(0.0, 'isfile')
(0.0, 'isdigit')
(0.0, 'isaloop')
(0.0, 'isa_url')
(0.0, 'isa')
(0.0, 'is_verb')
(0.0, 'is_valid_item')
(0.0, 'is_semester_info')
(0.0, 'is_number')
(0.0, 'is_noun')
(0.0, 'is_header')
(0.0, 'is_adverb')
(0.0, 'is_adjective')
(0.0, 'invert_yaxis')
(0.0, 'invert_xaxis')
(0.0, 'invert')
(0.0, 'inverse_transform')
(0.0, 'intslider')
(0.0, 'intersection')
(0.0, 'intersect_dicts')
(0.0, 'intersect1d')
(0.0, 'interp')
(0.0, 'interact')
(0.0, 'integrate_cantons')
(0.0, 'int64index')
(0.0, 'install_cache')
(0.0, 'insert')
(0.0, 'input')
(0.0, 'initialize_notebook')
(0.0, 'info')
(0.0, 'indices')
(0.0, 'index')
(0.0, 'in_switzerland')
(0.0, 'in1d')
(0.0, 'imread')
(0.0, 'importmid')
(0.0, 'import_all_csv')
(0.0, 'impo_graph')
(0.0, 'imagecolorgenerator')
(0.0, 'image')
(0.0, 'iloc')
(0.0, 'iglob')
(0.0, 'iframe')
(0.0, 'idxmax')
(0.0, 'idx_to_keep')
(0.0, 'identify_country')
(0.0, 'icon')
(0.0, 'html_to_df')
(0.0, 'hsv_to_rgb')
(0.0, 'hsv')
(0.0, 'hstack')
(0.0, 'homogeneity_score')
(0.0, 'home_url_options')
(0.0, 'hold')
(0.0, 'hist_ratings')
(0.0, 'hist_raters')
(0.0, 'heatmap')
(0.0, 'hasattr')
(0.0, 'guess_canton')
(0.0, 'groupshufflesplit')
(0.0, 'grouper')
(0.0, 'group_with_year')
(0.0, 'group_sum')
(0.0, 'group_data')
(0.0, 'group_average')
(0.0, 'gridsearchcv')
(0.0, 'grants_grouped_localised')
(0.0, 'grants_by_year')
(0.0, 'grad_time_gender')
(0.0, 'grab_table')
(0.0, 'googlev3')
(0.0, 'googleplaces')
(0.0, 'google_maps_query')
(0.0, 'google')
(0.0, 'good_data_index')
(0.0, 'gmtime')
(0.0, 'gmm')
(0.0, 'glob')
(0.0, 'getyearstartedmaster')
(0.0, 'getyearsemesterseasonmaster')
(0.0, 'getyearrange')
(0.0, 'getvalvector')
(0.0, 'getvalue')
(0.0, 'geturl')
(0.0, 'getunifrominst')
(0.0, 'gettotals')
(0.0, 'gettokenmapfromdata')
(0.0, 'gettext')
(0.0, 'getstudentsemestersn')
(0.0, 'getstudentsemesters')
(0.0, 'getspecificurl')
(0.0, 'getsoup')
(0.0, 'getsizeof')
(0.0, 'getsemester')
(0.0, 'getresultpage')
(0.0, 'getrawtext')
(0.0, 'getrandomforestclassifier')
(0.0, 'getprediction')
(0.0, 'getpercentfound')
(0.0, 'getparamsrequestvalue')
(0.0, 'getoutput')
(0.0, 'getnumberofsemestersbetweenyearseason')
(0.0, 'getnumberofsemesters')
(0.0, 'getnumberofbachelorsemester')
(0.0, 'getmostusedwordsofdictionary')
(0.0, 'getminyearseason')
(0.0, 'getmaxyearseason')
(0.0, 'getmaxdistancebetweensemester1and2')
(0.0, 'getmasterstudents')
(0.0, 'getmasterspecialisation')
(0.0, 'getmasterproject')
(0.0, 'getmaster3')
(0.0, 'getlogger')
(0.0, 'getlang')
(0.0, 'getgeocodejsondictionary')
(0.0, 'getgender')
(0.0, 'getfullurl')
(0.0, 'getfromdisc')
(0.0, 'getfilteredpage')
(0.0, 'getfilename')
(0.0, 'getfieldnames')
(0.0, 'getdatatable')
(0.0, 'getdataframe')
(0.0, 'getdata')
(0.0, 'getcontent')
(0.0, 'getcantonfromgeocodedictionary')
(0.0, 'getcanton')
(0.0, 'getallmasterspecialisation')
(0.0, 'getalldata')
(0.0, 'getaddress')
(0.0, 'getabr')
(0.0, 'get_xticks')
(0.0, 'get_xticklabels')
(0.0, 'get_xaxis')
(0.0, 'get_x_y')
(0.0, 'get_x')
(0.0, 'get_wordnet_pos')
(0.0, 'get_word')
(0.0, 'get_wikidata_id')
(0.0, 'get_wikidata_headquater_location')
(0.0, 'get_wikidata_geolocation')
(0.0, 'get_width')
(0.0, 'get_weighted_mean')
(0.0, 'get_values')
(0.0, 'get_url')
(0.0, 'get_university_location')
(0.0, 'get_university_amount')
(0.0, 'get_uni_cantons')
(0.0, 'get_uni_canton')
(0.0, 'get_tuple')
(0.0, 'get_train_test_score')
(0.0, 'get_total')
(0.0, 'get_topics_from_models')
(0.0, 'get_topics')
(0.0, 'get_token_sentiment')
(0.0, 'get_token')
(0.0, 'get_text_without_stopwords')
(0.0, 'get_table')
(0.0, 'get_subsets')
(0.0, 'get_students_bysemester')
(0.0, 'get_student_table')
(0.0, 'get_stop_words')
(0.0, 'get_static_is_academia')
(0.0, 'get_soup')
(0.0, 'get_silhouette_score')
(0.0, 'get_short_name')
(0.0, 'get_select_options')
(0.0, 'get_required_webpage')
(0.0, 'get_region')
(0.0, 'get_reduced_stop_list')
(0.0, 'get_query')
(0.0, 'get_position')
(0.0, 'get_place_id')
(0.0, 'get_params')
(0.0, 'get_parameters')
(0.0, 'get_page')
(0.0, 'get_only_words')
(0.0, 'get_obp')
(0.0, 'get_numeric_features')
(0.0, 'get_non_empty')
(0.0, 'get_ms_dataframe')
(0.0, 'get_ms_alldata')
(0.0, 'get_money_rostigraben')
(0.0, 'get_major_formatter')
(0.0, 'get_location')
(0.0, 'get_locality')
(0.0, 'get_local_table')
(0.0, 'get_legend')
(0.0, 'get_last_occurence')
(0.0, 'get_kmeans_result')
(0.0, 'get_key_and_value')
(0.0, 'get_isa_data')
(0.0, 'get_is_academia_page')
(0.0, 'get_info_wrapper')
(0.0, 'get_info')
(0.0, 'get_important_features')
(0.0, 'get_html_page')
(0.0, 'get_html_content')
(0.0, 'get_html')
(0.0, 'get_height')
(0.0, 'get_gps_id')
(0.0, 'get_gps_code')
(0.0, 'get_geonames')
(0.0, 'get_geocodes')
(0.0, 'get_geocode_info')
(0.0, 'get_geo_info')
(0.0, 'get_geo_dict')
(0.0, 'get_frame_from_master_html')
(0.0, 'get_frame_from_html')
(0.0, 'get_frame_from_element')
(0.0, 'get_first_occurence')
(0.0, 'get_first_last_occurence')
(0.0, 'get_filter_value')
(0.0, 'get_file_content')
(0.0, 'get_figure')
(0.0, 'get_fields_options')
(0.0, 'get_feature_names')
(0.0, 'get_entry_text')
(0.0, 'get_email_opinion')
(0.0, 'get_duplicates')
(0.0, 'get_dropped_perc')
(0.0, 'get_df_for_semester')
(0.0, 'get_df')
(0.0, 'get_details')
(0.0, 'get_dataset')
(0.0, 'get_data_url')
(0.0, 'get_data3')
(0.0, 'get_data')
(0.0, 'get_cross_val_scores')
(0.0, 'get_country_references')
(0.0, 'get_country_df')
(0.0, 'get_countries_sentiment')
(0.0, 'get_countries_by_name')
(0.0, 'get_corpus')
(0.0, 'get_compound_vader')
(0.0, 'get_common_name_map')
(0.0, 'get_clustering_scores')
(0.0, 'get_cluster')
(0.0, 'get_cleaned_name')
(0.0, 'get_classifier_score')
(0.0, 'get_classifier')
(0.0, 'get_claims')
(0.0, 'get_civilite')
(0.0, 'get_cantons_json')
(0.0, 'get_canton_name')
(0.0, 'get_canton_from_p3')
(0.0, 'get_canton_from_location')
(0.0, 'get_canton_from_geodata')
(0.0, 'get_canton_for_uni')
(0.0, 'get_canton_code')
(0.0, 'get_canton')
(0.0, 'get_bs_dataframe')
(0.0, 'get_bs_alldata')
(0.0, 'get_basic_info')
(0.0, 'get_axes')
(0.0, 'get_area_level_1')
(0.0, 'get_api_key')
(0.0, 'get_all_student_list')
(0.0, 'get_all_data')
(0.0, 'geonames_query')
(0.0, 'geonames')
(0.0, 'geojson')
(0.0, 'geocode')
(0.0, 'geo_json')
(0.0, 'genpersondf')
(0.0, 'generate_x_std')
(0.0, 'generate_wordcloud')
(0.0, 'generate_word_cloud')
(0.0, 'generate_record')
(0.0, 'generate_from_frequencies')
(0.0, 'generate_dendrogram')
(0.0, 'gen_wordcloud')
(0.0, 'gcf')
(0.0, 'gca')
(0.0, 'gaussiannb')
(0.0, 'gaussian_kde')
(0.0, 'gather')
(0.0, 'further_preprocessing')
(0.0, 'functionattrsfor')
(0.0, 'funcformatter')
(0.0, 'func_parameter_dict')
(0.0, 'func_get_semester_student')
(0.0, 'full_cluster_distrib')
(0.0, 'full')
(0.0, 'fruchterman_reingold_layout')
(0.0, 'fromstring')
(0.0, 'fromkeys')
(0.0, 'from_tuples')
(0.0, 'from_records')
(0.0, 'from_pandas_dataframe')
(0.0, 'from_iterable')
(0.0, 'from_dict_of_lists')
(0.0, 'from_csv')
(0.0, 'from_arrays')
(0.0, 'from_array')
(0.0, 'freqdist')
(0.0, 'frenchi_canton_coefficient')
(0.0, 'forward_feature_selection')
(0.0, 'formatter')
(0.0, 'formatstrformatter')
(0.0, 'format_email_text')
(0.0, 'format')
(0.0, 'form_result_soup')
(0.0, 'floor_divide')
(0.0, 'floor')
(0.0, 'floatprogress')
(0.0, 'float')
(0.0, 'flatten_list')
(0.0, 'flatten_corpus')
(0.0, 'flatten')
(0.0, 'flat_tag_bag_of_word_representation')
(0.0, 'fit_words')
(0.0, 'fit_tf')
(0.0, 'fit_kmeans_on_features')
(0.0, 'fit_data')
(0.0, 'firstb1')
(0.0, 'first_and_last')
(0.0, 'findunmappedmoney')
(0.0, 'findsubsets')
(0.0, 'findnext')
(0.0, 'finditer')
(0.0, 'findchildren')
(0.0, 'findcanton')
(0.0, 'find_sub')
(0.0, 'find_semesters')
(0.0, 'find_place_and_canton')
(0.0, 'find_place')
(0.0, 'find_location')
(0.0, 'find_feature_to_delete')
(0.0, 'find_countries_in_text')
(0.0, 'find_countries')
(0.0, 'find_col_nan')
(0.0, 'find_canton_in_uni_name')
(0.0, 'find_canton_from_google')
(0.0, 'find_canton_abrv')
(0.0, 'find_canton')
(0.0, 'find_beg_end_year1')
(0.0, 'find_all_countries')
(0.0, 'filterstopwords')
(0.0, 'filter_punctuation')
(0.0, 'filter_msc_students')
(0.0, 'filter_items')
(0.0, 'filter_for_ch')
(0.0, 'filter_extremes')
(0.0, 'filter_digits')
(0.0, 'filter_corpus')
(0.0, 'filter_bsc_students')
(0.0, 'fill_unknown_cantons')
(0.0, 'fill_nan_values')
(0.0, 'fill_empty_cantons')
(0.0, 'filehandler')
(0.0, 'figimage')
(0.0, 'ffill')
(0.0, 'fetch_json')
(0.0, 'fetch_canton_from_api')
(0.0, 'fetch_canton')
(0.0, 'features_test')
(0.0, 'features_importance_rf')
(0.0, 'feature_removal')
(0.0, 'feature_importances_plot')
(0.0, 'feature_importance')
(0.0, 'facetgrid')
(0.0, 'f_oneway')
(0.0, 'extractyear')
(0.0, 'extractuniname')
(0.0, 'extractuniid')
(0.0, 'extractunicity')
(0.0, 'extracttotals')
(0.0, 'extracttopics')
(0.0, 'extractinsticity')
(0.0, 'extractcanton')
(0.0, 'extract_year')
(0.0, 'extract_web_content')
(0.0, 'extract_top_20_words_per_partition')
(0.0, 'extract_top_20_words')
(0.0, 'extract_semester_info')
(0.0, 'extract_long_name')
(0.0, 'extract_kv_pair')
(0.0, 'extract_id')
(0.0, 'extract_gps')
(0.0, 'extract_data')
(0.0, 'extract_countries')
(0.0, 'extract_corpus')
(0.0, 'extract_canton_from_uni')
(0.0, 'extract_canton_from_institution')
(0.0, 'extract_canton_from_address')
(0.0, 'extract_canton')
(0.0, 'extract')
(0.0, 'extra_compute')
(0.0, 'extend')
(0.0, 'export_data')
(0.0, 'export')
(0.0, 'expception')
(0.0, 'exp')
(0.0, 'exists')
(0.0, 'executingrandomforest')
(0.0, 'execute_liu_hu_lexicon_tests')
(0.0, 'exception')
(0.0, 'exc_info')
(0.0, 'ex_canton_str')
(0.0, 'evaluate_random_forest_model')
(0.0, 'estimator_test')
(0.0, 'estimate_data')
(0.0, 'erdos_renyi_graph')
(0.0, 'equal')
(0.0, 'englishstemmer')
(0.0, 'encrypttext')
(0.0, 'encrypt')
(0.0, 'encodelabels')
(0.0, 'encode_df')
(0.0, 'encode')
(0.0, 'empty')
(0.0, 'emailscorpus')
(0.0, 'edges')
(0.0, 'ebola_data_sl_test')
(0.0, 'ebola_data_liberia_test')
(0.0, 'ebola_data_guinea_test')
(0.0, 'earliest_date')
(0.0, 'duration')
(0.0, 'dumps')
(0.0, 'dump')
(0.0, 'dummyregressor')
(0.0, 'dummyclassifier')
(0.0, 'dropping_stats')
(0.0, 'droplevel')
(0.0, 'drop_na_rows')
(0.0, 'drop_features')
(0.0, 'drop')
(0.0, 'draw_networkx_labels')
(0.0, 'draw_networkx')
(0.0, 'draw_cv')
(0.0, 'draw_circular')
(0.0, 'draw')
(0.0, 'download_data')
(0.0, 'download')
(0.0, 'dot')
(0.0, 'donkeyfier')
(0.0, 'do_stemming_words')
(0.0, 'divide')
(0.0, 'diverging_palette')
(0.0, 'div')
(0.0, 'distplot')
(0.0, 'display_topics')
(0.0, 'display_map')
(0.0, 'display_cloud')
(0.0, 'display')
(0.0, 'discretize_by_width')
(0.0, 'dirname')
(0.0, 'difference')
(0.0, 'diff')
(0.0, 'did_graduate')
(0.0, 'dictvectorizer')
(0.0, 'dic_param')
(0.0, 'df_join')
(0.0, 'detect_outliers')
(0.0, 'detect')
(0.0, 'details')
(0.0, 'destem_topics')
(0.0, 'despine')
(0.0, 'describe_doc_frequencies')
(0.0, 'depth_test')
(0.0, 'demo_vader_instance')
(0.0, 'demo_vader')
(0.0, 'demo_subjectivity')
(0.0, 'demo_sent_subjectivity')
(0.0, 'demo_liuhu')
(0.0, 'demo_liu_hu_lexicon')
(0.0, 'degree_iter')
(0.0, 'degree_centrality')
(0.0, 'degree')
(0.0, 'define_model')
(0.0, 'define_feat')
(0.0, 'define_dictionary')
(0.0, 'defaultdict')
(0.0, 'decrypttext')
(0.0, 'debug')
(0.0, 'deathss')
(0.0, 'deathsl')
(0.0, 'deaths')
(0.0, 'dbscan_labels')
(0.0, 'dbscan')
(0.0, 'datetimeindex')
(0.0, 'dark_skin_proportion')
(0.0, 'cvstep')
(0.0, 'cv')
(0.0, 'cut_email')
(0.0, 'customicon')
(0.0, 'custom_confusion_matrix')
(0.0, 'custom')
(0.0, 'currency')
(0.0, 'cumsum')
(0.0, 'cross_validation_kfold')
(0.0, 'cross_validation')
(0.0, 'cross_validate_model')
(0.0, 'cross_val_predict')
(0.0, 'cross_val')
(0.0, 'createurl')
(0.0, 'createtextfromtokens')
(0.0, 'create_wordcloud')
(0.0, 'create_student_entry')
(0.0, 'create_single_project_column')
(0.0, 'create_series')
(0.0, 'create_palette')
(0.0, 'create_mdl')
(0.0, 'create_marker')
(0.0, 'create_mapping_for')
(0.0, 'create_map_df')
(0.0, 'create_map')
(0.0, 'create_lda_model')
(0.0, 'create_inf_death_cols')
(0.0, 'create_df')
(0.0, 'create_corpus')
(0.0, 'create_choropleth_map')
(0.0, 'cpu_count')
(0.0, 'countwords')
(0.0, 'countvectorizer')
(0.0, 'countryebola')
(0.0, 'count_unique_values')
(0.0, 'count_nonzero')
(0.0, 'count_countries_occurrences')
(0.0, 'correlation_matrix')
(0.0, 'correcting_year_spring')
(0.0, 'correcting_year_automn')
(0.0, 'correcting_year')
(0.0, 'corpus')
(0.0, 'coord_finder')
(0.0, 'converttonum')
(0.0, 'convertcolumns')
(0.0, 'convertcatdummy')
(0.0, 'convert_to_lowercase')
(0.0, 'convert_objects')
(0.0, 'convert_id_to_name')
(0.0, 'contains_digits')
(0.0, 'contains')
(0.0, 'construct_search_keys')
(0.0, 'construct_dict_from_select')
(0.0, 'config_rfc')
(0.0, 'concatframes')
(0.0, 'concatfiles')
(0.0, 'concat_dataframes')
(0.0, 'computetotalyears')
(0.0, 'computesemesterstudentsbetween')
(0.0, 'computeprecision')
(0.0, 'computepourcentagemappedmoney')
(0.0, 'compute_tokens')
(0.0, 'compute_skin_separation_percentage')
(0.0, 'compute_ratio')
(0.0, 'compute_pos')
(0.0, 'compute_feature_importance_rfc')
(0.0, 'compute_duration')
(0.0, 'computation_liu_hu_lexicon')
(0.0, 'completeness_score')
(0.0, 'compile')
(0.0, 'compareskincolor')
(0.0, 'compare_stem')
(0.0, 'compare_and_swap')
(0.0, 'combining')
(0.0, 'combinedatayear')
(0.0, 'combinedatatotal')
(0.0, 'columns_nanvalues')
(0.0, 'columnencoder')
(0.0, 'columndatasource')
(0.0, 'column_stack')
(0.0, 'color_palette')
(0.0, 'color')
(0.0, 'collect_dataframe')
(0.0, 'collapsereferences')
(0.0, 'cohen_kappa_score')
(0.0, 'cmap')
(0.0, 'cm')
(0.0, 'clusters_vs_labels')
(0.0, 'clusteringscore')
(0.0, 'clustering_score')
(0.0, 'clustering')
(0.0, 'cluster_skintone')
(0.0, 'cluster_dark_separator_score')
(0.0, 'cluster')
(0.0, 'close')
(0.0, 'client')
(0.0, 'clear')
(0.0, 'cleanup_stem')
(0.0, 'cleaning_pipeline_series')
(0.0, 'cleaning_pipeline')
(0.0, 'cleanhtmltable')
(0.0, 'cleaner')
(0.0, 'clean_text')
(0.0, 'clean_symbol')
(0.0, 'clean_str')
(0.0, 'clean_numbers')
(0.0, 'clean_df_once_again')
(0.0, 'clean_days_month')
(0.0, 'clean_data')
(0.0, 'clean')
(0.0, 'classify')
(0.0, 'classifier')
(0.0, 'classification_report')
(0.0, 'cl_accuracy')
(0.0, 'circlemarker')
(0.0, 'circle')
(0.0, 'chr')
(0.0, 'choropleth_map')
(0.0, 'choose_best_model')
(0.0, 'checkcrossvalidationaccuracy')
(0.0, 'check_student')
(0.0, 'check_field_uniqueness_per_player')
(0.0, 'check_dict')
(0.0, 'changedfattributestonumerals')
(0.0, 'change_to_numeric')
(0.0, 'chain')
(0.0, 'ceil')
(0.0, 'cdf')
(0.0, 'catch_warnings')
(0.0, 'cat')
(0.0, 'career_with_aut_prin')
(0.0, 'capturing')
(0.0, 'capitalize')
(0.0, 'cantons_coordinates')
(0.0, 'cantons')
(0.0, 'cantondict')
(0.0, 'canton_mapping')
(0.0, 'canton_map_on_df')
(0.0, 'canton_ids')
(0.0, 'canton_geolocalization')
(0.0, 'canton_from_coordinates')
(0.0, 'canton_for_university')
(0.0, 'calculatemsspanforeveryone')
(0.0, 'calculatemeandate')
(0.0, 'calculatedate')
(0.0, 'calculatebcspanforeveryone')
(0.0, 'calculate_topics')
(0.0, 'calcmasterspan')
(0.0, 'calcbachelorspan')
(0.0, 'calc_sentiment')
(0.0, 'calc_diff_from_average')
(0.0, 'buildrequest')
(0.0, 'builddataframe')
(0.0, 'buildchoroplethmap')
(0.0, 'build_rf')
(0.0, 'build_query_url')
(0.0, 'build_query')
(0.0, 'build_ldamodels')
(0.0, 'build_key_map')
(0.0, 'build_df')
(0.0, 'build_dataframe')
(0.0, 'build_classifier')
(0.0, 'bsoup')
(0.0, 'bs')
(0.0, 'boxplot')
(0.0, 'bool')
(0.0, 'blend')
(0.0, 'bind_stem')
(0.0, 'bincount')
(0.0, 'binary_separator')
(0.0, 'binary_class')
(0.0, 'binarize')
(0.0, 'bfs')
(0.0, 'betweenness_centrality')
(0.0, 'bes')
(0.0, 'ber')
(0.0, 'bench_k_means')
(0.0, 'barplot')
(0.0, 'barh')
(0.0, 'bar_plot_sentiments')
(0.0, 'bag_of_word_representation')
(0.0, 'backward_feature_selection')
(0.0, 'bachelorsems')
(0.0, 'axvline')
(0.0, 'axisproperties')
(0.0, 'axis_titles')
(0.0, 'axhline')
(0.0, 'axes')
(0.0, 'avg_score')
(0.0, 'average')
(0.0, 'auc')
(0.0, 'attributeerror')
(0.0, 'assign_country_to_mail')
(0.0, 'assign')
(0.0, 'asarray')
(0.0, 'asanyarray')
(0.0, 'as_matrix')
(0.0, 'as_completed')
(0.0, 'array')
(0.0, 'arrange_student')
(0.0, 'around')
(0.0, 'argwhere')
(0.0, 'argmin')
(0.0, 'argmax')
(0.0, 'areraterconsistent')
(0.0, 'approxsilhouettescore')
(0.0, 'append_coordinates')
(0.0, 'any')
(0.0, 'anderson_ksamp')
(0.0, 'analyze_sentiment')
(0.0, 'amounts_by_uni')
(0.0, 'amounts_by_canton')
(0.0, 'aggscorebycountry')
(0.0, 'aggregatesentimentinformation')
(0.0, 'aggregatesent')
(0.0, 'aggregate_score')
(0.0, 'aggregate_by_country')
(0.0, 'aggregate')
(0.0, 'agg_player')
(0.0, 'agg_mean_stay')
(0.0, 'adjusted_mutual_info_score')
(0.0, 'addsent')
(0.0, 'addhandler')
(0.0, 'adddata')
(0.0, 'add_to')
(0.0, 'add_students')
(0.0, 'add_semesters_count')
(0.0, 'add_patch')
(0.0, 'add_nodes_from')
(0.0, 'add_node')
(0.0, 'add_mean_role')
(0.0, 'add_mapping_to_dict')
(0.0, 'add_edges_from')
(0.0, 'add_constant')
(0.0, 'add_columns')
(0.0, 'add_children')
(0.0, 'add_child')
(0.0, 'add_and_fill_total_col')
(0.0, 'adaboostclassifier')
(0.0, 'accuracy')
(0.0, 'accumulate')
(0.0, '_get_numeric_data')

In [44]:
a = Features(hw_notebooks[0], 'hw1')
a.add_notebooks(hw_notebooks[1], 'hw2')
a.add_notebooks(hw_notebooks[2], 'hw3')
a.add_notebooks(hw_notebooks[3], 'hw4')
a.add_notebooks(hw_notebooks[4], 'hw5')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
pipe = Pipeline([gastf, rbn, gi])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x7f1820f83518>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x7f1820f83710>
<nbminer.preprocess.get_imports.GetImports object at 0x7f17fb326320>

In [45]:
nbs = a.get_list_notebooks()
function_names = []
homework_num = []
for nb in nbs:
    temp = []
    for cell in nb.get_all_cells():
        temp.append(" ".join(cell.get_feature('full_name_string')))
    function_names.append(temp)
    homework_num.append(nb.get_feature('import_name'))
function_names = [" ".join(temp) for temp in function_names]
function_names = ['_'.join(temp.split('.')) for temp in function_names]
import sklearn
from sklearn.feature_extraction import text
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
tfidf.max_features = 1000
tfidf.fit(function_names)
X = tfidf.transform(function_names).todense()
y = np.array(homework_num)
from sklearn.model_selection import cross_val_score
p = np.random.permutation(len(X))
X = X[p]
y = y[p]
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, max_depth=2)
scores = cross_val_score(clf, X, y, cv=10)
print(scores)
print('Mean', np.mean(scores))


[0.25714286 0.25       0.25714286 0.23571429 0.25714286 0.25714286
 0.25357143 0.25714286 0.23928571 0.25714286]
Mean 0.2521428571428571

In [ ]:


In [46]:
clf.fit(X,y)
for pair in reversed(sorted([el for el in zip(clf.feature_importances_, tfidf.get_feature_names())])):
    print (pair)


(0.01902587758557955, 'join')
(0.015655180506725132, 'nltk_corpus_stopwords_words')
(0.015330963965880995, 'sklearn_metrics_silhouette_score')
(0.014779714336202107, 'bs4_beautifulsoup')
(0.01462966519355429, 'pandas_read_csv')
(0.01414343522843587, 'matplotlib_pyplot_imshow')
(0.01380937732499806, 'nltk_word_tokenize')
(0.013586658227105813, 'matplotlib_pyplot_show')
(0.012849136783448569, 'pandas_dataframe')
(0.01283751631092025, 'pandas_concat')
(0.01180176375835613, 'numpy_log')
(0.011532230566464872, 'sklearn_ensemble_randomforestclassifier')
(0.011384943280254037, 'folium_map')
(0.011176516388214488, 'set')
(0.011108118458991312, 'matplotlib_pyplot_plot')
(0.01099208419231541, 'pandas_read_excel')
(0.010614056373791689, 'requests_get')
(0.009553746067979778, 'sklearn_cluster_kmeans')
(0.009128705118361445, 'gensim_corpora_dictionary')
(0.008754268578250218, 'type')
(0.008575814117036014, 'matplotlib_pyplot_axis')
(0.008488792924410288, 'matplotlib_pyplot_figure')
(0.00789417692426331, 'len')
(0.007588830671099894, 'wordcloud_wordcloud_generate')
(0.007435265730766027, 'dictionary_doc2bow')
(0.007231425034547663, 'mb_swaplevel_head')
(0.007164020196470241, 'matplotlib_pyplot_ylabel')
(0.007065511986619732, 'scipy_stats_ttest_ind')
(0.006844563177000266, 'wordcloud_wordcloud')
(0.006546741661503598, 'numpy_argsort')
(0.0063262541193178975, 'pandas_series')
(0.006036538661336019, 'tokenizer_tokenize')
(0.005460017381452345, 'matplotlib_cm_rdylgn')
(0.005398796539477168, 'gensim_models_ldamodel_ldamodel')
(0.005301920840381628, 'pandas_merge')
(0.005208109111589797, 'matplotlib_pyplot_scatter')
(0.005194987844108447, 'networkx_graph')
(0.005184216023114287, 'sklearn_metrics_accuracy_score')
(0.005127325156473286, 'df_drop')
(0.004985126566295631, 'pandas_to_datetime')
(0.004877875486633397, 'mb_head')
(0.004848120623311125, 'numpy_arange_reshape')
(0.0047364801744469425, 'data_tail')
(0.004624201082020475, 'baseball_corr')
(0.004610657992038542, 'stemmer_stem')
(0.004572575779611527, 'matplotlib_pyplot_xlabel')
(0.004528843626896412, 'community_best_partition')
(0.004433469537814215, 'str')
(0.004325800943196745, 'soup_find_all')
(0.004298185397913799, 'sklearn_metrics_confusion_matrix')
(0.004162878663314324, 'plot_learning_curve')
(0.004132823286381872, 'bacteria2_fillna')
(0.004092284890542357, 'segments_st_time_dt_month_segments_head')
(0.003958433389097683, 'pandas_read_html')
(0.003943197447292288, 'baseball_hr_corr')
(0.0038866925011735143, 'emails_head')
(0.0037990375080274724, 'get_ipython_run_line_magic')
(0.003783359627410066, 'matplotlib_pyplot_xticks')
(0.003775818516907918, 'x_drop')
(0.0037720417955335182, 'nltk_sentiment_sentimentintensityanalyzer')
(0.0037393562971441098, 'baseball_set_index')
(0.0037197753961523143, 'gensim_models_ldamodel')
(0.003716902082698572, 'baseball_apply')
(0.003689323557822756, 'nltk_stem_wordnetlemmatizer')
(0.0036876243453995905, 'x_max')
(0.0036745352266841975, 'nltk_sentiment_vader_sentimentintensityanalyzer')
(0.003671212784649461, 'range')
(0.0036581900157217962, 'matplotlib_pyplot_legend')
(0.0036022746060027106, 'matplotlib_pyplot_clf')
(0.0035828282843567583, 'cdystonia_wide_pd_melt_head')
(0.0035807063485851927, 'get_ipython')
(0.003552382574561601, 'max')
(0.003522508028191602, 'foo_isnull')
(0.0035089414596063745, 'mb_sortlevel_head')
(0.0035033161555059577, 'numpy_std')
(0.0034544947334378888, 'baseball_sort_values_head')
(0.0034436982884979263, 'int')
(0.003396687952665467, 'top3segments_head')
(0.0033174949595088017, 'dict')
(0.0032969850887209083, 'mb_class_groupby')
(0.0032796293951063323, 'pandas_cut')
(0.0032646308871790747, 'ax_legend')
(0.003250579678506348, 'networkx_draw_networkx_nodes')
(0.0032496889024504153, 'numpy_array')
(0.003248189793718106, 'pandas_get_dummies')
(0.003235261015273462, 'matplotlib_pyplot_bar')
(0.0031558104638697766, 'vessels_type_value_counts')
(0.003154289296804797, 'vessels_merge')
(0.00314153031572944, 'data_value_copy')
(0.0031242074868104075, 'text_lower')
(0.0031030244637465032, 'nltk_tokenize_regexptokenizer')
(0.0030424505323063235, 'format')
(0.0030360981910985078, 'baseball_newind_index_pd_series_value_counts')
(0.0029269733685756707, 'get_ipython_system')
(0.002912925738309472, 're_sub')
(0.0029053509625382667, 'x_mean')
(0.0028966714538040273, 'data_dropna')
(0.0028961374166119247, 'print')
(0.002884663775670777, 'nltk_corpus_opinion_lexicon_positive')
(0.0028227381739695124, 'baseball_h_head')
(0.002805553862696315, 'id_range_baseball_reindex_head')
(0.0027975124054845923, 'df_head')
(0.0027690792643746006, 'pandas_crosstab')
(0.0027320436923518724, 'bacteria2_mean')
(0.0027150855154123403, 'networkx_spring_layout')
(0.0026575815342548175, 'chunk_tissue_mean')
(0.002632790578476572, 'df_to_csv')
(0.0026169569233485544, 'baseball_hr_max')
(0.0026050165149812577, 'df_groupby')
(0.0026003900544493497, 'master_df_xs')
(0.002581275991587088, 'baseball_to_pickle')
(0.002576745309046079, 'country_name_lower')
(0.0025702515338792936, 'titanic_pclass_value_counts')
(0.0025494508402043635, 'cdystonia_grouped_mean_add_suffix_head')
(0.002530442875486286, 'matplotlib_pyplot_grid')
(0.002530323121736104, 'baseball_newind_sort_index_head')
(0.002512927373435475, 'v_find')
(0.00250599476584361, 'networkx_draw')
(0.002496886063096973, 'data_year_fillna')
(0.002493745756765895, 'vals_replace')
(0.0024604915049207606, 'baseball_index_baseball_reindex_head')
(0.0023883060949718704, 'baseball_newind_sort_index')
(0.002377310728947556, 'itertools_combinations')
(0.002363753783796684, 'nltk_porterstemmer')
(0.002354250335344027, 'matplotlib_pyplot_colorbar')
(0.002352738745634576, 'networkx_draw_networkx_edges')
(0.0023429350102317768, 'vessels_sample')
(0.0023426701881512748, 'matplotlib_pyplot_title')
(0.0023156615269748315, 'quantiles_pd_get_dummies_head')
(0.0023027811722805807, 'token_lower')
(0.0022880039692024785, 'cdystonia_pivot_table')
(0.0022825656855851965, 'matplotlib_pyplot_ylim')
(0.0022803613470951394, 'sklearn_cluster_kmeans_fit')
(0.002274854689404972, 'clb_ax_set_title')
(0.0022429646890463318, 'vessels_head')
(0.0021995067847761242, 'numpy_max')
(0.0021601184908164785, 'df_sort_values')
(0.0021260761836059327, 't_lower')
(0.0020992163828104588, 'glob_glob')
(0.0020822287065778516, 'seaborn_set_style')
(0.002075894988170762, 'zip')
(0.0020716104837039066, 'extra_bases_sort_values')
(0.0020443843695894925, 'segments_seg_length_apply')
(0.002017508259726345, 'nltk_corpus_opinion_lexicon_negative')
(0.002011770789593192, 'baseball_mean')
(0.001980030986117991, 'cdystonia_set_index')
(0.0019731567376836067, 'os_listdir')
(0.001967595581720559, 'df_isin')
(0.0019614885254110905, 'stats_apply')
(0.0019548601921153944, 'numpy_random_permutation')
(0.001952732545712665, 'x_index_get_level_values')
(0.0019447434825049631, 'word_lower')
(0.001939242384517813, 'vessels_duplicated')
(0.001926350319605987, 'km_fit')
(0.0019237598439519538, 'swiss_map_choropleth')
(0.0019016949825453057, 'emails_dropna')
(0.0018855955936011146, 'open')
(0.001885293371631253, 'folium_marker')
(0.001856456453361313, 'cdystonia_pivot_head')
(0.001852204239123958, 'numpy_argmax')
(0.0018316561708384344, 'segments_vessels_merge_head')
(0.0018297551372427612, 'numpy_abs')
(0.0018216989062141784, 'cdystonia_treat_cat_as_ordered_head')
(0.0018132420296276316, 'soup_find')
(0.0018029157118725381, 'data_fillna')
(0.0018026580740360134, 'data_groupby')
(0.0017757533800223575, 'ch_map_choropleth')
(0.0017543123322568204, 'df_unique')
(0.0017481695852912718, 'chr')
(0.0017438942056679555, 'sid_polarity_scores')
(0.001732750929153556, 'titanic_value_counts')
(0.0017148103686536703, 'segments_st_time_dt_tz_localize')
(0.0017092506252961116, 'mb_sortlevel')
(0.0017044389889896633, 'twstrs_wide_head')
(0.0016959668682211748, 'name_endswith')
(0.0016736588259704643, 'now_weekday')
(0.0016679747207934837, 'baseball_hr_rank')
(0.001654115121629006, 'partition_values')
(0.0016461319873855174, 'partition_items')
(0.0016384989187664067, 'wnl_lemmatize')
(0.001599866527604384, 'mine_data')
(0.0015825802384083658, 'vessels5_type_pd_get_dummies_head')
(0.0015635263261958305, 'titanic_describe')
(0.0015632703948462562, 'abs')
(0.0015434163882379374, 'sklearn_model_selection_train_test_split')
(0.0015404867129663858, 'segments_st_time_dt_hour_head')
(0.0015370275581532802, 'tr_find_all')
(0.0015318094836086454, 'numpy_unique')
(0.0015305404761912671, 'baseball_newind_head')
(0.0015217726831598507, 'df_apply')
(0.0015201557773920152, 'cdystonia_groupby_mean')
(0.0015052572406888658, 'cdystonia_stack')
(0.0014891366145636437, 'vessels_drop_duplicates')
(0.0014744393281000835, 'geopy_geocoders_googlev3')
(0.0014589156440196712, 'cdystonia_treat_astype')
(0.0014533305429083718, 'float')
(0.001451776695100058, 'x_sum')
(0.0014423085419073012, 'data_drop')
(0.0014333296840165698, 'round')
(0.0014149516339285178, 'accuracy_append')
(0.0013952170489034666, 'baseball_reindex')
(0.0013870136859566375, 'msstudentss1tos2spec_months_mean')
(0.0013863471494862942, 'sorted')
(0.0013810026342459536, 'sklearn_preprocessing_labelencoder')
(0.0013758263089138278, 'mb1_head')
(0.0013717407918777415, 'dateutil_parser_parse')
(0.0013608649809441823, 'pandas_isnull')
(0.0013474338254326926, 'matplotlib_pyplot_xlim')
(0.0013286218380622397, 'json_dump')
(0.0013279849093293544, 'numpy_arange')
(0.0013186912676744287, 'cdystonia_treat_map')
(0.0013126924953132293, 'segments_vessels_pd_merge_head')
(0.0013125610011963996, 'cdystonia_grouped_mean_head')
(0.001311699551686091, 'baseball_hr_cov')
(0.0013114411608956753, 'matplotlib_pyplot_semilogx')
(0.001302800292852982, 'filter')
(0.001300367879431673, 'concat_drop')
(0.0012963355005701483, 'pandas_qcut')
(0.0012960831983862755, 'baseball_index_values_max')
(0.0012947889011516785, 'partition_keys')
(0.0012840561036974007, 'list')
(0.001273031747434212, 'cdystonia_treat_describe')
(0.001269486620868506, 'mb_swaplevel')
(0.0012680395650049956, 'googleplaces_googleplaces')
(0.001263418855221154, 'titanic_isnull')
(0.0012590864882609562, 'nanify')
(0.0012550267524169298, 'master_data_groupby')
(0.0012491917728997875, 'numpy_random_random')
(0.0012486233187227957, 'place_split')
(0.0012455889869863017, 'data_copy')
(0.0012299809484290017, 'baseball_sort_values')
(0.0012271564770346733, 'datetime_date')
(0.001221709646403376, 'dff2_index_get_level_values_str_contains')
(0.0012162452656413257, 'forest_fit')
(0.0012148198008686834, 'sklearn_preprocessing_scale')
(0.0012141381754568394, 'mb_class_groupby_sum_head')
(0.0012104005554288502, 'numpy_random_randint')
(0.001198715150901075, 'cdystonia_drop_duplicates')
(0.001197574661205777, 'matplotlib_pyplot_fill_between')
(0.0011930532208814387, 'numpy_linspace')
(0.0011886678166878216, 'cdystonia_age_describe')
(0.0011834595170846474, 'numpy_mean_cdystonia_grouped_agg_head')
(0.001173877104254489, 'geolocator_geocode')
(0.001169637118094711, 'baseball_drop')
(0.0011624338307043604, 'sklearn_metrics_recall_score')
(0.0011600487570968317, 'functools_reduce')
(0.0011583214555232041, 'df_dropna')
(0.001150502669207424, 'segments_merged_groupby_apply')
(0.0011462495677067876, 'json_load')
(0.001143921945872779, 'numpy_ones')
(0.0011382700682171449, 'proportion_cal')
(0.001134360042803421, 'soup_findall_findchildren')
(0.001131582515684834, 'numpy_vstack')
(0.0011236614070951757, 'pickle_load')
(0.001120897498470374, 'stacked_unstack_head')
(0.0011157392150090008, 'dff2_index_get_level_values')
(0.0011151522330094454, 'numpy_triu_indices_from')
(0.0011121229367592027, 'matplotlib_pyplot_subplots')
(0.0011095499627518047, 'baseball_year_astype')
(0.0010944585971378667, 'rf_fit')
(0.0010894504738405044, 'nltk_stem_porter_porterstemmer')
(0.0010876178094180317, 'titanic_groupby')
(0.0010808285698923111, 'datetime_datetime_strptime')
(0.0010766118995737374, 'cdystonia2_groupby')
(0.0010735047705658857, 'segments_seg_length_hist')
(0.0010723139914055873, 'powerset')
(0.0010678255051858418, 'l_append')
(0.0010600275674146442, 'ipython_display_image')
(0.0010483084593112633, 'sklearn_metrics_f1_score')
(0.0010434056929552009, 'clustering')
(0.0010395198763314582, 'top5_head')
(0.0010352041577514598, 'baseball_ab_corr')
(0.0010298798739483107, 'numpy_log_segments_seg_length_apply_hist')
(0.0010281097961173128, 'mb_to_csv')
(0.001024546397575538, 'stoplist_add')
(0.0010238484475756143, 'ipython_display_iframe')
(0.001021228287236484, 'urllib_request_urlopen')
(0.0010185810916813526, 'le_transform')
(0.001017963868884325, 'datetime_datetime')
(0.0010092401818999107, 'le_fit')
(0.0009990068841934717, 'sklearn_preprocessing_standardscaler')
(0.0009941469111859622, 'get_data')
(0.0009862665398083267, 'w_lower')
(0.000985182080970549, 'r_json')
(0.0009808688263030663, 'g_add_edge')
(0.0009763620791804832, 'data_set_value')
(0.0009601802543873486, 'baseball_rank_head')
(0.0009566346183215718, 'mb_sum')
(0.0009540684762862891, 'titanic_head')
(0.0009529052538508525, 'mb2_mb1_pd_concat_head')
(0.0009526123032957607, 'demo_liu_hu_lexicon')
(0.0009520255907142684, 'player_sum')
(0.0009520034818517112, 'mb1_copy')
(0.0009489977863672436, 'datetime_datetime_now')
(0.0009462819475035991, 'segments_st_time_dt_month_head')
(0.000945803691470153, 'any')
(0.0009401150339294212, 'year_split')
(0.0009379670798762031, 'response_json')
(0.0009375998803798685, 'cdystonia_treat_cat_as_ordered')
(0.0009361303688611744, 'baseball_index_values_min')
(0.0009351737425068986, 'cross_mean')
(0.0009300883123699697, 'cdystonia_grouped_mean_add_suffix')
(0.0009234670174222161, 'max_cdystonia_pivot_table_head')
(0.0009194072691505269, 'scipy_stats_ttest_1samp')
(0.0009067929613776452, 'soup_findall')
(0.0009055882170005969, 'grants_apply')
(0.0009041802953888489, 'col_string_strip')
(0.0008994270921566795, 'lda_print_topics')
(0.000885892979929666, 'geo_all_copy_str_contains')
(0.0008841933973531479, 'mb2_mb1_dict_pd_concat_head')
(0.0008778175948497356, 'baseball_newind_query')
(0.0008760916992646969, 'datetime_time')
(0.000869221032251228, 'mb_class_head')
(0.00086549618969951, 'pandas_read_csv_head')
(0.0008615490025431334, 'df_describe')
(0.0008615307915957694, 'df_mean')
(0.0008575689929635299, 'row_find_all')
(0.000857035028897631, 'countries_append')
(0.000851334044509604, 'scores_append')
(0.0008474745849208052, 'semestre_split')
(0.0008463454442823325, 'x_str_split')
(0.0008434451695787581, 'data_head')
(0.0008429095037381838, 'cantons_append')
(0.0008419692237585312, 'cdystonia2_groupby_mean')
(0.0008409660266561005, 'json_loads')
(0.0008237252054936063, 'ax_plot')
(0.0008136281379756672, 'pandas_to_numeric')
(0.0008068402020246386, 'cfile_master_dropna')
(0.0008046088769408813, 'ch_map_save')
(0.0008031042981225461, 'grouped_by_type_get_group')
(0.0008020553054828488, 'segments_head')
(0.0007936235350581086, 'numpy_concatenate')
(0.0007913558856314372, 'get_canton')
(0.0007890952356677168, 'bacteria2_isnull')
(0.0007869345696064072, 'line_find_all')
(0.0007808169964355355, 'ax_scatter')
(0.0007746702794642852, 'data_list_append')
(0.0007675230554636369, 'segments_st_time_dt_tz_localize_dt_tz_convert')
(0.0007672070168137181, 'sklearn_model_selection_shufflesplit')
(0.0007670905512309697, 'clf_score')
(0.0007636988327512997, 'sum')
(0.0007622891289603249, 'sklearn_cross_validation_train_test_split')
(0.0007522714458216463, 'ax_set_ylabel')
(0.0007502231987827984, 'numpy_mean')
(0.00074997068443156, 'kmeans_predict')
(0.0007445117696872805, 'collections_counter')
(0.000744209227478855, 'grant_data_dropna')
(0.0007432636682086621, 'cdystonia_set_index_unstack_head')
(0.0007336020202206286, 'name_split')
(0.0007317790009045293, 'gensim_models_ldamulticore')
(0.0007254037032613017, 'vessels_type_isin')
(0.0007191796415714686, 'df_players_head')
(0.0007173071569895669, 'matplotlib_pyplot_yticks')
(0.0007150291597787636, 'cdystonia2_head')
(0.0007140175860552878, 'result_head')
(0.000710802242250846, 'list__append')
(0.0007054068528668595, 'matplotlib_pyplot_gca')
(0.0006965851629598262, 'utils_generate_word_cloud')
(0.0006946615674319884, 'msdata_index_unique')
(0.0006936108749794605, 'players_drop')
(0.0006921448067942143, 'vader_analyzer_polarity_scores')
(0.000678638658905559, 'data_grouped_size_plot')
(0.0006570455610966839, 'load_data')
(0.0006554105444472851, 'soup_prettify')
(0.0006418137005439605, 'seaborn_set_context')
(0.0006379510535135473, 'cdystonia_pivot')
(0.0006348304614145546, 'table_find_all')
(0.0006282266735698075, 'f_write')
(0.0006266133112340828, 'numpy_isnan')
(0.0006158034300584292, 'vessels_type_unique')
(0.0006140910466868337, 'x_min')
(0.0006133990049705554, 'master_full_data_groupby')
(0.000611331843436996, 'mb_class_groupby_sum')
(0.0006075486652645913, 'plot_wordcloud')
(0.0006000596758495007, 'baseball_isin')
(0.0005938396707513288, 'foreign_policy_update')
(0.0005799437933085887, 'pandas_categorical')
(0.0005657322374377843, 'baseball_newind_isin')
(0.0005595600057304436, 'emails_apply')
(0.0005592674378079398, 'config_rfc')
(0.0005495094639400708, 'os_path_join')
(0.0005415990086023996, 'pandas_read_pickle')
(0.0005381740421461194, 'rfc_predict')
(0.0005377495696517585, 'sklearn_model_selection_cross_val_score')
(0.0005324732070026927, 'pandas_notnull')
(0.0005316234046135971, 'result_append')
(0.0005196420723062712, 'model_print_topics')
(0.0005150137846475072, 'df_isnull_any')
(0.0005039028343663782, 'new_set_value')
(0.0004997010464570897, 'map')
(0.0004932899580581559, 'numpy_median')
(0.0004931567959924443, 'numpy_where')
(0.00047621178589393477, 'column_maxmin')
(0.000458825136757585, 're_search')
(0.00045059169561653895, 'bc_sem_count_groupby')
(0.00044663783881687604, 'bachelor_df_head')
(0.00044287585118892595, 'axes_set_ylabel')
(0.00044239969756355483, 'titanic_age_dropna')
(0.00043522781583031606, 'player_df_count')
(0.00043071820724382916, 'cdystonia_grouped_quantile')
(0.00042327720396499236, 'mb2_mb1_combine_first_head')
(0.00041614257250216816, 'bacteria2_notnull')
(0.0004139513503957408, 'df_master_passed_groupby_size_to_frame_reset_index')
(0.00041251666484491754, 'df_refnum_value_counts')
(0.00040928838986741087, 'matplotlib_pyplot_tight_layout')
(0.00040775880787061273, 'grants_groupby')
(0.0004075646150800818, 'segments_merged_head')
(0.00040703781927461734, 'seaborn_factorplot')
(0.0003989657737463429, 'g_add_edges_from')
(0.0003978260249606191, 'enumerate')
(0.0003761256169737075, 'getfullurl')
(0.0003652343380534744, 'pandas_set_option')
(0.0003573655359658179, 'normalize_cdystonia_grouped_transform_head')
(0.0003503466354479851, 'hr2007_add')
(0.00035013350981352503, 'titanic_copy')
(0.00034089630640676187, 'cdystonia2_unstack')
(0.00033810672074358284, 'f_close')
(0.0003347379769373706, 'matplotlib_pyplot_setp')
(0.0003335596851197798, 'f_get_xticks')
(0.00032824374363153567, 'gensim_models_ldamodel_ldamodel_load')
(0.00032506345821075753, 'pandas_dataframe_from_dict')
(0.0003190954771765757, 'grants_cantons_head')
(0.00031743469621993545, 'titanic_survived_value_counts')
(0.000298460770817143, 'os_path_isfile')
(0.00028398346036939915, 'wc_generate')
(0.0002626025804199523, 'seaborn_regplot')
(0.00024783321543745204, 'numpy_log10')
(0.00024364539612041394, 'baseball_head')
(0.00024275282933675847, 'pil_image_open')
(0.0002339974194258496, 'clf_fit')
(0.0002332618154911492, 'matplotlib_pyplot_subplot')
(0.00023253434951130172, 'warnings_filterwarnings')
(0.00022871810677801404, 'df_grants_canton_isnull')
(0.00022568827062350246, 'fig_add_subplot')
(0.0002173804637909972, 'seaborn_barplot')
(0.00020045847313470772, 'sentiment_per_country_pos_plot_bar')
(0.00018633822491097914, 'wordcloud_wordcloud_fit_words')
(0.00018199778332865833, 'x_split')
(0.00017669565822762842, 'selectfields_extract_findall')
(0.00016990717535644568, 'classifier_fit')
(0.0001571043079563262, 'df_reset_index')
(0.00014693038364412128, 'baseball_player_describe')
(0.00013191285495946453, 'min')
(0.00011856488806976624, 'nltk_tokenize_treebank_treebankwordtokenizer')
(0.0001121866509217434, 'googlemaps_client')
(0.0001008405748000346, 'key_startswith')
(8.49041422697765e-05, 'cdystonia_grouped_mean')
(0.0, 'z_set_index')
(0.0, 'y_ravel')
(0.0, 'y_append')
(0.0, 'x_value_counts')
(0.0, 'x_unique')
(0.0, 'x_str_contains_any')
(0.0, 'x_str_contains')
(0.0, 'x_std')
(0.0, 'x_lower')
(0.0, 'x_head')
(0.0, 'x_games_sum')
(0.0, 'x_find')
(0.0, 'x_count')
(0.0, 'ww_params_get_get')
(0.0, 'ww_params_get')
(0.0, 'wordcloud_to_file')
(0.0, 'wordcloud_imagecolorgenerator')
(0.0, 'wordcloud_generate')
(0.0, 'valid_students_append')
(0.0, 'val_find')
(0.0, 'utils_get_random_forests')
(0.0, 'utils_create_lda_model')
(0.0, 'url_replace')
(0.0, 'url_read')
(0.0, 'updateparam')
(0.0, 'unuseful_words_append')
(0.0, 'university_with_name_data_set_value')
(0.0, 'unicanton_pop')
(0.0, 'uni_tmp_find')
(0.0, 'uni_split')
(0.0, 'uni_adresses_dict_keys')
(0.0, 'twstrs_wide_cdystonia_drop_duplicates_merge_head')
(0.0, 'tuple')
(0.0, 'train_drop')
(0.0, 'train')
(0.0, 'tr_findall')
(0.0, 'tmp_append')
(0.0, 'titanic_sex_value_counts')
(0.0, 'titanic_pd_isnull_values_any')
(0.0, 'titanic_hist')
(0.0, 'titanic_groupby_size')
(0.0, 'titanic_groupby_agg')
(0.0, 'titanic_fare_dropna')
(0.0, 'titanic_dropna')
(0.0, 'time_time')
(0.0, 'time_sleep')
(0.0, 'th_findnext')
(0.0, 'text_split')
(0.0, 'text_replace')
(0.0, 'test_table_append')
(0.0, 'test_rfc_complete')
(0.0, 'temp_frame_sum')
(0.0, 'table_findall')
(0.0, 'swiss_map_save')
(0.0, 'swiss_cantons_full_str_split')
(0.0, 'swiss_cantons_full_head')
(0.0, 'substringbeginning_find')
(0.0, 'students_head')
(0.0, 'students_groupby')
(0.0, 'students_append')
(0.0, 'student_split')
(0.0, 'student_row_append')
(0.0, 'stacked_unstack')
(0.0, 'st_stem')
(0.0, 'specm_index_get_level_values')
(0.0, 'soupe_find_all')
(0.0, 'soup_select')
(0.0, 'soup_find_all_find_all')
(0.0, 'soup1_find_all')
(0.0, 'slugging_baseball_apply_round')
(0.0, 'sklearn_preprocessing_imputer')
(0.0, 'sklearn_model_selection_learning_curve')
(0.0, 'sklearn_model_selection_kfold')
(0.0, 'sklearn_model_selection_gridsearchcv')
(0.0, 'sklearn_metrics_roc_auc_score')
(0.0, 'sklearn_metrics_precision_score')
(0.0, 'sklearn_learning_curve_learning_curve')
(0.0, 'sklearn_grid_search_gridsearchcv')
(0.0, 'sklearn_ensemble_randomforestregressor')
(0.0, 'sklearn_decomposition_pca')
(0.0, 'sklearn_cross_validation_kfold')
(0.0, 'sklearn_cross_validation_cross_val_score')
(0.0, 'simplejson_load')
(0.0, 'silhouette_kmeans')
(0.0, 'sex_get_group')
(0.0, 'sex_append')
(0.0, 'setdate')
(0.0, 'set_intersection')
(0.0, 'session_get')
(0.0, 'semester_data')
(0.0, 'semester_append')
(0.0, 'selectfields_find')
(0.0, 'selectfields_extract')
(0.0, 'select_fields_find_all')
(0.0, 'select_field_find_all')
(0.0, 'segments_take')
(0.0, 'segments_st_time_dt_tz_localize_head')
(0.0, 'segments_st_time_dt_tz_localize_dt_tz_convert_head')
(0.0, 'segments_st_time_apply')
(0.0, 'segments_name_astype')
(0.0, 'segments_merged_groupby')
(0.0, 'seaborn_distplot')
(0.0, 'seaborn_countplot')
(0.0, 'seaborn_color_palette')
(0.0, 'seaborn_boxplot')
(0.0, 'scoring_complete')
(0.0, 'scores_mean')
(0.0, 'scipy_stats_mode')
(0.0, 'scipy_stats_mannwhitneyu')
(0.0, 'scipy_stats_kstest')
(0.0, 'scipy_stats_ks_2samp')
(0.0, 'sciper_data_xs_index_split')
(0.0, 'sciper')
(0.0, 's_replace')
(0.0, 's_isdigit')
(0.0, 's_groupby')
(0.0, 's_get')
(0.0, 's_find_all')
(0.0, 'row_split')
(0.0, 'row_findchildren')
(0.0, 'row_findall')
(0.0, 'row_find')
(0.0, 'row_canton_df_medium_apply_df_medium_sum')
(0.0, 'romandie_list_append')
(0.0, 'rfc_fit')
(0.0, 'result_reset_index')
(0.0, 'result_mean')
(0.0, 'result_index_get_level_values_str_contains')
(0.0, 'result_index_get_level_values')
(0.0, 'result_groupby')
(0.0, 'result_dict_values')
(0.0, 'result_dict_items')
(0.0, 'real_key_value_append')
(0.0, 're_findall')
(0.0, 're_compile')
(0.0, 'ratings_scale_append')
(0.0, 'randomforest_predict')
(0.0, 'randomforest_fit')
(0.0, 'rand_forest_model_predict')
(0.0, 'rand_forest_model_fit')
(0.0, 'r_text_encode')
(0.0, 'queries_map_keys')
(0.0, 'pyldavis_gensim_prepare')
(0.0, 'pyldavis_display')
(0.0, 'pylab_ylabel')
(0.0, 'pylab_xlabel')
(0.0, 'pylab_suptitle')
(0.0, 'pycountry_countries_lookup')
(0.0, 'pycountry_countries_get')
(0.0, 'print_score')
(0.0, 'preprocess')
(0.0, 'prepare_features_multi')
(0.0, 'pprint_pprint')
(0.0, 'pow')
(0.0, 'position_proxy_append')
(0.0, 'plot_feature_importances')
(0.0, 'plot_confusion_matrix')
(0.0, 'players_mean')
(0.0, 'players_head')
(0.0, 'players_fillna')
(0.0, 'players_copy')
(0.0, 'playerref_grouped_div')
(0.0, 'player_player_sum')
(0.0, 'player_groups_apply')
(0.0, 'place_get_details')
(0.0, 'pie')
(0.0, 'pickle_dump')
(0.0, 'periods_append')
(0.0, 'period_startswith')
(0.0, 'period_split')
(0.0, 'pedagogicperiod_endswith')
(0.0, 'partition_get')
(0.0, 'params_update')
(0.0, 'par_split')
(0.0, 'par_get_values_split')
(0.0, 'par_get_values')
(0.0, 'pandas_series_unique')
(0.0, 'pandas_series_rank')
(0.0, 'pandas_read_table')
(0.0, 'pandas_read_csv_pd_isnull_head')
(0.0, 'pandas_read_csv_dropna')
(0.0, 'pandas_melt')
(0.0, 'pandas_index')
(0.0, 'os_path_exists')
(0.0, 'ord')
(0.0, 'option_get')
(0.0, 'operator_itemgetter')
(0.0, 'open_read')
(0.0, 'of_minor')
(0.0, 'of_gender')
(0.0, 'occurm_keys')
(0.0, 'objdict')
(0.0, 'numpy_zeros')
(0.0, 'numpy_var')
(0.0, 'numpy_sum')
(0.0, 'numpy_sqrt')
(0.0, 'numpy_set_printoptions')
(0.0, 'numpy_round')
(0.0, 'numpy_percentile')
(0.0, 'numpy_min')
(0.0, 'numpy_intersect1d')
(0.0, 'numpy_in1d')
(0.0, 'numpy_floor')
(0.0, 'numpy_empty')
(0.0, 'numpy_delete')
(0.0, 'numpy_count_nonzero')
(0.0, 'numpy_ceil')
(0.0, 'numpy_asarray')
(0.0, 'numpy_around')
(0.0, 'numpy_argwhere')
(0.0, 'numpy_argmin')
(0.0, 'numpy_append')
(0.0, 'numbersemesters_groupby')
(0.0, 'normalize_standardize')
(0.0, 'nltk_tokenize_word_tokenize')
(0.0, 'nltk_tokenize_sent_tokenize')
(0.0, 'nltk_stem_lancaster_lancasterstemmer')
(0.0, 'nltk_pos_tag')
(0.0, 'nltk_download')
(0.0, 'nltk_corpus_words_count')
(0.0, 'next')
(0.0, 'new_url_replace')
(0.0, 'new_tokens_append')
(0.0, 'new_order_segments_take_head')
(0.0, 'networkx_from_pandas_dataframe')
(0.0, 'names_append')
(0.0, 'my_table_findchildren')
(0.0, 'msc_agg_valid_groupby')
(0.0, 'most_frequent')
(0.0, 'months_rename')
(0.0, 'months_between_dates')
(0.0, 'months_apply')
(0.0, 'model_fit')
(0.0, 'mb1_index_map')
(0.0, 'mb1_combine_first')
(0.0, 'matplotlib_style_use')
(0.0, 'matplotlib_pyplot_tick_params')
(0.0, 'matplotlib_pyplot_text')
(0.0, 'matplotlib_pyplot_suptitle')
(0.0, 'matplotlib_pyplot_loglog')
(0.0, 'matplotlib_pyplot_hist')
(0.0, 'matplotlib_pyplot_boxplot')
(0.0, 'matplotlib_pyplot_axhline')
(0.0, 'match_group')
(0.0, 'master_spe_groupby')
(0.0, 'master_semesters_groupby')
(0.0, 'master_head')
(0.0, 'master_dataframes_copy')
(0.0, 'map_save')
(0.0, 'map_lang_save')
(0.0, 'map_inst_canton')
(0.0, 'map_coutry_to_emails_keys')
(0.0, 'map_choropleth')
(0.0, 'makematrixforclassifier')
(0.0, 'make_wordcloud')
(0.0, 'logger_debug')
(0.0, 'locsciper')
(0.0, 'locale_currency')
(0.0, 'link_get')
(0.0, 'lemmatizer_lemmatize')
(0.0, 'learning_curve')
(0.0, 'le_fit_transform')
(0.0, 'lda_show_topics')
(0.0, 'l_startswith')
(0.0, 'kmeans_fit')
(0.0, 'kmean_fit')
(0.0, 'kf_split')
(0.0, 'jitter')
(0.0, 'itertools_product')
(0.0, 'itertools_chain')
(0.0, 'isinstance')
(0.0, 'isaform_find')
(0.0, 'isa_url')
(0.0, 'ipywidgets_floatprogress')
(0.0, 'ipython_display_html')
(0.0, 'ipython_display_display')
(0.0, 'ipython_core_display_html')
(0.0, 'info_isin')
(0.0, 'i_tablem3_get_text_replace')
(0.0, 'i_tablem3_get_text')
(0.0, 'i_tablem2_get_text_replace')
(0.0, 'i_tablem2_get_text')
(0.0, 'i_tablem1_get_text_replace')
(0.0, 'i_tablem1_get_text')
(0.0, 'i_table_get_text_replace')
(0.0, 'i_table_get_text')
(0.0, 'i_split')
(0.0, 'i_lower')
(0.0, 'htmlcontent_prettify')
(0.0, 'htmlcontent_find')
(0.0, 'hr_total_notnull')
(0.0, 'helpers_set_coordinates_and_canton')
(0.0, 'helpers_plot_learning_curve')
(0.0, 'helpers_compute_feature_importance_rfc')
(0.0, 'helperfunctions_get_kmeans_result')
(0.0, 'helper_prepare_data')
(0.0, 'helper_plot_cnf')
(0.0, 'helper_details')
(0.0, 'grpd_apply')
(0.0, 'grp_apply')
(0.0, 'grouped_head')
(0.0, 'groupby_player_aggregate')
(0.0, 'group_semester_any')
(0.0, 'group_players_agg')
(0.0, 'group_min')
(0.0, 'group_isin_sum')
(0.0, 'group_isin_any')
(0.0, 'group_isin')
(0.0, 'group_dropna')
(0.0, 'group_any')
(0.0, 'graph_set_ylabel')
(0.0, 'graph_set_xlabel')
(0.0, 'graph_set_title')
(0.0, 'grants_head')
(0.0, 'grants_groupby_sum')
(0.0, 'grantreport_head')
(0.0, 'grantexport_final_drop')
(0.0, 'grant_data_refine_institution_str_contains')
(0.0, 'grant_data_refine_drop')
(0.0, 'grant_data_head')
(0.0, 'grant_data_count')
(0.0, 'grant_data_canton_isin')
(0.0, 'gps_df_isin')
(0.0, 'google_places_text_search')
(0.0, 'gmaps_geocode')
(0.0, 'getstudentsemestersn')
(0.0, 'getdata')
(0.0, 'getcanton')
(0.0, 'getalldata')
(0.0, 'get_table')
(0.0, 'get_frame_from_element')
(0.0, 'get_dropped_perc')
(0.0, 'get_df_for_semester')
(0.0, 'get_clustering_scores')
(0.0, 'geopy_point_point')
(0.0, 'geolocator_reverse')
(0.0, 'geocoder_google')
(0.0, 'geo_all_copy_isnull')
(0.0, 'generate_x_std')
(0.0, 'g_nodes')
(0.0, 'g_add_nodes_from')
(0.0, 'func_get_semester_student')
(0.0, 'fulldata_columns_values_tolist')
(0.0, 'frames_append')
(0.0, 'framema_groupby_mean')
(0.0, 'framema_groupby')
(0.0, 'frame_head')
(0.0, 'frame_groupby')
(0.0, 'form_table_find')
(0.0, 'form_find_all')
(0.0, 'forest_predict')
(0.0, 'forest_c_predict')
(0.0, 'forest_binary_predict')
(0.0, 'folium_icon_folium_marker_add_to')
(0.0, 'folium_icon')
(0.0, 'findcanton')
(0.0, 'find_semesters')
(0.0, 'final_replace')
(0.0, 'final_append')
(0.0, 'filtered_mat_index_get_level_values')
(0.0, 'filtered_index_get_level_values')
(0.0, 'filtered_head')
(0.0, 'filtered_groupby')
(0.0, 'filtered_apply')
(0.0, 'filter_name_find')
(0.0, 'files_remove')
(0.0, 'file_name_split')
(0.0, 'file_find')
(0.0, 'file_drop')
(0.0, 'fig_suptitle')
(0.0, 'f_score')
(0.0, 'f_fit')
(0.0, 'extractyear')
(0.0, 'extractedbodytext4_apply')
(0.0, 'extract_data')
(0.0, 'executingrandomforest')
(0.0, 'exception')
(0.0, 'evaluate_random_forest_model')
(0.0, 'error_train_append')
(0.0, 'error_test_append')
(0.0, 'emails_extractedbodytext_isnull')
(0.0, 'emails_extractedbodytext_dropna')
(0.0, 'email_lower')
(0.0, 'email_df_head')
(0.0, 'email_df_apply')
(0.0, 'el_extract_get')
(0.0, 'el_extract')
(0.0, 'duration')
(0.0, 'diffs_abs')
(0.0, 'dico_append')
(0.0, 'dic_name_value_items')
(0.0, 'dfs_append')
(0.0, 'dfma_simple_notnull')
(0.0, 'dffm2_notnull_dffm2_loc_index_get_level_values')
(0.0, 'dffm2_notnull')
(0.0, 'dffm2_index_get_level_values_str_contains_dffm2_iloc_index_get_level_values')
(0.0, 'dffm2_index_get_level_values_str_contains')
(0.0, 'dffm2_index_get_level_values_isin')
(0.0, 'dffm2_index_get_level_values')
(0.0, 'dff2_index_get_level_values_isin')
(0.0, 'df_x_load_drop')
(0.0, 'df_x_drop')
(0.0, 'df_value_counts')
(0.0, 'df_tolist')
(0.0, 'df_sum')
(0.0, 'df_str_astype')
(0.0, 'df_split')
(0.0, 'df_set_index')
(0.0, 'df_sample')
(0.0, 'df_reset_index_groupby')
(0.0, 'df_rename')
(0.0, 'df_refcountry_unique')
(0.0, 'df_plot')
(0.0, 'df_notnull')
(0.0, 'df_medium_apply')
(0.0, 'df_masters_spec_groupby')
(0.0, 'df_master_passed_groupby_size_to_frame')
(0.0, 'df_master_passed_groupby_size')
(0.0, 'df_master_passed_groupby')
(0.0, 'df_master_grouped_size_mean')
(0.0, 'df_master_grouped_size')
(0.0, 'df_join')
(0.0, 'df_iterrows')
(0.0, 'df_isnull_sum')
(0.0, 'df_isnull')
(0.0, 'df_isin_value_counts')
(0.0, 'df_isin_df_sort_values')
(0.0, 'df_hist')
(0.0, 'df_h_w_isnull_df_h_w_unique')
(0.0, 'df_h_w_isnull')
(0.0, 'df_groupby_sum')
(0.0, 'df_groupby_first')
(0.0, 'df_groupby_count')
(0.0, 'df_groupby_apply')
(0.0, 'df_groupby_agg')
(0.0, 'df_final_df_final_as_matrix')
(0.0, 'df_fillna')
(0.0, 'df_emails_head')
(0.0, 'df_emails_apply')
(0.0, 'df_copy')
(0.0, 'df_categorized_groupby')
(0.0, 'df_byshort_agg')
(0.0, 'df_bachelor_head')
(0.0, 'df_bachelor_grouped_size_mean')
(0.0, 'df_bachelor_grouped_size')
(0.0, 'df_astype')
(0.0, 'df_all_cls_df_all_target_hist')
(0.0, 'df1_set_value')
(0.0, 'df1_drop')
(0.0, 'dates_append')
(0.0, 'dataskin_groupby')
(0.0, 'data_xs')
(0.0, 'data_unique')
(0.0, 'data_tolist')
(0.0, 'data_sum')
(0.0, 'data_set_index')
(0.0, 'data_reset_index')
(0.0, 'data_replace')
(0.0, 'data_rename')
(0.0, 'data_rater2_isnull')
(0.0, 'data_rater1_isnull')
(0.0, 'data_phylum_str_endswith')
(0.0, 'data_masters_append')
(0.0, 'data_ix_head')
(0.0, 'data_isnull')
(0.0, 'data_isin')
(0.0, 'data_grouped_size')
(0.0, 'data_groupby_agg')
(0.0, 'data_clean_isnull')
(0.0, 'data_astype')
(0.0, 'data_apply')
(0.0, 'data_append')
(0.0, 'dark_skin_proportion')
(0.0, 'cross_validation')
(0.0, 'cross_val')
(0.0, 'country_upper')
(0.0, 'country_sent_head')
(0.0, 'country_names_append')
(0.0, 'country_lower')
(0.0, 'country_arr_append')
(0.0, 'country_alpha_2_lower')
(0.0, 'countries_extend')
(0.0, 'countries_dict_pop')
(0.0, 'countries_dict_keys')
(0.0, 'countries_df_str_contains')
(0.0, 'countries_df_apply')
(0.0, 'countries_clean_remove')
(0.0, 'countries_clean_append')
(0.0, 'count_unique_values')
(0.0, 'concurrent_futures_threadpoolexecutor')
(0.0, 'computesemesterstudentsbetween')
(0.0, 'computeprecision')
(0.0, 'composed_filter_countries_df_str_contains')
(0.0, 'columns_append')
(0.0, 'column_data_replace')
(0.0, 'cols_get_text')
(0.0, 'collections_defaultdict')
(0.0, 'col_titles_append')
(0.0, 'cm_max')
(0.0, 'cluster')
(0.0, 'clf_predict')
(0.0, 'cleaned_isnull')
(0.0, 'checkcrossvalidationaccuracy')
(0.0, 'check_student')
(0.0, 'cfile_master_dropna_pd_dataframe_reset_index_drop_duplicates')
(0.0, 'cfile_master_dropna_pd_dataframe_reset_index')
(0.0, 'cells_find')
(0.0, 'cdystonia_treat_value_counts')
(0.0, 'cdystonia_treat_head')
(0.0, 'cdystonia_set_index_unstack')
(0.0, 'cdystonia_head')
(0.0, 'cdystonia_grouped_transform')
(0.0, 'cdystonia_grouped_agg')
(0.0, 'cdystonia_groupby_mean_head')
(0.0, 'cdystonia_groupby')
(0.0, 'cdystonia_drop_duplicates_merge')
(0.0, 'cdystonia2_treat_replace')
(0.0, 'canton_item')
(0.0, 'calculatedate')
(0.0, 'calcmasterspan')
(0.0, 'build_classifier')
(0.0, 'borders_get')
(0.0, 'beautifulsoup_beautifulsoup')
(0.0, 'beautiful_html_find')
(0.0, 'baseball_sum')
(0.0, 'baseball_rank')
(0.0, 'baseball_hr_sort_values')
(0.0, 'baseball_describe')
(0.0, 'baseball_copy')
(0.0, 'bar_plot_sentiments')
(0.0, 'bacteria2_dropna')
(0.0, 'axes_set_ylim')
(0.0, 'axes_set_xlabel')
(0.0, 'ax_set_xticklabels')
(0.0, 'ax_set_xlabel')
(0.0, 'ax_set_title')
(0.0, 'ax_hist')
(0.0, 'ax2_set_xlabel')
(0.0, 'ax1_set_ylabel')
(0.0, 'ax1_set_xlabel')
(0.0, 'average_over_years_f_ravel_nonzero')
(0.0, 'average_over_years_f_ravel')
(0.0, 'amounts_get')
(0.0, 'all_master_students_isin')
(0.0, 'all_master_data_groupby')
(0.0, 'all_data_append')
(0.0, 'alemanique_list_append')
(0.0, 'agg_specialization_isnull')
(0.0, 'agg_head')
(0.0, 'adata_find_all')
(0.0, 'accumulate')
(0.0, 'acc_append')
(0.0, 'acad_period_items')

In [ ]: