In [1]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
In [2]:
#Loading in the notebooks
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)
In [3]:
examp_nb = a.get_notebook(0)
In [4]:
print (examp_nb.get_number_cells())
23
In [5]:
new_segmentation = examp_nb.get_new_notebook()
In [6]:
print (new_segmentation.get_number_cells())
131
In [7]:
lns = []
for cell in new_segmentation.get_all_cells():
lns.append(len(cell.get_feature('ast').body))
print (lns)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Looks like it's working for one, checking all...
In [8]:
for i, nb in enumerate(a.nb_features):
a.nb_features[i] = nb.get_new_notebook()
In [9]:
total_segments = 0
for nb in a.nb_features:
for cell in nb.get_all_cells():
total_segments += 1
if len(cell.get_feature('ast').body) != 1:
print ("Failed")
In [10]:
print (total_segments)
19882
In [11]:
all_types = []
for nb in a.nb_features:
for cell in nb.get_all_cells():
t = type(cell.get_feature('ast').body[0])
all_types.append(t)
In [12]:
counting_dict = {}
for t in all_types:
if t not in counting_dict:
counting_dict[t] = 0
counting_dict[t] += 1
In [13]:
for key in counting_dict.keys():
print (key, ':', counting_dict[key])
<class '_ast.Import'> : 1148
<class '_ast.ImportFrom'> : 1007
<class '_ast.Expr'> : 8076
<class '_ast.Assign'> : 8157
<class '_ast.For'> : 482
<class '_ast.FunctionDef'> : 687
<class '_ast.AugAssign'> : 14
<class '_ast.With'> : 40
<class '_ast.Delete'> : 265
<class '_ast.If'> : 1
<class '_ast.Try'> : 4
<class '_ast.ClassDef'> : 1
I found it interesting that there is no 'ast.call', maybe call just isn't a top level node? I printed out top level AST nodes below and it looks like call happens in an expression or in an Assign... maybe we should just expand these out further.
In [ ]:
In [14]:
import ast
for cell in new_segmentation.get_all_cells():
print (ast.dump(cell.get_feature('ast')))
print ('')
Module(body=[Import(names=[alias(name='pandas', asname='pd')])])
Module(body=[Import(names=[alias(name='numpy', asname='np')])])
Module(body=[Import(names=[alias(name='os', asname=None)])])
Module(body=[ImportFrom(module='sklearn.linear_model', names=[alias(name='LinearRegression', asname=None)], level=0)])
Module(body=[ImportFrom(module='sklearn', names=[alias(name='metrics', asname=None)], level=0)])
Module(body=[Import(names=[alias(name='matplotlib.pyplot', asname='plt')])])
Module(body=[Expr(value=Call(func=Attribute(value=Call(func=Name(id='get_ipython', ctx=Load()), args=[], keywords=[]), attr='magic', ctx=Load()), args=[Str(s='matplotlib inline')], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_epfl', ctx=Store())], value=Call(func=Attribute(value=Name(id='pd', ctx=Load()), attr='read_json', ctx=Load()), args=[Str(s='epfl_en.json')], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_eth', ctx=Store())], value=Call(func=Attribute(value=Name(id='pd', ctx=Load()), attr='read_json', ctx=Load()), args=[Str(s='eth_en.json')], keywords=[]))])
Module(body=[Expr(value=Call(func=Attribute(value=Name(id='df_epfl', ctx=Load()), attr='head', ctx=Load()), args=[Num(n=2)], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_epfl_dw', ctx=Store())], value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Compare(left=BinOp(left=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='id')), ctx=Load()), op=Mod(), right=Num(n=10)), ops=[Eq()], comparators=[Num(n=2)])), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='df_eth_dw', ctx=Store())], value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Compare(left=BinOp(left=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='id')), ctx=Load()), op=Mod(), right=Num(n=10)), ops=[Eq()], comparators=[Num(n=2)])), ctx=Load()))])
Module(body=[Expr(value=Attribute(value=Name(id='df_epfl', ctx=Load()), attr='size', ctx=Load()))])
Module(body=[Expr(value=Attribute(value=Name(id='df_epfl_dw', ctx=Load()), attr='size', ctx=Load()))])
Module(body=[Expr(value=Call(func=Attribute(value=Call(func=Attribute(value=Name(id='df_epfl', ctx=Load()), attr='isnull', ctx=Load()), args=[], keywords=[]), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Attribute(value=Name(id='df_epfl', ctx=Load()), attr='index', ctx=Store())], value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='id')), ctx=Load()))])
Module(body=[Assign(targets=[Attribute(value=Name(id='df_epfl_dw', ctx=Load()), attr='index', ctx=Store())], value=Subscript(value=Name(id='df_epfl_dw', ctx=Load()), slice=Index(value=Str(s='id')), ctx=Load()))])
Module(body=[Assign(targets=[Attribute(value=Name(id='df_eth', ctx=Load()), attr='index', ctx=Store())], value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='id')), ctx=Load()))])
Module(body=[Assign(targets=[Attribute(value=Name(id='df_eth_dw', ctx=Load()), attr='index', ctx=Store())], value=Subscript(value=Name(id='df_eth_dw', ctx=Load()), slice=Index(value=Str(s='id')), ctx=Load()))])
Module(body=[Expr(value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='user')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='epfl_user', ctx=Store())], value=Call(func=Attribute(value=Name(id='pd', ctx=Load()), attr='DataFrame', ctx=Load()), args=[], keywords=[]))])
Module(body=[For(target=Name(id='i', ctx=Store()), iter=Call(func=Name(id='range', ctx=Load()), args=[Num(n=0), Subscript(value=Attribute(value=Name(id='df_epfl', ctx=Load()), attr='shape', ctx=Load()), slice=Index(value=Num(n=0)), ctx=Load())], keywords=[]), body=[Assign(targets=[Name(id='epfl_user', ctx=Store())], value=Call(func=Attribute(value=Name(id='epfl_user', ctx=Load()), attr='append', ctx=Load()), args=[List(elts=[Subscript(value=Subscript(value=Attribute(value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='user')), ctx=Load()), attr='iloc', ctx=Load()), slice=Index(value=Name(id='i', ctx=Load())), ctx=Load()), slice=Index(value=Str(s='id')), ctx=Load())], ctx=Load())], keywords=[]))], orelse=[])])
Module(body=[Assign(targets=[Name(id='unique', ctx=Store())], value=Call(func=Attribute(value=Name(id='epfl_user', ctx=Load()), attr='drop_duplicates', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='eth_user', ctx=Store())], value=Call(func=Attribute(value=Name(id='pd', ctx=Load()), attr='DataFrame', ctx=Load()), args=[], keywords=[]))])
Module(body=[For(target=Name(id='i', ctx=Store()), iter=Call(func=Name(id='range', ctx=Load()), args=[Num(n=0), Subscript(value=Attribute(value=Name(id='df_eth', ctx=Load()), attr='shape', ctx=Load()), slice=Index(value=Num(n=0)), ctx=Load())], keywords=[]), body=[Assign(targets=[Name(id='eth_user', ctx=Store())], value=Call(func=Attribute(value=Name(id='eth_user', ctx=Load()), attr='append', ctx=Load()), args=[List(elts=[Subscript(value=Subscript(value=Attribute(value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='user')), ctx=Load()), attr='iloc', ctx=Load()), slice=Index(value=Name(id='i', ctx=Load())), ctx=Load()), slice=Index(value=Str(s='id')), ctx=Load())], ctx=Load())], keywords=[]))], orelse=[])])
Module(body=[Assign(targets=[Name(id='unique2', ctx=Store())], value=Call(func=Attribute(value=Name(id='eth_user', ctx=Load()), attr='drop_duplicates', ctx=Load()), args=[], keywords=[]))])
Module(body=[Expr(value=Name(id='unique2', ctx=Load()))])
Module(body=[Assign(targets=[Name(id='favorites_epfl', ctx=Store())], value=Call(func=Attribute(value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load()), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='retweet_epfl', ctx=Store())], value=Call(func=Attribute(value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='favorites_eth', ctx=Store())], value=Call(func=Attribute(value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load()), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='retweet_eth', ctx=Store())], value=Call(func=Attribute(value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Expr(value=Call(func=Attribute(value=Name(id='plt', ctx=Load()), attr='figure', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_show', ctx=Store())], value=Call(func=Attribute(value=Name(id='pd', ctx=Load()), attr='DataFrame', ctx=Load()), args=[], keywords=[keyword(arg='data', value=List(elts=[List(elts=[Str(s='EPFL'), BinOp(left=Name(id='favorites_epfl', ctx=Load()), op=Add(), right=Name(id='retweet_epfl', ctx=Load()))], ctx=Load()), List(elts=[Str(s='ETH'), BinOp(left=Name(id='favorites_eth', ctx=Load()), op=Add(), right=Name(id='retweet_eth', ctx=Load()))], ctx=Load())], ctx=Load())), keyword(arg='columns', value=List(elts=[Str(s='Uni'), Str(s='Number of Tweet + Likes')], ctx=Load())), keyword(arg='index', value=List(elts=[Str(s='EPFL'), Str(s='ETH')], ctx=Load()))]))])
Module(body=[Expr(value=Call(func=Attribute(value=Name(id='df_show', ctx=Load()), attr='plot', ctx=Load()), args=[], keywords=[keyword(arg='kind', value=Str(s='bar'))]))])
Module(body=[Expr(value=Call(func=Name(id='print', ctx=Load()), args=[BinOp(left=Name(id='favorites_epfl', ctx=Load()), op=Add(), right=Name(id='retweet_epfl', ctx=Load()))], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_epfl_year', ctx=Store())], value=Call(func=Attribute(value=Call(func=Attribute(value=Name(id='df_epfl', ctx=Load()), attr='groupby', ctx=Load()), args=[Call(func=Attribute(value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='created_at')), ctx=Load()), attr='map', ctx=Load()), args=[Lambda(args=arguments(args=[arg(arg='x', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=Attribute(value=Name(id='x', ctx=Load()), attr='year', ctx=Load()))], keywords=[])], keywords=[]), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_epfl_year', ctx=Store())], value=Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=List(elts=[Str(s='favorite_count'), Str(s='retweet_count')], ctx=Load())), ctx=Load()))])
Module(body=[Assign(targets=[Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=Str(s='total')), ctx=Store())], value=BinOp(left=Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()), op=Add(), right=Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load())))])
Module(body=[Assign(targets=[Name(id='df_eth_year', ctx=Store())], value=Call(func=Attribute(value=Call(func=Attribute(value=Name(id='df_eth', ctx=Load()), attr='groupby', ctx=Load()), args=[Call(func=Attribute(value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='created_at')), ctx=Load()), attr='map', ctx=Load()), args=[Lambda(args=arguments(args=[arg(arg='x', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=Attribute(value=Name(id='x', ctx=Load()), attr='year', ctx=Load()))], keywords=[])], keywords=[]), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_eth_year', ctx=Store())], value=Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=List(elts=[Str(s='favorite_count'), Str(s='retweet_count')], ctx=Load())), ctx=Load()))])
Module(body=[Assign(targets=[Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=Str(s='total')), ctx=Store())], value=BinOp(left=Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()), op=Add(), right=Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load())))])
Module(body=[Assign(targets=[Name(id='fig', ctx=Store())], value=Call(func=Attribute(value=Name(id='plt', ctx=Load()), attr='figure', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='ax', ctx=Store())], value=Call(func=Attribute(value=Name(id='fig', ctx=Load()), attr='add_subplot', ctx=Load()), args=[Num(n=111)], keywords=[]))])
Module(body=[Assign(targets=[Name(id='width', ctx=Store())], value=Num(n=0.4))])
Module(body=[Expr(value=Call(func=Attribute(value=Attribute(value=Name(id='df_epfl_year', ctx=Load()), attr='total', ctx=Load()), attr='plot', ctx=Load()), args=[], keywords=[keyword(arg='kind', value=Str(s='bar')), keyword(arg='color', value=Str(s='blue')), keyword(arg='width', value=Name(id='width', ctx=Load())), keyword(arg='position', value=Num(n=0))]))])
Module(body=[Expr(value=Call(func=Attribute(value=Attribute(value=Name(id='df_eth_year', ctx=Load()), attr='total', ctx=Load()), attr='plot', ctx=Load()), args=[], keywords=[keyword(arg='kind', value=Str(s='bar')), keyword(arg='color', value=Str(s='red')), keyword(arg='width', value=Name(id='width', ctx=Load())), keyword(arg='position', value=Num(n=1))]))])
Module(body=[Expr(value=Call(func=Attribute(value=Name(id='plt', ctx=Load()), attr='show', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_epfl_year', ctx=Store())], value=Call(func=Attribute(value=Call(func=Attribute(value=Name(id='df_epfl', ctx=Load()), attr='groupby', ctx=Load()), args=[Call(func=Attribute(value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='created_at')), ctx=Load()), attr='map', ctx=Load()), args=[Lambda(args=arguments(args=[arg(arg='x', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=Attribute(value=Name(id='x', ctx=Load()), attr='month', ctx=Load()))], keywords=[])], keywords=[]), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_epfl_year', ctx=Store())], value=Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=List(elts=[Str(s='favorite_count'), Str(s='retweet_count')], ctx=Load())), ctx=Load()))])
Module(body=[Assign(targets=[Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=Str(s='total')), ctx=Store())], value=BinOp(left=Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()), op=Add(), right=Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load())))])
Module(body=[Assign(targets=[Name(id='df_eth_year', ctx=Store())], value=Call(func=Attribute(value=Call(func=Attribute(value=Name(id='df_eth', ctx=Load()), attr='groupby', ctx=Load()), args=[Call(func=Attribute(value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='created_at')), ctx=Load()), attr='map', ctx=Load()), args=[Lambda(args=arguments(args=[arg(arg='x', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=Attribute(value=Name(id='x', ctx=Load()), attr='month', ctx=Load()))], keywords=[])], keywords=[]), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_eth_year', ctx=Store())], value=Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=List(elts=[Str(s='favorite_count'), Str(s='retweet_count')], ctx=Load())), ctx=Load()))])
Module(body=[Assign(targets=[Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=Str(s='total')), ctx=Store())], value=BinOp(left=Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()), op=Add(), right=Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load())))])
Module(body=[Assign(targets=[Name(id='fig', ctx=Store())], value=Call(func=Attribute(value=Name(id='plt', ctx=Load()), attr='figure', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='ax', ctx=Store())], value=Call(func=Attribute(value=Name(id='fig', ctx=Load()), attr='add_subplot', ctx=Load()), args=[Num(n=111)], keywords=[]))])
Module(body=[Assign(targets=[Name(id='width', ctx=Store())], value=Num(n=0.4))])
Module(body=[Expr(value=Call(func=Attribute(value=Attribute(value=Name(id='df_epfl_year', ctx=Load()), attr='total', ctx=Load()), attr='plot', ctx=Load()), args=[], keywords=[keyword(arg='kind', value=Str(s='bar')), keyword(arg='color', value=Str(s='blue')), keyword(arg='width', value=Name(id='width', ctx=Load())), keyword(arg='position', value=Num(n=0))]))])
Module(body=[Expr(value=Call(func=Attribute(value=Attribute(value=Name(id='df_eth_year', ctx=Load()), attr='total', ctx=Load()), attr='plot', ctx=Load()), args=[], keywords=[keyword(arg='kind', value=Str(s='bar')), keyword(arg='color', value=Str(s='red')), keyword(arg='width', value=Name(id='width', ctx=Load())), keyword(arg='position', value=Num(n=1))]))])
Module(body=[Expr(value=Call(func=Attribute(value=Name(id='plt', ctx=Load()), attr='show', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_epfl_year', ctx=Store())], value=Call(func=Attribute(value=Call(func=Attribute(value=Name(id='df_epfl', ctx=Load()), attr='groupby', ctx=Load()), args=[Call(func=Attribute(value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='created_at')), ctx=Load()), attr='map', ctx=Load()), args=[Lambda(args=arguments(args=[arg(arg='x', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=Attribute(value=Name(id='x', ctx=Load()), attr='hour', ctx=Load()))], keywords=[])], keywords=[]), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_epfl_year', ctx=Store())], value=Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=List(elts=[Str(s='favorite_count'), Str(s='retweet_count')], ctx=Load())), ctx=Load()))])
Module(body=[Assign(targets=[Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=Str(s='total')), ctx=Store())], value=BinOp(left=Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()), op=Add(), right=Subscript(value=Name(id='df_epfl_year', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load())))])
Module(body=[Assign(targets=[Name(id='df_eth_year', ctx=Store())], value=Call(func=Attribute(value=Call(func=Attribute(value=Name(id='df_eth', ctx=Load()), attr='groupby', ctx=Load()), args=[Call(func=Attribute(value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='created_at')), ctx=Load()), attr='map', ctx=Load()), args=[Lambda(args=arguments(args=[arg(arg='x', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=Attribute(value=Name(id='x', ctx=Load()), attr='hour', ctx=Load()))], keywords=[])], keywords=[]), attr='sum', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_eth_year', ctx=Store())], value=Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=List(elts=[Str(s='favorite_count'), Str(s='retweet_count')], ctx=Load())), ctx=Load()))])
Module(body=[Assign(targets=[Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=Str(s='total')), ctx=Store())], value=BinOp(left=Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()), op=Add(), right=Subscript(value=Name(id='df_eth_year', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load())))])
Module(body=[Assign(targets=[Name(id='fig', ctx=Store())], value=Call(func=Attribute(value=Name(id='plt', ctx=Load()), attr='figure', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='ax', ctx=Store())], value=Call(func=Attribute(value=Name(id='fig', ctx=Load()), attr='add_subplot', ctx=Load()), args=[Num(n=111)], keywords=[]))])
Module(body=[Assign(targets=[Name(id='width', ctx=Store())], value=Num(n=0.4))])
Module(body=[Expr(value=Call(func=Attribute(value=Attribute(value=Name(id='df_epfl_year', ctx=Load()), attr='total', ctx=Load()), attr='plot', ctx=Load()), args=[], keywords=[keyword(arg='kind', value=Str(s='bar')), keyword(arg='color', value=Str(s='blue')), keyword(arg='width', value=Name(id='width', ctx=Load())), keyword(arg='position', value=Num(n=0))]))])
Module(body=[Expr(value=Call(func=Attribute(value=Attribute(value=Name(id='df_eth_year', ctx=Load()), attr='total', ctx=Load()), attr='plot', ctx=Load()), args=[], keywords=[keyword(arg='kind', value=Str(s='bar')), keyword(arg='color', value=Str(s='red')), keyword(arg='width', value=Name(id='width', ctx=Load())), keyword(arg='position', value=Num(n=1))]))])
Module(body=[Expr(value=Call(func=Attribute(value=Name(id='plt', ctx=Load()), attr='show', ctx=Load()), args=[], keywords=[]))])
Module(body=[Assign(targets=[Name(id='df_epfl_hashtag', ctx=Store())], value=Name(id='df_epfl', ctx=Load()))])
Module(body=[Assign(targets=[Name(id='df_hashtags', ctx=Store())], value=Call(func=Attribute(value=Name(id='pd', ctx=Load()), attr='DataFrame', ctx=Load()), args=[], keywords=[]))])
Module(body=[For(target=Name(id='i', ctx=Store()), iter=Call(func=Name(id='range', ctx=Load()), args=[Num(n=0), Subscript(value=Attribute(value=Name(id='df_epfl_hashtag', ctx=Load()), attr='shape', ctx=Load()), slice=Index(value=Num(n=0)), ctx=Load())], keywords=[]), body=[Assign(targets=[Name(id='hashtag_i', ctx=Store())], value=Subscript(value=Subscript(value=Attribute(value=Subscript(value=Name(id='df_epfl_hashtag', ctx=Load()), slice=Index(value=Str(s='entities')), ctx=Load()), attr='iloc', ctx=Load()), slice=Index(value=Name(id='i', ctx=Load())), ctx=Load()), slice=Index(value=Str(s='hashtags')), ctx=Load())), If(test=Compare(left=Name(id='hashtag_i', ctx=Load()), ops=[NotEq()], comparators=[Str(s='')]), body=[Assign(targets=[Name(id='hashtags', ctx=Store())], value=Subscript(value=Subscript(value=Name(id='hashtag_i', ctx=Load()), slice=Index(value=Num(n=0)), ctx=Load()), slice=Index(value=Str(s='text')), ctx=Load())), Assign(targets=[Subscript(value=Name(id='df_hashtags', ctx=Load()), slice=Index(value=Str(s='hashtags')), ctx=Store())], value=Call(func=Attribute(value=Name(id='df_hashtags', ctx=Load()), attr='append', ctx=Load()), args=[List(elts=[Name(id='hashtags', ctx=Load())], ctx=Load())], keywords=[]))], orelse=[Assign(targets=[Subscript(value=Name(id='df_hashtags', ctx=Load()), slice=Index(value=Str(s='hashtags')), ctx=Store())], value=Call(func=Attribute(value=Name(id='df_hashtags', ctx=Load()), attr='append', ctx=Load()), args=[List(elts=[Str(s='No_tag')], ctx=Load())], keywords=[]))])], orelse=[])])
Module(body=[Assign(targets=[Name(id='validation_size', ctx=Store())], value=Num(n=300))])
Module(body=[Assign(targets=[Name(id='Y', ctx=Store())], value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='Y2', ctx=Store())], value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='Y', ctx=Store())], value=Call(func=Attribute(value=Name(id='Y', ctx=Load()), attr='append', ctx=Load()), args=[Name(id='Y2', ctx=Load())], keywords=[]))])
Module(body=[Assign(targets=[Name(id='X', ctx=Store())], value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='X2', ctx=Store())], value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='X', ctx=Store())], value=Call(func=Attribute(value=Name(id='X', ctx=Load()), attr='append', ctx=Load()), args=[Name(id='X2', ctx=Load())], keywords=[]))])
Module(body=[Assign(targets=[Name(id='validation_features', ctx=Store())], value=Subscript(value=Name(id='X', ctx=Load()), slice=Slice(lower=None, upper=Name(id='validation_size', ctx=Load()), step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='validation_labels', ctx=Store())], value=Subscript(value=Name(id='Y', ctx=Load()), slice=Slice(lower=None, upper=Name(id='validation_size', ctx=Load()), step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='train_features', ctx=Store())], value=Subscript(value=Name(id='X', ctx=Load()), slice=Slice(lower=Name(id='validation_size', ctx=Load()), upper=None, step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='train_labels', ctx=Store())], value=Subscript(value=Name(id='Y', ctx=Load()), slice=Slice(lower=Name(id='validation_size', ctx=Load()), upper=None, step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='model', ctx=Store())], value=Call(func=Name(id='LinearRegression', ctx=Load()), args=[], keywords=[]))])
Module(body=[Expr(value=Call(func=Attribute(value=Name(id='model', ctx=Load()), attr='fit', ctx=Load()), args=[Call(func=Attribute(value=Attribute(value=Name(id='train_features', ctx=Load()), attr='values', ctx=Load()), attr='reshape', ctx=Load()), args=[UnaryOp(op=USub(), operand=Num(n=1)), Num(n=1)], keywords=[]), Attribute(value=Name(id='train_labels', ctx=Load()), attr='values', ctx=Load())], keywords=[]))])
Module(body=[Assign(targets=[Name(id='y_predict', ctx=Store())], value=Call(func=Attribute(value=Name(id='model', ctx=Load()), attr='predict', ctx=Load()), args=[Call(func=Attribute(value=Attribute(value=Name(id='validation_features', ctx=Load()), attr='values', ctx=Load()), attr='reshape', ctx=Load()), args=[UnaryOp(op=USub(), operand=Num(n=1)), Num(n=1)], keywords=[])], keywords=[]))])
Module(body=[Expr(value=Call(func=Name(id='print', ctx=Load()), args=[Str(s='R2 score given by model is'), Call(func=Attribute(value=Name(id='model', ctx=Load()), attr='score', ctx=Load()), args=[Call(func=Attribute(value=Attribute(value=Name(id='validation_features', ctx=Load()), attr='values', ctx=Load()), attr='reshape', ctx=Load()), args=[UnaryOp(op=USub(), operand=Num(n=1)), Num(n=1)], keywords=[]), Attribute(value=Name(id='validation_labels', ctx=Load()), attr='values', ctx=Load())], keywords=[])], keywords=[]))])
Module(body=[ImportFrom(module='sklearn.metrics', names=[alias(name='mean_squared_error', asname=None)], level=0)])
Module(body=[ImportFrom(module='math', names=[alias(name='sqrt', asname=None)], level=0)])
Module(body=[Assign(targets=[Name(id='rms', ctx=Store())], value=Call(func=Name(id='sqrt', ctx=Load()), args=[Call(func=Name(id='mean_squared_error', ctx=Load()), args=[Call(func=Attribute(value=Attribute(value=Name(id='validation_labels', ctx=Load()), attr='values', ctx=Load()), attr='reshape', ctx=Load()), args=[UnaryOp(op=USub(), operand=Num(n=1))], keywords=[]), Name(id='y_predict', ctx=Load())], keywords=[])], keywords=[]))])
Module(body=[Expr(value=Call(func=Name(id='print', ctx=Load()), args=[Str(s='rmse error is'), Name(id='rms', ctx=Load())], keywords=[]))])
Module(body=[Assign(targets=[Name(id='validation_size', ctx=Store())], value=Num(n=100))])
Module(body=[Assign(targets=[Name(id='Y', ctx=Store())], value=Subscript(value=Name(id='df_epfl_dw', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='Y2', ctx=Store())], value=Subscript(value=Name(id='df_eth_dw', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='Y', ctx=Store())], value=Call(func=Attribute(value=Name(id='Y', ctx=Load()), attr='append', ctx=Load()), args=[Name(id='Y2', ctx=Load())], keywords=[]))])
Module(body=[Assign(targets=[Name(id='X', ctx=Store())], value=Subscript(value=Name(id='df_epfl_dw', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='X2', ctx=Store())], value=Subscript(value=Name(id='df_eth_dw', ctx=Load()), slice=Index(value=Str(s='favorite_count')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='X', ctx=Store())], value=Call(func=Attribute(value=Name(id='X', ctx=Load()), attr='append', ctx=Load()), args=[Name(id='X2', ctx=Load())], keywords=[]))])
Module(body=[Assign(targets=[Name(id='validation_features', ctx=Store())], value=Subscript(value=Name(id='X', ctx=Load()), slice=Slice(lower=None, upper=Name(id='validation_size', ctx=Load()), step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='validation_labels', ctx=Store())], value=Subscript(value=Name(id='Y', ctx=Load()), slice=Slice(lower=None, upper=Name(id='validation_size', ctx=Load()), step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='train_features', ctx=Store())], value=Subscript(value=Name(id='X', ctx=Load()), slice=Slice(lower=Name(id='validation_size', ctx=Load()), upper=None, step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='train_labels', ctx=Store())], value=Subscript(value=Name(id='Y', ctx=Load()), slice=Slice(lower=Name(id='validation_size', ctx=Load()), upper=None, step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='model', ctx=Store())], value=Call(func=Name(id='LinearRegression', ctx=Load()), args=[], keywords=[]))])
Module(body=[Expr(value=Call(func=Attribute(value=Name(id='model', ctx=Load()), attr='fit', ctx=Load()), args=[Call(func=Attribute(value=Attribute(value=Name(id='train_features', ctx=Load()), attr='values', ctx=Load()), attr='reshape', ctx=Load()), args=[UnaryOp(op=USub(), operand=Num(n=1)), Num(n=1)], keywords=[]), Attribute(value=Name(id='train_labels', ctx=Load()), attr='values', ctx=Load())], keywords=[]))])
Module(body=[Expr(value=Call(func=Name(id='print', ctx=Load()), args=[Str(s='R2 score given by model is'), Call(func=Attribute(value=Name(id='model', ctx=Load()), attr='score', ctx=Load()), args=[Call(func=Attribute(value=Attribute(value=Name(id='validation_features', ctx=Load()), attr='values', ctx=Load()), attr='reshape', ctx=Load()), args=[UnaryOp(op=USub(), operand=Num(n=1)), Num(n=1)], keywords=[]), Attribute(value=Name(id='validation_labels', ctx=Load()), attr='values', ctx=Load())], keywords=[])], keywords=[]))])
Module(body=[ImportFrom(module='sklearn.metrics', names=[alias(name='mean_squared_error', asname=None)], level=0)])
Module(body=[ImportFrom(module='math', names=[alias(name='sqrt', asname=None)], level=0)])
Module(body=[Assign(targets=[Name(id='rms', ctx=Store())], value=Call(func=Name(id='sqrt', ctx=Load()), args=[Call(func=Name(id='mean_squared_error', ctx=Load()), args=[Call(func=Attribute(value=Attribute(value=Name(id='validation_labels', ctx=Load()), attr='values', ctx=Load()), attr='reshape', ctx=Load()), args=[UnaryOp(op=USub(), operand=Num(n=1))], keywords=[]), Name(id='y_predict', ctx=Load())], keywords=[])], keywords=[]))])
Module(body=[Expr(value=Call(func=Name(id='print', ctx=Load()), args=[Str(s='rmse error is'), Name(id='rms', ctx=Load())], keywords=[]))])
Module(body=[Assign(targets=[Name(id='validation_size', ctx=Store())], value=Num(n=300))])
Module(body=[Assign(targets=[Name(id='Y', ctx=Store())], value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='Y2', ctx=Store())], value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='retweet_count')), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='Y', ctx=Store())], value=Call(func=Attribute(value=Name(id='Y', ctx=Load()), attr='append', ctx=Load()), args=[Name(id='Y2', ctx=Load())], keywords=[]))])
Module(body=[Assign(targets=[Name(id='X', ctx=Store())], value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=List(elts=[Str(s='favorite_count'), Str(s='created_at')], ctx=Load())), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='X2', ctx=Store())], value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=List(elts=[Str(s='favorite_count'), Str(s='created_at')], ctx=Load())), ctx=Load()))])
Module(body=[Assign(targets=[Subscript(value=Name(id='X', ctx=Load()), slice=Index(value=Str(s='created_at')), ctx=Store())], value=Call(func=Attribute(value=Subscript(value=Name(id='df_epfl', ctx=Load()), slice=Index(value=Str(s='created_at')), ctx=Load()), attr='map', ctx=Load()), args=[Lambda(args=arguments(args=[arg(arg='x', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=Attribute(value=Name(id='x', ctx=Load()), attr='hour', ctx=Load()))], keywords=[]))])
Module(body=[Assign(targets=[Subscript(value=Name(id='X2', ctx=Load()), slice=Index(value=Str(s='created_at')), ctx=Store())], value=Call(func=Attribute(value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='created_at')), ctx=Load()), attr='map', ctx=Load()), args=[Lambda(args=arguments(args=[arg(arg='x', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=Attribute(value=Name(id='x', ctx=Load()), attr='hour', ctx=Load()))], keywords=[]))])
Module(body=[Assign(targets=[Name(id='X', ctx=Store())], value=Call(func=Attribute(value=Name(id='X', ctx=Load()), attr='append', ctx=Load()), args=[Name(id='X2', ctx=Load())], keywords=[]))])
Module(body=[Assign(targets=[Name(id='validation_features', ctx=Store())], value=Subscript(value=Name(id='X', ctx=Load()), slice=Slice(lower=None, upper=Name(id='validation_size', ctx=Load()), step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='validation_labels', ctx=Store())], value=Subscript(value=Name(id='Y', ctx=Load()), slice=Slice(lower=None, upper=Name(id='validation_size', ctx=Load()), step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='train_features', ctx=Store())], value=Subscript(value=Name(id='X', ctx=Load()), slice=Slice(lower=Name(id='validation_size', ctx=Load()), upper=None, step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='train_labels', ctx=Store())], value=Subscript(value=Name(id='Y', ctx=Load()), slice=Slice(lower=Name(id='validation_size', ctx=Load()), upper=None, step=None), ctx=Load()))])
Module(body=[Assign(targets=[Name(id='model', ctx=Store())], value=Call(func=Name(id='LinearRegression', ctx=Load()), args=[], keywords=[]))])
Module(body=[Expr(value=Call(func=Attribute(value=Name(id='model', ctx=Load()), attr='fit', ctx=Load()), args=[Attribute(value=Name(id='train_features', ctx=Load()), attr='values', ctx=Load()), Attribute(value=Name(id='train_labels', ctx=Load()), attr='values', ctx=Load())], keywords=[]))])
Module(body=[Expr(value=Call(func=Name(id='print', ctx=Load()), args=[Str(s='R2 score given by model is'), Call(func=Attribute(value=Name(id='model', ctx=Load()), attr='score', ctx=Load()), args=[Attribute(value=Name(id='validation_features', ctx=Load()), attr='values', ctx=Load()), Attribute(value=Name(id='validation_labels', ctx=Load()), attr='values', ctx=Load())], keywords=[])], keywords=[]))])
Module(body=[ImportFrom(module='sklearn.metrics', names=[alias(name='mean_squared_error', asname=None)], level=0)])
Module(body=[ImportFrom(module='math', names=[alias(name='sqrt', asname=None)], level=0)])
Module(body=[Assign(targets=[Name(id='rms', ctx=Store())], value=Call(func=Name(id='sqrt', ctx=Load()), args=[Call(func=Name(id='mean_squared_error', ctx=Load()), args=[Attribute(value=Name(id='validation_labels', ctx=Load()), attr='values', ctx=Load()), Name(id='y_predict', ctx=Load())], keywords=[])], keywords=[]))])
Module(body=[Expr(value=Call(func=Name(id='print', ctx=Load()), args=[Str(s='rmse error is'), Name(id='rms', ctx=Load())], keywords=[]))])
In [ ]:
Content source: DataPilot/notebook-miner
Similar notebooks: