In [1]:
import datetime
import xml.etree.ElementTree as ET
def parse_file(filename):
# instantiate the parser and fed it some HTML
first_day = datetime.datetime(2008,1,1)
all_code = []
code_time = []
total = 0
for line in open(filename,'r'):
if 'row' in line:
s = "<posts>\n" + str(line) + "\n</posts>"
try:
tnode = ET.fromstring(s)
except:
continue
for neighbor in tnode.iter('row'):
bod = (neighbor.attrib['Body'])
cd = creation_date = neighbor.attrib['CreationDate']
if total % 100000 == 0:
print(creation_date)
d = datetime.datetime(year=int(cd[:4]),month=int(cd[5:7]), day=int(cd[8:10]))
diff_time = d-first_day
all_code.append((bod, diff_time))
total += 1
return all_code
In [2]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/datascience/parsed/parsed_posts.txt'
python_code = parse_file(filename)
In [3]:
longer_python_code = [p for p in python_code if len(p[0].split('\n')) > 1]
len(longer_python_code)
Out[3]:
In [4]:
import sys
home_directory = '/dfs/scratch2/fcipollone'
sys.path.append(home_directory)
import numpy as np
from nbminer.notebook_miner import NotebookMiner
from nbminer.notebook_miner import NotebookMinerString
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
nbs = [NotebookMinerString(longer_python_code[i][0]) for i in range(len(longer_python_code))]
a = Features(nbs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
#agr = ASTGraphReducer(a, threshold=8, split_call=False)
#ci = CorpusIdentifier()
pipe = Pipeline([gastf, rbn, gi])
a = pipe.transform(a)
In [5]:
time_range = 10
import_name = 'sys'
result = {}
aggregate_result = {}
for i in range(len(longer_python_code)):
import_list = a.get_notebook(i).get_feature('imports')
d = longer_python_code[i][1]
for key in import_list.keys():
time_step = int(d.days/time_range)
if key.strip() == import_name:
if time_step not in result:
result[time_step] = 0
result[time_step] += 1
if time_step not in aggregate_result:
aggregate_result[time_step] = 0
aggregate_result[time_step] += 1
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
tuples = sorted([(key, result[key]) for key in result.keys()])
x_vals = np.array([el[0] for el in tuples])
y_vals = np.array([el[1] for el in tuples])
#print(x_vals, y_vals)
y_vals = y_vals/max(y_vals)
plt.plot(x_vals,y_vals)
tuples_agg = sorted([(key, aggregate_result[key]) for key in aggregate_result.keys()])
#print(tuples_agg)
x_vals_agg = np.array([el[0] for el in tuples_agg])
y_vals_agg = np.array([el[1] for el in tuples_agg])
y_vals_agg = y_vals_agg/max(y_vals_agg)
plt.plot(x_vals_agg,y_vals_agg)
Out[5]:
In [12]:
time_range = 10
function_name = 'var'
result = {}
aggregate_result = {}
for i in range(len(longer_python_code)):
funcs = []
for cell in a.get_notebook(i).get_all_cells():
funcs.extend(cell.get_feature('short_name_string'))
d = longer_python_code[i][1]
for el in funcs:
time_step = int(d.days/time_range)
if el.strip() == function_name:
if time_step not in result:
result[time_step] = 0
result[time_step] += 1
if time_step not in aggregate_result:
aggregate_result[time_step] = 0
aggregate_result[time_step] += 1
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
tuples = sorted([(key, result[key]) for key in result.keys()])
x_vals = np.array([el[0] for el in tuples])
y_vals = np.array([el[1] for el in tuples])
#print(x_vals, y_vals)
y_vals = y_vals/max(y_vals)
plt.plot(x_vals,y_vals)
tuples_agg = sorted([(key, aggregate_result[key]) for key in aggregate_result.keys()])
#print(tuples_agg)
x_vals_agg = np.array([el[0] for el in tuples_agg])
y_vals_agg = np.array([el[1] for el in tuples_agg])
y_vals_agg = y_vals_agg/max(y_vals_agg)
plt.plot(x_vals_agg,y_vals_agg)
Out[12]:
In [ ]: