In [1]:
import datetime
import xml.etree.ElementTree as ET
    
                
                
def parse_file(filename):
    # instantiate the parser and fed it some HTML    
    first_day = datetime.datetime(2008,1,1)
    all_code = []
    code_time = []
    total = 0
    for line in open(filename,'r'):
        if 'row' in line:
            s = "<posts>\n" + str(line) + "\n</posts>"
            
            try:
                tnode = ET.fromstring(s)
            except:
                continue
                
            for neighbor in tnode.iter('row'):
                bod = (neighbor.attrib['Body'])
                cd = creation_date = neighbor.attrib['CreationDate']
                if total % 100000 == 0:
                    print(creation_date)
                d = datetime.datetime(year=int(cd[:4]),month=int(cd[5:7]), day=int(cd[8:10]))
                diff_time = d-first_day
                all_code.append((bod, diff_time))
        
            total += 1

    return all_code

In [2]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/exchange/datascience/parsed/parsed_posts.txt'
python_code = parse_file(filename)


2014-05-14T05:58:21.927

In [3]:
longer_python_code = [p for p in python_code if len(p[0].split('\n')) > 1]
len(longer_python_code)


Out[3]:
2579

In [4]:
import sys
home_directory = '/dfs/scratch2/fcipollone'
sys.path.append(home_directory)
import numpy as np
from nbminer.notebook_miner import NotebookMiner
from nbminer.notebook_miner import NotebookMinerString
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
nbs = [NotebookMinerString(longer_python_code[i][0]) for i in range(len(longer_python_code))]
a = Features(nbs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
#agr = ASTGraphReducer(a, threshold=8, split_call=False)
#ci = CorpusIdentifier()
pipe = Pipeline([gastf, rbn, gi])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x7fab53d9d748>
2579
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x7fab53d9d7b8>
2579
<nbminer.preprocess.get_imports.GetImports object at 0x7fab53d9d6d8>
2579

In [5]:
time_range = 10
import_name = 'sys'
result = {}
aggregate_result = {}
for i in range(len(longer_python_code)):
    import_list = a.get_notebook(i).get_feature('imports')
    d = longer_python_code[i][1]
    for key in import_list.keys():
        time_step = int(d.days/time_range)
        if key.strip() == import_name:
            if time_step not in result:
                result[time_step] = 0
            result[time_step] += 1
        if time_step not in aggregate_result:
            aggregate_result[time_step] = 0
        aggregate_result[time_step] += 1

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
tuples = sorted([(key, result[key]) for key in result.keys()])
x_vals = np.array([el[0] for el in tuples])
y_vals = np.array([el[1] for el in tuples])
#print(x_vals, y_vals)
y_vals = y_vals/max(y_vals)
plt.plot(x_vals,y_vals)
tuples_agg = sorted([(key, aggregate_result[key]) for key in aggregate_result.keys()])
#print(tuples_agg)
x_vals_agg = np.array([el[0] for el in tuples_agg])
y_vals_agg = np.array([el[1] for el in tuples_agg])
y_vals_agg = y_vals_agg/max(y_vals_agg)
plt.plot(x_vals_agg,y_vals_agg)


Out[5]:
[<matplotlib.lines.Line2D at 0x7fab523ac6d8>]

In [12]:
time_range = 10
function_name = 'var'
result = {}
aggregate_result = {}
for i in range(len(longer_python_code)):
    funcs = []
    for cell in a.get_notebook(i).get_all_cells():
        funcs.extend(cell.get_feature('short_name_string'))
    d = longer_python_code[i][1]
    for el in funcs:
        time_step = int(d.days/time_range)
        if el.strip() == function_name:
            if time_step not in result:
                result[time_step] = 0
            result[time_step] += 1
        if time_step not in aggregate_result:
            aggregate_result[time_step] = 0
        aggregate_result[time_step] += 1

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
tuples = sorted([(key, result[key]) for key in result.keys()])
x_vals = np.array([el[0] for el in tuples])
y_vals = np.array([el[1] for el in tuples])
#print(x_vals, y_vals)
y_vals = y_vals/max(y_vals)
plt.plot(x_vals,y_vals)
tuples_agg = sorted([(key, aggregate_result[key]) for key in aggregate_result.keys()])
#print(tuples_agg)
x_vals_agg = np.array([el[0] for el in tuples_agg])
y_vals_agg = np.array([el[1] for el in tuples_agg])
y_vals_agg = y_vals_agg/max(y_vals_agg)
plt.plot(x_vals_agg,y_vals_agg)


Out[12]:
[<matplotlib.lines.Line2D at 0x7fab4fb93240>]

In [ ]: