In [8]:
import json
import datetime
# Necessary imports
import sys
home_directory = '/dfs/scratch2/fcipollone'
sys.path.append(home_directory)
import os
import time
import numpy as np
from nbminer.notebook_miner import NotebookMinerString
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.get_docstrings import GetDocstrings
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.postorder_hash import PostorderHash
notebooks = []
filename = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/Posts_Reduced.xml'
out_filename = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/CodeBlockVecs.xml'
outfile = open(out_filename, 'w')
count = 0
gastf = GetASTFeatures()
pipe = Pipeline([gastf])
start = time.time()
node_types = {}
number_node_types = 0
def count_nodes(t):
global node_types
global number_node_types
node_type_vec = np.zeros(100)
try:
for n in ast.walk(t):
node_type = str(type(n))
if node_type not in node_types:
node_types[node_type] = number_node_types
number_node_types += 1
index = node_types[node_type]
if index < 100:
node_type_vec[index] += 1
return node_type_vec
except:
return node_type_vec
#4110319 total lines
for line in open(filename):
line_obj = json.loads(line)
code = [el['code'] for el in line_obj['CodeBlocks']]
#print(len(code))
nb = NotebookMinerString(code)
a = Features([nb])
try:
a = pipe.transform(a)
except:
continue
for seg in a.get_notebook(0).cell_array:
vec = (count_nodes(seg.get_feature('ast')))
outfile.write(' '.join([str(el) for el in vec]))
outfile.write('\n')
count += 1
if count % 10000 == 0:
end = time.time()
print (count)
print ("Estimated time is,",(end-start)/(count/4110319)/3600)
if count % 10000 == 0:
break
In [9]:
node_types
Out[9]:
In [5]:
from sklearn.manifold import TSNE
out_filename = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/AllCodeBlockVecs.xml'
X = []
count = 0
for line in open(out_filename):
X.append(np.array([int(float(el)) for el in line.split()]))
count += 1
if count == 1000:
break
X = np.array(X)
print(X.shape)
X_embedded = TSNE(n_components=2).fit_transform(X)
In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
y = []
for i in range(len(X)):
val = float(int(np.sum(X[i])/10))/10
if val > 1:
val = 1
val = 1-val
y.append(val)
plt.scatter(X_embedded[:,0],X_embedded[:,1], c=y)
Out[6]:
In [10]:
from sklearn.manifold import TSNE
out_filename = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/AllCodeBlockVecs.xml'
plt.rcParams['figure.figsize'] = [20, 20]
fig, axes = plt.subplots(3,3)
for i in range(9):
X = []
count = 0
for line in open(out_filename):
if np.sum(np.array([int(float(el)) for el in line.split()])) == i*10+10:
X.append(np.array([int(float(el)) for el in line.split()]))
count += 1
if count == 1000:
break
X = np.array(X)
print(X.shape)
X_embedded = TSNE(n_components=2).fit_transform(X)
axes[int(i/3),i%3].scatter(X_embedded[:,0],X_embedded[:,1])
axes[int(i/3),i%3].set_title('Manifold for code blocks of length '+str(i*10+10))
In [4]:
from sklearn.manifold import TSNE
import numpy as np
out_filename = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/AllCodeBlockVecs.xml'
X = []
count = 0
for line in open(out_filename):
if np.sum(np.array([int(float(el)) for el in line.split()])) > 20 and np.sum(np.array([int(float(el)) for el in line.split()])) < 30:
X.append(np.array([int(float(el)) for el in line.split()]))
count += 1
if count == 3000:
break
X = np.array(X)
print(X.shape)
X_embedded = TSNE(n_components=2).fit_transform(X)
In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(X_embedded[:,0],X_embedded[:,1])
Out[7]:
In [8]:
from sklearn.decomposition import PCA
import numpy as np
out_filename = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/AllCodeBlockVecs.xml'
X = []
count = 0
for line in open(out_filename):
if np.sum(np.array([int(float(el)) for el in line.split()])) > 20 and np.sum(np.array([int(float(el)) for el in line.split()])) < 30:
X.append(np.array([int(float(el)) for el in line.split()]))
count += 1
if count == 3000:
break
X = np.array(X)
print(X.shape)
X_embedded = PCA(n_components=2).fit_transform(X)
plt.scatter(X_embedded[:,0],X_embedded[:,1])
Out[8]:
In [9]:
def count_nodes(t):
global node_types
global number_node_types
node_type_vec = np.zeros(100)
try:
for n in ast.walk(t):
node_type = str(type(n))
if node_type not in node_types:
node_types[node_type] = number_node_types
number_node_types += 1
index = node_types[node_type]
if index < 100:
node_type_vec[index] += 1
return node_type_vec
except:
return node_type_vec
In [23]:
filename = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/Posts_Reduced.xml'
manual_entry = open('/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/manual_entry.txt','w')
count = 0
for line in open(filename):
line_obj = json.loads(line)
code = [el['code'] for el in line_obj['CodeBlocks']]
#print(len(code))
nb = NotebookMinerString(code)
a = Features([nb])
try:
a = pipe.transform(a)
except:
continue
for seg in a.get_notebook(0).cell_array:
vec = (count_nodes(seg.get_feature('ast')))
if np.sum(vec) < 20 or np.sum(vec) > 30:
continue
json_obj = {'code':seg.get_feature('code'), 'vec': ' '.join([str(el) for el in vec]), 'python':-1}
json.dump(json_obj,manual_entry)
manual_entry.write('\n')
count += 1
if count > 30:
break
if count > 30:
break
In [19]:
from sklearn.manifold import TSNE
import numpy as np
out_filename = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/AllCodeBlockVecs.xml'
X = []
y = []
count = 0
for line in open(out_filename):
if np.sum(np.array([int(float(el)) for el in line.split()])) >= 20 and np.sum(np.array([int(float(el)) for el in line.split()])) <= 30:
X.append(np.array([int(float(el)) for el in line.split()]))
y.append('b')
count += 1
if count == 1000:
break
manual_entry = open('/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/manual_entry.txt')
for line in manual_entry:
line_obj = json.loads(line)
vector = line_obj['vec']
vec = np.array([int(float(el)) for el in vector.split()])
X.append(vec)
if str(line_obj['python']).strip() == '1':
y.append('g')
else:
y.append('r')
X = np.array(X)
print(X.shape)
X_embedded = TSNE(n_components=2).fit_transform(X)
In [20]:
plt.rcParams['figure.figsize'] = [10, 10]
plt.scatter(X_embedded[:,0],X_embedded[:,1],c=y)
Out[20]:
In [21]:
from sklearn.decomposition import PCA
X_embedded = PCA(n_components=2).fit_transform(X)
plt.scatter(X_embedded[:,0],X_embedded[:,1],c=y)
Out[21]:
In [2]:
import json
filename = '/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/Posts_Reduced.xml'
manual_entry = open('/dfs/scratch2/fcipollone/stackoverflow/guesslang_and_ast/manual_entry.txt','w')
count = 0
for line in open(filename):
line_obj = json.loads(line)
code = [el['code'] for el in line_obj['CodeBlocks']]
#print(len(code))
nb = NotebookMinerString(code)
a = Features([nb])
try:
a = pipe.transform(a)
except:
continue
for seg in a.get_notebook(0).cell_array:
for n in ast.walk(seg.get_feature('ast')):
node_type = str(type(n))
if node_type == "<class '_ast.Call'>":
print(seg.get_feature('code'))
count += 1
if count == 10:
import pdb; pdb.set_trace()
In [51]:
test_json = '''
{
"ListProductsResponse": {
"Products": [{
"VatAmount": 0,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "50 kr",
"ExternalProductID": "TELIA",
"Prices": {
"CurrencyIso": "SEK",
"Description": "50 kr",
"FixedDiscount": 0,
"Price": 5000,
"ValidFrom": "\/Date(1293836400000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK010TELIASEKONTANT50",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2011-01-01",
"ValidTo": "2061-11-19",
"Value": 5000
}, {
"VatAmount": 0,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "100 kr",
"ExternalProductID": "TELIA",
"Prices": {
"CurrencyIso": "SEK",
"Description": "100 kr",
"FixedDiscount": 0,
"Price": 10000,
"ValidFrom": "\/Date(1294786800000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK020TELIASEKONTANT100",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2011-01-12",
"ValidTo": "2061-01-12",
"Value": 10000
}, {
"VatAmount": 0,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "150 kr",
"ExternalProductID": "TELIA",
"Prices": {
"CurrencyIso": "SEK",
"Description": "150 kr",
"FixedDiscount": 0,
"Price": 15000,
"ValidFrom": "\/Date(1294786800000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK030TELIASEKONTANT150",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2011-01-12",
"ValidTo": "2061-01-12",
"Value": 15000
}, {
"VatAmount": 0,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "200 kr",
"ExternalProductID": "TELIA",
"Prices": {
"CurrencyIso": "SEK",
"Description": "200 kr",
"FixedDiscount": 0,
"Price": 20000,
"ValidFrom": "\/Date(1294786800000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK040TELIASEKONTANT200",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2011-01-12",
"ValidTo": "2061-01-12",
"Value": 20000
}, {
"VatAmount": 0,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "300 kr",
"ExternalProductID": "TELIA",
"Prices": {
"CurrencyIso": "SEK",
"Description": "300 kr",
"FixedDiscount": 0,
"Price": 30000,
"ValidFrom": "\/Date(1294786800000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK050TELIASEKONTANT300",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2011-01-12",
"ValidTo": "2061-01-12",
"Value": 30000
}, {
"VatAmount": 2500,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "Halvårskort SMS 299 kr",
"ExternalProductID": "TELIA-6MSMS",
"Prices": {
"CurrencyIso": "SEK",
"Description": "Halvårskort SMS 299 kr",
"FixedDiscount": 0,
"Price": 29900,
"ValidFrom": "\/Date(1323298800000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK060TELIASEHALVAARSMS",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2011-12-08",
"ValidTo": "2061-12-09",
"Value": 29900
}, {
"VatAmount": 2500,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "Halvårskort Mobilsurf 299 kr",
"ExternalProductID": "TELIA-6MDATA",
"Prices": {
"CurrencyIso": "SEK",
"Description": "Halvårskort Mobilsurf 299 kr",
"FixedDiscount": 0,
"Price": 29900,
"ValidFrom": "\/Date(1326927600000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK070TELIASEHALVAARMOBSURF",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2012-01-19",
"ValidTo": "2062-01-19",
"Value": 29900
}, {
"VatAmount": 2500,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "Årskort Mobilsurf 499 kr",
"ExternalProductID": "TELIA-YDATA",
"Prices": {
"CurrencyIso": "SEK",
"Description": "Årskort Mobilsurf 499 kr",
"FixedDiscount": 0,
"Price": 49900,
"ValidFrom": "\/Date(1294786800000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK080TELIASEKSURF1YEAR",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2011-01-12",
"ValidTo": "2061-01-12",
"Value": 49900
}, {
"VatAmount": 2500,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "Årskort SMS & Surf 899 kr",
"ExternalProductID": "TELIA-YCOMBO",
"Prices": {
"CurrencyIso": "SEK",
"Description": "Årskort SMS & Surf 899 kr",
"FixedDiscount": 0,
"Price": 89900,
"ValidFrom": "\/Date(1294786800000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK090TELIASEKCOMBO1YEAR",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2011-01-12",
"ValidTo": "2061-01-12",
"Value": 89900
}, {
"VatAmount": 2500,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "Årskort Navigator & Surf 699",
"ExternalProductID": "TELIA-YNAV",
"Prices": {
"CurrencyIso": "SEK",
"Description": "Årskort Navigator & Surf 699",
"FixedDiscount": 0,
"Price": 69900,
"ValidFrom": "\/Date(1323298800000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK100TELIASEARSKORT",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2011-12-08",
"ValidTo": "2061-12-09",
"Value": 69900
}, {
"VatAmount": 2500,
"VatFormat": "Percent",
"Bonus": {
"AdditionalValue": 0,
"Description": null,
"ValidFrom": "\/Date(-1807877931046+0200)\/",
"ValidTo": "\/Date(4503555668953+0200)\/"
},
"CurrencyIsoString": "SEK",
"Description": "Årskort SMS 499 kr",
"ExternalProductID": "TELIA-YSMS",
"Prices": {
"CurrencyIso": "SEK",
"Description": "Årskort SMS 499 kr",
"FixedDiscount": 0,
"Price": 49900,
"ValidFrom": "\/Date(1294786800000+0100)\/",
"ValidTo": "\/Date(-62135596800000+0100)\/"
},
"ProductID": "KK110TELIASEKSMS1YEAR",
"ProviderId": "TELIA",
"UnitType": null,
"ValidFrom": "2011-01-12",
"ValidTo": "2061-01-12",
"Value": 49900
}]
},
"Header": {
"AcquirerID": "OKB",
"AgreementID": "92010002",
"ClientIP": "77.40.160.226",
"MerchantID": "Test TopupService",
"TransmissionTime": "2012-10-10 00:00:00"
},
"Status": {
"OperationStatus": "0",
"OperationStatusDesc": "",
"TransactionStatus": 0,
"TransactionStatusDesc": "OK"
}
}'''
In [52]:
for n in ast.walk(ast.parse(test_json)):
print(str(type(n)))
In [ ]:
In [ ]: