In [2]:
code_lines = sqlCtx.read.json(
# Put the location to your data here
'git_repos/*.json.gz',
)
code_lines = code_lines.repartition(300)
In [3]:
import re
def split_code(input):
strs = ' '.join(input)
patt = re.compile(ur"[\w]+", re.UNICODE)
return patt.findall(strs)
In [4]:
words = code_lines\
.map(
lambda (
author,
author_mail,
author_time,
author_timezone,
comment,
commit_id,
committer,
committer_mail,
committer_time,
committer_timezone,
filename,
line,
line_num,
repo_name,
):
(line.split())
)\
.map(lambda line: [f.lower() for f in line])\
.map(lambda line: split_code(line))\
.filter(lambda line: line != [])
In [5]:
from pyspark.mllib.feature import Word2Vec
word2vec = Word2Vec()
word2vec.setMinCount(25) # Default 100
word2vec.setVectorSize(50) # Default 100
model = word2vec.fit(words)
We save two copies: One JSON version that can be passed arround to other people, and a pickle version that you can use to load the model on your own machine.
In [7]:
import json
model_dict = {k:list(v) for k,v in dict(model.getVectors()).iteritems()}
with open("/tmp/py2vec_model.json", "w") as f:
json.dump(model_dict, f, indent=4)
In [15]:
import cPickle as pickle
import numpy as np
model_dict = {k:np.array(list(v)) for k,v in dict(model.getVectors()).iteritems()}
with open("/tmp/py2vec_model.pkl", "wb") as f:
pickle.dump(model_dict, f)
In [16]:
with open("/tmp/py2vec_model.pkl", "rb") as f:
loaded_model = pickle.load(f)