This notebook contains some code to process and normalize the lexical information appearing in CodeMethod comments and implementations 
(i.e., CodeMethod.comment and CodeMethod.code, respectively).
The overall processing encompasses the following steps:
nltk)Once those processing steps are completed, the jaccard_coefficient is computed between code and comments of each method, and all the analysis information are then stored in a CodeLexiconInfo model instance).
This notebook requires Python 3
In [2]:
    
%load preamble_directives.py
    
In [3]:
    
from source_code_analysis.models import CodeLexiconInfo
    
In [ ]:
    
from lexical_analysis import LINSENnormalizer
    
In [5]:
    
from lexical_analysis import LexicalAnalyzer
    
In [5]:
    
from source_code_analysis.models import SoftwareProject
target_sw_project = SoftwareProject.objects.get(name__iexact='CoffeeMaker')
    
In [6]:
    
# Use RelatedManager to get all the code methods associated to the target project
code_methods = target_sw_project.code_methods.all()
    
In [10]:
    
total_methods = code_methods.count()
coefficients = list()
for i, method in enumerate(code_methods):
    print('Analyzing Method {0} out of {1}: {2}'.format(i+1, total_methods, method.method_name))
    analyzer = LexicalAnalyzer(method)
    analyzer.analyse_textual_information()
    coefficients.append(analyzer.code_lexical_info.jaccard_coeff)
    
    
In [4]:
    
from scipy import median
from scipy import mean
from scipy import var, std
import numpy as np
    
In [5]:
    
from source_code_analysis.models import SoftwareProject
projects = list()
projects.append(SoftwareProject.objects.get(name__iexact='CoffeeMaker', version__exact='1.0'))
projects.append(SoftwareProject.objects.get(name__iexact='Jfreechart', version__exact='0.6.0'))
projects.append(SoftwareProject.objects.get(name__iexact='Jfreechart', version__exact='0.7.1'))
projects.append(SoftwareProject.objects.get(name__iexact='JHotDraw', version__exact='7.4.1'))
print(projects)
    
    
In [8]:
    
for project in projects:
    code_methods = project.code_methods.all()
    coefficients = list()
    for method in code_methods:
        # Check that this method has no "wrong_association"
        n_evaluations = method.agreement_evaluations.count()
        n_eval_wrong_assocation = method.agreement_evaluations.filter(wrong_association=True).count()
        if n_evaluations == n_eval_wrong_assocation:
            # if **all** the evaluations for the current method mark it as a wrong_association
            # exclude it from the statistics
            continue
        clexicon_info = method.lexical_info
        coefficients.append(clexicon_info.jaccard_coeff)
    coeff = np.array(coefficients)
    print('{proj} ({ver}) & {total} & {min:.3} & {max:.3} & {median:.3} & {mean:.3} & {variance:.3} & {devstd:.3} \\\\'.format(
                                                                             proj = project.name.title(), ver=project.version,
                                                                             total=coeff.size, min=coeff.min(), max=coeff.max(),
                                                                             median=median(coeff), mean=coeff.mean(), 
                                                                             variance=var(coeff), devstd=std(coeff)))
    
    
In [21]:
    
# Import Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
for project in projects:
    
    # Populate the Doc Collection
    document_collection = list()
    
    # Get Methods
    code_methods = project.code_methods.all()
    for method in code_methods:
        # Check that this method has no "wrong_association"
        n_evaluations = method.agreement_evaluations.count()
        n_eval_wrong_assocation = method.agreement_evaluations.filter(wrong_association=True).count()
        if n_evaluations == n_eval_wrong_assocation:
            # if **all** the evaluations for the current method mark it as a wrong_association
            # exclude it from the statistics
            continue
        
        clexicon_info = method.lexical_info
        document_collection.append(clexicon_info.normalized_comment)
        document_collection.append(clexicon_info.normalized_code)
    
    vectorizer = TfidfVectorizer(input='content', sublinear_tf=True, lowercase=False)
    tfidf_values = vectorizer.fit_transform(document_collection)
    
    #cosine_sim_vals = list()
    #rows, cols = tfidf_values.shape
    #for i in range(0, rows, 2):
    #    cosine_sim_vals.append(tfidf_values[i].dot(tfidf_values[i+1].T)[0,0])
    #cosine_sim_vals = np.array(cosine_sim_vals)
    comments, code = tfidf_values[::2], tfidf_values[1::2]
    kernel_matrix = linear_kernel(comments, code)  # arrays are still L2 (length) normalized
    cosine_sim_vals = np.diag(kernel_matrix)
    
    print('{proj} ({ver}) & {tot} & {min:.3} & {max:.3} & {med:.3} & {mu:.3} & {var:.3} & {sigma:.3} \\\\'.format(
            proj=project.name.title(), ver=project.version, tot=cosine_sim_vals.size, min=cosine_sim_vals.min(), 
            max=cosine_sim_vals.max(), med=median(cosine_sim_vals), mu=cosine_sim_vals.mean(), 
            var=var(cosine_sim_vals), sigma=std(cosine_sim_vals)))
    
    
In [6]:
    
coff_maker = projects[0]
methods = coff_maker.code_methods.all()
methods = methods[0:2]
docs = list()
for method in methods:
    lex_info = method.lexical_info
    docs.append(lex_info.normalized_comment)
    docs.append(lex_info.normalized_code)
print('Methods: ', len(methods))
print('Docs: ', len(docs))
    
    
In [7]:
    
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(input='content', sublinear_tf=True, lowercase=False)
X = vectorizer.fit_transform(docs)
    
    
In [14]:
    
vectorizer.get_feature_names()
    
    Out[14]:
In [21]:
    
x = X[0].toarray()
from scipy.sparse import issparse
print(issparse(x))
    
    
In [30]:
    
x = x.ravel()
    
In [31]:
    
np.where(x>0)
    
    Out[31]:
In [33]:
    
np.take(x, np.where(x>0))
    
    Out[33]:
In [34]:
    
x[np.where(x>0)]
    
    Out[34]:
In [35]:
    
print(vectorizer.get_feature_names())
    
    
In [36]:
    
docs[0]
    
    Out[36]:
In [40]:
    
jhotdraw = projects[-1]
methods = jhotdraw.code_methods.all()
methods = methods[0:2]
docs = list()
for method in methods:
    lex_info = method.lexical_info
    docs.append(lex_info.normalized_comment)
    docs.append(lex_info.normalized_code)
print('Methods: ', len(methods))
print('Docs: ', len(docs))
    
    
In [42]:
    
docs[0], docs[1]
    
    Out[42]:
In [44]:
    
methods[0].lexical_info.normalized_comment
    
    Out[44]:
In [45]:
    
methods[0].lexical_info.normalized_code
    
    Out[45]:
In [46]:
    
methods[0].example.target
    
    Out[46]:
In [19]:
    
# Import Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer
## TODO: See the following "Optimization" subsections to see tests
from sklearn.metrics.pairwise import linear_kernel  # array are still L2 normalized
for project in projects:
    
    # Get Methods
    code_methods = project.code_methods.all()
    
    # Populate the Doc Collection
    document_collection = list()
    for method in code_methods:
        
        # Check that this method has no "wrong_association"
        n_evaluations = method.agreement_evaluations.count()
        n_eval_wrong_assocation = method.agreement_evaluations.filter(wrong_association=True).count()
        if n_evaluations == n_eval_wrong_assocation:
            # if **all** the evaluations for the current method mark it as a wrong_association
            # exclude it from the statistics
            continue
        
        clexicon_info = method.lexical_info
        document_collection.append(clexicon_info.normalized_comment)
        document_collection.append(clexicon_info.normalized_code)
    
    vectorizer = TfidfVectorizer(input='content', sublinear_tf=False, lowercase=False, use_idf=False)
    tf_values = vectorizer.fit_transform(document_collection)
    
    #cosine_sim_vals = list()
    #rows, cols = tf_values.shape
    #for i in range(0, rows, 2):
    #    cosine_sim_vals.append(tf_values[i].dot(tf_values[i+1].T)[0,0])
    #cosine_sim_vals = np.array(cosine_sim_vals)
    
    comments, code = tf_values[::2], tf_values[1::2]
    kernel_matrix = linear_kernel(comments, code)
    cosine_sim_vals = np.diag(kernel_matrix)
    
    print('{proj} ({ver}) & {total} & {min:.3} & {max:.3} & {median:.3} & {mean:.3} & {variance:.3} & {devstd:.3} \\\\'.format(
                                                                                 proj = project.name.title(), ver=project.version,
                                                                                 total=cosine_sim_vals.size, 
                                                                                 min=cosine_sim_vals.min(), 
                                                                                 max=cosine_sim_vals.max(), 
                                                                                 median=median(cosine_sim_vals), 
                                                                                 mean=cosine_sim_vals.mean(), 
                                                                                 variance=var(cosine_sim_vals), 
                                                                                 devstd=std(cosine_sim_vals)))
    
    
Trying to optimize the cosine_similarity computation replacing the cosine_sim_vals list
(try using np.vstack)
In [6]:
    
from sklearn.feature_extraction.text import TfidfVectorizer
# Target Project (as this is just an example)
project = projects[0]
    
# Get Methods
code_methods = project.code_methods.all()
# Populate the Doc Collection
document_collection = list()
for method in code_methods:
    clexicon_info = method.lexical_info
    document_collection.append(clexicon_info.normalized_comment)
    document_collection.append(clexicon_info.normalized_code)
vectorizer = TfidfVectorizer(input='content', sublinear_tf=True, lowercase=False)
tfidf_values = vectorizer.fit_transform(document_collection)
rows, cols = tfidf_values.shape
cosine_sim_vals = tfidf_values[0].dot(tfidf_values[1].T)[0,0]
for i in range(2, rows, 2):
    cosine_sim_vals = np.vstack((cosine_sim_vals, tfidf_values[i].dot(tfidf_values[i+1].T)[0,0]))
cosine_sim_vals.ravel()
    
    
    Out[6]:
In [7]:
    
alt_method = np.einsum('ij,ij->i', tfidf_values[::2,].toarray(), tfidf_values[1::2,].toarray())
alt_method
    
    Out[7]:
In [8]:
    
alt_method.shape
    
    Out[8]:
In [9]:
    
cosine_sim_vals.ravel().shape
    
    Out[9]:
In [10]:
    
np.testing.assert_allclose(cosine_sim_vals.ravel(), alt_method)
    
In [11]:
    
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
    
In [12]:
    
comments, code = tfidf_values[::2], tfidf_values[1::2]
print(comments.shape, code.shape)
    
    
In [13]:
    
kernel = linear_kernel(comments, code)
np.diag(kernel)
    
    Out[13]:
In [14]:
    
from numpy.testing import assert_array_almost_equal
assert_array_almost_equal(alt_method, np.diag(kernel))
    
In [15]:
    
alt_method
    
    Out[15]:
In [16]:
    
cossim = cosine_similarity(comments, code)
np.diag(cossim)
    
    Out[16]:
In [17]:
    
assert_array_almost_equal(alt_method, np.diag(cossim))
assert_array_almost_equal(np.diag(cossim), np.diag(kernel))
    
In [12]:
    
from sklearn.feature_extraction.text import TfidfVectorizer
from evaluations import Judge
judges_combinations = (('leonardo.nole', 'rossella.linsalata'),
                       ('leonardo.nole', 'antonio.petrone'),
                       ('leonardo.nole', 'antonio.petrone'),
                       ('leonardo.nole', 'rossella.linsalata'),)
CODES_Labels = ('NC', 'DK', 'CO')
from collections import defaultdict
stats_results = defaultdict(list)
for pno, project in enumerate(projects):
    # Get Methods
    code_methods = project.code_methods.all()
    # Populate the Doc Collection
    document_collection = list()
    method_ids_map = dict()  # Map (dict) to store the association method.pk --> Row index in Tfidf Matrix
    for mno, method in enumerate(code_methods):
        clexicon_info = method.lexical_info
        document_collection.append(clexicon_info.normalized_comment)
        document_collection.append(clexicon_info.normalized_code)
        method_ids_map[method.id] = mno*2
    vectorizer = TfidfVectorizer(input='content', sublinear_tf=True, lowercase=False)
    tfidf_values = vectorizer.fit_transform(document_collection)
    j1_usrname, j2_usrname = judges_combinations[pno]
    j1 = Judge(j1_usrname, project.name, project.version)
    j2 = Judge(j2_usrname, project.name, project.version)
    
    j1_evals = j1.three_codes_evaluations
    j2_evals = j2.three_codes_evaluations
    
    project_stats = list()
    for code in range(3):
        j1_evals_code = j1_evals[code]
        j2_evals_code = j2_evals[code]
        
        method_ids = j1_evals_code.intersection(j2_evals_code)
        cosine_sim_vals = list()
        for mid in method_ids:
            i = method_ids_map[mid]
            cosine_sim_vals.append(tfidf_values[i].dot(tfidf_values[i+1].T)[0,0])
        cosine_sim_vals = np.array(cosine_sim_vals)
        project_stats.append(cosine_sim_vals)
    
    for code in range(3):
        vals = project_stats[code]
        label = CODES_Labels[code]
        if vals.size > 0:
            stats_results[label].append('{proj} ({ver}) & {total} & {min:.3} & {max:.3} & {median:.3} & {mean:.3} & {variance:.3} & {devstd:.3} \\\\'.format(
                                                                                 proj = project.name.title(), 
                                                                                 ver=project.version,
                                                                                 total=vals.size, 
                                                                                 min=vals.min(), 
                                                                                 max=vals.max(), 
                                                                                 median=median(vals), 
                                                                                 mean=vals.mean(), 
                                                                                 variance=var(vals), 
                                                                                 devstd=std(vals)))
        else:
            stats_results[label].append('{proj} ({ver}) & \multicolumn{{7}}{{c|}}{{N.A.}} \\\\'.format(proj = project.name.title(), 
                                                                                 ver=project.version))
            
for label in stats_results:
    print('\n{0}\n'.format(label))
    for value in stats_results[label]:
        print(value)
    
    
In [13]:
    
judges_combinations = (('leonardo.nole', 'rossella.linsalata'),
                       ('leonardo.nole', 'antonio.petrone'),
                       ('leonardo.nole', 'antonio.petrone'),
                       ('leonardo.nole', 'rossella.linsalata'),)
CODES_Labels = ('NC', 'DK', 'CO')
from collections import defaultdict
stats_results_paths = defaultdict(list)
pwd_out = !pwd
current_dir = pwd_out[0]
folder_path = os.path.join(current_dir, 'distributions_per_rate_tfidf')
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
for pno, project in enumerate(projects):
    # Get Methods
    code_methods = project.code_methods.all()
    # Populate the Doc Collection
    document_collection = list()
    method_ids_map = dict()  # Map (dict) to store the association method.pk --> Row index in Tfidf Matrix
    for mno, method in enumerate(code_methods):
        clexicon_info = method.lexical_info
        document_collection.append(clexicon_info.normalized_comment)
        document_collection.append(clexicon_info.normalized_code)
        method_ids_map[method.id] = mno*2
    vectorizer = TfidfVectorizer(input='content', sublinear_tf=True, lowercase=False)
    tfidf_values = vectorizer.fit_transform(document_collection)
    j1_usrname, j2_usrname = judges_combinations[pno]
    j1 = Judge(j1_usrname, project.name, project.version)
    j2 = Judge(j2_usrname, project.name, project.version)
    
    j1_evals = j1.three_codes_evaluations
    j2_evals = j2.three_codes_evaluations
    
    project_stats = list()
    for code in range(3):
        j1_evals_code = j1_evals[code]
        j2_evals_code = j2_evals[code]
        
        method_ids = j1_evals_code.intersection(j2_evals_code)
        cosine_sim_vals = list()
        for mid in method_ids:
            i = method_ids_map[mid]
            cosine_sim_vals.append(tfidf_values[i].dot(tfidf_values[i+1].T)[0,0])
        cosine_sim_vals = np.array(cosine_sim_vals)
        project_stats.append(cosine_sim_vals)
    
    for code in range(3):
        vals = project_stats[code]
        label = CODES_Labels[code]
        if vals.size > 0:
            filename = '{label}_{proj}_({ver})_{total}.txt'.format(label=label, 
                                                                   proj=project.name.title(), 
                                                                   ver=project.version,
                                                                   total=vals.size)
            filepath = os.path.join(folder_path, filename)
            np.savetxt(filepath, vals)
            stats_results_paths[label].append(filepath)
            
for label in stats_results:
    print('\n{0}\n'.format(label))
    for path in stats_results_paths[label]:
        print('Saved Filepath:', path)