notebook.community

Edit and run



In [1]:

    
import warnings
warnings.filterwarnings('ignore')



In [2]:

    
import paths
import povray
import pandas as pd
from saapy.analysis import *
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis
import pyLDAvis.gensim



In [3]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import matplotlib.pylab as pylab
matplotlib.style.use('ggplot')
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6



In [50]:

    
warnings.filterwarnings("ignore", category=DeprecationWarning)



In [4]:

    
pv = povray.Povray('../../../3party')
ent_frame = pd.read_csv(pv.analysis_dir_path / 'entities.csv')
lexeme_parser = LexemeParser()



In [5]:

    
metrics_frame = ent_frame.drop(['name', 'defs', 'kindname', 'longname'], axis=1)



In [6]:

    
lexeme_frame = ent_frame[['name', 'defs']]
parsed_lexemes = lexeme_parser.parse_lexeme_frame(lexeme_frame)
miss_frame = misses_to_frame(parsed_lexemes)
miss_frame.to_csv(pv.analysis_dir_path / 'lexeme-misses.csv', index=True)
print('manually edit terms in', pv.analysis_dir_path / 'lexeme-misses.csv', 'before executing next cell')









    



manually edit terms in /Users/ashapoch/Projects/3party/povray-analysis/lexeme-misses.csv before executing next cell



In [7]:

    
term_frame = pd.read_csv(pv.analysis_dir_path / 'lexeme-misses.csv')
term_frame = term_frame.drop(['lexemes'], axis=1).set_index('miss')
term_frame.head()



In [8]:

    
terms = term_frame.to_dict()['term']
lexeme_parser.add_terms(terms)
parsed_lexemes = lexeme_parser.parse_lexeme_frame(lexeme_frame)
flattened_lexemes = flatten_lexeme_series(parsed_lexemes, skip_miss=False)
flattened_lexemes.head()









    Out[8]:





0    generic messenger generic messenger generic me...
1    I stream I stream I stream I stream read byte ...
2    O stream O stream O stream see kg tel lg clear...
3    I mem stream I mem stream read byte un read by...
4    gamma curve get lookup table is neutral encode...
dtype: object



In [9]:

    
flattened_lexemes.to_csv(pv.analysis_dir_path / 'flat-lexemes.csv', index=False, header=False)



In [10]:

    
entity_sentences = LineSentence(str(pv.analysis_dir_path / 'flat-lexemes.csv'))
entity_dict = Dictionary(entity_sentences)
entity_dict.filter_extremes(no_below=10, no_above=0.4)
entity_dict.compactify()
entity_dict.save(str(pv.analysis_dir_path / 'entity.dict'))
# entity_dict.load(str(pv.analysis_dir_path / 'entity.dict'))
entity_dict.save_as_text(str(pv.analysis_dir_path / 'entity.dict.txt'))



In [11]:

    
def bow_generator(filepath, entity_dict):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for sentence in LineSentence(filepath):
        yield entity_dict.doc2bow(sentence)



In [12]:

    
# generate bag-of-words representations for
# all entities and save them as a matrix
MmCorpus.serialize(str(pv.analysis_dir_path / 'flat-lexemes.mm'),
                   bow_generator(str(pv.analysis_dir_path / 'flat-lexemes.csv'), entity_dict))
    
# load the finished bag-of-words corpus from disk
bow_corpus = MmCorpus(str(pv.analysis_dir_path / 'flat-lexemes.mm'))



In [13]:

    
lda = LdaMulticore(bow_corpus, num_topics=5,
                   id2word=entity_dict, workers=3)
lda.save(str(pv.analysis_dir_path / 'flat-lexemes.lda'))
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, bow_corpus, entity_dict)



In [14]:

    
pyLDAvis.display(LDAvis_prepared)









    Out[14]:



In [15]:

    
metrics_frame.head()









    Out[15]:






  
    
      
      AltAvgLineBlank
      AltAvgLineCode
      AltAvgLineComment
      AltCountLineBlank
      AltCountLineCode
      AltCountLineComment
      AvgCyclomatic
      AvgCyclomaticModified
      AvgCyclomaticStrict
      AvgEssential
      ...
      MaxCyclomaticStrict
      MaxEssential
      MaxInheritanceTree
      MaxNesting
      PercentLackOfCohesion
      RatioCommentToCode
      SumCyclomatic
      SumCyclomaticModified
      SumCyclomaticStrict
      SumEssential
    
  
  
    
      0
      2
      9
      0
      62
      201
      13
      1
      1
      1
      1
      ...
      3
      1
      0
      2
      60
      0.06
      30
      30
      30
      18
    
    
      1
      0
      2
      0
      4
      23
      5
      1
      1
      1
      1
      ...
      1
      1
      1
      0
      0
      0.22
      6
      6
      6
      6
    
    
      2
      0
      9
      0
      11
      108
      9
      3
      3
      3
      1
      ...
      12
      1
      1
      4
      18
      0.08
      30
      30
      33
      10
    
    
      3
      0
      10
      0
      13
      135
      3
      2
      2
      2
      1
      ...
      11
      6
      2
      3
      65
      0.02
      31
      29
      32
      16
    
    
      4
      0
      6
      1
      38
      134
      187
      2
      2
      2
      1
      ...
      6
      4
      0
      3
      91
      1.48
      40
      40
      41
      20
    
  

5 rows × 55 columns



In [16]:

    
metrics_frame.columns









    Out[16]:





Index(['AltAvgLineBlank', 'AltAvgLineCode', 'AltAvgLineComment',
       'AltCountLineBlank', 'AltCountLineCode', 'AltCountLineComment',
       'AvgCyclomatic', 'AvgCyclomaticModified', 'AvgCyclomaticStrict',
       'AvgEssential', 'AvgLine', 'AvgLineBlank', 'AvgLineCode',
       'AvgLineComment', 'CountClassBase', 'CountClassCoupled',
       'CountClassDerived', 'CountDeclClassMethod', 'CountDeclClassVariable',
       'CountDeclInstanceMethod', 'CountDeclInstanceVariable',
       'CountDeclInstanceVariablePrivate',
       'CountDeclInstanceVariableProtected', 'CountDeclInstanceVariablePublic',
       'CountDeclMethod', 'CountDeclMethodAll', 'CountDeclMethodConst',
       'CountDeclMethodFriend', 'CountDeclMethodPrivate',
       'CountDeclMethodProtected', 'CountDeclMethodPublic', 'CountLine',
       'CountLineBlank', 'CountLineCode', 'CountLineCodeDecl',
       'CountLineCodeExe', 'CountLineComment', 'CountLineInactive',
       'CountLinePreprocessor', 'CountStmt', 'CountStmtDecl', 'CountStmtEmpty',
       'CountStmtExe', 'MaxCyclomatic', 'MaxCyclomaticModified',
       'MaxCyclomaticStrict', 'MaxEssential', 'MaxInheritanceTree',
       'MaxNesting', 'PercentLackOfCohesion', 'RatioCommentToCode',
       'SumCyclomatic', 'SumCyclomaticModified', 'SumCyclomaticStrict',
       'SumEssential'],
      dtype='object')



In [17]:

    
mf = metrics_frame[['CountLineCode', 'AvgEssential', 'MaxEssential', 'MaxNesting', 'PercentLackOfCohesion', 'SumEssential']]



In [18]:

    
mf.head()









    Out[18]:






  
    
      
      CountLineCode
      AvgEssential
      MaxEssential
      MaxNesting
      PercentLackOfCohesion
      SumEssential
    
  
  
    
      0
      201
      1
      1
      2
      60
      18
    
    
      1
      23
      1
      1
      0
      0
      6
    
    
      2
      108
      1
      1
      4
      18
      10
    
    
      3
      135
      1
      6
      3
      65
      16
    
    
      4
      126
      1
      4
      3
      91
      20



In [19]:

    
mf.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 6 columns):
CountLineCode            266 non-null int64
AvgEssential             266 non-null int64
MaxEssential             266 non-null int64
MaxNesting               266 non-null int64
PercentLackOfCohesion    266 non-null int64
SumEssential             266 non-null int64
dtypes: int64(6)
memory usage: 12.5 KB



In [20]:

    
from sklearn.linear_model import LogisticRegression



In [32]:

    
x_train = mf['CountLineCode'].to_frame()
y_train = mf['AvgEssential']

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
# Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log









    Out[32]:





86.090000000000003



In [33]:

    
x_train.plot()









    Out[33]:





<matplotlib.axes._subplots.AxesSubplot at 0x1121bc9e8>



In [34]:

    
y_train.plot()









    Out[34]:





<matplotlib.axes._subplots.AxesSubplot at 0x11250e978>



In [46]:

    
def plot_correlation_map(df, corr_method='spearman'):
    corr = df.corr(method=corr_method)
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )



In [51]:

    
plot_correlation_map(mf)



In [104]:

    
c1s = pd.Series(['2017-04-02T11:22:44+02:00', '2017-04-03T11:21:44+02:00', '2017-04-02T11:22:44+02:00']) # pd.date_range(start='2017-01-01', periods=3)
c1 = pd.to_datetime(c1s, infer_datetime_format=True, utc=True)
c1[2] = c1[2] + pd.DateOffset(seconds=0.1)



In [113]:

    
pd.Timestamp('2017-04-02T11:22:44+02:00').astimezone(None)









    Out[113]:





Timestamp('2017-04-02 09:22:44')



In [105]:

    
c2 = {c1[0]: 1, c1[1]: 3}
c3 = {c1[1]: 'a', c1[2]: 'b', c1[0]: 'c'}
tdf = pd.DataFrame.from_dict(dict(c2=c2, c3=c3)).fillna(int(0))
tdf









    Out[105]:






  
    
      
      c2
      c3
    
  
  
    
      2017-04-02 09:22:44.000
      1.0
      c
    
    
      2017-04-02 09:22:44.100
      0.0
      b
    
    
      2017-04-03 09:21:44.000
      3.0
      a



In [87]:

    
tdf.index









    Out[87]:





DatetimeIndex(['2017-04-02 09:21:44', '2017-04-02 09:22:44',
               '2017-04-03 09:21:44'],
              dtype='datetime64[ns]', freq=None)



In [79]:

    
pv.load_git_graph()









    



  load_git_graph (/Users/ashapoch/Projects/ashapochka-github/saapy/samples/povray.py:46):
    0.756 seconds







    Out[79]:





<saapy.vcs.git_client.GitGraph at 0x113e48dd8>



In [80]:

    
pv.git_graph.commit_node(ref_name='origin/master')









    Out[80]:





{'author_tz_offset': -7200,
 'authored_date': 1491124904,
 'authored_datetime': '2017-04-02T11:21:44+02:00',
 'committed_date': 1491124904,
 'committed_datetime': '2017-04-02T11:21:44+02:00',
 'committer_tz_offset': -7200,
 'encoding': 'UTF-8',
 'gpgsig': None,
 'hexsha': '1e3b586b2fce101fd150748fcb4a426ac32e9fb9',
 'message': "Merge branch 'release/v3.7.1'\n",
 'name_rev': '1e3b586b2fce101fd150748fcb4a426ac32e9fb9 master',
 'node_type': 'commit',
 'parent_count': 2,
 'ref_head': True,
 'ref_name': 'origin/master',
 'stats_total_deletions': 20897,
 'stats_total_files': 202,
 'stats_total_insertions': 24543,
 'stats_total_lines': 45440,
 'summary': "Merge branch 'release/v3.7.1'"}



In [100]:

    
pd.DateOffset?



In [ ]:

	AltAvgLineBlank	AltAvgLineCode	AltAvgLineComment	AltCountLineBlank	AltCountLineCode	AltCountLineComment	AvgCyclomatic	AvgCyclomaticModified	AvgCyclomaticStrict	AvgEssential	...	MaxCyclomaticStrict	MaxEssential	MaxInheritanceTree	MaxNesting	PercentLackOfCohesion	RatioCommentToCode	SumCyclomatic	SumCyclomaticModified	SumCyclomaticStrict	SumEssential
0	2	9	0	62	201	13	1	1	1	1	...	3	1	0	2	60	0.06	30	30	30	18
1	0	2	0	4	23	5	1	1	1	1	...	1	1	1	0	0	0.22	6	6	6	6
2	0	9	0	11	108	9	3	3	3	1	...	12	1	1	4	18	0.08	30	30	33	10
3	0	10	0	13	135	3	2	2	2	1	...	11	6	2	3	65	0.02	31	29	32	16
4	0	6	1	38	134	187	2	2	2	1	...	6	4	0	3	91	1.48	40	40	41	20

	c2	c3
2017-04-02 09:22:44.000	1.0	c
2017-04-02 09:22:44.100	0.0	b
2017-04-03 09:21:44.000	3.0	a