In [1]:
import warnings
warnings.filterwarnings('ignore')
In [2]:
import paths
import povray
import pandas as pd
from saapy.analysis import *
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis
import pyLDAvis.gensim
In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import matplotlib.pylab as pylab
matplotlib.style.use('ggplot')
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
In [50]:
warnings.filterwarnings("ignore", category=DeprecationWarning)
In [4]:
pv = povray.Povray('../../../3party')
ent_frame = pd.read_csv(pv.analysis_dir_path / 'entities.csv')
lexeme_parser = LexemeParser()
In [5]:
metrics_frame = ent_frame.drop(['name', 'defs', 'kindname', 'longname'], axis=1)
In [6]:
lexeme_frame = ent_frame[['name', 'defs']]
parsed_lexemes = lexeme_parser.parse_lexeme_frame(lexeme_frame)
miss_frame = misses_to_frame(parsed_lexemes)
miss_frame.to_csv(pv.analysis_dir_path / 'lexeme-misses.csv', index=True)
print('manually edit terms in', pv.analysis_dir_path / 'lexeme-misses.csv', 'before executing next cell')
In [7]:
term_frame = pd.read_csv(pv.analysis_dir_path / 'lexeme-misses.csv')
term_frame = term_frame.drop(['lexemes'], axis=1).set_index('miss')
term_frame.head()
Out[7]:
In [8]:
terms = term_frame.to_dict()['term']
lexeme_parser.add_terms(terms)
parsed_lexemes = lexeme_parser.parse_lexeme_frame(lexeme_frame)
flattened_lexemes = flatten_lexeme_series(parsed_lexemes, skip_miss=False)
flattened_lexemes.head()
Out[8]:
In [9]:
flattened_lexemes.to_csv(pv.analysis_dir_path / 'flat-lexemes.csv', index=False, header=False)
In [10]:
entity_sentences = LineSentence(str(pv.analysis_dir_path / 'flat-lexemes.csv'))
entity_dict = Dictionary(entity_sentences)
entity_dict.filter_extremes(no_below=10, no_above=0.4)
entity_dict.compactify()
entity_dict.save(str(pv.analysis_dir_path / 'entity.dict'))
# entity_dict.load(str(pv.analysis_dir_path / 'entity.dict'))
entity_dict.save_as_text(str(pv.analysis_dir_path / 'entity.dict.txt'))
In [11]:
def bow_generator(filepath, entity_dict):
"""
generator function to read reviews from a file
and yield a bag-of-words representation
"""
for sentence in LineSentence(filepath):
yield entity_dict.doc2bow(sentence)
In [12]:
# generate bag-of-words representations for
# all entities and save them as a matrix
MmCorpus.serialize(str(pv.analysis_dir_path / 'flat-lexemes.mm'),
bow_generator(str(pv.analysis_dir_path / 'flat-lexemes.csv'), entity_dict))
# load the finished bag-of-words corpus from disk
bow_corpus = MmCorpus(str(pv.analysis_dir_path / 'flat-lexemes.mm'))
In [13]:
lda = LdaMulticore(bow_corpus, num_topics=5,
id2word=entity_dict, workers=3)
lda.save(str(pv.analysis_dir_path / 'flat-lexemes.lda'))
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, bow_corpus, entity_dict)
In [14]:
pyLDAvis.display(LDAvis_prepared)
Out[14]:
In [15]:
metrics_frame.head()
Out[15]:
In [16]:
metrics_frame.columns
Out[16]:
In [17]:
mf = metrics_frame[['CountLineCode', 'AvgEssential', 'MaxEssential', 'MaxNesting', 'PercentLackOfCohesion', 'SumEssential']]
In [18]:
mf.head()
Out[18]:
In [19]:
mf.info()
In [20]:
from sklearn.linear_model import LogisticRegression
In [32]:
x_train = mf['CountLineCode'].to_frame()
y_train = mf['AvgEssential']
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
# Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log
Out[32]:
In [33]:
x_train.plot()
Out[33]:
In [34]:
y_train.plot()
Out[34]:
In [46]:
def plot_correlation_map(df, corr_method='spearman'):
corr = df.corr(method=corr_method)
_ , ax = plt.subplots( figsize =( 12 , 10 ) )
cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
_ = sns.heatmap(
corr,
cmap = cmap,
square=True,
cbar_kws={ 'shrink' : .9 },
ax=ax,
annot = True,
annot_kws = { 'fontsize' : 12 }
)
In [51]:
plot_correlation_map(mf)
In [104]:
c1s = pd.Series(['2017-04-02T11:22:44+02:00', '2017-04-03T11:21:44+02:00', '2017-04-02T11:22:44+02:00']) # pd.date_range(start='2017-01-01', periods=3)
c1 = pd.to_datetime(c1s, infer_datetime_format=True, utc=True)
c1[2] = c1[2] + pd.DateOffset(seconds=0.1)
In [113]:
pd.Timestamp('2017-04-02T11:22:44+02:00').astimezone(None)
Out[113]:
In [105]:
c2 = {c1[0]: 1, c1[1]: 3}
c3 = {c1[1]: 'a', c1[2]: 'b', c1[0]: 'c'}
tdf = pd.DataFrame.from_dict(dict(c2=c2, c3=c3)).fillna(int(0))
tdf
Out[105]:
In [87]:
tdf.index
Out[87]:
In [79]:
pv.load_git_graph()
Out[79]:
In [80]:
pv.git_graph.commit_node(ref_name='origin/master')
Out[80]:
In [100]:
pd.DateOffset?
In [ ]: