notebook.community

Edit and run



In [2]:

    
import os
os.chdir('..')
import src.wrangle.create_corpus
import src.analyze.model









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-2-bd995318c3f0> in <module>()
      1 import os
      2 os.chdir('..')
----> 3 import src.wrangle.create_corpus
      4 import src.analyze.model

ImportError: No module named src.wrangle.create_corpus



In [16]:

    
reload(bills.analyze.model)
bills.analyze.model.score_all_subjects()









    



1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0



In [ ]:

    
reload(bills.wrangle.create_corpus)
reload(bills.analyze.model)
X_train, X_test, y_train, y_test = bills.analyze.model.get_bill_info(
'bills_db', 'Joel')



In [ ]:

    
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')



In [ ]:

    
reload(bills.analyze.model)
results = bills.analyze.model.get_us_data('bills_db', 'Joel', 'Health', vect, 'logistic')



In [ ]:

    
results[1]



In [ ]:

    
import pickle
from sklearn import metrics



In [ ]:

    
mod, vect, y_pred_class, y_pred_prob,X_train_dtm, X_test_dtm, y_train, y_test = pickle.load(open('/Users/Joel/Desktop/Insight/data/model.p', 'rb'))



In [ ]:

    
mod.coef_[0]



In [ ]:

    
len(vect.get_feature_names())



In [ ]:

    
len(mod.coef_[0])



In [ ]:

    
feats = vect.get_feature_names()



In [ ]:

    
feat_weights = [(feats[i], mod.coef_[0][i]) for i in range(len(feats))]



In [ ]:

    
sort_weights = sorted(feat_weights, key=lambda (_, x): -abs(x))



In [ ]:

    
sort_weights[0:100]



In [ ]:

    
mod, vect, y_pred_class, y_pred_prob,X_train_dtm, X_test_dtm, y_train, y_test = pickle.load(open('/Users/Joel/Desktop/Insight/data/model.p', 'rb'))
feats = vect.get_feature_names()
feat_weights = [(feats[i], mod.coef_[0][i]) for i in range(len(feats))]
sort_weights = sorted(feat_weights, key=lambda (_, x): -abs(x))



In [ ]:



In [ ]:

    
sort_weights[0:100]



In [ ]:

    
len(mod.coef_[0])



In [ ]:

    
type(test)



In [ ]:

    
test.size



In [ ]:

    
test.size*1.0/len(mod.coef_[0])



In [ ]:

    
len(y_train)



In [ ]:



In [ ]:

    
pars



In [ ]:

    
import pandas as pd(
pd.Series(arr.coef_[0]))



In [ ]:

    
X_test_dtm.size



In [ ]:

    
metrics.accuracy_score(y_test, y_pred_class)



In [ ]:

    
metrics.confusion_matrix(y_test, y_pred_class)



In [ ]:

    
tpr, fpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)



In [ ]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [ ]:

    
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_prob)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt

plt.title('Intellectual Property\nReceiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()



In [ ]:

    
dense = X_train_dtm.toarray()



In [ ]:

    
X_train_dtm.sum(axis=1)



In [ ]:

    
len(X_train_dtm[y_train==1])



In [ ]:



In [ ]:

    
reload(bills.wrangle.create_corpus)
reload(bills.analyze.model)
results = bills.analyze.model.create_lda_tfidf()



In [ ]:

    
tfidf_mod = results[3]



In [ ]:

    
tfidf_mod[my_corpus[9]]



In [ ]:

    
tfidf_mod[my_corpus][0]



In [ ]:

    
import scipy
test = scipy.sparse.coo_matrix(tfidf_mod[my_corpus][0], shape=(1,1037))
for i in range(1, len(tfidf_mod[my_corpus])):
    test2 = scipy.sparse.coo_matrix(tfidf_mod[my_corpus][i], shape=(1, 1037))
    print(test.size)
    print(test2.size)
    test = scipy.sparse.vstack([test, test2])



In [ ]:

    
test.toarray()



In [ ]:

    
text_corps



In [ ]:

    
text_corps



In [ ]:

    
import gensim
import pandas as pd
tfidf_matrix = pd.DataFrame(gensim.matutils.sparse2full(tfidf_mod[my_corpus][0], 1237))
print(len(tfidf_mod[my_corpus]))
print(tfidf_matrix.size)
for i in range(1, len(tfidf_mod[my_corpus])):
    tfidf_matrix = pd.concat(tfidf_matrix, pd.DataFrame(gensim.matutils.sparse2full(tfidf_mod[my_corpus][i], 1237)), ignore_index=True, axis=1)



In [ ]:

    
tfidf_matrix.size



In [ ]:

    
lda_mod = results[5]



In [ ]:

    
my_corpus = results[1]



In [ ]:

    
my_corpus.



In [ ]:

    
print(lda_mod[my_corpus[3]])



In [ ]:

    
lda_mod.print_topics()



In [ ]:

    
my_corpus[0]



In [ ]:

    
import pandas as pd
pd.DataFrame(results[1][1])



In [ ]:

    
import numpy as np
import gensim



In [ ]:

    
len(gensim.matutils.sparse2full(results[3][results[1][5]], 1000))



In [ ]:

    
from scipy.sparse import csr_matrix
numpy_matrix = gensim.matutils.corpus2dense(results[3][results[1]][2],10)



In [ ]:

    
import scipy.sparse
scipy_sparse_matrix = scipy.sparse.random(5,2)



In [ ]:

    
scipy_sparse_matrix



In [ ]:

    
from gensim import corpora, models



In [ ]:

    
test = models.LdaModel.



In [ ]:

    
test = models.LdaModel.load("/Users/Joel/Desktop/Insight/data/lda_text.mdl")



In [ ]:

    
test_dict = corpora.dictionary.Dictionary.load("/Users/Joel/Desktop/Insight/data/full_text.dict")



In [ ]:

    
text_corpus = corpora.MmCorpus('../../data/text_cor.mm')



In [ ]:

    
for doc in test[text_corpus]:
    print doc



In [ ]:

    
204+32+117+2381



In [ ]:

    
2734-2462-42-148



In [ ]:

    
2678+25+12+23



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
from sklearn.linear_model import LogisticRegression



In [ ]:

    
import numpy as np
param_dict = {}



In [ ]:

    
param_dict['C'] = np.logspace(-3, 1, 8, base=10)



In [ ]:

    
mod = LogisticRegression(penalty='l1')



In [ ]:

    
from sklearn.grid_search import GridSearchCV



In [ ]:

    
err_formula='mean_squared_error'



In [ ]:

    
n_jobs=1



In [ ]:

    
# minimize mean absolute error (will be the check for the results)
grid = GridSearchCV(mod, param_dict, cv=10,
                    scoring=err_formula, n_jobs=n_jobs)

# print(trainX)
# print(np.ravel(trainy))
# pdb.set_trace()
grid.fit(X_train_dtm, np.ravel(y_train))



In [ ]:

    
configs = load('../')



In [9]:

    
import yaml

with open("../configs.yml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)



In [10]:

    
print cfg['dbname']









    



bills_db



In [11]:

    
print cfg['username']









    



Joel



In [4]:

    
import os
import sys
os.getcwd()









    Out[4]:





'/Users/Joel/Desktop/Insight/bill_taxonomy/notebooks'