In [2]:
import os
os.chdir('..')
import src.wrangle.create_corpus
import src.analyze.model
In [16]:
reload(bills.analyze.model)
bills.analyze.model.score_all_subjects()
In [ ]:
reload(bills.wrangle.create_corpus)
reload(bills.analyze.model)
X_train, X_test, y_train, y_test = bills.analyze.model.get_bill_info(
'bills_db', 'Joel')
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')
In [ ]:
reload(bills.analyze.model)
results = bills.analyze.model.get_us_data('bills_db', 'Joel', 'Health', vect, 'logistic')
In [ ]:
results[1]
In [ ]:
import pickle
from sklearn import metrics
In [ ]:
mod, vect, y_pred_class, y_pred_prob,X_train_dtm, X_test_dtm, y_train, y_test = pickle.load(open('/Users/Joel/Desktop/Insight/data/model.p', 'rb'))
In [ ]:
mod.coef_[0]
In [ ]:
len(vect.get_feature_names())
In [ ]:
len(mod.coef_[0])
In [ ]:
feats = vect.get_feature_names()
In [ ]:
feat_weights = [(feats[i], mod.coef_[0][i]) for i in range(len(feats))]
In [ ]:
sort_weights = sorted(feat_weights, key=lambda (_, x): -abs(x))
In [ ]:
sort_weights[0:100]
In [ ]:
mod, vect, y_pred_class, y_pred_prob,X_train_dtm, X_test_dtm, y_train, y_test = pickle.load(open('/Users/Joel/Desktop/Insight/data/model.p', 'rb'))
feats = vect.get_feature_names()
feat_weights = [(feats[i], mod.coef_[0][i]) for i in range(len(feats))]
sort_weights = sorted(feat_weights, key=lambda (_, x): -abs(x))
In [ ]:
In [ ]:
sort_weights[0:100]
In [ ]:
len(mod.coef_[0])
In [ ]:
type(test)
In [ ]:
test.size
In [ ]:
test.size*1.0/len(mod.coef_[0])
In [ ]:
len(y_train)
In [ ]:
In [ ]:
pars
In [ ]:
import pandas as pd(
pd.Series(arr.coef_[0]))
In [ ]:
X_test_dtm.size
In [ ]:
metrics.accuracy_score(y_test, y_pred_class)
In [ ]:
metrics.confusion_matrix(y_test, y_pred_class)
In [ ]:
tpr, fpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_prob)
roc_auc = metrics.auc(fpr, tpr)
# method I: plt
plt.title('Intellectual Property\nReceiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [ ]:
dense = X_train_dtm.toarray()
In [ ]:
X_train_dtm.sum(axis=1)
In [ ]:
len(X_train_dtm[y_train==1])
In [ ]:
In [ ]:
reload(bills.wrangle.create_corpus)
reload(bills.analyze.model)
results = bills.analyze.model.create_lda_tfidf()
In [ ]:
tfidf_mod = results[3]
In [ ]:
tfidf_mod[my_corpus[9]]
In [ ]:
tfidf_mod[my_corpus][0]
In [ ]:
import scipy
test = scipy.sparse.coo_matrix(tfidf_mod[my_corpus][0], shape=(1,1037))
for i in range(1, len(tfidf_mod[my_corpus])):
test2 = scipy.sparse.coo_matrix(tfidf_mod[my_corpus][i], shape=(1, 1037))
print(test.size)
print(test2.size)
test = scipy.sparse.vstack([test, test2])
In [ ]:
test.toarray()
In [ ]:
text_corps
In [ ]:
text_corps
In [ ]:
import gensim
import pandas as pd
tfidf_matrix = pd.DataFrame(gensim.matutils.sparse2full(tfidf_mod[my_corpus][0], 1237))
print(len(tfidf_mod[my_corpus]))
print(tfidf_matrix.size)
for i in range(1, len(tfidf_mod[my_corpus])):
tfidf_matrix = pd.concat(tfidf_matrix, pd.DataFrame(gensim.matutils.sparse2full(tfidf_mod[my_corpus][i], 1237)), ignore_index=True, axis=1)
In [ ]:
tfidf_matrix.size
In [ ]:
lda_mod = results[5]
In [ ]:
my_corpus = results[1]
In [ ]:
my_corpus.
In [ ]:
print(lda_mod[my_corpus[3]])
In [ ]:
lda_mod.print_topics()
In [ ]:
my_corpus[0]
In [ ]:
import pandas as pd
pd.DataFrame(results[1][1])
In [ ]:
import numpy as np
import gensim
In [ ]:
len(gensim.matutils.sparse2full(results[3][results[1][5]], 1000))
In [ ]:
from scipy.sparse import csr_matrix
numpy_matrix = gensim.matutils.corpus2dense(results[3][results[1]][2],10)
In [ ]:
import scipy.sparse
scipy_sparse_matrix = scipy.sparse.random(5,2)
In [ ]:
scipy_sparse_matrix
In [ ]:
from gensim import corpora, models
In [ ]:
test = models.LdaModel.
In [ ]:
test = models.LdaModel.load("/Users/Joel/Desktop/Insight/data/lda_text.mdl")
In [ ]:
test_dict = corpora.dictionary.Dictionary.load("/Users/Joel/Desktop/Insight/data/full_text.dict")
In [ ]:
text_corpus = corpora.MmCorpus('../../data/text_cor.mm')
In [ ]:
for doc in test[text_corpus]:
print doc
In [ ]:
204+32+117+2381
In [ ]:
2734-2462-42-148
In [ ]:
2678+25+12+23
In [ ]:
In [ ]:
In [ ]:
In [ ]:
from sklearn.linear_model import LogisticRegression
In [ ]:
import numpy as np
param_dict = {}
In [ ]:
param_dict['C'] = np.logspace(-3, 1, 8, base=10)
In [ ]:
mod = LogisticRegression(penalty='l1')
In [ ]:
from sklearn.grid_search import GridSearchCV
In [ ]:
err_formula='mean_squared_error'
In [ ]:
n_jobs=1
In [ ]:
# minimize mean absolute error (will be the check for the results)
grid = GridSearchCV(mod, param_dict, cv=10,
scoring=err_formula, n_jobs=n_jobs)
# print(trainX)
# print(np.ravel(trainy))
# pdb.set_trace()
grid.fit(X_train_dtm, np.ravel(y_train))
In [ ]:
configs = load('../')
In [9]:
import yaml
with open("../configs.yml", 'r') as ymlfile:
cfg = yaml.load(ymlfile)
In [10]:
print cfg['dbname']
In [11]:
print cfg['username']
In [4]:
import os
import sys
os.getcwd()
Out[4]: