In [2]:
import os
os.chdir('..')
import src.wrangle.create_corpus
import src.analyze.model


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-2-bd995318c3f0> in <module>()
      1 import os
      2 os.chdir('..')
----> 3 import src.wrangle.create_corpus
      4 import src.analyze.model

ImportError: No module named src.wrangle.create_corpus

In [16]:
reload(bills.analyze.model)
bills.analyze.model.score_all_subjects()


1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0
1.0
[[3]]
0.0

In [ ]:
reload(bills.wrangle.create_corpus)
reload(bills.analyze.model)
X_train, X_test, y_train, y_test = bills.analyze.model.get_bill_info(
'bills_db', 'Joel')

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [ ]:
reload(bills.analyze.model)
results = bills.analyze.model.get_us_data('bills_db', 'Joel', 'Health', vect, 'logistic')

In [ ]:
results[1]

In [ ]:
import pickle
from sklearn import metrics

In [ ]:
mod, vect, y_pred_class, y_pred_prob,X_train_dtm, X_test_dtm, y_train, y_test = pickle.load(open('/Users/Joel/Desktop/Insight/data/model.p', 'rb'))

In [ ]:
mod.coef_[0]

In [ ]:
len(vect.get_feature_names())

In [ ]:
len(mod.coef_[0])

In [ ]:
feats = vect.get_feature_names()

In [ ]:
feat_weights = [(feats[i], mod.coef_[0][i]) for i in range(len(feats))]

In [ ]:
sort_weights = sorted(feat_weights, key=lambda (_, x): -abs(x))

In [ ]:
sort_weights[0:100]

In [ ]:
mod, vect, y_pred_class, y_pred_prob,X_train_dtm, X_test_dtm, y_train, y_test = pickle.load(open('/Users/Joel/Desktop/Insight/data/model.p', 'rb'))
feats = vect.get_feature_names()
feat_weights = [(feats[i], mod.coef_[0][i]) for i in range(len(feats))]
sort_weights = sorted(feat_weights, key=lambda (_, x): -abs(x))

In [ ]:


In [ ]:
sort_weights[0:100]

In [ ]:
len(mod.coef_[0])

In [ ]:
type(test)

In [ ]:
test.size

In [ ]:
test.size*1.0/len(mod.coef_[0])

In [ ]:
len(y_train)

In [ ]:


In [ ]:
pars

In [ ]:
import pandas as pd(
pd.Series(arr.coef_[0]))

In [ ]:
X_test_dtm.size

In [ ]:
metrics.accuracy_score(y_test, y_pred_class)

In [ ]:
metrics.confusion_matrix(y_test, y_pred_class)

In [ ]:
tpr, fpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)

In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_prob)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt

plt.title('Intellectual Property\nReceiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [ ]:
dense = X_train_dtm.toarray()

In [ ]:
X_train_dtm.sum(axis=1)

In [ ]:
len(X_train_dtm[y_train==1])

In [ ]:


In [ ]:
reload(bills.wrangle.create_corpus)
reload(bills.analyze.model)
results = bills.analyze.model.create_lda_tfidf()

In [ ]:
tfidf_mod = results[3]

In [ ]:
tfidf_mod[my_corpus[9]]

In [ ]:
tfidf_mod[my_corpus][0]

In [ ]:
import scipy
test = scipy.sparse.coo_matrix(tfidf_mod[my_corpus][0], shape=(1,1037))
for i in range(1, len(tfidf_mod[my_corpus])):
    test2 = scipy.sparse.coo_matrix(tfidf_mod[my_corpus][i], shape=(1, 1037))
    print(test.size)
    print(test2.size)
    test = scipy.sparse.vstack([test, test2])

In [ ]:
test.toarray()

In [ ]:
text_corps

In [ ]:
text_corps

In [ ]:
import gensim
import pandas as pd
tfidf_matrix = pd.DataFrame(gensim.matutils.sparse2full(tfidf_mod[my_corpus][0], 1237))
print(len(tfidf_mod[my_corpus]))
print(tfidf_matrix.size)
for i in range(1, len(tfidf_mod[my_corpus])):
    tfidf_matrix = pd.concat(tfidf_matrix, pd.DataFrame(gensim.matutils.sparse2full(tfidf_mod[my_corpus][i], 1237)), ignore_index=True, axis=1)

In [ ]:
tfidf_matrix.size

In [ ]:
lda_mod = results[5]

In [ ]:
my_corpus = results[1]

In [ ]:
my_corpus.

In [ ]:
print(lda_mod[my_corpus[3]])

In [ ]:
lda_mod.print_topics()

In [ ]:
my_corpus[0]

In [ ]:
import pandas as pd
pd.DataFrame(results[1][1])

In [ ]:
import numpy as np
import gensim

In [ ]:
len(gensim.matutils.sparse2full(results[3][results[1][5]], 1000))

In [ ]:
from scipy.sparse import csr_matrix
numpy_matrix = gensim.matutils.corpus2dense(results[3][results[1]][2],10)

In [ ]:
import scipy.sparse
scipy_sparse_matrix = scipy.sparse.random(5,2)

In [ ]:
scipy_sparse_matrix

In [ ]:
from gensim import corpora, models

In [ ]:
test = models.LdaModel.

In [ ]:
test = models.LdaModel.load("/Users/Joel/Desktop/Insight/data/lda_text.mdl")

In [ ]:
test_dict = corpora.dictionary.Dictionary.load("/Users/Joel/Desktop/Insight/data/full_text.dict")

In [ ]:
text_corpus = corpora.MmCorpus('../../data/text_cor.mm')

In [ ]:
for doc in test[text_corpus]:
    print doc

In [ ]:
204+32+117+2381

In [ ]:
2734-2462-42-148

In [ ]:
2678+25+12+23

In [ ]:


In [ ]:


In [ ]:


In [ ]:
from sklearn.linear_model import LogisticRegression

In [ ]:
import numpy as np
param_dict = {}

In [ ]:
param_dict['C'] = np.logspace(-3, 1, 8, base=10)

In [ ]:
mod = LogisticRegression(penalty='l1')

In [ ]:
from sklearn.grid_search import GridSearchCV

In [ ]:
err_formula='mean_squared_error'

In [ ]:
n_jobs=1

In [ ]:
# minimize mean absolute error (will be the check for the results)
grid = GridSearchCV(mod, param_dict, cv=10,
                    scoring=err_formula, n_jobs=n_jobs)

# print(trainX)
# print(np.ravel(trainy))
# pdb.set_trace()
grid.fit(X_train_dtm, np.ravel(y_train))

In [ ]:
configs = load('../')

In [9]:
import yaml

with open("../configs.yml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)

In [10]:
print cfg['dbname']


bills_db

In [11]:
print cfg['username']


Joel

In [4]:
import os
import sys
os.getcwd()


Out[4]:
'/Users/Joel/Desktop/Insight/bill_taxonomy/notebooks'