In [1]:
from setup import *
import sys
if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
from constants import *
In [3]:
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 300)
pd.set_option('precision', 2)
%pprint
In [4]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
In [5]:
lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi'))
lsi4 = LsiModel.load(os.path.join(DATA_PATH, 'lsi4'))
lsi.num_topics
lsi.show_topics(1,10)
Out[5]:
In [17]:
# favorites
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'))
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'))
vocab = Dictionary(df.txt.str.split())
tfidf = TfidfModel(id2word=vocab, dictionary=vocab)
lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi'))
# Bags of words
df = pd.DataFrame.from_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
bows = pd.Series(vocab.doc2bow(toks) for toks in df.txt.str.split())
In [40]:
topics = lsi[tfidf[bows]]
topics = pd.DataFrame((dict(top) for top in topics), index=df.index)
print(topics.isnull().sum())
topics.fillna(0, inplace=True)
print(topics.isnull().sum())
In [41]:
topics.copy().round(2)
Out[41]:
When I first ran this, my dataframes weren't "aligned".
So it's very important to check your datasets after every load.
The correspondence between dates and topics and numerical features is critical for training!
In [42]:
print(len(dates))
print(len(topics))
print(len(nums))
In [43]:
sum(nums.index == dates.index) == len(dates)
Out[43]:
In [44]:
sum(nums.index == topics.index) == len(dates)
Out[44]:
In [45]:
disc = LinearDiscriminantAnalysis()
disc
Out[45]:
In [46]:
category = (np.ceil(nums.favorite_count ** .13)).astype(np.int8)
In [47]:
disc = LinearDiscriminantAnalysis().fit(topics, category)
In [50]:
predicted_favorites = disc.predict(topics)
predicted_favorites[:100]
Out[50]:
In [52]:
np.sum(predicted_favorites > 0)
Out[52]:
In [163]:
np.sum(nums.favorite_count >= 1)
Out[163]:
But not in a good way.
10x more true favorites than predicted.
Our unbalanced training set makes it easy for the judge to be tough.
Let's mellow our judge a bit...
In [164]:
In [165]:
results = pd.DataFrame()
results['predicted'] = predicted_favorites
results['truth'] = pd.Series(nums.favorite_count >= 1)
conf = Confusion(results)
conf
Out[165]:
In [166]:
results.predicted.corr(results.truth)
Out[166]:
In [167]:
conf.stats_dict
Out[167]:
High accuracy, but low MCC (correlation)
Balance the training?
Get rid of some negatives?
Accentuate the positive? <-- give this a try yourself
In [168]:
pos = np.array(nums.favorite_count >= 1)
neg = ~pos
portion_pos = float(sum(pos)) / len(nums)
mask = ((np.random.binomial(1, portion_pos, size=len(nums)).astype(bool) & neg) | pos)
disc = LinearDiscriminantAnalysis().fit(topics[mask], (nums.favorite_count[mask] >= 1))
print(sum(mask))
print(sum(pos) * 2)
In [169]:
results = pd.DataFrame()
results['predicted'] = disc.predict(topics.values)
results['truth'] = nums.favorite_count.values >= 1
conf = Confusion(results)
conf
Out[169]:
In [170]:
results.predicted.corr(results.truth)
Out[170]:
In [171]:
conf.stats_dict
Out[171]:
So let's add some more negative examples back in.
50x imbalance is defintiely misleading.
But 2-5x imbalance is probably OK.
In [172]:
portion_neg = 3 * portion_pos
mask = ((np.random.binomial(1, portion_neg, size=len(nums)).astype(bool) & neg) | pos)
disc = LinearDiscriminantAnalysis().fit(topics[mask], nums.favorite_count[mask] >=1 )
print(sum(mask))
print(sum(pos) * 2)
In [173]:
results = pd.DataFrame()
results['predicted'] = disc.predict(topics.values)
results['truth'] = nums.favorite_count.values > 0
conf = Confusion(results)
conf
Out[173]:
At least the confusion matrix looks balanced now
In [174]:
results.predicted.corr(results.truth)
Out[174]:
Should have known, imbalance doesn't help...
In [179]:
portion_neg = 2 * portion_pos
mask = ((np.random.binomial(1, portion_neg, size=len(nums)).astype(bool) & neg) | pos)
disc = LinearDiscriminantAnalysis().fit(topics.values[mask], (nums.favorite_count.values > 0)[mask])
print(sum(mask))
print(sum(pos) * 2)
In [180]:
results = pd.DataFrame()
results['predicted'] = disc.predict(topics.values)
results['truth'] = nums.favorite_count.values > 0
conf = Confusion(results)
conf
Out[180]:
In [181]:
results.predicted.corr(results.truth)
Out[181]:
So it looks like 38% correlation is all we can squeeze out of this simple model
Next up... adding number of followers and other features