In [ ]:
import numpy as np
import pandas as pd

rawData = pd.read_json('data/sgdir-cards-20161001.json')

rawData.info()

In [ ]:
rawData.name[101]

In [ ]:
rawData.desc[101]

In [ ]:
rawData.idLabels[101]

In [ ]:
rawText = np.array(rawData.name + ' ' + rawData.desc)

rawText[101]

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
countVectorizer = CountVectorizer()

countVectorizer.fit(['coucou hello theodo bam sicara'])
print("features:", countVectorizer.get_feature_names())

Xtest = countVectorizer.transform([
    "coucou je suis theodo theodo",
    "hello je suis bam",
    "hello hello je suis sicara"
])

print(Xtest.toarray())

In [ ]:
X_trello = countVectorizer.fit_transform(rawText)

X_trello.shape

In [ ]:
labels = np.array(rawData.idLabels)

print(labels[0])
print(labels[100])
print(labels[101])

In [ ]:
from sklearn.preprocessing import MultiLabelBinarizer
multiLabelBinarizer = MultiLabelBinarizer()

multiLabelBinarizer.fit([["a", "b", "c"]])

print("classes:", multiLabelBinarizer.classes_)

multiLabelBinarizer.transform([
        ["a", "b"],
        [],
        ["a", "c"]
])

In [ ]:
Y_trello = multiLabelBinarizer.fit_transform(labels)

Y_trello.shape

In [ ]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

centers = [[-2, 0], [1, 1]]
X, y = make_blobs(n_samples=1000, centers=centers, random_state=40)

plt.figure()

id0 = np.where(y == 0)
plt.scatter(X[id0, 0], X[id0, 1], c='red')

id1 = np.where(y == 1)
plt.scatter(X[id1, 0], X[id1, 1], c='green')

plt.show()

In [ ]:
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
estimator.fit(X, y)

print(estimator.score(X, y))

coef = estimator.coef_[0]
intercept = estimator.intercept_

print(coef)
print(intercept)

print('should be red', estimator.predict_proba([(0, -10)]))
print('should be green', estimator.predict_proba([(5, 5)]))
print('should be 50/50', estimator.predict_proba([(-0.58/2.56, 0)]))

In [ ]:
plt.figure()

id0 = np.where(y == 0)
plt.scatter(X[id0, 0], X[id0, 1], c='red')

id1 = np.where(y == 1)
plt.scatter(X[id1, 0], X[id1, 1], c='green')

def line(x0):
    return (-(x0 * coef[0]) - intercept) / coef[1]
plt.plot([-2, 1], [line(-2), line(1)], color='blue')

plt.show()

In [ ]:
from sklearn.multiclass import OneVsRestClassifier
estimator = OneVsRestClassifier(LogisticRegression())

estimator.fit(X_trello, Y_trello)

print('true:       ', Y_trello[100])
print('estimation:', estimator.predict(X_trello[100]))

print(estimator.score(X_trello, Y_trello))

In [ ]:
estimator.fit(X_trello[0:280], Y_trello[0:280])

print(estimator.score(X_trello[280:], Y_trello[280:]))

In [ ]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator, X_trello, Y_trello, cv=10)

print(scores)
print(scores.mean())

In [ ]:
from sklearn.metrics import label_ranking_average_precision_score

print(label_ranking_average_precision_score([[0,0,1]], [[0.33,0.45,0.55]]))
print(label_ranking_average_precision_score([[0,0,1]], [[0.33,0.45,0.44]]))
print(label_ranking_average_precision_score([[0,0,1]], [[0.33,0.45,0.32]]))

In [ ]:
def myScore(estimator, X, Y):
    return label_ranking_average_precision_score(Y, estimator.predict_proba(X))

scores = cross_val_score(estimator, X_trello, Y_trello, cv=10, scoring=myScore)

print(scores)
print(scores.mean())