In [ ]:
import numpy as np
import pandas as pd
rawData = pd.read_json('data/sgdir-cards-20161001.json')
rawData.info()
In [ ]:
rawData.name[101]
In [ ]:
rawData.desc[101]
In [ ]:
rawData.idLabels[101]
In [ ]:
rawText = np.array(rawData.name + ' ' + rawData.desc)
rawText[101]
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
countVectorizer = CountVectorizer()
countVectorizer.fit(['coucou hello theodo bam sicara'])
print("features:", countVectorizer.get_feature_names())
Xtest = countVectorizer.transform([
"coucou je suis theodo theodo",
"hello je suis bam",
"hello hello je suis sicara"
])
print(Xtest.toarray())
In [ ]:
X_trello = countVectorizer.fit_transform(rawText)
X_trello.shape
In [ ]:
labels = np.array(rawData.idLabels)
print(labels[0])
print(labels[100])
print(labels[101])
In [ ]:
from sklearn.preprocessing import MultiLabelBinarizer
multiLabelBinarizer = MultiLabelBinarizer()
multiLabelBinarizer.fit([["a", "b", "c"]])
print("classes:", multiLabelBinarizer.classes_)
multiLabelBinarizer.transform([
["a", "b"],
[],
["a", "c"]
])
In [ ]:
Y_trello = multiLabelBinarizer.fit_transform(labels)
Y_trello.shape
In [ ]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
centers = [[-2, 0], [1, 1]]
X, y = make_blobs(n_samples=1000, centers=centers, random_state=40)
plt.figure()
id0 = np.where(y == 0)
plt.scatter(X[id0, 0], X[id0, 1], c='red')
id1 = np.where(y == 1)
plt.scatter(X[id1, 0], X[id1, 1], c='green')
plt.show()
In [ ]:
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()
estimator.fit(X, y)
print(estimator.score(X, y))
coef = estimator.coef_[0]
intercept = estimator.intercept_
print(coef)
print(intercept)
print('should be red', estimator.predict_proba([(0, -10)]))
print('should be green', estimator.predict_proba([(5, 5)]))
print('should be 50/50', estimator.predict_proba([(-0.58/2.56, 0)]))
In [ ]:
plt.figure()
id0 = np.where(y == 0)
plt.scatter(X[id0, 0], X[id0, 1], c='red')
id1 = np.where(y == 1)
plt.scatter(X[id1, 0], X[id1, 1], c='green')
def line(x0):
return (-(x0 * coef[0]) - intercept) / coef[1]
plt.plot([-2, 1], [line(-2), line(1)], color='blue')
plt.show()
In [ ]:
from sklearn.multiclass import OneVsRestClassifier
estimator = OneVsRestClassifier(LogisticRegression())
estimator.fit(X_trello, Y_trello)
print('true: ', Y_trello[100])
print('estimation:', estimator.predict(X_trello[100]))
print(estimator.score(X_trello, Y_trello))
In [ ]:
estimator.fit(X_trello[0:280], Y_trello[0:280])
print(estimator.score(X_trello[280:], Y_trello[280:]))
In [ ]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator, X_trello, Y_trello, cv=10)
print(scores)
print(scores.mean())
In [ ]:
from sklearn.metrics import label_ranking_average_precision_score
print(label_ranking_average_precision_score([[0,0,1]], [[0.33,0.45,0.55]]))
print(label_ranking_average_precision_score([[0,0,1]], [[0.33,0.45,0.44]]))
print(label_ranking_average_precision_score([[0,0,1]], [[0.33,0.45,0.32]]))
In [ ]:
def myScore(estimator, X, Y):
return label_ranking_average_precision_score(Y, estimator.predict_proba(X))
scores = cross_val_score(estimator, X_trello, Y_trello, cv=10, scoring=myScore)
print(scores)
print(scores.mean())