We import a bunch of libraries
In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.neural_network import MLPClassifier
Load the data
In [2]:
countries = pd.read_csv("countries.csv", names=['name', 'continent'])
Configure feature extractions.
In [11]:
cv = CountVectorizer(analyzer='char', ngram_range=(1,1))
vec = DictVectorizer(sparse=False)
Transform inputs and outputs into a suitable format.
In [12]:
inputs = cv.fit_transform(countries.name)
outputs = vec.fit_transform(countries.continent.apply(lambda c: {'continent': c}))
Train a neural network.
In [62]:
model = MLPClassifier(activation='logistic', algorithm='l-bfgs', hidden_layer_sizes=(25))
Fit the entire input set. We don't care about overfitting. We want a 100% fit.
In [63]:
model.fit(inputs, outputs)
predicted_outputs = model.predict(inputs)
We compute the accuracy, precision and recall. All of these need to be 1.
In [64]:
accuracy = accuracy_score(outputs, predicted_outputs)
precision = precision_score(outputs, predicted_outputs, average='samples')
recall = recall_score(outputs, predicted_outputs, average='samples')
(accuracy, precision, recall)
Out[64]:
In [65]:
def predict_continent(country):
X = cv.transform([country])
y = model.predict(X)
for k, v in vec.inverse_transform(y)[0].items():
if v == 1.0:
return k.split('=')[1].title()
return "Unknown"
This function can now predict the continent perfectly.
In [66]:
print(predict_continent("India"))
print(predict_continent("United States"))
In [67]:
[c.shape for c in model.coefs_]
Out[67]: