In [ ]:
import pandas as pd
In [ ]:
data = pd.read_csv('/Users/walter_tyrna/Finalized_DS_Code/cc_title_may.csv', header = 0 , names = ['country','title'])
The csv file only contains titles and countries of origin
In [ ]:
data.shape
(77245, 2)
In [ ]:
data['country_number'] = data.country.map({'en-us':0, 'en-uk':1, 'en-au':2, 'en-in':3, 'en-ca':4, 'fr-fr':5})
In [ ]:
data.head(10)
country title country_number 0 en-au Indigenous Man Stuck In The US After Sudden De... 2 1 en-us I Wore Pinterest-Style Updos For A Week And Th... 0 2 en-us The Mountain From "Game Of Thrones" Has A Ridi... 0 3 en-us What Does Your Favorite Milkshake Flavor Say A... 0 4 en-us How Well Do You Actually Know Your Best Friend? 0 5 en-us Men Recreate Iconic Photos And Get Photoshoppe... 0 6 en-us 14 Expert Ways To Tell If Clothes Are Well-Mad... 0 7 en-au 19 Times Chrissy Teigen Was Goddamn Hilarious ... 2 8 en-uk Only A Foodie Can Get More Than 70% On This Quiz 1 9 en-au 21 Tumblr Posts About Sexting Guaranteed To Ma... 2
In [ ]:
X = data.title
y = data.country_number
In [ ]:
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
In [ ]:
vect = CountVectorizer()
In [ ]:
X_train_dtm = vect.fit_transform(X_train)
In [ ]:
X_test_dtm = vect.transform(X_test)
In [ ]:
from sklearn.naive_bayes import MultinomialNB
In [ ]:
nb = MultinomialNB()
In [ ]:
nb.fit(X_train_dtm, y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
In [ ]:
y_pred_class = nb.predict(X_test_dtm)
In [ ]:
from sklearn import metrics
In [ ]:
metrics.accuracy_score(y_test, y_pred_class)
0.9474420049710025
In [ ]:
metrics.confusion_matrix(y_test, y_pred_class)
array([[11115, 262, 216, 78, 32, 13], [ 186, 3569, 34, 11, 8, 1], [ 49, 52, 2229, 6, 6, 1], [ 35, 6, 0, 958, 0, 0], [ 11, 7, 1, 0, 426, 0], [ 0, 0, 0, 0, 0, 0]])
In [ ]:
from sklearn.pipeline import Pipeline
In [ ]:
model = Pipeline([
("vectorizer", CountVectorizer()),
("classifier", MultinomialNB()),
])
In [ ]:
test = ["17 Mean Girls Quotes That Sum Up Life In Australia", "21 Fucking Funny Aussie Olympic Tweets",]
In [ ]:
model.predict(test)
array([0, 2])