In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [9]:
# Load our data into two Python lists
with open('clickbait.txt') as f:
    lines = [line.strip().split("\t") for line in f]
    headlines, labels = zip(*lines)

In [10]:
headlines[:5]


Out[10]:
("Egypt's top envoy in Iraq confirmed killed",
 'Carter: Race relations in Palestine are worse than apartheid',
 'After Years Of Dutiful Service, The Shiba Who Ran A Tobacco Shop Retires',
 'In Books on Two Powerbrokers, Hints of the Future',
 'These Horrifyingly Satisfying Photos Of "Baby Foot" Will Haunt You')

In [11]:
labels[:5]


Out[11]:
('0', '0', '1', '0', '1')

In [12]:
len(headlines)


Out[12]:
10000

In [13]:
# Break dataset into train and test sets
train_headlines = headlines[: 8000]
test_headlines = headlines[8000: ]

train_labels = labels[: 8000]
test_labels = labels[8000: ]

In [16]:
# Create a vectorizer and classifier
vectorizer = TfidfVectorizer()
svm = LinearSVC()

In [27]:
# Transform our text data into numerical vectors
train_vectors = vectorizer.fit_transform(train_headlines)
test_vectors = vectorizer.transform(test_headlines)

In [28]:
# Train the classifier 
svm.fit(train_vectors, train_labels)


Out[28]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [29]:
predictions = svm.predict(test_vectors)

In [30]:
test_headlines[:5]


Out[30]:
('The Earliest I\'ve Said "I Love You"',
 "Stop What You're Doing And Worship These Matt Bomer Pictures",
 '23 Of The Funniest "Nancy Drew" Game Memes',
 'Policeman killed in football-related violence in Italy',
 'Do You Remember Which Disney Star Sang These Lyrics')

In [31]:
predictions[:5]


Out[31]:
array(['1', '1', '1', '0', '1'],
      dtype='<U1')

In [32]:
test_labels[:5]


Out[32]:
('1', '1', '1', '0', '1')

In [33]:
accuracy_score(test_labels, predictions)


Out[33]:
0.96099999999999997

In [36]:
new_headlines = ["How India's Political Parties Hijacked Twitter's Trending Column For Partisan Bickering", "We Tried Tanmay Bhat's Diet For 30 Days"]
new_vectors = vectorizer.transform(new_headlines)
new_predictions = svm.predict(new_vectors)

In [37]:
new_predictions


Out[37]:
array(['0', '1'],
      dtype='<U1')

In [ ]: