notebook.community

Edit and run



In [1]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score



In [9]:

    
# Load our data into two Python lists
with open('clickbait.txt') as f:
    lines = [line.strip().split("\t") for line in f]
    headlines, labels = zip(*lines)



In [10]:

    
headlines[:5]









    Out[10]:





("Egypt's top envoy in Iraq confirmed killed",
 'Carter: Race relations in Palestine are worse than apartheid',
 'After Years Of Dutiful Service, The Shiba Who Ran A Tobacco Shop Retires',
 'In Books on Two Powerbrokers, Hints of the Future',
 'These Horrifyingly Satisfying Photos Of "Baby Foot" Will Haunt You')



In [11]:

    
labels[:5]









    Out[11]:





('0', '0', '1', '0', '1')



In [12]:

    
len(headlines)









    Out[12]:





10000



In [13]:

    
# Break dataset into train and test sets
train_headlines = headlines[: 8000]
test_headlines = headlines[8000: ]

train_labels = labels[: 8000]
test_labels = labels[8000: ]



In [16]:

    
# Create a vectorizer and classifier
vectorizer = TfidfVectorizer()
svm = LinearSVC()



In [27]:

    
# Transform our text data into numerical vectors
train_vectors = vectorizer.fit_transform(train_headlines)
test_vectors = vectorizer.transform(test_headlines)



In [28]:

    
# Train the classifier 
svm.fit(train_vectors, train_labels)









    Out[28]:





LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)



In [29]:

    
predictions = svm.predict(test_vectors)



In [30]:

    
test_headlines[:5]









    Out[30]:





('The Earliest I\'ve Said "I Love You"',
 "Stop What You're Doing And Worship These Matt Bomer Pictures",
 '23 Of The Funniest "Nancy Drew" Game Memes',
 'Policeman killed in football-related violence in Italy',
 'Do You Remember Which Disney Star Sang These Lyrics')



In [31]:

    
predictions[:5]









    Out[31]:





array(['1', '1', '1', '0', '1'],
      dtype='<U1')



In [32]:

    
test_labels[:5]









    Out[32]:





('1', '1', '1', '0', '1')



In [33]:

    
accuracy_score(test_labels, predictions)









    Out[33]:





0.96099999999999997



In [36]:

    
new_headlines = ["How India's Political Parties Hijacked Twitter's Trending Column For Partisan Bickering", "We Tried Tanmay Bhat's Diet For 30 Days"]
new_vectors = vectorizer.transform(new_headlines)
new_predictions = svm.predict(new_vectors)



In [37]:

    
new_predictions









    Out[37]:





array(['0', '1'],
      dtype='<U1')



In [ ]: