In [1]:
from nlp import cluster_prep
import pandas as pd
import psycopg2
/usr/local/lib/python2.7/dist-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.
warnings.warn("The twython library has not been installed. "
In [2]:
conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-34-215-56-46.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
article = pd.read_sql_query('SELECT * FROM articles LIMIT 1', conn)
In [3]:
a = article['body'].tolist()
a = a[0]
a
Out[3]:
"Quick, do you know the difference between Danish and German pop? How about pointing out the nuances that shift EDM music from the deep neo-synthpop genre into tribal house? Yeah, me neither. Win a trip to Amsterdam! We've teamed up with Product Hunt to offer you the chance to win an all expense paid trip to TNW Conference 2017! CHECK IT OUT Engineer Glenn McDonald, in an attempt to chart different types of music, created a website back in 2013 that brought together over 500 genres and attempted to find the artists that best fit inside. Dubbed a \xe2\x80\x9cmusic intelligence platform,\xe2\x80\x9d McDonald\xe2\x80\x99s creation, \xe2\x80\x98Every Noise at Once,\xe2\x80\x99 uses AI to help answer questions like, \xe2\x80\x9cwhat is rock?\xe2\x80\x9d Once the AI was dialed in to answer simpler queries, McDonald expanded on it to find the answer to more obscure questions, or genres, as it were. Now, his creation features over 1,500 genres, and some are quite obscure. I mean, have you ever heard of indie poptimism, deep filthstep, or nu gaze? McDonald\xe2\x80\x99s algorithm has. And not only does it categorize genres by others the musical stylings closely align with, it provides an audio sample of each with a click. Or, you can click the arrow after hovering to see the artists that best fit inside based on scraped Spotify data. Here are just a few of the artists Every Noise at Once found for the gangster rap genre: It\xe2\x80\x99s not perfect, but it\xe2\x80\x99s damn close. And just like the main genre page, you can click each artist to get a sample of their sound. Or, you can click the double arrow to open a Spotify playlist of the selected artist. And if you\xe2\x80\x99re feeling really adventurous, go to the main page and hit the scan button to listen to five second samples of each genre. It\xe2\x80\x99s definitely got your weekend tunes covered. via Reddit Every Noise at Once on Every Noise Read next: Did Burger King pull the biggest publicity stunt of the year?"
In [6]:
z = cluster_prep(a)
z = ' '.join(z)
z
[u'Quick', u',', u'do', u'you', u'know', u'the', u'difference', u'between', u'Danish', u'and', u'German', u'pop', u'?', u'How', u'about', u'pointing', u'out', u'the', u'nuances', u'that', u'shift', u'EDM', u'music', u'from', u'the', u'deep', u'neo-synthpop', u'genre', u'into', u'tribal', u'house', u'?', u'Yeah', u',', u'me', u'neither', u'.', u'Win', u'a', u'trip', u'to', u'Amsterdam', u'!', u'We', u"'ve", u'teamed', u'up', u'with', u'Product', u'Hunt', u'to', u'offer', u'you', u'the', u'chance', u'to', u'win', u'an', u'all', u'expense', u'paid', u'trip', u'to', u'TNW', u'Conference', u'2017', u'!', u'CHECK', u'IT', u'OUT', u'Engineer', u'Glenn', u'McDonald', u',', u'in', u'an', u'attempt', u'to', u'chart', u'different', u'types', u'of', u'music', u',', u'created', u'a', u'website', u'back', u'in', u'2013', u'that', u'brought', u'together', u'over', u'500', u'genres', u'and', u'attempted', u'to', u'find', u'the', u'artists', u'that', u'best', u'fit', u'inside', u'.', u'Dubbed', u'a', u'\u201cmusic', u'intelligence', u'platform', u',', u'\u201d', u'McDonald\u2019s', u'creation', u',', u'\u2018Every', u'Noise', u'at', u'Once', u',', u'\u2019', u'uses', u'AI', u'to', u'help', u'answer', u'questions', u'like', u',', u'\u201cwhat', u'is', u'rock', u'?', u'\u201d', u'Once', u'the', u'AI', u'was', u'dialed', u'in', u'to', u'answer', u'simpler', u'queries', u',', u'McDonald', u'expanded', u'on', u'it', u'to', u'find', u'the', u'answer', u'to', u'more', u'obscure', u'questions', u',', u'or', u'genres', u',', u'as', u'it', u'were', u'.', u'Now', u',', u'his', u'creation', u'features', u'over', u'1,500', u'genres', u',', u'and', u'some', u'are', u'quite', u'obscure', u'.', u'I', u'mean', u',', u'have', u'you', u'ever', u'heard', u'of', u'indie', u'poptimism', u',', u'deep', u'filthstep', u',', u'or', u'nu', u'gaze', u'?', u'McDonald\u2019s', u'algorithm', u'has', u'.', u'And', u'not', u'only', u'does', u'it', u'categorize', u'genres', u'by', u'others', u'the', u'musical', u'stylings', u'closely', u'align', u'with', u',', u'it', u'provides', u'an', u'audio', u'sample', u'of', u'each', u'with', u'a', u'click', u'.', u'Or', u',', u'you', u'can', u'click', u'the', u'arrow', u'after', u'hovering', u'to', u'see', u'the', u'artists', u'that', u'best', u'fit', u'inside', u'based', u'on', u'scraped', u'Spotify', u'data', u'.', u'Here', u'are', u'just', u'a', u'few', u'of', u'the', u'artists', u'Every', u'Noise', u'at', u'Once', u'found', u'for', u'the', u'gangster', u'rap', u'genre', u':', u'It\u2019s', u'not', u'perfect', u',', u'but', u'it\u2019s', u'damn', u'close', u'.', u'And', u'just', u'like', u'the', u'main', u'genre', u'page', u',', u'you', u'can', u'click', u'each', u'artist', u'to', u'get', u'a', u'sample', u'of', u'their', u'sound', u'.', u'Or', u',', u'you', u'can', u'click', u'the', u'double', u'arrow', u'to', u'open', u'a', u'Spotify', u'playlist', u'of', u'the', u'selected', u'artist', u'.', u'And', u'if', u'you\u2019re', u'feeling', u'really', u'adventurous', u',', u'go', u'to', u'the', u'main', u'page', u'and', u'hit', u'the', u'scan', u'button', u'to', u'listen', u'to', u'five', u'second', u'samples', u'of', u'each', u'genre', u'.', u'It\u2019s', u'definitely', u'got', u'your', u'weekend', u'tunes', u'covered', u'.', u'via', u'Reddit', u'Every', u'Noise', u'at', u'Once', u'on', u'Every', u'Noise', u'Read', u'next', u':', u'Did', u'Burger', u'King', u'pull', u'the', u'biggest', u'publicity', u'stunt', u'of', u'the', u'year', u'?']
[u'quick', u'know', u'difference', u'danish', u'german', u'pop', u'how', u'pointing', u'nuances', u'shift', u'edm', u'music', u'deep', u'neo-synthpop', u'genre', u'tribal', u'house', u'yeah', u'neither', u'win', u'trip', u'amsterdam', u'we', u'teamed', u'product', u'hunt', u'offer', u'chance', u'win', u'expense', u'paid', u'trip', u'tnw', u'conference', u'2017', u'check', u'it', u'out', u'engineer', u'glenn', u'mcdonald', u'attempt', u'chart', u'different', u'types', u'music', u'created', u'website', u'back', u'2013', u'brought', u'together', u'500', u'genres', u'attempted', u'find', u'artists', u'best', u'fit', u'inside', u'dubbed', u'\u201cmusic', u'intelligence', u'platform', u'\u201d', u'mcdonald\u2019s', u'creation', u'\u2018every', u'noise', u'once', u'\u2019', u'uses', u'ai', u'help', u'answer', u'questions', u'like', u'\u201cwhat', u'rock', u'\u201d', u'once', u'ai', u'dialed', u'answer', u'simpler', u'queries', u'mcdonald', u'expanded', u'find', u'answer', u'obscure', u'questions', u'genres', u'now', u'creation', u'features', u'1,500', u'genres', u'quite', u'obscure', u'i', u'mean', u'ever', u'heard', u'indie', u'poptimism', u'deep', u'filthstep', u'nu', u'gaze', u'mcdonald\u2019s', u'algorithm', u'and', u'categorize', u'genres', u'others', u'musical', u'stylings', u'closely', u'align', u'provides', u'audio', u'sample', u'click', u'or', u'click', u'arrow', u'hovering', u'see', u'artists', u'best', u'fit', u'inside', u'based', u'scraped', u'spotify', u'data', u'here', u'artists', u'every', u'noise', u'once', u'found', u'gangster', u'rap', u'genre', u':', u'it\u2019s', u'perfect', u'it\u2019s', u'damn', u'close', u'and', u'like', u'main', u'genre', u'page', u'click', u'artist', u'get', u'sample', u'sound', u'or', u'click', u'double', u'arrow', u'open', u'spotify', u'playlist', u'selected', u'artist', u'and', u'you\u2019re', u'feeling', u'really', u'adventurous', u'go', u'main', u'page', u'hit', u'scan', u'button', u'listen', u'five', u'second', u'samples', u'genre', u'it\u2019s', u'definitely', u'got', u'weekend', u'tunes', u'covered', u'via', u'reddit', u'every', u'noise', u'once', u'every', u'noise', u'read', u'next', u':', u'did', u'burger', u'king', u'pull', u'biggest', u'publicity', u'stunt', u'year']
[u'quick', u'know', u'difference', u'danish', u'german', u'pop', u'how', u'pointing', u'nuance', u'shift', u'edm', u'music', u'deep', u'neo-synthpop', u'genre', u'tribal', u'house', u'yeah', u'neither', u'win', u'trip', u'amsterdam', u'we', u'team', u'product', u'hunt', u'offer', u'chance', u'win', u'expense', u'pay', u'trip', u'tnw', u'conference', u'2017', u'check', u'it', u'out', u'engineer', u'glenn', u'mcdonald', u'attempt', u'chart', u'different', u'type', u'music', u'create', u'website', u'back', u'2013', u'bring', u'together', u'500', u'genre', u'attempt', u'find', u'artists', u'best', u'fit', u'inside', u'dub', u'\u201cmusic', u'intelligence', u'platform', u'\u201d', u'mcdonald\u2019s', u'creation', u'\u2018every', u'noise', u'once', u'\u2019', u'us', u'ai', u'help', u'answer', u'question', u'like', u'\u201cwhat', u'rock', u'\u201d', u'once', u'ai', u'dial', u'answer', u'simpler', u'query', u'mcdonald', u'expand', u'find', u'answer', u'obscure', u'question', u'genre', u'now', u'creation', u'feature', u'1,500', u'genre', u'quite', u'obscure', u'i', u'mean', u'ever', u'hear', u'indie', u'poptimism', u'deep', u'filthstep', u'nu', u'gaze', u'mcdonald\u2019s', u'algorithm', u'and', u'categorize', u'genre', u'others', u'musical', u'stylings', u'closely', u'align', u'provide', u'audio', u'sample', u'click', u'or', u'click', u'arrow', u'hover', u'see', u'artist', u'best', u'fit', u'inside', u'base', u'scrap', u'spotify', u'data', u'here', u'artists', u'every', u'noise', u'once', u'find', u'gangster', u'rap', u'genre', u':', u'it\u2019s', u'perfect', u'it\u2019s', u'damn', u'close', u'and', u'like', u'main', u'genre', u'page', u'click', u'artist', u'get', u'sample', u'sound', u'or', u'click', u'double', u'arrow', u'open', u'spotify', u'playlist', u'select', u'artist', u'and', u'you\u2019re', u'feeling', u'really', u'adventurous', u'go', u'main', u'page', u'hit', u'scan', u'button', u'listen', u'five', u'second', u'sample', u'genre', u'it\u2019s', u'definitely', u'get', u'weekend', u'tune', u'cover', u'via', u'reddit', u'every', u'noise', u'once', u'every', u'noise', u'read', u'next', u':', u'do', u'burger', u'king', u'pull', u'biggest', u'publicity', u'stunt', u'year']
Out[6]:
u'quick know difference danish german pop how pointing nuance shift edm music deep neo-synthpop genre tribal house yeah neither win trip amsterdam we team product hunt offer chance win expense pay trip tnw conference 2017 check it out engineer glenn mcdonald attempt chart different type music create website back 2013 bring together 500 genre attempt find artists best fit inside dub \u201cmusic intelligence platform \u201d mcdonald\u2019s creation \u2018every noise once \u2019 us ai help answer question like \u201cwhat rock \u201d once ai dial answer simpler query mcdonald expand find answer obscure question genre now creation feature 1,500 genre quite obscure i mean ever hear indie poptimism deep filthstep nu gaze mcdonald\u2019s algorithm and categorize genre others musical stylings closely align provide audio sample click or click arrow hover see artist best fit inside base scrap spotify data here artists every noise once find gangster rap genre : it\u2019s perfect it\u2019s damn close and like main genre page click artist get sample sound or click double arrow open spotify playlist select artist and you\u2019re feeling really adventurous go main page hit scan button listen five second sample genre it\u2019s definitely get weekend tune cover via reddit every noise once every noise read next : do burger king pull biggest publicity stunt year'
In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.externals import joblib
In [8]:
tfidf = TfidfVectorizer(max_features=250)
x_test = tfidf.fit_transform([z])
In [9]:
x_test
Out[9]:
<1x150 sparse matrix of type '<type 'numpy.float64'>'
with 150 stored elements in Compressed Sparse Row format>
In [10]:
km = joblib.load('model/kmeans_model.pkl')
vectorizer = joblib.load('model/tf_vectorizer_obj.pkl')
x_test = vectorizer.transform([z])
x_test
Out[10]:
<1x250 sparse matrix of type '<type 'numpy.float64'>'
with 35 stored elements in Compressed Sparse Row format>
In [11]:
res = km.predict(x_test)
In [12]:
res[0]
Out[12]:
13
In [5]:
def predict_cluster(article_string):
'''
Takes in a string which is the article body to be processed and cluster value predicted
article_string - string
returns the integer label for the cluster the text fits in
'''
z = cluster_prep(article_string)
# reformat the result for tfidf
z = ' '.join(z)
km = joblib.load('model/kmeans_model.pkl')
vectorizer = joblib.load('model/tf_vectorizer_obj.pkl')
x_test = vectorizer.transform([z])
res = km.predict(x_test)
return res[0]
In [6]:
predict_cluster(a)
Out[6]:
13
In [ ]:
Content source: jaredwelch1/CapstoneI
Similar notebooks: