In [ ]:
#First recreate our data
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#create a dataframe called "df"
df = pandas.read_csv("../Data/BDHSI2016_music_reviews.csv", sep = '\t', encoding = 'utf-8')
#counte vectorizer
countvec = CountVectorizer()
sklearn_dtm = countvec.fit_transform(df.body)
dtm_df = pandas.DataFrame(countvec.fit_transform(df.body).toarray(), columns=countvec.get_feature_names(), index = df.index)
#tfidf vectorizer
tfidfvec = TfidfVectorizer()
#create the dtm, but with cells weigthed by the tf-idf score.
dtm_tfidf_df = pandas.DataFrame(tfidfvec.fit_transform(df.body).toarray(), columns=tfidfvec.get_feature_names(), index = df.index)
You folks are experts at this now. Write Python code using pandas to do the following exploration of the data:
In [ ]:
print(df['genre'].value_counts())
print(df['critic'].value_counts())
print(df['artist'].value_counts())
In [ ]:
##Ex: print the average number of times each word is used in a review
#Print this out sorted from highest to lowest.
print(dtm_df.mean().sort_values(ascending=False))
In [ ]:
print(df['artist'].value_counts())
In [ ]:
#merge this into the dtm_tfidf_df
dtm_tfidf_df_artist = dtm_tfidf_df
dtm_tfidf_df['ARTIST'] = df['artist']
#pull out the reviews for three genres, Rap, Alternative/Indie Rock, and Jazz
dtm_rem_artist = dtm_tfidf_df_artist[dtm_tfidf_df_artist['ARTIST']=="R.E.M."]
dtm_af_artist = dtm_tfidf_df_artist[dtm_tfidf_df_artist['ARTIST']=="Arcade Fire"]
#print the words with the highest tf-idf scores for each genre
print("R.E.M. Words")
print(dtm_rem_artist.max(numeric_only=True).sort_values(ascending=False)[0:20])
print()
print("Arcade Fire Words")
print(dtm_af_artist.max(numeric_only=True).sort_values(ascending=False)[0:20])