In [ ]:
#First recreate our data
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#create a dataframe called "df"
df = pandas.read_csv("../Data/BDHSI2016_music_reviews.csv", sep = '\t', encoding = 'utf-8')
#find two artists
print(df['artist'].value_counts())
In [ ]:
#concatenate all of the reviews for our two artists together
#R.E.M.
rem_df = df[df['artist']=="R.E.M."]
rem_string = ''.join(x for x in rem_df['body'].tolist())
rem_string[:100]
In [ ]:
#Arcade Fire
af_df = df[df['artist']=="Arcade Fire"]
af_string = ''.join(x for x in af_df['body'].tolist())
af_string[:100]
In [ ]:
#Concatenate these two string into one list
text_list = []
text_list.extend((rem_string, af_string))
text_list[0]
In [ ]:
#turn our list into a DTM dataframe
countvec = CountVectorizer(stop_words="english")
text_dtm = pandas.DataFrame(countvec.fit_transform(text_list).toarray(), columns=countvec.get_feature_names())
#calculate difference of proportions
text_dtm['word_count'] = text_dtm.sum(axis=1)
text_dtm = text_dtm.iloc[:,0:].div(text_dtm.word_count, axis=0)
text_dtm.loc[2] = text_dtm.loc[0] - text_dtm.loc[1]
In [ ]:
#print most distinctive words
#R.E.M is postive, Arcade Fire is negative
text_dtm.loc[2].sort_values(axis=0, ascending=False)