In [ ]:
#import the necessary packages
import pandas
import nltk
from nltk import word_tokenize
import string
#read the Music Reviews corpus into a Pandas dataframe
df = pandas.read_csv("../Data/BDHSI2016_music_reviews.csv", encoding='utf-8', sep = '\t')
df['body'] = df['body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
df['body_tokens'] = df['body'].str.lower()
df['body_tokens'] = df['body_tokens'].apply(nltk.word_tokenize)
df['body_tokens'] = df['body_tokens'].apply(lambda x: [word for word in x if word not in string.punctuation])
df['token_count'] = df['body_tokens'].apply(lambda x: len(x))
#view the dataframe
df
In [ ]:
#Read in dictionary files
pos_sent = open("../Data/positive_words.txt", encoding='utf-8').read()
neg_sent = open("../Data/negative_words.txt", encoding='utf-8').read()
#view part of the pos_sent variable, to see how it's formatted.
print(pos_sent[:101])
In [ ]:
#remember the split function? We'll split on the newline character (\n) to create a list
positive_words=pos_sent.split('\n')
negative_words=neg_sent.split('\n')
#view the first elements in the lists
print(positive_words[:10])
print(negative_words[:10])
Great! You know what to do now.
Exercise:
In [ ]:
#exercise code here
#1. Create a column with the number of positive words and another with the proportion of positive words
df['pos_num'] = df['body_tokens'].apply(lambda x: len([word for word in x if word in positive_words]))
df['pos_prop'] = df['pos_num']/df['token_count']
#2. Create a column with the number of negative words, and another with the proportion of negative words
df['neg_num'] = df['body_tokens'].apply(lambda x: len([word for word in x if word in negative_words]))
df['neg_prop'] = df['neg_num']/df['token_count']
df
In [ ]:
#3. Print the average proportion of negative and positive words by genre
grouped = df.groupby('genre')
print("Averge proportion of positive words by genre")
print(grouped['pos_prop'].mean().sort_values(ascending=False))
print()
print("Averge proportion of negative words by genre")
grouped['neg_prop'].mean().sort_values(ascending=False)
In [ ]:
# 4. Compare this to the average score by genre
print("Averge score by genre")
grouped['score'].mean().sort_values(ascending=False)
In [ ]:
#import the function CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer()
#create our document term matrix as a pandas dataframe
dtm_df = pandas.DataFrame(countvec.fit_transform(df.body).toarray(), columns=countvec.get_feature_names(), index = df.index)
Now we can keep only those columns that occur in our positive words list. To do this, we'll first save a list of the columns names as a variable, and then only keep the elements of the list that occur in our positive words list. We'll then create a new dataframe keeping only those select columns.
In [ ]:
#create a columns variable that is a list of all column names
columns = list(dtm_df)
pos_columns = [word for word in columns if word in positive_words]
#create a dtm from our dtm_df that keeps only positive sentiment columns
dtm_pos = dtm_df[pos_columns]
#count the number of positive words for each document
dtm_pos['pos_count'] = dtm_pos.sum(axis=1)
#dtm_pos.drop('pos_count',axis=1, inplace=True)
dtm_pos['pos_count']
EX: Do the same for negative words.
EX: Calculate the proportion of negative and positive words for each document.
In [ ]:
#EX: Do the same for negative words.
neg_columns = [word for word in columns if word in negative_words]
dtm_neg = dtm_df[neg_columns]
dtm_neg['neg_count'] = dtm_neg.sum(axis=1)
dtm_neg['neg_count']
In [ ]:
#EX: Calculate the proportion of negative and positive words for each document.
dtm_pos['pos_proportion'] = dtm_pos['pos_count']/dtm_df.sum(axis=1)
print(dtm_pos['pos_proportion'])
print()
dtm_neg['neg_proportion'] = dtm_neg['neg_count']/dtm_df.sum(axis=1)
print(dtm_neg['neg_proportion'])