Homework5


张艺馨

15210130100

1.使用另外一种sklearn分类器进行情感分析


In [30]:
import nltk

pos_tweets = [('I love this car', 'positive'),
    ('This view is amazing', 'positive'),
    ('I feel great this morning', 'positive'),
    ('I am so excited about the concert', 'positive'),
    ('He is my best friend', 'positive')]

neg_tweets = [('I do not like this car', 'negative'),
    ('This view is horrible', 'negative'),
    ('I feel tired this morning', 'negative'),
    ('I am not looking forward to the concert', 'negative'),
    ('He is my enemy', 'negative')]

In [2]:
#把正负面的tweets合并到一起
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3] # 过滤大小写
    tweets.append((words_filtered, sentiment)) 
tweets[0:2]


Out[2]:
[(['love', 'this', 'car'], 'positive'),
 (['this', 'view', 'amazing'], 'positive')]

In [3]:
test_tweets = [
    (['feel', 'happy', 'this', 'morning'], 'positive'),
    (['larry', 'friend'], 'positive'),
    (['not', 'like', 'that', 'man'], 'negative'),
    (['house', 'not', 'great'], 'negative'),
    (['your', 'song', 'annoying'], 'negative')]

In [4]:
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

word_features = get_word_features(get_words_in_tweets(tweets))
' '.join(word_features)


Out[4]:
'forward great like love concert tired this car about morning looking feel amazing friend horrible not the enemy excited best view'

In [5]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
      features['contains(%s)' % word] = (word in document_words)
    return features

In [16]:
training_set = nltk.classify.util.apply_features(extract_features, tweets) #nltk有划分的功能
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [17]:
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
    label_probdist = estimator(label_freqdist)
    feature_probdist = {}
    return MaxentClassifier(label_probdist, feature_probdist)

In [18]:
tweet_positive = 'Larry is my friend'
print classifier.classify(extract_features(tweet_positive.split()))


positive

In [19]:
tweet_positive = 'Larry is not my friend'
print classifier.classify(extract_features(tweet_positive.split()))


negative

In [20]:
def classify_tweet(tweet):
    return classifier.classify(extract_features(tweet)) 
    # nltk.word_tokenize(tweet)

total = accuracy = float(len(test_tweets))

for tweet in test_tweets:
    if classify_tweet(tweet[0]) != tweet[1]:
        accuracy -= 1

print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))


Total accuracy: 80.000000% (4/20).

In [21]:
nltk_classifiers = dir(nltk)
for i in nltk_classifiers:
    if 'Classifier' in i:
        print i


ClassifierBasedPOSTagger
ClassifierBasedTagger
ClassifierI
ConditionalExponentialClassifier
DecisionTreeClassifier
MaxentClassifier
MultiClassifierI
NaiveBayesClassifier
PositiveNaiveBayesClassifier
SklearnClassifier
WekaClassifier

In [25]:
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier
classif = SklearnClassifier(RandomForestClassifier())
svm_classifier = classif.train(training_set)

In [26]:
tweet_negative2 = 'Your song is annoying'
print svm_classifier.classify(extract_features(tweet_negative2.split()))


positive

2.使用https://github.com/victorneo/Twitter-Sentimental-Analysis 所提供的推特数据进行情感分析


In [31]:
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier


def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      all_words.extend(words)
    return all_words


def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features


def read_tweets(fname, t_type):
    tweets = []
    f = open(fname, 'r')
    line = f.readline()
    while line != '':
        tweets.append([line, t_type])
        line = f.readline()
    f.close()
    return tweets

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
      features['contains(%s)' % word] = (word in document_words)
    return features

def classify_tweet(tweet):
    return \
        classifier.classify(extract_features(nltk.word_tokenize(tweet)))

pos_tweets = read_tweets('/Users/zhangyixin/Twitter-Sentimental-Analysis/happy.txt', 'positive')
neg_tweets = read_tweets('/Users/zhangyixin/Twitter-Sentimental-Analysis/sad.txt', 'negative')

tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))

word_features = get_word_features(\
                    get_words_in_tweets(tweets))

training_set = nltk.classify.util.apply_features(extract_features, tweets)
classifier = NaiveBayesClassifier.train(training_set)

test_tweets = read_tweets('/Users/zhangyixin/Twitter-Sentimental-Analysis/happy_test.txt', 'positive')
test_tweets.extend(read_tweets('/Users/zhangyixin/Twitter-Sentimental-Analysis/sad_test.txt', 'negative'))
total = accuracy = float(len(test_tweets))

for tweet in test_tweets:
    if classify_tweet(tweet[0]) != tweet[1]:
        accuracy -= 1

print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))


Total accuracy: 90.000000% (18/20).

In [ ]: