In [1]:
import pandas as pd
import sklearn
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
In [2]:
train = pd.read_csv("./data/labeledTrainData.tsv", delimiter="\t")
In [3]:
train.head()
Out[3]:
In [4]:
train.shape
Out[4]:
In [5]:
train["review"][0][:600]
Out[5]:
In [6]:
# 定义与处理函数
def review_to_words(review, remove_stopwords=False):
# 去掉 html
review_text = BeautifulSoup(review, "html5lib").get_text()
# 去掉 none letter
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
# 转换大小写并分割
words = letters_only.lower().split()
# stop_words
stops = set(stopwords.words("english"))
# 删除 stop_words
meaningful_words = [w for w in words if not w in stops]
return " ".join(meaningful_words)
In [7]:
print("开始清洗并解析影评......")
num_reviews = train["review"].size
clean_train_reviews = []
for i in range(num_reviews):
if((i + 1) % 5000 == 0):
print("影评 {} of {}".format(i, num_reviews))
clean_train_reviews.append(review_to_words(train["review"][i]))
In [8]:
clean_train_reviews[0]
Out[8]:
In [9]:
# 词袋
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
max_features = 5000
)
train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()
In [10]:
train_data_features.shape
Out[10]:
In [11]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit( train_data_features, train["sentiment"])
In [12]:
# Read the test data
test = pd.read_csv("./data/testData.tsv", header=0, delimiter="\t", \
quoting=3 )
# Verify that there are 25,000 rows and 2 columns
print(test.shape)
# Create an empty list and append the clean reviews one by one
num_reviews = len(test["review"])
clean_test_reviews = []
print("Cleaning and parsing the test set movie reviews...\n")
for i in range(num_reviews):
if( (i+1) % 5000 == 0 ):
print("Review {} of {}\n".format(i+1, num_reviews))
clean_review = review_to_words( test["review"][i] )
clean_test_reviews.append( clean_review )
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()
# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )
In [ ]: