In [2]:
import os
import numpy as np
import pickle
import random
In [6]:
random.seed(0)
def folder_list(path,label):
'''
PARAMETER PATH IS THE PATH OF YOUR LOCAL FOLDER
'''
filelist = os.listdir(path)
review = []
for infile in filelist:
file = os.path.join(path,infile)
r = list(read_data(file))
r.append(label)
review.append(r)
return review
def read_data(file):
'''
Read each file into a list of strings.
Example:
["it's", 'a', 'curious', 'thing', "i've", 'found', 'that', 'when', 'willis', 'is', 'not', 'called', 'on',
...'to', 'carry', 'the', 'whole', 'movie', "he's", 'much', 'better', 'and', 'so', 'is', 'the', 'movie']
'''
f = open(file)
lines = f.read().split(' ')
symbols = '${}()[].,:;+-*/&|<>=~" '
words = map(lambda Element: Element.translate(symbols).strip(), lines)
words = filter(None, words)
return words
def shuffle_data():
'''
pos_path is where you save positive review data.
neg_path is where you save negative review data.
'''
pos_path = "C:/Users/Lihan/Documents/WORD/2017 Spring/Machine Learning/hw3/txt_sentoken/pos"
neg_path = "C:/Users/Lihan/Documents/WORD/2017 Spring/Machine Learning/hw3/txt_sentoken/neg"
pos_review = folder_list(pos_path,1)
neg_review = folder_list(neg_path,-1)
review = pos_review + neg_review
random.shuffle(review)
pickle.dump(review[0:1500], open( "train.p", "wb" ) )
pickle.dump(review[1500:2000], open( "valid.p", "wb" ) )
return 0
'''
Now you have read all the files into list 'review' and it has been shuffled.
Save your shuffled result by pickle.
*Pickle is a useful module to serialize a python object structure.
*Check it out. https://wiki.python.org/moin/UsingPickle
'''
shuffle_data()
train = pickle.load( open( "train.p", "rb" ) )
valid = pickle.load( open( "valid.p", "rb" ) )
In [ ]:
In [ ]: