In [2]:
import os
import numpy as np
import pickle
import random

In [6]:
random.seed(0)
def folder_list(path,label):
    '''
    PARAMETER PATH IS THE PATH OF YOUR LOCAL FOLDER
    '''
    filelist = os.listdir(path)
    review = []
    for infile in filelist:
        file = os.path.join(path,infile)
        r = list(read_data(file))
        r.append(label)
        review.append(r)
    return review

def read_data(file):
    '''
    Read each file into a list of strings. 
    Example:
    ["it's", 'a', 'curious', 'thing', "i've", 'found', 'that', 'when', 'willis', 'is', 'not', 'called', 'on', 
    ...'to', 'carry', 'the', 'whole', 'movie', "he's", 'much', 'better', 'and', 'so', 'is', 'the', 'movie']
    '''
    f = open(file)
    lines = f.read().split(' ')
    symbols = '${}()[].,:;+-*/&|<>=~" '
    words = map(lambda Element: Element.translate(symbols).strip(), lines)
    words = filter(None, words)
    return words
	
    
def shuffle_data():
    '''
    pos_path is where you save positive review data.
    neg_path is where you save negative review data.
    '''
    pos_path = "C:/Users/Lihan/Documents/WORD/2017 Spring/Machine Learning/hw3/txt_sentoken/pos"
    neg_path = "C:/Users/Lihan/Documents/WORD/2017 Spring/Machine Learning/hw3/txt_sentoken/neg"
	
    pos_review = folder_list(pos_path,1)
    neg_review = folder_list(neg_path,-1)
	
    review = pos_review + neg_review
    random.shuffle(review)
    
    pickle.dump(review[0:1500], open( "train.p", "wb" ) )
    pickle.dump(review[1500:2000], open( "valid.p", "wb" ) )
    return 0
'''
Now you have read all the files into list 'review' and it has been shuffled.
Save your shuffled result by pickle.
*Pickle is a useful module to serialize a python object structure. 
*Check it out. https://wiki.python.org/moin/UsingPickle
'''
shuffle_data()
train = pickle.load( open( "train.p", "rb" ) )
valid = pickle.load( open( "valid.p", "rb" ) )

In [ ]:


In [ ]: