In [ ]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import sys
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()

In [ ]:
print os.getcwd();

In [ ]:
# if necessary change the directory
#os.chdir('c:\\Users\..')

In [ ]:
data = pd.read_csv("nightlife_sanfrancisco_en.csv", header=0, delimiter=",")

In [ ]:
# iexplore data set
data.shape

In [ ]:
data.columns.values

In [ ]:
print data["text"][0]

In [ ]:
# Remove stop words from "words"
import nltk      # import stop words
nltk.download('popular')  # Download text data sets, including stop words

In [ ]:
from nltk.corpus import stopwords # Import the stop word list
print stopwords.words("english")
#words = [w for w in words if not w in stopwords.words("english")]
#print words #  "u" before each word indicates that Python is internally representing each word as a unicode string

In [ ]:
# Clean all records
def text_to_words( raw_text ):
    # 1. Remove end of line
    without_end_line = re.sub('\n', ' ', raw_text)
    # 2. Remove start of line
    without_start_line = re.sub('\r', ' ', without_end_line)
    # 3. Remove punctuation
    without_punctual = re.sub(ur'[\W_]+',' ',without_start_line )
    # 4. Replace number by XnumberX
    without_number = re.sub('(\d+\s*)+', ' XnumberX ', without_punctual)
    # 5. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", without_number) 
    # 6. Convert to lower case
    lower_case = letters_only.lower()
    # 7. Split into individual words
    words = lower_case.split()
    # 8. stemming  - algorithms Porter stemmer
    meaningful_words = [ps.stem(word) for word in words]
    # 9.Remove stop words 
    # Redundant step, removing later in Creating the bag of words step
    #stops = set(stopwords.words("english"))                  
    #meaningful_words = [w for w in words if not w in stops]   
    # 10. Join the words back into one string separated by space and return the result.
    return( " ".join( meaningful_words ))
    #return (meaningful_words)

In [ ]:
clean_text = text_to_words( data["text"][0] )
print clean_text

In [ ]:
# Get the number of text based on the dataframe column size
num_text = data["text"].size
# Initialize an empty list to hold the clean text
clean_data = []
# Loop over each text; create an index i that goes from 0 to the length
print "Cleaning and parsing the data set text...\n"
clean_data = []
for i in xrange( 0, num_text ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print "Text %d of %d\n" % ( i+1, num_text )                                                                    
    clean_data.append( text_to_words( data["text"][i] )) # in case of error run "pip install -U nltk"

In [ ]:
# Compare original and edited text
data['text'][0]

In [ ]:
clean_data[0]

In [ ]:
print "Creating the bag of words...\n"
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = 'english',   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_data)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

In [ ]:
print train_data_features.shape

In [ ]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab

In [ ]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag

In [ ]:
# Using in model, random forest example
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, data["stars"] )