Task 6: Hygiene Prediction


In [3]:
basePath = 'dataminingcapstone-001'
hygienePath = 'Hygiene'
workingDir = os.path.join(os.curdir, basePath, hygienePath)

reviewsPath = os.path.join(workingDir, 'hygiene.dat')
labelsPath = os.path.join(workingDir, 'hygiene.dat.labels')

1. Straightforward solution


In [4]:
N = 546

with open(reviewsPath, 'r') as f:
    data_train = [next(f) for x in xrange(N)]
    data_pred = [x for x in f]
    
with open(labelsPath, 'r') as f:
    y_train = [next(f) for x in xrange(N)]

In [5]:
print "Train data length: {}".format(len(data_train))
print "Predicted data length: {}".format(len(data_pred))


Train data length: 546
Predicted data length: 12753

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_simple = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words='english')
X_train = tfidf_simple.fit_transform(data_train)
X_test = tfidf_simple.transform(data_pred)

In [10]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

In [11]:
predictedLabelsPath = os.path.join(workingDir, 'output1.txt')
with open(predictedLabelsPath, 'w') as f:
    f.write('sis\n')
    for line in pred:
        f.write(line)

2. Preprocessing improvements

First of all I'll make simple checks a couple cases for chosen preprocessing steps.


In [17]:
s = u'I\'ll be waiting for you! I don\'t mind. What\'s the hell? I\'m gonna home!!'

In [18]:
punctuation = """!"#$%&()*+,-./:;<=>?@[\]^_`{|}~"""

remove_punctuation_map = dict((ord(char), None) for char in punctuation)
s = s.translate(remove_punctuation_map).lower()

In [19]:
from nltk.tokenize.stanford import StanfordTokenizer

path_to_jar = '/Users/igorsokolov/stanford-postagger-2015-04-20/stanford-postagger-3.5.2.jar'
tokens = StanfordTokenizer(path_to_jar=path_to_jar).tokenize(s)

print tokens


[u'i', u"'ll", u'be', u'waiting', u'for', u'you', u'i', u'do', u"n't", u'mind', u'what', u"'s", u'the', u'hell', u'i', u"'m", u'gon', u'na', u'home']

In [20]:
from nltk.stem.porter import PorterStemmer

porterStemmer = PorterStemmer()
stems = [porterStemmer.stem(item) for item in tokens]

print stems


[u'i', u"'ll", u'be', u'wait', u'for', u'you', u'i', u'do', u"n't", u'mind', u'what', u"'s", u'the', u'hell', u'i', u"'m", u'gon', u'na', u'home']

After exercises above we'r ready to implement subroutines for each step of pre-processing.


In [21]:
# I could use string.punctionation constant but it contains symbol ' which is used widely in casual speak. 
# So defined my own constant.
punctuation = """!"#$%&()*+,-./:;<=>?@[\]^_`{|}~"""

remove_punctuation_map = dict((ord(char), None) for char in punctuation)

def remove_punctuation(line):
    return line.translate(remove_punctuation_map).lower()

In [22]:
from nltk.tokenize.stanford import StanfordTokenizer

path_to_jar = '/Users/igorsokolov/stanford-postagger-2015-04-20/stanford-postagger-3.5.2.jar'
tokenizer = StanfordTokenizer(path_to_jar=path_to_jar, options={"americanize": True})

def tokenize(line):
    return tokenizer.tokenize(line)

In [23]:
from nltk.stem.porter import PorterStemmer

porterStemmer = PorterStemmer()

def stemming(tokens):
    return [porterStemmer.stem(item) for item in tokens]

In [24]:
import re
reductions_map = {'\'m': 'am', 'n\'t': 'not', '\'ll': 'will', '\'s': 'is', '\'ve': 'have', '\'d': 'would',
                 '\'re': 'are'}

def replace_reductions(line):
    return reduce(lambda x, y: x.replace(y, reductions_map[y]), reductions_map, line)

In [25]:
reviewsRDD = sc.textFile(reviewsPath, use_unicode=True)
labelsRDD = sc.textFile(labelsPath)

In [30]:
prerocessed_reviews_RDD = (reviewsRDD
                             .map(lambda line: line.lower())
                             .map(lambda line: line.replace('&#160;', ''))
                             .map(lambda line: remove_punctuation(line))
                             .map(lambda line: tokenize(line))
                             .map(lambda tokens: stemming(tokens))
                             .map(lambda tokens: " ".join(tokens))
                             .map(lambda line: replace_reductions(line))
                         )

prerocessed_reviews_RDD.take(1)


Out[30]:
[u"the baguett and roll are excel and although i have not tri them yet i am excit about the dozenplu type of fill croissant on offer at ridicul low price chees with or without ham blueberri with or without cream chees chocol almond thi could be danger i have a bad bakeri habit but at least at q bakeri i wo not go broke while i get fat i have tri four differ banh mi and i will agre with matthew that the basic one are somewhat american mushroom wa more interest than chicken or bbq pork obvious that made thi a good place to get food for the inlaw that seem exot but not too scari note to self do not publish thi on facebook where the inlaw might see it q bakeri is locat on two divid street make it a slight pain to get to but it is worth a stop if you want afford varieti thi use to be the locat of my favorit bahn mi shop in seattl king baguett so when i saw that after one year as a pho restaur that a new bahn mi joint had taken up resid with dream of king baguett ' chicken bahn mi danc in my head i quickli rush inat first examin q bakeri is a lot more welcom than the old king baguett there are a varieti of bake good in the case and they have both bahn mi and regular deli sandwich avail i had both a pork and a teriyaki chicken bahn mi and i found that they did not quit hit the mark on the plu side q bakeri is not stingi with the meat and the meat is both flavor and juici the teriyaki chicken wa especi interest as i have never had a bahn mi like it befor and i quit enjoy it also q bakeri is merci light on the mayonnais on the down side both my sandwich lack much kick they tast like an american bahn mi with all the asian flavor tone way down the baguett itself wa also kind of disapoint be more chewi than crustyfor a brand new place howev i think q bakeri ha a lot of potenti and i look forward to tri them out again in the near futur yum i alway alway look forward to visit my famili in seattl wa so i can get all type of bake goodi from q is they are known for their banh mi is i usual order the banh mi cha lua the custom servic is get better english is obvious not their second languag but mani establish these day are not anyway love thi place we mostli go there for the vietnames sandwich i usual go for the veggi one they are full of flavor tofu and a great assort of veggi i do not feel like i have just had a sandwich of bread after eat there it actual feel like i have eaten my veggi my husband get the meat sandwich he say he like q is sandwich becaus chicken is not shred the sandwich is bigger and the store is great becaus it is better lit better arrang and tidier than somehav live in franc and eaten my way though manyapatisseri i have to add that q is pastri select and tast are pretti awesom i am talk straight up delici pain au chocolat and pain aux raisin they also have all sort of candi in jar and sesam treat they have one or 2 tabl w 4 or 6 chair for eat in they are open til 7pm also they take card which make it handi if we do not have cashon star off becaus it is been a littl tough to commun with the folk there sinc we do not speak vietnames would not be a problem except we have end up with the wrong food befor overal great littl place and i will total be bring my parent there when they come to visit thi weekend thank for the idea matthewp brought my mom there when she came for a visit and she love it natur the ` rent had never had a vietnames sandwich befor great way to introduc them to the magic of banh mi"]

In [ ]:
prerocessed_reviews = prerocessed_reviews_RDD.collect()
                       
prerocessed_data_train = prerocessed_reviews[:N]
prerocessed_data_pred = prerocessed_reviews[N:]  
                       
print "Train data length: {}".format(len(prerocessed_data_train))
print "Predicted data length: {}".format(len(prerocessed_data_pred))

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8,
                                 stop_words='english')
prerocessed_X_train = vectorizer.fit_transform(prerocessed_data_train)
prerocessed_X_test = vectorizer.transform(prerocessed_data_pred)

print("n_samples: %d, n_features: %d" % prerocessed_X_train.shape)

In [269]:
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(tol=1e-2, solver="lsqr")
clf.fit(prerocessed_X_train, y_train)


Out[269]:
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, solver='lsqr', tol=0.01)

In [270]:
preprocessed_pred = clf.predict(prerocessed_X_test)

In [271]:
predictedLabelsPath = os.path.join(workingDir, 'output7.txt')
with open(predictedLabelsPath, 'w') as f:
    f.write('sis\n')
    for line in preprocessed_pred:
        f.write(line)

3. Attempts to apply other classifiers

Chi and naive bayes


In [265]:
from sklearn.feature_selection import SelectKBest, chi2

select_chi2 = prerocessed_X_train.shape[1]
ch2 = SelectKBest(chi2, k=select_chi2)
prerocessed_X_train = ch2.fit_transform(prerocessed_X_train, y_train)
prerocessed_X_test = ch2.transform(prerocessed_X_test)

In [266]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

clf = MultinomialNB(alpha=.01)
clf.fit(prerocessed_X_train, y_train)
bayes_pred = clf.predict(prerocessed_X_test)

In [267]:
predictedLabelsPath = os.path.join(workingDir, 'output6.txt')
with open(predictedLabelsPath, 'w') as f:
    f.write('sis\n')
    for line in bayes_pred:
        f.write(line)

Passive agressive classifier


In [272]:
from sklearn.linear_model import PassiveAggressiveClassifier

clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(prerocessed_X_train, y_train)

pred = clf.predict(prerocessed_X_test)

predictedLabelsPath = os.path.join(workingDir, 'output8.txt')
with open(predictedLabelsPath, 'w') as f:
    f.write('sis\n')
    for line in pred:
        f.write(line)

Bayes based classifiers


In [273]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
clf = MultinomialNB(alpha=.01)
clf.fit(prerocessed_X_train, y_train)

pred = clf.predict(prerocessed_X_test)

predictedLabelsPath = os.path.join(workingDir, 'output9.txt')
with open(predictedLabelsPath, 'w') as f:
    f.write('sis\n')
    for line in pred:
        f.write(line)

4. Union text features with additional information


In [274]:
addsPath = os.path.join(workingDir, 'hygiene.dat.additional')

addsRDD = sc.textFile(addsPath, use_unicode=True)

In [287]:
import re
def parseLine(line):
    t = line.split('"')
    categories = re.compile("'(\w*)'").findall(t[1])
    numbers = t[2].split(',')[1:]
    
    zip_code = numbers[0]
    review_count = int(numbers[1])
    rating = float(numbers[2])
    
    return categories, [zip_code, review_count, rating] 
    
parseLine("\"['Vietnamese', 'Sandwiches', 'Restaurants']\",98118,4,4.0")


Out[287]:
(['Vietnamese', 'Sandwiches', 'Restaurants'], ['98118', 4, 4.0])

In [288]:
additionals = addsRDD.map(lambda x: parseLine(x)).collect()

In [463]:
from sklearn.feature_extraction import DictVectorizer
dict_vectorizer = DictVectorizer()

dict_index_train = [{str(adds[0]): 1} for catetogories, adds in additionals[:N]]
dict_index_test = [{str(adds[0]): 1} for catetogories, adds in additionals[N:]]

index_X_train = dict_vectorizer.fit_transform(dict_index_train)
index_X_test = dict_vectorizer.transform(dict_index_test)

In [464]:
from sklearn.feature_extraction import DictVectorizer
cat_dict_vectorizer = DictVectorizer()

def map_items(categories_list):
    return {cat: 1 for cat in categories_list}

categories_map_train = [map_items(catetogories) for catetogories, adds in additionals[:N]]
categories_map_test = [map_items(catetogories) for catetogories, adds in additionals[N:]]

categories_map_X_train = cat_dict_vectorizer.fit_transform(categories_map_train)
categories_map_X_test = cat_dict_vectorizer.transform(categories_map_test)

In [465]:
ratings_train = [[float(x[1][1]), float(x[1][2])] for x in additionals[:N]]
ratings_test = [[float(x[1][1]), float(x[1][2])] for x in additionals[N:]]

print len(ratings)


546

In [466]:
prerocessed_X_train.toarray()


Out[466]:
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [476]:
from scipy.sparse import hstack

combined_X_train = hstack([prerocessed_X_train, ratings_train, index_X_train, categories_map_X_train])
combined_X_test = hstack([prerocessed_X_test, ratings_test, index_X_test, categories_map_X_test])

print combined_X_train.toarray()


[[ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  1.  1.  0.]]

In [461]:
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(tol=1e-2, solver="lsqr")
clf.fit(combined_X_train, y_train)


Out[461]:
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, solver='lsqr', tol=0.01)

In [462]:
combined_pred = clf.predict(combined_X_test)

predictedLabelsPath = os.path.join(workingDir, 'output14.txt')
with open(predictedLabelsPath, 'w') as f:
    f.write('sis\n')
    for line in combined_pred:
        f.write(line)

In [469]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')

svc.fit(combined_X_train, y_train)
svc_pred = svc.predict(combined_X_test)

predictedLabelsPath = os.path.join(workingDir, 'output15.txt')
with open(predictedLabelsPath, 'w') as f:
    f.write('sis\n')
    for line in svc_pred:
        f.write(line)

5. Selection of K best features and fine tuning of classifier


In [500]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression
from sklearn.grid_search import GridSearchCV

select_chi2 = combined_X_train.shape[1]

ridge_pipeline = Pipeline([
        ('feature_kbest', SelectKBest(chi2)), #f_classif
        ('ridge_classifier', RidgeClassifier(tol=1e-2, solver="lsqr"))
        ])


param_grid = dict(feature_kbest__score_func = [chi2, f_classif],
                  feature_kbest__k=[10, 15, 20, 25, 30, 50, 100, 1000, 10000, 20000, 'all'],
                  ridge_classifier__tol=[1e-8, 1e-6, 1e-4, 1e-2, 1e-1],
                  ridge_classifier__solver=['auto', 'cholesky', 'lsqr', 'sparse_cg'],
                  ridge_classifier__normalize=[True, False])

grid_search = GridSearchCV(ridge_pipeline, param_grid=param_grid)
grid_search.fit(combined_X_train, y_train)
print(grid_search.best_estimator_)


Pipeline(steps=[('feature_kbest', SelectKBest(k=20, score_func=<function chi2 at 0x108ac11b8>)), ('ridge_classifier', RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=True, solver='lsqr', tol=0.01))])

In [ ]:
tuned_ridge_pipeline = Pipeline([
        ('feature_kbest', SelectKBest(chi2, k=10)),
        ('ridge_classifier', RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True,
                                             fit_intercept=True, max_iter=None, normalize=False, 
                                             solver='auto', tol=0.0001))
        ])

tuned_ridge_pipeline.fit(combined_X_train, y_train)
tuned_ridge_pipeline_pred = tuned_ridge_pipeline.predict(combined_X_test)

predictedLabelsPath = os.path.join(workingDir, 'output18.txt')
with open(predictedLabelsPath, 'w') as f:
    f.write('sis\n')
    for line in tuned_ridge_pipeline_pred:
        f.write(line)