In [3]:
basePath = 'dataminingcapstone-001'
hygienePath = 'Hygiene'
workingDir = os.path.join(os.curdir, basePath, hygienePath)
reviewsPath = os.path.join(workingDir, 'hygiene.dat')
labelsPath = os.path.join(workingDir, 'hygiene.dat.labels')
In [4]:
N = 546
with open(reviewsPath, 'r') as f:
data_train = [next(f) for x in xrange(N)]
data_pred = [x for x in f]
with open(labelsPath, 'r') as f:
y_train = [next(f) for x in xrange(N)]
In [5]:
print "Train data length: {}".format(len(data_train))
print "Predicted data length: {}".format(len(data_pred))
In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_simple = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words='english')
X_train = tfidf_simple.fit_transform(data_train)
X_test = tfidf_simple.transform(data_pred)
In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
In [11]:
predictedLabelsPath = os.path.join(workingDir, 'output1.txt')
with open(predictedLabelsPath, 'w') as f:
f.write('sis\n')
for line in pred:
f.write(line)
First of all I'll make simple checks a couple cases for chosen preprocessing steps.
In [17]:
s = u'I\'ll be waiting for you! I don\'t mind. What\'s the hell? I\'m gonna home!!'
In [18]:
punctuation = """!"#$%&()*+,-./:;<=>?@[\]^_`{|}~"""
remove_punctuation_map = dict((ord(char), None) for char in punctuation)
s = s.translate(remove_punctuation_map).lower()
In [19]:
from nltk.tokenize.stanford import StanfordTokenizer
path_to_jar = '/Users/igorsokolov/stanford-postagger-2015-04-20/stanford-postagger-3.5.2.jar'
tokens = StanfordTokenizer(path_to_jar=path_to_jar).tokenize(s)
print tokens
In [20]:
from nltk.stem.porter import PorterStemmer
porterStemmer = PorterStemmer()
stems = [porterStemmer.stem(item) for item in tokens]
print stems
After exercises above we'r ready to implement subroutines for each step of pre-processing.
In [21]:
# I could use string.punctionation constant but it contains symbol ' which is used widely in casual speak.
# So defined my own constant.
punctuation = """!"#$%&()*+,-./:;<=>?@[\]^_`{|}~"""
remove_punctuation_map = dict((ord(char), None) for char in punctuation)
def remove_punctuation(line):
return line.translate(remove_punctuation_map).lower()
In [22]:
from nltk.tokenize.stanford import StanfordTokenizer
path_to_jar = '/Users/igorsokolov/stanford-postagger-2015-04-20/stanford-postagger-3.5.2.jar'
tokenizer = StanfordTokenizer(path_to_jar=path_to_jar, options={"americanize": True})
def tokenize(line):
return tokenizer.tokenize(line)
In [23]:
from nltk.stem.porter import PorterStemmer
porterStemmer = PorterStemmer()
def stemming(tokens):
return [porterStemmer.stem(item) for item in tokens]
In [24]:
import re
reductions_map = {'\'m': 'am', 'n\'t': 'not', '\'ll': 'will', '\'s': 'is', '\'ve': 'have', '\'d': 'would',
'\'re': 'are'}
def replace_reductions(line):
return reduce(lambda x, y: x.replace(y, reductions_map[y]), reductions_map, line)
In [25]:
reviewsRDD = sc.textFile(reviewsPath, use_unicode=True)
labelsRDD = sc.textFile(labelsPath)
In [30]:
prerocessed_reviews_RDD = (reviewsRDD
.map(lambda line: line.lower())
.map(lambda line: line.replace(' ', ''))
.map(lambda line: remove_punctuation(line))
.map(lambda line: tokenize(line))
.map(lambda tokens: stemming(tokens))
.map(lambda tokens: " ".join(tokens))
.map(lambda line: replace_reductions(line))
)
prerocessed_reviews_RDD.take(1)
Out[30]:
In [ ]:
prerocessed_reviews = prerocessed_reviews_RDD.collect()
prerocessed_data_train = prerocessed_reviews[:N]
prerocessed_data_pred = prerocessed_reviews[N:]
print "Train data length: {}".format(len(prerocessed_data_train))
print "Predicted data length: {}".format(len(prerocessed_data_pred))
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8,
stop_words='english')
prerocessed_X_train = vectorizer.fit_transform(prerocessed_data_train)
prerocessed_X_test = vectorizer.transform(prerocessed_data_pred)
print("n_samples: %d, n_features: %d" % prerocessed_X_train.shape)
In [269]:
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier(tol=1e-2, solver="lsqr")
clf.fit(prerocessed_X_train, y_train)
Out[269]:
In [270]:
preprocessed_pred = clf.predict(prerocessed_X_test)
In [271]:
predictedLabelsPath = os.path.join(workingDir, 'output7.txt')
with open(predictedLabelsPath, 'w') as f:
f.write('sis\n')
for line in preprocessed_pred:
f.write(line)
In [265]:
from sklearn.feature_selection import SelectKBest, chi2
select_chi2 = prerocessed_X_train.shape[1]
ch2 = SelectKBest(chi2, k=select_chi2)
prerocessed_X_train = ch2.fit_transform(prerocessed_X_train, y_train)
prerocessed_X_test = ch2.transform(prerocessed_X_test)
In [266]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
clf = MultinomialNB(alpha=.01)
clf.fit(prerocessed_X_train, y_train)
bayes_pred = clf.predict(prerocessed_X_test)
In [267]:
predictedLabelsPath = os.path.join(workingDir, 'output6.txt')
with open(predictedLabelsPath, 'w') as f:
f.write('sis\n')
for line in bayes_pred:
f.write(line)
In [272]:
from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(prerocessed_X_train, y_train)
pred = clf.predict(prerocessed_X_test)
predictedLabelsPath = os.path.join(workingDir, 'output8.txt')
with open(predictedLabelsPath, 'w') as f:
f.write('sis\n')
for line in pred:
f.write(line)
In [273]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
clf = MultinomialNB(alpha=.01)
clf.fit(prerocessed_X_train, y_train)
pred = clf.predict(prerocessed_X_test)
predictedLabelsPath = os.path.join(workingDir, 'output9.txt')
with open(predictedLabelsPath, 'w') as f:
f.write('sis\n')
for line in pred:
f.write(line)
In [274]:
addsPath = os.path.join(workingDir, 'hygiene.dat.additional')
addsRDD = sc.textFile(addsPath, use_unicode=True)
In [287]:
import re
def parseLine(line):
t = line.split('"')
categories = re.compile("'(\w*)'").findall(t[1])
numbers = t[2].split(',')[1:]
zip_code = numbers[0]
review_count = int(numbers[1])
rating = float(numbers[2])
return categories, [zip_code, review_count, rating]
parseLine("\"['Vietnamese', 'Sandwiches', 'Restaurants']\",98118,4,4.0")
Out[287]:
In [288]:
additionals = addsRDD.map(lambda x: parseLine(x)).collect()
In [463]:
from sklearn.feature_extraction import DictVectorizer
dict_vectorizer = DictVectorizer()
dict_index_train = [{str(adds[0]): 1} for catetogories, adds in additionals[:N]]
dict_index_test = [{str(adds[0]): 1} for catetogories, adds in additionals[N:]]
index_X_train = dict_vectorizer.fit_transform(dict_index_train)
index_X_test = dict_vectorizer.transform(dict_index_test)
In [464]:
from sklearn.feature_extraction import DictVectorizer
cat_dict_vectorizer = DictVectorizer()
def map_items(categories_list):
return {cat: 1 for cat in categories_list}
categories_map_train = [map_items(catetogories) for catetogories, adds in additionals[:N]]
categories_map_test = [map_items(catetogories) for catetogories, adds in additionals[N:]]
categories_map_X_train = cat_dict_vectorizer.fit_transform(categories_map_train)
categories_map_X_test = cat_dict_vectorizer.transform(categories_map_test)
In [465]:
ratings_train = [[float(x[1][1]), float(x[1][2])] for x in additionals[:N]]
ratings_test = [[float(x[1][1]), float(x[1][2])] for x in additionals[N:]]
print len(ratings)
In [466]:
prerocessed_X_train.toarray()
Out[466]:
In [476]:
from scipy.sparse import hstack
combined_X_train = hstack([prerocessed_X_train, ratings_train, index_X_train, categories_map_X_train])
combined_X_test = hstack([prerocessed_X_test, ratings_test, index_X_test, categories_map_X_test])
print combined_X_train.toarray()
In [461]:
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier(tol=1e-2, solver="lsqr")
clf.fit(combined_X_train, y_train)
Out[461]:
In [462]:
combined_pred = clf.predict(combined_X_test)
predictedLabelsPath = os.path.join(workingDir, 'output14.txt')
with open(predictedLabelsPath, 'w') as f:
f.write('sis\n')
for line in combined_pred:
f.write(line)
In [469]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(combined_X_train, y_train)
svc_pred = svc.predict(combined_X_test)
predictedLabelsPath = os.path.join(workingDir, 'output15.txt')
with open(predictedLabelsPath, 'w') as f:
f.write('sis\n')
for line in svc_pred:
f.write(line)
In [500]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression
from sklearn.grid_search import GridSearchCV
select_chi2 = combined_X_train.shape[1]
ridge_pipeline = Pipeline([
('feature_kbest', SelectKBest(chi2)), #f_classif
('ridge_classifier', RidgeClassifier(tol=1e-2, solver="lsqr"))
])
param_grid = dict(feature_kbest__score_func = [chi2, f_classif],
feature_kbest__k=[10, 15, 20, 25, 30, 50, 100, 1000, 10000, 20000, 'all'],
ridge_classifier__tol=[1e-8, 1e-6, 1e-4, 1e-2, 1e-1],
ridge_classifier__solver=['auto', 'cholesky', 'lsqr', 'sparse_cg'],
ridge_classifier__normalize=[True, False])
grid_search = GridSearchCV(ridge_pipeline, param_grid=param_grid)
grid_search.fit(combined_X_train, y_train)
print(grid_search.best_estimator_)
In [ ]:
tuned_ridge_pipeline = Pipeline([
('feature_kbest', SelectKBest(chi2, k=10)),
('ridge_classifier', RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True,
fit_intercept=True, max_iter=None, normalize=False,
solver='auto', tol=0.0001))
])
tuned_ridge_pipeline.fit(combined_X_train, y_train)
tuned_ridge_pipeline_pred = tuned_ridge_pipeline.predict(combined_X_test)
predictedLabelsPath = os.path.join(workingDir, 'output18.txt')
with open(predictedLabelsPath, 'w') as f:
f.write('sis\n')
for line in tuned_ridge_pipeline_pred:
f.write(line)