In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess, lemmatize
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from gensim import models
from gensim.models.ldamodel import LdaModel
from scipy.stats import skew, kurtosis
from nltk.corpus import stopwords
from os import listdir
import numpy as np
import codecs
import re
from sklearn.learning_curve import learning_curve
In [2]:
# TF-IDF, LDA, LSI, RPM, HDP
class MyCorpus(object):
def __init__(self,path):
self.path = path
self.text, self.klasses = self.get_klass_ids()
self.dictionary = corpora.Dictionary(self.text.get(text) for text in self.klasses.keys())
def __iter__(self):
for sent in self.klasses.keys():
#yield self.dictionary.doc2bow(self.text.get(sent))
yield self.text.get(sent)
def text_bow(self,sent):
return self.dictionary.doc2bow(self.text.get(sent))
def get_data(self,fname):
ids_text = {}
data = open(self.path+fname).readlines()
for sent in data:
yield self.pre_process(sent)
def get_klass_ids(self):
ids_text = {}
ids_klasses = {}
i=0
for klass in listdir(self.path):
for row in self.get_data(klass):
if row != None:
ids_text['SENT_%s'%i] = row
ids_klasses['SENT_%s'%i] = klass.replace('.data','')
i=i+1
return ids_text, ids_klasses
def pre_process(self,text):
sentence = re.sub('[.,"]','',text)
#sentence = sentence.lower().decode('ISO-8859-7').split()
# REMOVING STOPWORDS TEXT ~~~~~~~~~~~~~~
# stopCashed = set(stopwords.words('english'))
# sentence = [word for word in sentence.lower().decode('ISO-8859-7').split() if word not in (stopCashed)]
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# LEMMATIZE TEXT ~~~~~~~~~~~~~~~~~~~~~~~~
sentence = lemmatize(sentence)
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
return sentence
In [ ]:
mc = MyCorpus('Bases/Reyes/')
In [ ]:
from sklearn.decomposition import LatentDirichletAllocation
def split_into_lemmas(message):
message = message
# for each word, take its "base form" = lemma
return [word for word in message]
In [ ]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(mc)
print len(bow_transformer.vocabulary_)
In [ ]:
messages_bow = bow_transformer.transform(mc)
In [ ]:
message4 = mc.text.get('SENT_5')
print message4
In [ ]:
bow4 = bow_transformer.transform([message4])
print bow4
print bow4.shape
In [ ]:
print bow_transformer.get_feature_names()[1000]
print bow_transformer.get_feature_names()[8013]
In [ ]:
messages_bow = bow_transformer.transform(mc.text.values())
print 'sparse matrix shape:', messages_bow.shape
print 'number of non-zeros:', messages_bow.nnz
print 'sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
In [ ]:
transformer_lda = LatentDirichletAllocation().fit(messages_bow)
In [ ]:
messages_lda = transformer_lda.transform(messages_bow)
print messages_lda.shape
In [ ]:
%time spam_detector = LinearSVC().fit(messages_lda, mc.klasses.values())
In [ ]:
spam_detector.score
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: