In [3]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
path='/Users/sebastienconort/Documents/DataScience/DataScienceNet/CDiscount/'
#%% load et normalisation des donnees
def norm(x):
return unicodedata.normalize('NFKD', unicode(x).lower()).encode('ASCII', 'ignore')
In [5]:
#load de la donnee light
tr_df=pd.read_csv(path+'data/training_norm_light_th100_2.csv',sep=";",encoding='utf-8')
un_cat3=tr_df.Categorie3.unique()
rep=pd.Series(range(tr_df.Categorie3.unique().shape[0]),index=tr_df.Categorie3.unique())
y=tr_df.Categorie3.map(rep)
In [10]:
te_df=pd.read_csv(path+'data/test.csv',sep=";",encoding='utf-8')
te_df['Description']=te_df.Description.map(norm)
te_df['Libelle']=te_df.Libelle.map(norm)
In [19]:
#iterateur generique
from sklearn.pipeline import Pipeline
import time
import sys
import scipy
from nltk.classify.maxent import MaxentClassifier
from sklearn.base import clone
from sklearn.metrics.pairwise import cosine_similarity
r=np.random.RandomState(42)
def draw(a,size):
aa=r.permutation(a)
return [aa[i%len(aa)] for i in range(size)]
classes=y.unique()
one_te=list(pd.DataFrame(y[:]).groupby("Categorie3").apply(lambda x: r.choice(x.index,1)[0]))
batch_size=5
batch_nb=1
cur_set=set(tr_df.index)
cur_set=cur_set.difference(one_te)
mnb_iter2=MultinomialNB(fit_prior=False,alpha=0.25)
bycat3=pd.DataFrame(y[list(cur_set)]).groupby("Categorie3").apply(lambda x:draw(x.index,batch_size*batch_nb))
HV=TfidfVectorizer(sublinear_tf=True,ngram_range=(1, 1),max_features=None,stop_words=None)
HV.fit(te_df.ix[:].apply(lambda x: (' ').join(HVB(unicode(x.Description))[:]) + ' ' + unicode(x.Marque)+ ' ' + unicode(x.Libelle) ,axis=1))
ind_all=[xx for x in bycat3 for xx in x]
for mod in [("mnb_iter4",mnb_iter2)]:
cl=mod[1]
#cl2=clone(cl)
for e in range(0,0+batch_nb):
t0=time.time()
ind=[]
for b in range(batch_size):
ind.extend(list(bycat3.apply(lambda x:x[(e*batch_size+b)])))
ind=list(set(ind))
print 'fitting tfidf on hash vector'
sys.stdout.flush()
X=HV.transform(tr_df.ix[ind].apply(lambda x: (' ').join(HVB(unicode(x.Description))[:]) + ' ' + unicode(x.Marque)+ ' ' + unicode(x.Libelle) ,axis=1))
X_te=HV.transform(tr_df.ix[one_te].apply(lambda x: (' ').join(HVB(unicode(x.Description))[:]) + ' ' + unicode(x.Marque)+ ' ' + unicode(x.Libelle),axis=1))
print 'tfidf on hash vector fitted'
print 'X size is ', X.shape
t1=time.time()
print t1-t0, 'sec'
sys.stdout.flush()
#tr=range(len(ind))
print 'fitting model'
sys.stdout.flush()
a=y[ind].value_counts()
w=1/a*max(a)
weights=[w[x] for x in y[ind]]
cl.fit(X,y[ind].values,sample_weight=weights)
#cl.partial_fit(X,y[ind].values,classes=classes,sample_weight=weights)
print 'model fitted'
t2=time.time()
print t2-t1, 'sec'
sys.stdout.flush()
prob_te=cl.predict_proba(X_te)
pred_via_prob=prob_te.argmax(1)
pred=cl.predict(X_te)
#score methode 1
print "score methode 1", cl.score(X_te,y[one_te])
#score methode 2
print "score methode 1", np.mean(pred==y[one_te])
#score methode 3
print "score methode 1", np.mean(pred_via_prob==y[one_te])
In [ ]:
In [ ]:
In [ ]: