In [3]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cross_validation import train_test_split

path='/Users/sebastienconort/Documents/DataScience/DataScienceNet/CDiscount/'
#%% load et normalisation des donnees
def norm(x):
    return unicodedata.normalize('NFKD', unicode(x).lower()).encode('ASCII', 'ignore')

In [5]:
#load de la donnee light
tr_df=pd.read_csv(path+'data/training_norm_light_th100_2.csv',sep=";",encoding='utf-8')

un_cat3=tr_df.Categorie3.unique()
rep=pd.Series(range(tr_df.Categorie3.unique().shape[0]),index=tr_df.Categorie3.unique())
y=tr_df.Categorie3.map(rep)

In [10]:
te_df=pd.read_csv(path+'data/test.csv',sep=";",encoding='utf-8')
te_df['Description']=te_df.Description.map(norm)
te_df['Libelle']=te_df.Libelle.map(norm)

In [19]:
#iterateur generique
from sklearn.pipeline import Pipeline
import time
import sys
import scipy
from nltk.classify.maxent import MaxentClassifier
from sklearn.base import clone
from sklearn.metrics.pairwise import cosine_similarity

r=np.random.RandomState(42)

def draw(a,size):
    aa=r.permutation(a)
    return [aa[i%len(aa)] for i in range(size)]
   
classes=y.unique()
one_te=list(pd.DataFrame(y[:]).groupby("Categorie3").apply(lambda x: r.choice(x.index,1)[0]))

batch_size=5
batch_nb=1

cur_set=set(tr_df.index)
cur_set=cur_set.difference(one_te)
mnb_iter2=MultinomialNB(fit_prior=False,alpha=0.25)
bycat3=pd.DataFrame(y[list(cur_set)]).groupby("Categorie3").apply(lambda x:draw(x.index,batch_size*batch_nb))

HV=TfidfVectorizer(sublinear_tf=True,ngram_range=(1, 1),max_features=None,stop_words=None)
HV.fit(te_df.ix[:].apply(lambda x:    (' ').join(HVB(unicode(x.Description))[:]) + ' ' + unicode(x.Marque)+ ' ' +  unicode(x.Libelle) ,axis=1))

ind_all=[xx  for x in bycat3 for xx in x]

for mod in [("mnb_iter4",mnb_iter2)]:
    cl=mod[1]
    #cl2=clone(cl)
    for e in range(0,0+batch_nb):
        t0=time.time()
        ind=[]
        for b in range(batch_size):
            ind.extend(list(bycat3.apply(lambda x:x[(e*batch_size+b)])))
        ind=list(set(ind))
        print 'fitting tfidf on hash vector'
        sys.stdout.flush()        
        X=HV.transform(tr_df.ix[ind].apply(lambda x:   (' ').join(HVB(unicode(x.Description))[:]) + ' ' + unicode(x.Marque)+ ' ' +    unicode(x.Libelle) ,axis=1))
        X_te=HV.transform(tr_df.ix[one_te].apply(lambda x:   (' ').join(HVB(unicode(x.Description))[:]) + ' ' + unicode(x.Marque)+ ' ' +   unicode(x.Libelle),axis=1))
        print 'tfidf on hash vector fitted'
        print 'X size is ', X.shape
        t1=time.time()
        print t1-t0, 'sec'        
        sys.stdout.flush()
        #tr=range(len(ind))
        print 'fitting model'
        sys.stdout.flush()
        a=y[ind].value_counts()
        w=1/a*max(a)
        weights=[w[x] for x in y[ind]]
        cl.fit(X,y[ind].values,sample_weight=weights)
        #cl.partial_fit(X,y[ind].values,classes=classes,sample_weight=weights)
        print 'model fitted'
        t2=time.time()
        print t2-t1, 'sec'
        sys.stdout.flush()
        prob_te=cl.predict_proba(X_te)
        pred_via_prob=prob_te.argmax(1)
        pred=cl.predict(X_te)
        #score methode 1
        print "score methode 1", cl.score(X_te,y[one_te])
        #score methode 2
        print "score methode 1", np.mean(pred==y[one_te])
        #score methode 3
        print "score methode 1", np.mean(pred_via_prob==y[one_te])


fitting tfidf on hash vector
tfidf on hash vector fitted
X size is  (26968, 44203)
2.30995202065 sec
fitting model
model fitted
14.5418629646 sec
score methode 1 0.435481084816
score methode 1 0.435481084816
score methode 1 0.216617723268

In [ ]:


In [ ]:


In [ ]: