notebook.community

Edit and run



In [35]:

    
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# Load in the Client Name data
# Make sure all names uppercase (there are some mixed instances)
pd.set_option('display.max_rows', 30)
path = '/home/zongyi/bimbo_data/'
# path = '/Users/zonemercy/jupyter_notebook/bimbo_data/'
vf = pd.read_csv(path+'cliente_tabla.csv',header=0)
vf['NombreCliente'] = vf['NombreCliente'].str.upper()



In [3]:

    
vf['NombreCliente'].value_counts()[0:10]









    Out[3]:





NO IDENTIFICADO    281670
LUPITA               4863
MARY                 3016
LA PASADITA          2426
LA VENTANITA         2267
LA GUADALUPANA       1299
ROSY                 1246
ALEX                 1242
GABY                 1238
LA ESCONDIDA         1216
Name: NombreCliente, dtype: int64



In [4]:

    
# Let's also generate a list of individual word frequency across all names
def tfidf_score_list(vf2, list_len):
    from sklearn.feature_extraction.text import TfidfVectorizer
    v = TfidfVectorizer()

    vf2['New'] = 'na'
    a = " ".join(vf2['NombreCliente'])
    vf2['New'][0] = a

    tfidf = v.fit_transform(vf2['New'])

    feature_names = v.get_feature_names()

    freq = []
    doc = 0
    feature_index = tfidf[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf[doc, x] for x in feature_index])
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
            freq.append((w.encode('utf-8'),s))
    
    del vf2['New']
    
    import numpy as np
    names = ['word','score']
    formats = ['S50','f8']
    dtype = dict(names = names, formats=formats)
    array = np.array(freq, dtype=dtype)

    b = np.sort(array, order='score')
    
    if list_len > len(b)+1:
        list_len = len(b)+1
    for i in range(1,list_len):
        print(b[-i])



In [5]:

    
tfidf_score_list(vf, 200)









    



/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    



('no', 0.6888126004393861)
('identificado', 0.6849292193081505)
('la', 0.14990532034895288)
('el', 0.08328478631485127)
('abarrotes', 0.0800093233318993)
('de', 0.060769929775020375)
('maria', 0.046819424108208656)
('miscelanea', 0.038050812737366206)
('super', 0.03520332413080589)
('los', 0.02841652592336794)
('san', 0.025741686070920162)
('oxxo', 0.02257321668206611)
('del', 0.021668634477420134)
('garcia', 0.021376833766244014)
('hernandez', 0.02099506116912192)
('jose', 0.019835153342196838)
('lupita', 0.019261278610217134)
('gonzalez', 0.01775850494766011)
('martinez', 0.01764664800837593)
('lopez', 0.017150586799376526)
('mini', 0.015331695699712036)
('rodriguez', 0.014891562960354718)
('las', 0.01388971385198337)
('don', 0.012542567235386944)
('comodin', 0.01238450851683321)
('guadalupe', 0.012357760118308733)
('mary', 0.012270219904955897)
('jesus', 0.012024620973049328)
('ramirez', 0.011995440901931718)
('juan', 0.011966260830814104)
('casa', 0.011929785741917089)
('ag', 0.011820360475226043)
('sanchez', 0.011567466525540072)
('perez', 0.011192988946197384)
('farmacia', 0.011115175423217084)
('hermanos', 0.010881734854276187)
('cruz', 0.01042458040676693)
('flores', 0.009619696778439463)
('mi', 0.008992325249410802)
('rosa', 0.008676207812303337)
('carmen', 0.008549760837460353)
('diconsa', 0.008435472225583038)
('papeleria', 0.008204463329235275)
('luis', 0.008189873293676469)
('torres', 0.007915094290652288)
('tienda', 0.007907799272872884)
('ventanita', 0.007778920625436765)
('ii', 0.007338787886079448)
('gomez', 0.007270701053471686)
('pasadita', 0.007265837708285418)
('cremeria', 0.007226930946795268)
('reyes', 0.007141822406035567)
('francisco', 0.006660351232594967)
('antonio', 0.0066554878874086985)
('esperanza', 0.006653056214815564)
('ana', 0.00665062454222243)
('puesto', 0.00655578931109019)
('martha', 0.006529040912565712)
('luz', 0.006475544115516757)
('santa', 0.0064536590621785475)
('angel', 0.006293168671031681)
('vazquez', 0.006271283617693472)
('gutierrez', 0.006174016713968098)
('hamburguesas', 0.006154563333223023)
('tiendita', 0.006059728102090784)
('diaz', 0.005938144472434066)
('miguel', 0.005792244116846006)
('jimenez', 0.00564877543385108)
('escuela', 0.0056025736545815275)
('gloria', 0.0055588035479051095)
('centro', 0.005527191804194363)
('misc', 0.005403176501944511)
('manuel', 0.0054007448293513765)
('elena', 0.005303477925626003)
('angeles', 0.005293751235253466)
('ma', 0.005174599278189883)
('morales', 0.005145419207072271)
('abts', 0.005121102481140927)
('teresa', 0.004958180417400927)
('ruiz', 0.0049095469655382395)
('margarita', 0.004839028460337343)
('juana', 0.004804985044033463)
('guadalupana', 0.004768509955136448)
('chavez', 0.00475878326476391)
('alex', 0.004683401414376746)
('do\xc3\xb1a', 0.004671243051411074)
('carniceria', 0.004639631307700328)
('martin', 0.004622609599548387)
('isabel', 0.004610451236582715)
('mendoza', 0.0045131843328573415)
('alicia', 0.004413485756538833)
('aguilar', 0.0043964640483868936)
('carlos', 0.004330808888372266)
('fruteria', 0.0043040604898477875)
('gaby', 0.004270017073543907)
('rosy', 0.004265153728357638)
('nueva', 0.004235973657240027)
('estrella', 0.004165455152039131)
('deposito', 0.004141138426107787)
('juarez', 0.004128980063142115)
('alvarez', 0.004082778283872563)
('moreno', 0.004056029885348085)
('cafeteria', 0.00405359821275495)
('ortiz', 0.004031713159416741)
('rivera', 0.004000101415705995)
('castillo', 0.003978216362367786)
('dany', 0.003953899636436442)
('pedro', 0.0038882444764218155)
('luna', 0.0038882444764218155)
('paty', 0.003871222768269875)
('secundaria', 0.0037228907400886795)
('guzman', 0.003681552306005396)
('dulces', 0.0036402138719221124)
('providencia', 0.003618328818583903)
('escondida', 0.0036037387830250974)
('rosario', 0.0035696953667212167)
('diana', 0.0035186302422653953)
('ramos', 0.0035064718792997235)
('chiquita', 0.0034845868259615145)
('express', 0.003477291808182111)
('medina', 0.003474860135588977)
('reyna', 0.0034456800644713645)
('esc', 0.0034456800644713645)
('juquilita', 0.0033994782852018123)
('laura', 0.0033775932318636033)
('sol', 0.0033630031963047973)
('romero', 0.0033459814881528567)
('flor', 0.0033411181429665883)
('silvia', 0.003299779708883304)
('rosita', 0.003290053018510767)
('minisuper', 0.0032851896733244977)
('blanca', 0.0032657362925794233)
('jorge', 0.0032584412748000204)
('javier', 0.003253577929613752)
('modelorama', 0.0032389878940549456)
('leon', 0.0031976494599716613)
('guerrero', 0.0031976494599716613)
('fe', 0.003190354442192259)
('vargas', 0.003122267609584497)
('eleven', 0.003012842342893452)
('victoria', 0.002990957289555243)
('castro', 0.002930165474726884)
('loncheria', 0.0029058487487955407)
('tres', 0.0028985537310161373)
('esquina', 0.0028888270406436)
('angelica', 0.0028888270406436)
('leticia', 0.0028450569339671817)
('josefina', 0.0028304668984083757)
('mercado', 0.002803718499883898)
('lucy', 0.002789128464325092)
('irma', 0.0027769701013594206)
('mendez', 0.002769675083580017)
('lety', 0.002769675083580017)
('vinos', 0.002767243410986883)
('comercial', 0.0027648117383937483)
('claudia', 0.0027575167206143458)
('espinoza', 0.0027356316672761363)
('tere', 0.0027210416317173304)
('yolanda', 0.0027137466139379274)
('villa', 0.0027015882509722556)
('esmeralda', 0.002689429888006584)
('herrera', 0.0026748398524477778)
('luisa', 0.0026675448346683748)
('restaurant', 0.0026651131620752406)
('licores', 0.0026602498168889718)
('doa', 0.0026578181442958376)
('expendio', 0.00263106974577136)
('hotel', 0.002599458002060613)
('dios', 0.00259216298428121)
('juanita', 0.0025872996390949414)
('colegio', 0.0025872996390949414)
('valle', 0.00257514127612927)
('dulceria', 0.0025654145857567324)
('primaria', 0.002555687895384195)
('caseta', 0.0025532562227910606)
('local', 0.0025508245501979264)
('hot', 0.0025459612050116576)
('contreras', 0.00253623451463912)
('estrada', 0.002528939496859717)
('anita', 0.00251921280648718)
('velazquez', 0.0025143494613009115)
('karen', 0.002497327753148971)
('salvador', 0.002477874372403896)
('rocio', 0.0024535576464725524)
('cv', 0.0024438309561000153)
('alejandra', 0.0024316725931343435)
('consumo', 0.0024195142301686717)
('socorro', 0.0024122192123892687)
('rafael', 0.002387902486457925)
('comedor', 0.002385470813864791)
('patricia', 0.0023757441234922537)
('ab', 0.002370880778305985)
('sa', 0.002363585760526582)
('jugos', 0.002356290742747179)
('rio', 0.0023538590701540443)
('lourdes', 0.002341700707188373)
('ortega', 0.0023392690345952383)
('salazar', 0.002329542344222701)
('plaza', 0.0023271106716295665)



In [7]:

    
# print(vf[vf['NombreCliente'].str.contains('.*CAFE.*')])
print len(vf[vf['NombreCliente'].str.contains('NEZ|JOSE|NZO|TES')])



In [9]:

    
# --- Begin Filtering for specific terms

# Note that the order of filtering is significant.
# For example: 
# The regex of .*ERIA.* will assign "FRUITERIA" to 'Eatery' rather than 'Fresh Market'.
# In other words, the first filters to occur have a bigger priority.

def filter_specific(vf2):
    
    # Known Large Company / Special Group Types
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*REMISION.*','Consignment')
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*DISTRIBUIDORA.*','Distribut')
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*WAL MART.*','.*SAMS CLUB.*'],'Walmart', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*OXXO.*','Oxxo Store')
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*CONASUPO.*','Govt Store')
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*BIMBO.*','Bimbo Store')

    

    # General term search for a random assortment of words I picked from looking at
    # their frequency of appearance in the data and common spanish words for these categories
#     vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*NEZ.*','.*JOSE.*','.*NZO.*','.*TES.*'],'Xicans', regex=True)

    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*COLEG.*','.*UNIV.*','.*ESCU.*','.*INSTI.*',\
                                                        '.*PREPAR.*'],'School', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*PUESTO.*','Post')
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*FARMA.*','.*HOSPITAL.*','.*CLINI.*'],'Hospital', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*CAFE.*','.*CREMERIA.*','.*DULCERIA.*',\
                                                        '.*REST.*','.*BURGER.*','.*TACO.*', '.*TORTA.*',\
                                                        '.*TAQUER.*','.*HOT DOG.*',\
                                                        '.*COMEDOR.*', '.*ERIA.*','.*BURGU.*'],'Eatery', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*SUPER.*','Supermarket')
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*COMERCIAL.*','.*BODEGA.*','.*DEPOSITO.*',\
                                                            '.*ABARROTES.*','.*MERCADO.*','.*CAMBIO.*',\
                                                        '.*MARKET.*','.*MART .*','.*MINI .*',\
                                                        '.*PLAZA.*','.*MISC.*','.*ELEVEN.*','.*SEVEN.*','.*EXP.*',\
                                                         '.*SNACK.*', '.*PAPELERIA.*', '.*CARNICERIA.*',\
                                                         '.*LOCAL.*','.*COMODIN.*','.*PROVIDENCIA.*'
                                                        ],'General Market'\
                                                       , regex=True)

    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*VERDU.*','.*FRUT.*'],'Fresh Market', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*HOTEL.*','.*MOTEL.*'],'Hotel', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*NEZ.*','.*JOSE.*','.*NZO.*','.*TES.*'],'Xicans', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*MODELOR.*','Modelor')
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*ARTELI.*','Arteli')
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*CALIMAX.*','Calimax')



In [10]:

    
filter_specific(vf)



In [12]:

    
# --- Begin filtering for more general terms
# The idea here is to look for names with particles of speech that would
# not appear in a person's name.
# i.e. "Individuals" should not contain any participles or numbers in their names.
def filter_participle(vf2):
    vf2['NombreCliente'] = vf2['NombreCliente'].replace([
            '.*LA .*','.*EL .*','.*DE .*','.*LOS .*','.*DEL .*','.*Y .*', '.*SAN .*', '.*SANTA .*',\
            '.*AG .*','.*LAS .*','.*MI .*','.*MA .*', '.*II.*', '.*[0-9]+.*'\
    ],'Small Franchise', regex=True)



In [13]:

    
filter_participle(vf)



In [15]:

    
# Any remaining entries should be "Individual" Named Clients, there are some outliers.
# More specific filters could be used in order to reduce the percentage of outliers in this final set.
def filter_remaining(vf2):
    def function_word(data):
        # Avoid the single-words created so far by checking for upper-case
        if (data.isupper()) and (data != "NO IDENTIFICADO"): 
            return 'Individual'
        else:
            return data
    vf2['NombreCliente'] = vf2['NombreCliente'].map(function_word)



In [16]:

    
filter_remaining(vf)



In [18]:

    
vf['NombreCliente'].value_counts()









    Out[18]:





Individual         335692
NO IDENTIFICADO    281670
Small Franchise    154007
General Market      66378
Eatery              30412
Xicans              22352
Supermarket         16015
Oxxo Store           9313
Hospital             5782
School               5705
Post                 2667
Modelor              1280
Hotel                1127
Fresh Market         1068
Govt Store            959
Bimbo Store           320
Distribut             288
Walmart               220
Calimax                73
Arteli                 20
Consignment            14
Name: NombreCliente, dtype: int64



In [20]:

    
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(vf['NombreCliente'])

print le.classes_

vf['c_clt'] = le.transform(vf['NombreCliente'])









    



['Arteli' 'Bimbo Store' 'Calimax' 'Consignment' 'Distribut' 'Eatery'
 'Fresh Market' 'General Market' 'Govt Store' 'Hospital' 'Hotel'
 'Individual' 'Modelor' 'NO IDENTIFICADO' 'Oxxo Store' 'Post' 'School'
 'Small Franchise' 'Supermarket' 'Walmart' 'Xicans']



In [22]:

    
vf.head()









    Out[22]:






  
    
      
      Cliente_ID
      NombreCliente
      c_clt
    
  
  
    
      0
      0
      Individual
      11
    
    
      1
      1
      Oxxo Store
      14
    
    
      2
      2
      Individual
      11
    
    
      3
      3
      Small Franchise
      17
    
    
      4
      4
      Small Franchise
      17



In [27]:

    
len(vf) #935362









    Out[27]:





935362



In [24]:

    
vf = vf[['Cliente_ID','c_clt']]



In [32]:

    
# vf = vf[vf['Cliente_ID'].unique()]
vf = vf.drop_duplicates(subset='Cliente_ID', keep='last')



In [33]:

    
vf









    Out[33]:






  
    
      
      Cliente_ID
      c_clt
    
  
  
    
      0
      0
      11
    
    
      1
      1
      14
    
    
      2
      2
      11
    
    
      3
      3
      17
    
    
      5
      4
      17
    
    
      6
      5
      17
    
    
      7
      6
      11
    
    
      8
      7
      17
    
    
      9
      8
      18
    
    
      10
      9
      18
    
    
      11
      10
      17
    
    
      12
      11
      9
    
    
      13
      12
      5
    
    
      14
      13
      11
    
    
      15
      14
      11
    
    
      ...
      ...
      ...
    
    
      935347
      10142492
      14
    
    
      935348
      10142588
      7
    
    
      935349
      10281997
      14
    
    
      935350
      10303963
      4
    
    
      935351
      10351784
      14
    
    
      935352
      10351790
      14
    
    
      935353
      10351796
      14
    
    
      935354
      10351802
      14
    
    
      935355
      10351808
      14
    
    
      935356
      10351814
      14
    
    
      935357
      11011586
      14
    
    
      935358
      11693264
      18
    
    
      935359
      19988629
      13
    
    
      935360
      99999999
      13
    
    
      935361
      2015152015
      13
    
  

930500 rows × 2 columns



In [34]:

    
vf.to_csv(path+'clients.csv',index=False)



In [ ]:

	Cliente_ID	NombreCliente	c_clt
0	0	Individual	11
1	1	Oxxo Store	14
2	2	Individual	11
3	3	Small Franchise	17
4	4	Small Franchise	17

	Cliente_ID	c_clt
0	0	11
1	1	14
2	2	11
3	3	17
5	4	17
6	5	17
7	6	11
8	7	17
9	8	18
10	9	18
11	10	17
12	11	9
13	12	5
14	13	11
15	14	11
...	...	...
935347	10142492	14
935348	10142588	7
935349	10281997	14
935350	10303963	4
935351	10351784	14
935352	10351790	14
935353	10351796	14
935354	10351802	14
935355	10351808	14
935356	10351814	14
935357	11011586	14
935358	11693264	18
935359	19988629	13
935360	99999999	13
935361	2015152015	13