In [35]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# Load in the Client Name data
# Make sure all names uppercase (there are some mixed instances)
pd.set_option('display.max_rows', 30)
path = '/home/zongyi/bimbo_data/'
# path = '/Users/zonemercy/jupyter_notebook/bimbo_data/'
vf = pd.read_csv(path+'cliente_tabla.csv',header=0)
vf['NombreCliente'] = vf['NombreCliente'].str.upper()

In [3]:
vf['NombreCliente'].value_counts()[0:10]


Out[3]:
NO IDENTIFICADO    281670
LUPITA               4863
MARY                 3016
LA PASADITA          2426
LA VENTANITA         2267
LA GUADALUPANA       1299
ROSY                 1246
ALEX                 1242
GABY                 1238
LA ESCONDIDA         1216
Name: NombreCliente, dtype: int64

In [4]:
# Let's also generate a list of individual word frequency across all names
def tfidf_score_list(vf2, list_len):
    from sklearn.feature_extraction.text import TfidfVectorizer
    v = TfidfVectorizer()

    vf2['New'] = 'na'
    a = " ".join(vf2['NombreCliente'])
    vf2['New'][0] = a

    tfidf = v.fit_transform(vf2['New'])

    feature_names = v.get_feature_names()

    freq = []
    doc = 0
    feature_index = tfidf[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf[doc, x] for x in feature_index])
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
            freq.append((w.encode('utf-8'),s))
    
    del vf2['New']
    
    import numpy as np
    names = ['word','score']
    formats = ['S50','f8']
    dtype = dict(names = names, formats=formats)
    array = np.array(freq, dtype=dtype)

    b = np.sort(array, order='score')
    
    if list_len > len(b)+1:
        list_len = len(b)+1
    for i in range(1,list_len):
        print(b[-i])

In [5]:
tfidf_score_list(vf, 200)


/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
('no', 0.6888126004393861)
('identificado', 0.6849292193081505)
('la', 0.14990532034895288)
('el', 0.08328478631485127)
('abarrotes', 0.0800093233318993)
('de', 0.060769929775020375)
('maria', 0.046819424108208656)
('miscelanea', 0.038050812737366206)
('super', 0.03520332413080589)
('los', 0.02841652592336794)
('san', 0.025741686070920162)
('oxxo', 0.02257321668206611)
('del', 0.021668634477420134)
('garcia', 0.021376833766244014)
('hernandez', 0.02099506116912192)
('jose', 0.019835153342196838)
('lupita', 0.019261278610217134)
('gonzalez', 0.01775850494766011)
('martinez', 0.01764664800837593)
('lopez', 0.017150586799376526)
('mini', 0.015331695699712036)
('rodriguez', 0.014891562960354718)
('las', 0.01388971385198337)
('don', 0.012542567235386944)
('comodin', 0.01238450851683321)
('guadalupe', 0.012357760118308733)
('mary', 0.012270219904955897)
('jesus', 0.012024620973049328)
('ramirez', 0.011995440901931718)
('juan', 0.011966260830814104)
('casa', 0.011929785741917089)
('ag', 0.011820360475226043)
('sanchez', 0.011567466525540072)
('perez', 0.011192988946197384)
('farmacia', 0.011115175423217084)
('hermanos', 0.010881734854276187)
('cruz', 0.01042458040676693)
('flores', 0.009619696778439463)
('mi', 0.008992325249410802)
('rosa', 0.008676207812303337)
('carmen', 0.008549760837460353)
('diconsa', 0.008435472225583038)
('papeleria', 0.008204463329235275)
('luis', 0.008189873293676469)
('torres', 0.007915094290652288)
('tienda', 0.007907799272872884)
('ventanita', 0.007778920625436765)
('ii', 0.007338787886079448)
('gomez', 0.007270701053471686)
('pasadita', 0.007265837708285418)
('cremeria', 0.007226930946795268)
('reyes', 0.007141822406035567)
('francisco', 0.006660351232594967)
('antonio', 0.0066554878874086985)
('esperanza', 0.006653056214815564)
('ana', 0.00665062454222243)
('puesto', 0.00655578931109019)
('martha', 0.006529040912565712)
('luz', 0.006475544115516757)
('santa', 0.0064536590621785475)
('angel', 0.006293168671031681)
('vazquez', 0.006271283617693472)
('gutierrez', 0.006174016713968098)
('hamburguesas', 0.006154563333223023)
('tiendita', 0.006059728102090784)
('diaz', 0.005938144472434066)
('miguel', 0.005792244116846006)
('jimenez', 0.00564877543385108)
('escuela', 0.0056025736545815275)
('gloria', 0.0055588035479051095)
('centro', 0.005527191804194363)
('misc', 0.005403176501944511)
('manuel', 0.0054007448293513765)
('elena', 0.005303477925626003)
('angeles', 0.005293751235253466)
('ma', 0.005174599278189883)
('morales', 0.005145419207072271)
('abts', 0.005121102481140927)
('teresa', 0.004958180417400927)
('ruiz', 0.0049095469655382395)
('margarita', 0.004839028460337343)
('juana', 0.004804985044033463)
('guadalupana', 0.004768509955136448)
('chavez', 0.00475878326476391)
('alex', 0.004683401414376746)
('do\xc3\xb1a', 0.004671243051411074)
('carniceria', 0.004639631307700328)
('martin', 0.004622609599548387)
('isabel', 0.004610451236582715)
('mendoza', 0.0045131843328573415)
('alicia', 0.004413485756538833)
('aguilar', 0.0043964640483868936)
('carlos', 0.004330808888372266)
('fruteria', 0.0043040604898477875)
('gaby', 0.004270017073543907)
('rosy', 0.004265153728357638)
('nueva', 0.004235973657240027)
('estrella', 0.004165455152039131)
('deposito', 0.004141138426107787)
('juarez', 0.004128980063142115)
('alvarez', 0.004082778283872563)
('moreno', 0.004056029885348085)
('cafeteria', 0.00405359821275495)
('ortiz', 0.004031713159416741)
('rivera', 0.004000101415705995)
('castillo', 0.003978216362367786)
('dany', 0.003953899636436442)
('pedro', 0.0038882444764218155)
('luna', 0.0038882444764218155)
('paty', 0.003871222768269875)
('secundaria', 0.0037228907400886795)
('guzman', 0.003681552306005396)
('dulces', 0.0036402138719221124)
('providencia', 0.003618328818583903)
('escondida', 0.0036037387830250974)
('rosario', 0.0035696953667212167)
('diana', 0.0035186302422653953)
('ramos', 0.0035064718792997235)
('chiquita', 0.0034845868259615145)
('express', 0.003477291808182111)
('medina', 0.003474860135588977)
('reyna', 0.0034456800644713645)
('esc', 0.0034456800644713645)
('juquilita', 0.0033994782852018123)
('laura', 0.0033775932318636033)
('sol', 0.0033630031963047973)
('romero', 0.0033459814881528567)
('flor', 0.0033411181429665883)
('silvia', 0.003299779708883304)
('rosita', 0.003290053018510767)
('minisuper', 0.0032851896733244977)
('blanca', 0.0032657362925794233)
('jorge', 0.0032584412748000204)
('javier', 0.003253577929613752)
('modelorama', 0.0032389878940549456)
('leon', 0.0031976494599716613)
('guerrero', 0.0031976494599716613)
('fe', 0.003190354442192259)
('vargas', 0.003122267609584497)
('eleven', 0.003012842342893452)
('victoria', 0.002990957289555243)
('castro', 0.002930165474726884)
('loncheria', 0.0029058487487955407)
('tres', 0.0028985537310161373)
('esquina', 0.0028888270406436)
('angelica', 0.0028888270406436)
('leticia', 0.0028450569339671817)
('josefina', 0.0028304668984083757)
('mercado', 0.002803718499883898)
('lucy', 0.002789128464325092)
('irma', 0.0027769701013594206)
('mendez', 0.002769675083580017)
('lety', 0.002769675083580017)
('vinos', 0.002767243410986883)
('comercial', 0.0027648117383937483)
('claudia', 0.0027575167206143458)
('espinoza', 0.0027356316672761363)
('tere', 0.0027210416317173304)
('yolanda', 0.0027137466139379274)
('villa', 0.0027015882509722556)
('esmeralda', 0.002689429888006584)
('herrera', 0.0026748398524477778)
('luisa', 0.0026675448346683748)
('restaurant', 0.0026651131620752406)
('licores', 0.0026602498168889718)
('doa', 0.0026578181442958376)
('expendio', 0.00263106974577136)
('hotel', 0.002599458002060613)
('dios', 0.00259216298428121)
('juanita', 0.0025872996390949414)
('colegio', 0.0025872996390949414)
('valle', 0.00257514127612927)
('dulceria', 0.0025654145857567324)
('primaria', 0.002555687895384195)
('caseta', 0.0025532562227910606)
('local', 0.0025508245501979264)
('hot', 0.0025459612050116576)
('contreras', 0.00253623451463912)
('estrada', 0.002528939496859717)
('anita', 0.00251921280648718)
('velazquez', 0.0025143494613009115)
('karen', 0.002497327753148971)
('salvador', 0.002477874372403896)
('rocio', 0.0024535576464725524)
('cv', 0.0024438309561000153)
('alejandra', 0.0024316725931343435)
('consumo', 0.0024195142301686717)
('socorro', 0.0024122192123892687)
('rafael', 0.002387902486457925)
('comedor', 0.002385470813864791)
('patricia', 0.0023757441234922537)
('ab', 0.002370880778305985)
('sa', 0.002363585760526582)
('jugos', 0.002356290742747179)
('rio', 0.0023538590701540443)
('lourdes', 0.002341700707188373)
('ortega', 0.0023392690345952383)
('salazar', 0.002329542344222701)
('plaza', 0.0023271106716295665)

In [7]:
# print(vf[vf['NombreCliente'].str.contains('.*CAFE.*')])
print len(vf[vf['NombreCliente'].str.contains('NEZ|JOSE|NZO|TES')])


57514

In [9]:
# --- Begin Filtering for specific terms

# Note that the order of filtering is significant.
# For example: 
# The regex of .*ERIA.* will assign "FRUITERIA" to 'Eatery' rather than 'Fresh Market'.
# In other words, the first filters to occur have a bigger priority.

def filter_specific(vf2):
    
    # Known Large Company / Special Group Types
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*REMISION.*','Consignment')
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*DISTRIBUIDORA.*','Distribut')
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*WAL MART.*','.*SAMS CLUB.*'],'Walmart', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*OXXO.*','Oxxo Store')
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*CONASUPO.*','Govt Store')
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*BIMBO.*','Bimbo Store')

    

    # General term search for a random assortment of words I picked from looking at
    # their frequency of appearance in the data and common spanish words for these categories
#     vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*NEZ.*','.*JOSE.*','.*NZO.*','.*TES.*'],'Xicans', regex=True)

    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*COLEG.*','.*UNIV.*','.*ESCU.*','.*INSTI.*',\
                                                        '.*PREPAR.*'],'School', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*PUESTO.*','Post')
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*FARMA.*','.*HOSPITAL.*','.*CLINI.*'],'Hospital', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*CAFE.*','.*CREMERIA.*','.*DULCERIA.*',\
                                                        '.*REST.*','.*BURGER.*','.*TACO.*', '.*TORTA.*',\
                                                        '.*TAQUER.*','.*HOT DOG.*',\
                                                        '.*COMEDOR.*', '.*ERIA.*','.*BURGU.*'],'Eatery', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*SUPER.*','Supermarket')
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*COMERCIAL.*','.*BODEGA.*','.*DEPOSITO.*',\
                                                            '.*ABARROTES.*','.*MERCADO.*','.*CAMBIO.*',\
                                                        '.*MARKET.*','.*MART .*','.*MINI .*',\
                                                        '.*PLAZA.*','.*MISC.*','.*ELEVEN.*','.*SEVEN.*','.*EXP.*',\
                                                         '.*SNACK.*', '.*PAPELERIA.*', '.*CARNICERIA.*',\
                                                         '.*LOCAL.*','.*COMODIN.*','.*PROVIDENCIA.*'
                                                        ],'General Market'\
                                                       , regex=True)

    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*VERDU.*','.*FRUT.*'],'Fresh Market', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*HOTEL.*','.*MOTEL.*'],'Hotel', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].replace(['.*NEZ.*','.*JOSE.*','.*NZO.*','.*TES.*'],'Xicans', regex=True)
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*MODELOR.*','Modelor')
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*ARTELI.*','Arteli')
    vf2['NombreCliente'] = vf2['NombreCliente'].str.replace('.*CALIMAX.*','Calimax')

In [10]:
filter_specific(vf)

In [12]:
# --- Begin filtering for more general terms
# The idea here is to look for names with particles of speech that would
# not appear in a person's name.
# i.e. "Individuals" should not contain any participles or numbers in their names.
def filter_participle(vf2):
    vf2['NombreCliente'] = vf2['NombreCliente'].replace([
            '.*LA .*','.*EL .*','.*DE .*','.*LOS .*','.*DEL .*','.*Y .*', '.*SAN .*', '.*SANTA .*',\
            '.*AG .*','.*LAS .*','.*MI .*','.*MA .*', '.*II.*', '.*[0-9]+.*'\
    ],'Small Franchise', regex=True)

In [13]:
filter_participle(vf)

In [15]:
# Any remaining entries should be "Individual" Named Clients, there are some outliers.
# More specific filters could be used in order to reduce the percentage of outliers in this final set.
def filter_remaining(vf2):
    def function_word(data):
        # Avoid the single-words created so far by checking for upper-case
        if (data.isupper()) and (data != "NO IDENTIFICADO"): 
            return 'Individual'
        else:
            return data
    vf2['NombreCliente'] = vf2['NombreCliente'].map(function_word)

In [16]:
filter_remaining(vf)

In [18]:
vf['NombreCliente'].value_counts()


Out[18]:
Individual         335692
NO IDENTIFICADO    281670
Small Franchise    154007
General Market      66378
Eatery              30412
Xicans              22352
Supermarket         16015
Oxxo Store           9313
Hospital             5782
School               5705
Post                 2667
Modelor              1280
Hotel                1127
Fresh Market         1068
Govt Store            959
Bimbo Store           320
Distribut             288
Walmart               220
Calimax                73
Arteli                 20
Consignment            14
Name: NombreCliente, dtype: int64

In [20]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(vf['NombreCliente'])

print le.classes_

vf['c_clt'] = le.transform(vf['NombreCliente'])


['Arteli' 'Bimbo Store' 'Calimax' 'Consignment' 'Distribut' 'Eatery'
 'Fresh Market' 'General Market' 'Govt Store' 'Hospital' 'Hotel'
 'Individual' 'Modelor' 'NO IDENTIFICADO' 'Oxxo Store' 'Post' 'School'
 'Small Franchise' 'Supermarket' 'Walmart' 'Xicans']

In [22]:
vf.head()


Out[22]:
Cliente_ID NombreCliente c_clt
0 0 Individual 11
1 1 Oxxo Store 14
2 2 Individual 11
3 3 Small Franchise 17
4 4 Small Franchise 17

In [27]:
len(vf) #935362


Out[27]:
935362

In [24]:
vf = vf[['Cliente_ID','c_clt']]

In [32]:
# vf = vf[vf['Cliente_ID'].unique()]
vf = vf.drop_duplicates(subset='Cliente_ID', keep='last')

In [33]:
vf


Out[33]:
Cliente_ID c_clt
0 0 11
1 1 14
2 2 11
3 3 17
5 4 17
6 5 17
7 6 11
8 7 17
9 8 18
10 9 18
11 10 17
12 11 9
13 12 5
14 13 11
15 14 11
... ... ...
935347 10142492 14
935348 10142588 7
935349 10281997 14
935350 10303963 4
935351 10351784 14
935352 10351790 14
935353 10351796 14
935354 10351802 14
935355 10351808 14
935356 10351814 14
935357 11011586 14
935358 11693264 18
935359 19988629 13
935360 99999999 13
935361 2015152015 13

930500 rows × 2 columns


In [34]:
vf.to_csv(path+'clients.csv',index=False)

In [ ]: