Buscar y sustituir el nombre de la lista de datos por el de la lista de nombres correctos

Cambios en los ficheros:

de "espacio" a "_"

de "," a "%"

de ";" a ","

In [1]:
import numpy as np
import os as os
import pandas as pd

In [6]:
datospanda = pd.read_csv('STOCKb-codigos.txt',skiprows = 0)
datos = np.array(datospanda)

nombrespanda = pd.read_csv('lista_nombres_correctos.txt',skiprows = 0)
nombres = np.array(nombrespanda)




num_rows = datos.shape[0]
num_nom = nombres.shape[0]
root = 'lista_datos_con nombre.txt'
root_error = 'lista_datos_no_encontrados.txt'

try:
    os.remove(root)
except :
    pass
tabla = open(root, mode = 'w')
try:
    os.remove(root_error)
except :
    pass
tabla_error = open(root_error, mode = 'w')

nombres_2 = []
for nombre in nombres[:,0]:
    aa = str(nombre)
    aa = aa.replace('"', "")
    aa = aa.replace('_', "")
    aa = aa.replace('.', "")
    aa = aa.replace('-', "")    
    aa = aa.replace('%', "")
    if '/' in aa:
        _pos = aa.index('/')
        while aa[_pos+1] == '0':
            aa = aa[:_pos+1]+aa[_pos+2:]
            _pos = aa.index('/')
    aa = aa.replace('/', "")
    while aa[0] == '0':
        aa = aa[1:]
    while aa[-1] == '0':
        aa = aa[:-1]
    if 'x' in aa:
        _pos = aa.index('x')
        while aa[_pos-1] == '0':
            aa = aa[:_pos-1]+aa[_pos:]
            _pos = aa.index('x')
    
    nombres_2.append(aa)
datos_2 = []
for nombre in datos[:,0]:
    aa = str(nombre)
    aa = aa.replace('"', "")
    aa = aa.replace('_', "")
    aa = aa.replace('.', "")
    aa = aa.replace('-', "")    
    aa = aa.replace('%', "")
    if '/' in aa:
        _pos = aa.index('/')
        while aa[_pos+1] == '0':
            aa = aa[:_pos+1]+aa[_pos+2:]
            _pos = aa.index('/')
    aa = aa.replace('/', "")
    while aa[0] == '0':
        aa = aa[1:]
    while aa[-1] == '0':
        aa = aa[:-1]
    if 'x' in aa:
        _pos = aa.index('x')
        while aa[_pos-1] == '0':
            aa = aa[:_pos-1]+aa[_pos:]
            _pos = aa.index('x')
    datos_2.append(aa)
    
printcounter = 0    
for row in range(num_rows):
    
    nombre_dato = datos[row, 0]
    found = False
    
    
    for nombre_counter in range(num_nom):
        nombre_bueno = nombres[nombre_counter, 0]
        
        if (datos_2[row] == nombres_2[nombre_counter]) :  
            found = True
            nombres_row = nombre_counter
            
            #print (row, nombre_dato , ' -- ', nombre_bueno)
            
            
            tabla.write(nombre_bueno + '\n')
            break  
            
    if found == False :
        
        tabla_error.write(str(row) + str(datos[row]) + '\n')
        tabla.write('no_encontrado' + '\n')
        #print (row, nombre_dato , ' no encontrado')
    
        
    
    if printcounter == 1000 :
        print('row: ', row, '/', num_rows)
        printcounter = 1
    else:
        printcounter += 1
    row = row + 1
    
    
print('FINISHED')
tabla.close()
tabla_error.close()


row:  1000 / 34998
row:  2000 / 34998
row:  3000 / 34998
row:  4000 / 34998
row:  5000 / 34998
row:  6000 / 34998
row:  7000 / 34998
row:  8000 / 34998
row:  9000 / 34998
row:  10000 / 34998
row:  11000 / 34998
row:  12000 / 34998
row:  13000 / 34998
row:  14000 / 34998
row:  15000 / 34998
row:  16000 / 34998
row:  17000 / 34998
row:  18000 / 34998
row:  19000 / 34998
row:  20000 / 34998
row:  21000 / 34998
row:  22000 / 34998
row:  23000 / 34998
row:  24000 / 34998
row:  25000 / 34998
row:  26000 / 34998
row:  27000 / 34998
row:  28000 / 34998
row:  29000 / 34998
row:  30000 / 34998
row:  31000 / 34998
row:  32000 / 34998
row:  33000 / 34998
row:  34000 / 34998
FINISHED

In [3]:
x = 'aaabcdef'

In [5]:
x[:-1]


Out[5]:
'aaabcde'

In [30]:
a = (0,1,2,3)

b = a[1]

b = 4

print(a)


(0, 1, 2, 3)

In [ ]:
import numpy as np
import os as os
import pandas as pd


importadorpanda = pd.read_csv('Importadortxtdelimtab.txt',skiprows = 1)

importador = np.array(importadorpanda)

contactosclientespanda = pd.read_csv('contactosclientescomas.txt',skiprows = 1)

contactosclientes = np.array(contactosclientespanda)

contactosproovpanda = pd.read_csv('Contactosproveedores.txt',skiprows = 1)

contactosproov = np.array(contactosproovpanda)

#print(importador.shape, contactosclientes.shape, contactosproov.shape)
#print (importador[0,:])


num_rows = importador.shape[0]
root = 'tabla_nueva.txt'
try:
    os.remove(root)
except :
    pass
contactos = np.vstack([contactosclientes, contactosproov])
substring = '*'
def iguales(a,b):

    length1 = len(a)
    length2 = len(b)

    a2 = False
    a3 = False
    a4 = False
    a5 = False
    a6 = False

    a1 = (a==b)

    if length1<length2 :
        a2 = (a==b[0:length1])
        a3 = (a[0:length1-2] == b[0:length1-2])
    elif length1<length2 :
        a4 = (a[0:length2]==b)
        a5 = (a[0:length2-2] == b[0:length2-2])
    else :
        a6 = (a[0:length2-4] == b[0:length2-4])
    
 #   if ('*' in a):
#        a = a[0, a.find(substring)]
 #   if ('*' in b):
#        b = b[0, a.find(substring)]
#    a1 = (a == b)
#    a2 =(company == data_company + '= S.A.')
#    a3 = (company == data_company + '= S.L.')
#    a4 = (company == data_company + '= S.A.U.')
#    a5 = (company == data_company + '=S.L')
#    a6 = (company == data_company + '= S.L.U')
    
    return a1 or a2 or a3 or a4 or a5 or a6


print(contactos[133,:])

tabla = open(root, mode = 'w')
data_row = 133

for row in np.arange(0, 153, 1):
    line = ''
    for cell in np.arange(0, 20, 1):
        line = line + str(importador[row,cell]) + ','
    tabla.write(line + '\n')




for row in np.arange(153, num_rows, 1):
    company = importador[row, 1]
    data_company = contactos[data_row, 0]
    print (row, company, '--' ,data_company)
    line = ''
    for cell in np.arange(0, 14, 1):
        line = line + str(importador[row,cell]) + ','
        
    if (iguales(company, data_company)) :       
        for cell in np.arange(1, 7, 1):
            line = line + str(contactos[data_row,cell]) + ','
        tabla.write(line + '\n')
            
            
        data_row = data_row + 1
        if (data_row == 3219):
            break
        data_company = contactos[data_row, 0]
        while(iguales(company, data_company)):
            line = 14 * ','
            for cell in np.arange(1, 7, 1):
                line = line + str(contactos[data_row,cell]) + ','
            tabla.write(line + '\n')
            
            data_row = data_row + 1
            if (data_row == 3221):
                break
            data_company = contactos[data_row, 0]
            
            
    else:
        line = line + 6 * ','
        tabla.write(line + '\n')
        
    

tabla.close()