In [1]:
import codecs
from chardet.universaldetector import UniversalDetector

In [2]:
d221 = r'D:\TRENMO_JASP\CARRIS\emme\emme_files_jasp\d221_sc1_2009_modo_n.in'

In [3]:
detector = UniversalDetector()

for line in open(d221, 'r'):
    detector.feed(line)
    if detector.done:
        break

detector.close()
print detector.result


{'confidence': 0.73, 'language': '', 'encoding': 'ISO-8859-1'}

In [4]:
# Get file data - A new line is defined by the letter 'a'
with codecs.open(d221, mode='r', encoding=detector.result['encoding']) as d221Obj:
    
    data = []
    
    lnh_array = None
    for lnh in d221Obj:
        if lnh[0] == 'a':
            if not lnh_array:
                lnh_array = []
            else:
                data.append(lnh_array)
                lnh_array = []
        
        lnh_array.append(unicode(codecs.encode(lnh, 'utf-8'), 'utf-8'))

In [5]:
for i in range(len(data)):
    data[i] = "".join(data[i])

In [6]:
import pandas
df = pandas.DataFrame(data, columns=['str_value'])

In [7]:
df["str_value"] = df["str_value"].str.replace("\n", " ")

In [8]:
df["tst"] = df["str_value"].str[1:]
df["tst"] = df["tst"].str.replace("'", "")
df["tst"] = df["tst"].str.split()
df["tst"] = df["tst"].str.join(" ")

In [9]:
df["tst"] = df["tst"].str.replace(' -> ', '->')

In [10]:
df["interest"] = df.tst.str.split(r'\s*path=no \s*|\s* lay\s*').str[1]

In [11]:
df["interest"] = df.interest.str.replace(" ttf=11 ", " ")
df["interest"] = df.interest.str.replace(" ttf=0 ", " ")

In [12]:
df["pre_stops"] = df.interest.str.split(" ")

In [13]:
def get_stop_codes(row):
    array = row["pre_stops"]
    
    real_stops = []
    nextIsStop = None
    for i in range(len(array)):
        if not i or i == len(array) - 1:
            real_stops.append(array[i])
            continue
        
        else:
            if array[i] == 'dwt=.50' or array[i] == 'dwt=>.50':
                nextIsStop = True
                
                continue
            
            elif array[i] == 'dwt=#.00':
                nextIsStop = False
                continue
            
            elif array[i].startswith('ttf'):
                nextIsStop = True if nextIsStop else False
                continue
            
            else:
                nextIsStop = True if nextIsStop else False
        
        if nextIsStop:
            real_stops.append(array[i])
        else:
            continue
    
    row["real_stops"] = real_stops
    
    return row

In [14]:
df = df.apply(lambda x: get_stop_codes(x), axis=1)

In [ ]:
from gasp.toarray import series_to_list

In [ ]:
stops = series_to_list(df["real_stops"])

In [ ]:
import numpy

In [ ]:
nnstops = numpy.concatenate(stops, axis=0)

In [ ]:
final_stops = numpy.unique(nnstops)

In [ ]:
from gasp.toxls import df_to_xls

In [ ]:
result = pandas.DataFrame(final_stops, columns=["stops"])

In [ ]:
df_to_xls(result, r'D:\TRENMO_JASP\CARRIS\emme\emme_files_jasp\stops_d221_sc1_2009_modo_n2.xlsx')

In [15]:
# Write new d221.in

df["first_line"] = df.str_value.str.split(" path=no").str[0]
df["first_line"] = df.first_line.str.replace("\n", "")
df["first_line"] = df.first_line.str.replace("\r", "")

In [16]:
df["last_line"] = "lay=" + df.str_value.str.split("lay=").str[1]
df["last_line"] = df.last_line.str.replace("\r", "")

In [ ]:
print df["first_line"].str[30:]

In [17]:
def adjust_stop_cod(row):
    _stops_ = row["real_stops"]
    
    new_stops = []
    for i in range(len(_stops_)):
        #_stops_[i] = unicode(str(200000 + int(_stops_[i])), 'utf-8')
        new_stops.append(unicode(str(200000 + int(_stops_[i])), 'utf-8'))
    
    row["real_stops"] = new_stops
    
    return row

In [18]:
df = df.apply(lambda x: adjust_stop_cod(x), axis=1)

In [ ]:
print df.real_stops2[0]

In [ ]:
print df.real_stops[0]

In [19]:
with codecs.open(r'D:\TRENMO_JASP\CARRIS\emme\emme_files_jasp\d221_sul_tejo.in', 'w', encoding='utf-8') as txt:
    txt.write(u"t lines\n")
    txt.write(u"c\nc BOA VIAGEM\nc\n")
    
    first_lines = df.first_line.values.tolist()
    stops = df.real_stops.tolist()
    lay = df.last_line.tolist()
    
    for l in range(len(first_lines)):
        l_stops = [stops[l][i:i+3] for i in range(0, len(stops[l]), 3)]
        towrite = u"{}\n path=no {}\n {}{}{}{}".format(
            first_lines[l], " ".join(l_stops[0]),
            "\n ".join([" ".join(s) for s in l_stops[1:]]),
            "\n " if len(l_stops[1:]) else "",
            lay[i],
            "" if l +1 == len(first_lines) else "\n"
        )
        
        txt.write(towrite)
    
    txt.close()

In [ ]: