Load DCC Data


In [144]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [145]:
dfile = '../../zzData_RAW/2017-01-16-tc-dcc-scrub/order_comment.txt'
hfile = '../../zzData_RAW/2017-01-16-tc-dcc-scrub/backorder.FDF'

In [146]:
!!head -2 $dfile
#!!cat $hfile


Out[146]:
['10063442\tSL\t-999.999\t.01\t&**GSP BILL ONLY-IMPRINTED TOOTHBRUSHES-NO PRODUCTS SHIPPED* PO#20170001  GSP FILE DATE:JAN.5.2017                       CONTACT ORAL B AT:1-800-543-2577 IF ANY ISSUES              MARIA 4345  TORONTO\t2613256\t2613275\t01/16/17\t01/16/17',
 '10073511\tSO\t-999.999\t.01\t&GSP FILE DATE    JAN.10.2017                                GRACE X4331 TORONTO                                         GSP 5413401\t2613256\t3282748\t01/16/17\t']

In [147]:
cols = ['doc_no','doc_type','line_id','seq_id','note_line','bill_to',\
        'shipto', 'order_date','ship_to']
cols


Out[147]:
['doc_no',
 'doc_type',
 'line_id',
 'seq_id',
 'note_line',
 'bill_to',
 'shipto',
 'order_date',
 'ship_to']

In [148]:
df = pd.read_table(dfile, encoding='latin1', header=0, names=cols, \
                  usecols=[0,1,3,4,7] )

In [149]:
df.head()


Out[149]:
doc_no doc_type seq_id note_line order_date
0 10073511 SO 0.01 &GSP FILE DATE JAN.10.2017 ... 01/16/17
1 10081122 SL 0.01 &**GSP BILL ONLY-IMPRINTED TOOTHBRUSHES-NO PRO... 01/16/17
2 10081833 SL 0.01 &**GSP BILL ONLY-IMPRINTED TOOTHBRUSHES-NO PRO... 01/16/17
3 10081872 SL 0.01 &**GSP BILL ONLY-IMPRINTED TOOTHBRUSHES-NO PRO... 01/16/17
4 10082310 SZ 0.01 &Customer PO--- A0000045629 ... 01/16/17

Scrub


In [150]:
df.seq_id = np.trunc(df.seq_id * 100)
df.seq_id = df.seq_id.astype(int)
df.note_line = df.note_line.str.lstrip('&')

In [151]:
df.head()


Out[151]:
doc_no doc_type seq_id note_line order_date
0 10073511 SO 1 GSP FILE DATE JAN.10.2017 ... 01/16/17
1 10081122 SL 1 **GSP BILL ONLY-IMPRINTED TOOTHBRUSHES-NO PROD... 01/16/17
2 10081833 SL 1 **GSP BILL ONLY-IMPRINTED TOOTHBRUSHES-NO PROD... 01/16/17
3 10081872 SL 1 **GSP BILL ONLY-IMPRINTED TOOTHBRUSHES-NO PROD... 01/16/17
4 10082310 SZ 1 Customer PO--- A0000045629 ... 01/16/17

Consolidate TBD


In [178]:
df.seq_id[68]


Out[178]:
2

In [194]:
df.to_csv('../data/dcc_order_comment.txt', sep='\t' )

In [195]:
ls -lah ../data/dcc_order_comment.txt


-rw-r--r-- 1 jovyan users 4.2M Jan 17 04:13 ../data/dcc_order_comment.txt

In [ ]: