In [1]:
import pandas as pd
import urllib
import requests  
import os

In [2]:
%matplotlib inline

In [3]:
pd.set_option('display.precision', 2)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [4]:
def get_receipt_data(date):
    month_year = date.strftime('%m/%Y')
    date_month_year = date.strftime('%d-%m-%Y')
    uri = 'https://treasury.ap.gov.in/cybertry/index1.php'
    body = {'service': 'datewise',
            'hdn': 2,
            'edate': month_year,
            'hoacc': date_month_year,
            'bankname': date_month_year,
            'mmyy': month_year
           }
    resp = requests.post(uri, data=body)
    data_tables = pd.read_html(resp.text)
    data = data_tables[0]
    print(data.shape, date_month_year)
    data.columns = data.iloc[0]
    data.drop(0, inplace=True)
    return data

In [5]:
def get_receipt_data_date_range(start_day, end_day):
    '''date time format mm-dd-yyyy'''
    combined_data = pd.DataFrame()
    existing_files = os.listdir('temp_data/')
    for date in pd.date_range(start_day, end_day):
        date_month_year = date.strftime('%d-%m-%Y')
        filename = '{0}.csv'.format(date_month_year)
        if filename not in existing_files:
            data = get_receipt_data(date)
            data.to_csv('temp_data/{0}'.format(filename))
        else:
            data = pd.read_csv('temp_data/{0}'.format(filename))
        combined_data = pd.concat([combined_data, data])
    return combined_data

In [7]:
sample_data = get_receipt_data_date_range('04-01-2017', '05-01-2018')

In [8]:
hoas = pd.read_csv('hoa_2018.csv')

In [9]:
hoas.columns = hoas.iloc[0]
hoas.drop(0, inplace=True)
hoas.head()


Out[9]:
0 demand demandname sdept dept_code sdeptname mh mh_desc smh smh_desc ... dh_desc sdh sdh_desc cv accts_1516 be_1617 re_1617 be_1718 mhtype hoa
1 1 NaN NaN NaN NaN NaN 0020 Corporation tax 00 Not Applicable ... Not Applicable 000 Not Applicable V 68897100 77293400 80499006 85837400 1 0020009010001000000NVN
2 2 NaN NaN NaN NaN NaN 0021 Taxes on Income Other Than Corporation Tax 00 Not Applicable ... Not Applicable 000 Not Applicable V 47920700 59900200 60842567 75044200 1 0021009010001000000NVN
3 3 NaN NaN NaN NaN NaN 0028 Other Taxes on Income and Expenditure 00 Not Applicable ... Not Applicable 000 Not Applicable V 2229564 3444158 3516485 3691306 1 0028001070001000000NVN
4 4 NaN NaN NaN NaN NaN 0028 Other Taxes on Income and Expenditure 00 Not Applicable ... Not Applicable 000 Not Applicable V 123089 0 0 203788 1 0028001070003000000NVN
5 5 NaN NaN NaN NaN NaN 0028 Other Taxes on Income and Expenditure 00 Not Applicable ... Not Applicable 000 Not Applicable V 150511 0 0 249188 1 0028001070004000000NVN

5 rows × 27 columns


In [10]:
# Check if head of account information present.
hoas_intersection = set(sample_data['Head of Account'].astype(str).unique()) & set(hoas.hoa.astype(str).unique())
len(hoas_intersection) == len(sample_data['Head of Account'].unique())


Out[10]:
True

In [11]:
sample_data['Head of Account'].unique()


Out[11]:
array(['0030021030001000000NVN', '0040001020005000000NVN',
       '0045001010001000000NVN', '8443008000009000000NVN',
       '0040001010001000000NVN', '0045008000002000000NVN',
       '0040001070001000000NVN', '0851001020081800000NVN',
       '0230001040001000000NVN', '0028001070001000000NVN',
       '0045001050001000000NVN', '0217608000081000000NVN',
       '0230001030001000000NVN', '1452008000081800000NVN',
       '0055001020081001000NVN', '0042001060001000000NVN',
       '0070601090002001000NVN', '0043001020001000000NVN',
       '8443001060001000000NVN', '0030021020001000000NVN',
       '0055008000008800000NVN', '0045001050081000000NVN',
       '0035001010001002000NVN', '8443001010001000000NVN',
       '0039001060001000000NVN', '0210041040001000000NVN',
       '0030038000081001000NVN', '0030011020001000000NVN',
       '8782001020001000000NVN', '0030011010001000000NVN'], dtype=object)

In [12]:
sample_data.shape


Out[12]:
(965676, 12)

In [13]:
sample_data.columns


Out[13]:
Index(['Unnamed: 0', 'Challanno', 'Dept TransId', 'Amount', 'Bank TransId',
       'DDOCode', 'Remitters Name', 'Bank Date', 'Bank Status', 'Scroll Date',
       'Head of Account', 'Bank'],
      dtype='object')

In [14]:
sample_data.head()


Out[14]:
Unnamed: 0 Challanno Dept TransId Amount Bank TransId DDOCode Remitters Name Bank Date Bank Status Scroll Date Head of Account Bank
0 1 1700386301 1703311104190314 6100.000 CP80357914 2012308006 MVRAO999 31-03-2017 Success 2017-04-01 0030021030001000000NVN SBIMOPS
1 2 1700387438 1703311216050312 1100.000 CP80376591 2012308009 dbsayyappa 31-03-2017 Success 2017-04-01 0030021030001000000NVN SBIMOPS
2 3 1700387514 1703311221350314 96050.000 CP80377881 2012308006 BTPRGANILKUMAR 31-03-2017 Success 2017-04-01 0030021030001000000NVN SBIMOPS
3 4 1700388282 1703310120010314 12400.000 CP80393318 2012308006 jyothi3310 31-03-2017 Success 2017-04-01 0030021030001000000NVN SBIMOPS
4 5 1700388439 1703310134120314 9100.000 CP80396476 2012308006 KOTESWARARAO9 31-03-2017 Success 2017-04-01 0030021030001000000NVN SBIMOPS

In [15]:
sample_data['Scroll Date'].unique()


Out[15]:
array(['2017-04-01', '2017-04-03', '2017-04-04', '2017-04-02',
       '2017-04-11', '2017-04-05', '2017-04-06', '2017-04-07', nan,
       '2017-04-10', '2017-04-08', '2017-04-09', '2017-04-12',
       '2017-04-13', '2017-04-19', '2017-04-16', '2017-04-15',
       '2017-04-14', '2017-04-17', '2017-04-18', '2017-04-20',
       '2017-04-21', '2017-04-24', '2017-04-26', '2017-04-23',
       '2017-04-22', '2017-04-25', '2017-04-27', '2017-04-28',
       '2017-04-29', '2017-05-02', '2017-05-01', '2017-04-30',
       '2017-05-03', '2017-05-09', '2017-05-04', '2017-05-05',
       '2017-05-06', '2017-05-08', '2017-05-07', '2017-05-11',
       '2017-05-10', '2017-05-12', '2017-05-15', '2017-05-14',
       '2017-05-16', '2017-05-13', '2017-05-17', '2017-05-18',
       '2017-05-21', '2017-05-19', '2017-05-20', '2017-05-22',
       '2017-05-27', '2017-05-23', '2017-05-24', '2017-05-29',
       '2017-05-25', '2017-05-26', '2017-06-01', '2017-05-31',
       '2017-05-30', '2017-05-28', '2017-06-02', '2017-06-03',
       '2017-06-06', '2017-06-07', '2017-06-05', '2017-06-04',
       '2017-06-09', '2017-06-08', '2017-06-10', '2017-06-12',
       '2017-06-11', '2017-06-13', '2017-06-14', '2017-06-15',
       '2017-06-16', '2017-06-17', '2017-06-19', '2017-06-18',
       '2017-06-20', '2017-06-21', '2017-06-22', '2017-06-23',
       '2017-06-27', '2017-06-28', '2017-06-26', '2017-06-24',
       '2017-06-25', '2017-06-29', '2017-06-30', '2017-07-01',
       '2017-07-03', '2017-07-02', '2017-07-04', '2017-07-05',
       '2017-07-06', '2017-07-12', '2017-07-07', '2017-07-10',
       '2017-07-08', '2017-07-09', '2017-07-11', '2017-07-13',
       '2017-07-14', '2017-07-15', '2017-07-17', '2017-07-16',
       '2017-07-18', '2017-07-19', '2017-07-20', '2017-07-21',
       '2017-07-27', '2017-07-24', '2017-07-22', '2017-07-23',
       '2017-07-25', '2017-07-26', '2017-07-28', '2017-07-29',
       '2017-07-31', '2017-07-30', '2017-08-01', '2017-08-02',
       '2017-03-08', '2017-08-09', '2017-08-03', '2017-08-04',
       '2017-08-05', '2017-08-07', '2017-08-06', '2017-08-08',
       '2017-08-10', '2017-11-08', '2017-08-11', '2017-08-14',
       '2017-08-15', '2017-08-12', '2017-08-13', '2017-08-16',
       '2017-08-18', '2017-08-17', '2017-08-19', '2017-08-21',
       '2017-08-22', '2017-08-20', '2017-08-23', '2017-08-24',
       '2017-09-05', '2017-08-28', '2017-08-25', '2017-08-26',
       '2017-08-29', '2017-08-27', '2017-08-31', '2017-08-30',
       '2017-09-01', '2017-12-04', '2017-09-04', '2017-09-02',
       '2017-09-03', '2017-09-06', '2017-09-07', '2017-09-08',
       '2017-09-11', '2017-09-12', '2017-09-13', '2017-09-09',
       '2017-09-10', '2017-09-16', '2017-09-18', '2017-09-14',
       '2017-09-15', '2017-09-17', '2017-09-20', '2017-09-19',
       '2017-09-22', '2017-09-25', '2017-09-21', '2017-09-26',
       '2017-09-27', '2017-09-23', '2017-09-24', '2017-09-28',
       '2017-09-29', '2017-09-30', '2017-10-04', '2017-10-05',
       '2017-10-03', '2017-10-01', '2017-10-02', '2017-10-06',
       '2017-10-07', '2017-10-09', '2017-10-10', '2017-10-08',
       '2017-10-11', '2017-10-12', '2017-10-13', '2017-10-16',
       '2017-10-17', '2017-10-14', '2017-10-15', '2017-10-18',
       '2017-10-21', '2017-10-19', '2017-10-20', '2017-10-23',
       '2017-10-24', '2017-10-22', '2017-10-25', '2017-10-26',
       '2017-10-27', '2017-11-10', '2017-10-30', '2017-10-31',
       '2017-11-02', '2017-10-28', '2017-10-29', '2017-11-01',
       '2017-11-03', '2017-11-06', '2017-11-07', '2017-11-14',
       '2017-11-04', '2017-11-05', '2017-11-09', '2017-11-13',
       '2017-11-11', '2017-11-12', '2017-11-15', '2017-11-16',
       '2017-11-17', '2017-11-18', '2017-11-21', '2017-11-20',
       '2017-11-19', '2017-11-22', '2017-11-23', '2017-11-24',
       '2017-11-27', '2017-11-28', '2017-11-25', '2017-11-26',
       '2017-11-29', '2017-11-30', '2017-12-02', '2017-12-01',
       '2017-12-03', '2017-12-06', '2017-12-05', '2017-12-07',
       '2017-12-08', '2017-12-11', '2017-12-09', '2017-12-10',
       '2017-12-13', '2017-12-12', '2017-12-16', '2017-12-14',
       '2017-12-15', '2017-12-17', '2017-12-18', '2017-12-19',
       '2017-12-20', '2017-12-21', '2017-12-22', '2017-12-24',
       '2017-12-25', '2017-12-26', '2017-12-27', '2017-12-23',
       '2017-12-28', '2017-12-29', '2017-12-30', '2018-01-01',
       '2017-12-31', '2018-01-03', '2018-01-02', '2018-03-31',
       '2018-01-04', '2018-01-09', '2018-01-05', '2018-01-06',
       '2018-01-08', '2018-01-07', '2018-01-10', '2018-01-12',
       '2018-01-11', '2018-01-15', '2018-01-16', '2018-01-13',
       '2018-01-14', '2018-01-17', '2018-01-18', '2018-01-19',
       '2018-01-20', '2018-01-23', '2018-01-22', '2018-01-21',
       '2018-01-24', '2018-01-25', '2018-01-26', '2018-01-30',
       '2018-01-27', '2018-01-28', '2018-01-29', '2018-01-31',
       '2018-02-01', '2018-02-02', '2018-02-03', '2018-02-05',
       '2018-02-06', '2018-02-04', '2018-02-07', '2018-02-08',
       '2018-02-09', '2018-02-10', '2018-03-01', '2018-02-12',
       '2018-02-13', '2018-02-14', '2018-02-11', '2018-02-16',
       '2018-02-15', '2018-02-17', '2018-02-18', '2018-02-19',
       '2018-02-20', '2018-02-21', '2018-02-22', '2018-02-23',
       '2018-02-24', '2018-02-25', '2018-02-26', '2018-02-27',
       '2018-02-28', '2018-03-02', '2018-03-04', '2018-03-03',
       '2018-03-09', '2018-03-05', '2018-03-06', '2018-03-07',
       '2018-03-08', '2018-03-13', '2018-03-14', '2018-03-12',
       '2018-03-10', '2018-03-11', '2018-03-15', '2018-03-22',
       '2018-03-19', '2018-03-16', '2018-03-17', '2018-03-23',
       '2018-03-18', '2018-03-20', '2018-03-21', '2018-03-24',
       '2018-03-27', '2018-03-26', '2018-03-30', '2018-03-25',
       '2018-03-28', '2018-03-29'], dtype=object)

we need to convert the data based on months, but there are nan values present which we.. lets see if we can resolve that


In [16]:
sample_data[pd.isnull(sample_data['Scroll Date'])]['Amount'].shape


Out[16]:
(115,)

In [18]:
sample_data[pd.isnull(sample_data['Scroll Date'])]['Amount'].sum()


Out[18]:
620741.0

In [19]:
sample_data['Combined Date'] = sample_data['Scroll Date'].fillna(sample_data['Bank Date'])

In [20]:
sample_data['date'] = pd.to_datetime(sample_data['Combined Date'])

In [21]:
sample_data['month_year'] = sample_data.date.apply(lambda x: x.strftime('%Y-%m'))

In [22]:
sample_data['Amount'] = sample_data['Amount'].astype(float)

In [23]:
sample_data.groupby(['month_year'])['Amount'].sum()


Out[23]:
month_year
2017-01         12502.000
2017-03       3910235.000
2017-04   36155057598.590
2017-05   32523582527.750
2017-06   30971253505.750
2017-07   33273929781.330
2017-08   23931005683.140
2017-09   23127588640.250
2017-10   22563026605.850
2017-11   21562642904.400
2017-12   24693686824.700
2018-01   26118365581.230
2018-02   22079600232.050
2018-03   24435124023.000
Name: Amount, dtype: float64

In [71]:
sample_data['day'] = sample_data.date.apply(lambda x: x.day)

In [72]:
sample_data[sample_data['month_year'] == '2016-04'].groupby(['day'])['Amount'].sum()


Out[72]:
day
1       62523306.000
2      687287021.000
3       28690055.000
4      727217669.000
5      557357465.000
6      583107320.000
7      835706188.000
8       26869268.000
9       80286343.000
10      40079560.000
11    1454886744.000
12     733750335.000
13     927403803.000
14      73833954.000
15     260888000.000
16    1278999922.000
17      52737090.000
18    2744658197.000
19    1642041965.000
20   11328319249.000
21     871506023.000
22     712699603.000
23     256509275.000
24      37551722.000
25    1333508020.000
26     623290707.000
27     592815053.000
28     741021481.000
29     701431501.000
30     895397893.000
Name: Amount, dtype: float64

In [ ]: