In [1]:
import pandas as pd
import requests
import csv
In [2]:
# process columns like major head as they are converted to numbers, which shouldnt happen
# MH SMH MINH GSH SH DH SDH
# data.columns[2:9]
In [3]:
fill_config = {u'MH': 4,
u'SMH': 2,
u'MINH': 3,
u'GSH': 2,
u'SH': 2,
u'DH': 3,
u'SDH': 3}
In [4]:
# this data should not be null, thus no error handling for nan values
def fill_process_col(data, n_fill):
return data.astype(int).astype(str).apply(lambda x: '0' * (n_fill - len(x.strip())) + x.strip())
In [5]:
def get_ap_treasury_data(year, district_code):
url = 'https://treasury.ap.gov.in/budreq/index.php?service=REPORTN'
params = {'fin_year': year,
'district': district_code,
'chkall': '',
'april': '04',
'may': '05',
'june': '06',
'july': '07',
'aug': '08',
'sep': '09',
'oct':'10',
'nov':'11',
'dec':'12',
'jan':'01',
'feb':'02',
'mar':'03',
'mh':'MH',
'smh':'SMH',
'minh':'MINH',
'gsh':'GSH',
'sh':'SH',
'dh':'DH',
'sdh':'SDH',
'npn':'NPN',
'dist':'DIST',
'sub':'Submit'}
resp = requests.get(url=url, params=params)
print(resp.status_code)
parsed_data = pd.read_html(resp.text, header=[0])
data = parsed_data[0]
return data[data['MH TYPE'] != 'TOTAL']
In [6]:
hoas_selected_cols = [
'mh',
'mh_desc',
'smh',
'smh_desc',
'minh',
'minh_desc',
'gsh',
'gsh_desc',
'sh',
'sh_desc',
'dh',
'dh_desc',
'sdh',
'sdh_desc',
'cv',
'mhtype',
'hoa']
def get_haos_data():
hoas = pd.read_csv('hoa_2018.csv', dtype=str)
hoas.columns = hoas.iloc[0]
hoas_processed = hoas.drop('0', axis=1).drop(0)
return hoas_processed
def extract_ap_districts_treasury_data(districts, years):
for district in districts:
district_data = pd.DataFrame()
for year in years:
data = get_ap_treasury_data(year, district)
for col in fill_config:
data[col] = fill_process_col(data[col], fill_config[col])
time_cols = data.columns[11:23]
cat_cols = data.columns[1:11]
processed_data = pd.DataFrame()
for index, row in data.iterrows():
cat_info = row[cat_cols]
for t_col in time_cols:
cat_info['MONTH'] = t_col
cat_info['AMOUNT'] = row[t_col]
processed_data = processed_data.append(cat_info, ignore_index=False)
processed_data.reset_index(inplace=True, drop=True)
hoas_group_cols = [key.lower() for key in fill_config.keys()]
hoas_processed = get_haos_data()
hoas_unique = hoas_processed.drop_duplicates('hoa')
data_group_cols = fill_config.keys()
combined_data = pd.merge(left=processed_data, left_on=list(data_group_cols),
right=hoas_unique[hoas_selected_cols], right_on=list(hoas_group_cols),
how='left')
combined_data.drop(hoas_group_cols, axis=1, inplace=True)
district_name = combined_data['DISTRICT'].iloc[0]
combined_data.to_csv('../csvs/ditrict_treasuries/ap/{}/{}.csv'.format(year, district_name),
index=False, quoting=csv.QUOTE_ALL)
print('Completed for ', district, year)
In [7]:
extract_ap_districts_treasury_data(['0100', '0200', '0300', '0400'], ['2016-17', '2017-18'])
In [73]:
data = get_ap_treasury_data('2016-17', '0500')
In [74]:
for col in fill_config:
data[col] = fill_process_col(data[col], fill_config[col])
In [75]:
data[data.columns[2:9]]
Out[75]:
In [76]:
# Now I want to put it in a format that is usable as a time series
data.columns[11:23]
Out[76]:
In [77]:
time_cols = data.columns[11:23]
cat_cols = data.columns[1:11]
In [78]:
processed_data = pd.DataFrame()
In [79]:
for index, row in data.iterrows():
cat_info = row[cat_cols]
for t_col in time_cols:
cat_info['MONTH'] = t_col
cat_info['AMOUNT'] = row[t_col]
processed_data = processed_data.append(cat_info, ignore_index=False)
In [80]:
processed_data.reset_index(inplace=True, drop=True)
processed_data
Out[80]:
In [81]:
# Now we want to get information on head of account
hoas = pd.read_csv('hoa_2018.csv', dtype=str)
In [82]:
hoas.columns = hoas.iloc[0]
hoas_processed = hoas.drop('0', axis=1).drop(0)
hoas_processed
Out[82]:
In [83]:
processed_data.columns
Out[83]:
In [84]:
hoas_group_cols = [key.lower() for key in fill_config.keys()]
hoas_group_cols
Out[84]:
In [85]:
hoas_unique = hoas_processed.drop_duplicates('hoa')
In [86]:
data_group_cols = fill_config.keys()
data_group_cols
Out[86]:
In [87]:
hoas_selected_cols = [
'mh',
'mh_desc',
'smh',
'smh_desc',
'minh',
'minh_desc',
'gsh',
'gsh_desc',
'sh',
'sh_desc',
'dh',
'dh_desc',
'sdh',
'sdh_desc',
'cv',
'mhtype',
'hoa']
In [88]:
combined_data = pd.merge(left=processed_data, left_on=list(data_group_cols),
right=hoas_unique[hoas_selected_cols], right_on=list(hoas_group_cols),
how='left')
In [89]:
hoas_group_cols
Out[89]:
In [90]:
combined_data.drop(hoas_group_cols, axis=1, inplace=True)
In [91]:
combined_data.to_csv('../csvs/ditrict_treasuries/ap/2016-17/krishna.csv', index=False, quoting=csv.QUOTE_ALL)
In [38]:
combined_data[combined_data.MH == '2202'].to_csv('Major_head_2202_Krishna_monthly.csv', quoting=csv.QUOTE_ALL, index=False)
In [35]:
combined_data[combined_data.MH == '4602'].to_csv('Major_head_4602_Krishna_monthly.csv', index=False)
combined_data[combined_data.MH == '6202'].to_csv('Major_head_6202_Krishna_monthly.csv', index=False)
In [37]:
combined_data.to_csv('ap_monthly_treasury_data.csv', quoting=csv.QUOTE_ALL)
In [ ]: