In [ ]:
import time
import dir_constants as dc
# Set some constants __________________________________________________________
now = time.strftime("%Y_%m_%d_%Hh_%Mm_%Ss")
platform = 'lendingclub'
# Set data_path _______________________________________________________________
data_path = dc.home_path + '/rsync_dl_rig/unzipped_lc_csvs'
# Get the loan_info csvs to iterate over ______________________________________
files = os.listdir(data_path)
loan_info_files = [
file_ for file_ in files
if not (file_.startswith('.') | file_.startswith('lc_') |
file_.startswith('PMTHIST') | file_.startswith('LCData'))
]
to_concat = []
for file_ in loan_info_files:
to_concat.append(
pd.read_csv(
data_path + '/' + file_, header=1, engine='python', skipfooter=2))
loan_info = pd.concat(to_concat)
# Block to ensure that rows that aren't actually loans are dropped ____________
# All loans must have int/term/funded
loan_info = loan_info[loan_info['term'].notnull()]
loan_info['int_rate'] = loan_info['int_rate'].str.strip('%').astype(float)
loan_info['term'] = loan_info['term'].str[:3].astype(int)
loan_info = loan_info[(loan_info['int_rate'] > 0) & (loan_info['term'] > 0) &
(loan_info['funded_amnt'] > 0)]
# Reset index and set id to string, also set index to id ______________________
loan_info.reset_index(drop=True, inplace=True)
loan_info['id'] = loan_info['id'].astype(str)
loan_info.set_index('id', drop=False, inplace=True)
# Save in HDFStore ____________________________________________________________
# store = pd.HDFStore(
# dc.home_path + '/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
# format(platform),
# append=True)
# store['loan_info_merged'] = loan_info
# print("{:,}".format(len(loan_info)) + " loans saved " +
# 'for {0}'.format(platform))
# print(store.keys())
# store.close()
In [ ]:
loan_info.reset_index?
In [ ]:
# trying out feather data format
PATH = '/home/justin/justin_tinkering/data_science/lendingclub/data/'
loan_info.reset_index()
loan_info.to_feather(f'{PATH}loan_info')
In [ ]:
os.mkdir('/home/justin/justin_tinkering/data_science/lendingclub/data')
In [ ]: