In [ ]:
import sframe
import psycopg2
In [ ]:
def OneHotEncoder (srcSFrame, headers, cols ) :
print "One hot encoding"
TempSFrame=srcSFrame.copy()
limit=10 # Values occuring less often than limit are removed
for col in cols :
print(col)
colSumm=srcSFrame.groupby(key_columns=col,operations={'count': sframe.aggregate.COUNT()})
colSumm=colSumm.dropna(how='any')
colSumm=colSumm[(colSumm['count'] >= limit)]
oneHot = {}
for i, l in enumerate(list(colSumm[col])):
bits = ['0']*colSumm.num_rows(); bits[i] = '1'
oneHot[l] = ''.join(bits)
headers.extend(map(lambda x: col+"_"+x,sorted(oneHot, key=oneHot.__getitem__, reverse=True)))
mapList=[]
for i in oneHot.values(): #I hate this jugaad for making an sframe column that holds lists
mapList.append(list(i)) #Someone please tell me a pythonic way of doing it
colMap=sframe.SFrame([sframe.SArray(oneHot.keys()),sframe.SArray(mapList)])
print(colMap.shape)
print(time.ctime())
TempSFrame=TempSFrame.join(colMap, on={col:'X1'},how='left')
print('Joined')
print(time.ctime())
TempSFrame.rename({'X2': col+"_OneHot"})
print('Renamed')
print(time.ctime())
TempSFrame=TempSFrame.unpack(col+"_OneHot",na_value='0')
print('Unpack')
print(time.ctime())
TempSFrame.remove_columns(cols)
return TempSFrame
In [ ]:
def ToFloat(value):
try:
return float(value)
except:
return 0
In [ ]:
def DataPreper (srcSFrame,headers,depColName,contFeatures,categFeatures) :
TempSFrame=sframe.SFrame({depColName:srcSFrame[depColName]})
print ('Adding continuous features')
for k,c in enumerate(contFeatures) :
print(c)
TempSFrame=TempSFrame.add_column(srcSFrame[c].apply(lambda x: ToFloat(x)))
categHeaders=[]
if len(categFeatures) > 0 :
TempSFrame2=OneHotEncoder(srcSFrame[categFeatures],categHeaders,categFeatures)
TempSFrame=TempSFrame.add_columns(TempSFrame2)
print(TempSFrame.head())
headers.extend([depColName])
headers.extend(contFeatures)
headers.extend(categHeaders)
TempSFrame.rename(dict((v,k) for k,v in enumerate(TempSFrame.column_names())))
print(TempSFrame.head())
return TempSFrame
In [ ]:
conn = psycopg2.connect('postgresql://username@serveraddress:port/database')
In [ ]:
%%time
data01=sframe.SFrame.from_sql(conn, "select * from tablename", cursor_arraysize=100000)
In [ ]:
#data01.save('data01')
In [ ]:
#data01 = sframe.load_sframe('data01')
In [ ]:
data01.shape
In [ ]:
#data01.head
In [ ]:
%%time
headers=[]
data02=DataPreper(data01,headers,'DependentCol',['ContinuousCol1','ContinuousCol2'],['CategoricalCol1','CategoricalCol2'])
In [ ]:
#data02.save('data02')
In [ ]:
#data02 = sframe.load_sframe('data02')
In [ ]:
%%time
array03= data02.apply(lambda x: sorted({int(k1):float(v1) for k1,v1 in x.iteritems() if v1!=None}.iteritems()))
In [ ]:
%%time
array03.materialize()
In [ ]:
%%time
array03.save('XGB_Data', format='text')
In [ ]:
%%time
!sed -e 's/^.\{5\}//g' -e 's/ /:/g' -e 's/,/ /g' -e 's/"//g' -e 's/]//g' -e 's/[[]//g' XGB_Data > XGB_Data.libsvm
In [ ]:
headers