In [ ]:
import sframe
import psycopg2

In [ ]:
def OneHotEncoder (srcSFrame, headers, cols ) :
    print "One hot encoding"
    TempSFrame=srcSFrame.copy()
    limit=10 # Values occuring less often than limit are removed
    for col in cols :
        print(col)
        colSumm=srcSFrame.groupby(key_columns=col,operations={'count': sframe.aggregate.COUNT()})
        colSumm=colSumm.dropna(how='any')
        colSumm=colSumm[(colSumm['count'] >= limit)]
        oneHot = {}
        for i, l in enumerate(list(colSumm[col])):
            bits = ['0']*colSumm.num_rows(); bits[i] = '1'
            oneHot[l] = ''.join(bits)
        headers.extend(map(lambda x: col+"_"+x,sorted(oneHot, key=oneHot.__getitem__, reverse=True)))
        mapList=[]
        for i in oneHot.values(): #I hate this jugaad for making an sframe column that holds lists
            mapList.append(list(i)) #Someone please tell me a pythonic way of doing it
        colMap=sframe.SFrame([sframe.SArray(oneHot.keys()),sframe.SArray(mapList)])
        print(colMap.shape)
        print(time.ctime())
        TempSFrame=TempSFrame.join(colMap, on={col:'X1'},how='left')
        print('Joined')
        print(time.ctime())
        TempSFrame.rename({'X2': col+"_OneHot"})
        print('Renamed')
        print(time.ctime())
        TempSFrame=TempSFrame.unpack(col+"_OneHot",na_value='0')
        print('Unpack')
        print(time.ctime())
    TempSFrame.remove_columns(cols)
    return TempSFrame

In [ ]:
def ToFloat(value):
    try:
        return float(value)
    except:
        return 0

In [ ]:
def DataPreper (srcSFrame,headers,depColName,contFeatures,categFeatures) :
    TempSFrame=sframe.SFrame({depColName:srcSFrame[depColName]}) 
    print ('Adding continuous features')
    for k,c in enumerate(contFeatures) :
        print(c)
        TempSFrame=TempSFrame.add_column(srcSFrame[c].apply(lambda x: ToFloat(x)))
    categHeaders=[]
    if len(categFeatures) > 0 :
        TempSFrame2=OneHotEncoder(srcSFrame[categFeatures],categHeaders,categFeatures)
    TempSFrame=TempSFrame.add_columns(TempSFrame2)
    print(TempSFrame.head())
    headers.extend([depColName])
    headers.extend(contFeatures)
    headers.extend(categHeaders)
    TempSFrame.rename(dict((v,k) for k,v in enumerate(TempSFrame.column_names())))
    print(TempSFrame.head())
    return TempSFrame

In [ ]:
conn = psycopg2.connect('postgresql://username@serveraddress:port/database')

In [ ]:
%%time
data01=sframe.SFrame.from_sql(conn, "select * from tablename", cursor_arraysize=100000)

In [ ]:
#data01.save('data01')

In [ ]:
#data01 = sframe.load_sframe('data01')

In [ ]:
data01.shape

In [ ]:
#data01.head

In [ ]:
%%time
headers=[]
data02=DataPreper(data01,headers,'DependentCol',['ContinuousCol1','ContinuousCol2'],['CategoricalCol1','CategoricalCol2'])

In [ ]:
#data02.save('data02')

In [ ]:
#data02 = sframe.load_sframe('data02')

In [ ]:
%%time
array03= data02.apply(lambda x: sorted({int(k1):float(v1) for k1,v1 in x.iteritems() if v1!=None}.iteritems()))

In [ ]:
%%time
array03.materialize()

In [ ]:
%%time
array03.save('XGB_Data', format='text')

In [ ]:
%%time
!sed -e 's/^.\{5\}//g' -e 's/ /:/g' -e 's/,/ /g' -e 's/"//g' -e 's/]//g' -e 's/[[]//g' XGB_Data > XGB_Data.libsvm

In [ ]:
headers