Save to .arff (Weka format)


In [7]:
import pandas as pd
import numpy as np

def pandas2arff(df,filename,wekaname = "pandasdata",cleanstringdata=True,cleannan=True):
    """
    converts the pandas dataframe to a weka compatible file
    df: dataframe in pandas format
    filename: the filename you want the weka compatible file to be in
    wekaname: the name you want to give to the weka dataset (this will be visible to you when you open it in Weka)
    cleanstringdata: clean up data which may have spaces and replace with "_", special characters etc which seem to annoy Weka. 
                     To suppress this, set this to False
    cleannan: replaces all nan values with "?" which is Weka's standard for missing values. 
              To suppress this, set this to False
    """
    import re
    
    def cleanstring(s):
        if s!="?":
            return re.sub('[^A-Za-z0-9]+', "_", str(s))
        else:
            return "?"
            
    dfcopy = df #all cleaning operations get done on this copy

    
    if cleannan!=False:
        dfcopy = dfcopy.fillna(-999999999) #this is so that we can swap this out for "?"
        #this makes sure that certain numerical columns with missing values don't get stuck with "object" type
 
    f = open(filename,"w")
    arffList = []
    arffList.append("@relation " + wekaname + "\n")
    #look at each column's dtype. If it's an "object", make it "nominal" under Weka for now (can be changed in source for dates.. etc)
    for i in range(df.shape[1]):
        if dfcopy.dtypes[i]=='O' or (df.columns[i] in ["Class","CLASS","class"]):
            if cleannan!=False:
                dfcopy.iloc[:,i] = dfcopy.iloc[:,i].replace(to_replace=-999999999, value="?")
            if cleanstringdata!=False:
                dfcopy.iloc[:,i] = dfcopy.iloc[:,i].apply(cleanstring)
            _uniqueNominalVals = [str(_i) for _i in np.unique(dfcopy.iloc[:,i])]
            _uniqueNominalVals = ",".join(_uniqueNominalVals)
            _uniqueNominalVals = _uniqueNominalVals.replace("[","")
            _uniqueNominalVals = _uniqueNominalVals.replace("]","")
            _uniqueValuesString = "{" + _uniqueNominalVals +"}" 
            arffList.append("@attribute " + df.columns[i] + _uniqueValuesString + "\n")
        else:
            arffList.append("@attribute " + df.columns[i] + " real\n") 
            #even if it is an integer, let's just deal with it as a real number for now
    arffList.append("@data\n")           
    for i in range(dfcopy.shape[0]):#instances
        _instanceString = ""
        for j in range(df.shape[1]):#features
                if dfcopy.dtypes[j]=='O':
                    _instanceString+="\"" + str(dfcopy.iloc[i,j]) + "\""
                else:
                    _instanceString+=str(dfcopy.iloc[i,j])
                if j!=dfcopy.shape[1]-1:#if it's not the last feature, add a comma
                    _instanceString+=","
        _instanceString+="\n"
        if cleannan!=False:
            _instanceString = _instanceString.replace("-999999999.0","?") #for numeric missing values
            _instanceString = _instanceString.replace("\"?\"","?") #for categorical missing values
        arffList.append(_instanceString)
    f.writelines(arffList)
    f.close()
    del dfcopy
    return True

In [8]:
df = pd.read_csv("test.csv",na_values="NA")
pandas2arff(df,"test.arff")


Out[8]:
True

In [11]:
#Thanks for the script, I fixed a few bugs in the first function.

def getCSVFromArff(fileName):

    with open(fileName + '.arff', 'r') as fin:
        data = fin.read().splitlines(True)

        i = 0
        cols = []
        for line in data:
        line = line.lower()
        if ('@data' in line):
        i+= 1
        break
        else:
        #print line
        i+= 1
        if (line.startswith('@attribute')):
        if('{' in line):
        cols.append(line[11:line.index('{')-1])
        else:
        cols.append(line[11:line.index(' ', 11)])

        headers = ",".join(cols)

        with open(fileName + '.csv', 'w') as fout:
        fout.write(headers)
        fout.write('\n')
        fout.writelines(data[i:])


  File "<ipython-input-11-4a00b1c8844d>", line 11
    line = line.lower()
       ^
IndentationError: expected an indented block

In [12]:
df = pd.read_csv("test.csv",na_values="NA")
getCSVFromArff(df,"test.arff")


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-ed1b936a5e13> in <module>()
      1 df = pd.read_csv("test.csv",na_values="NA")
----> 2 getCSVFromArff(df,"test.arff")

NameError: name 'getCSVFromArff' is not defined

In [13]:
def getArffFile(fullDataSetFileName,trainFileName,target_nominal_column_name,colsExclude=[],maxNonimalCardinality=100,dateformat='%Y-%m-%d'):
    
    # Here fullDataSetFileName is the full dataset while trainFileName is a part used for training or testing
    # The reason you need fullDataSetFileName, is you need to make sure you have all the unique values for a feature
    # 
    # The train file being a subset, might not have all the unique values of a feature and hence
    # while using the same for creating a test file, you might get different values. In such cases Weka
    # will not let you test on that set.
    
    # If you do not need this functionality, such remove fullDataSetFileName from the arguments 
    # and work only with the trainFileName

    # target_nominal_column_name is the column which will generally be the last
    # feature in the arff while, which you want to predict
    
    # colsExclude is a list of columns you want to exclude from the arff file
    # pass an [] list if you want them all
    
    # I also added a maxNonimalCardinality flag. 
    # This is used to make sure that numeric values can also be considered as nominal
    # if there are very few of them
    
    
    # read full data set
    dfResourceFull = pd.read_csv(fullDataSetFileName)
    
    # read train/test data set and temporily same a csv verion of only included columns    
    dfResourceTrain = pd.read_csv(trainFileName)
    colsInclude = [col for col in dfResourceTrain.columns if col not in colsExclude]
    
    dfResourceTrain = dfResourceTrain[colsInclude]
    dfResourceTrain.to_csv('tmp.csv',index=False)

    # read the temporary csv file    
    with open('tmp.csv', 'r') as fin:
        data = fin.read().splitlines(True)
    
    # first line in arff
    output =  '@relation ' + fullDataSetFileName + '\n'
    #print
    for col in colsInclude:
        unqValues = pd.Series(dfResourceFull[col].values.ravel()).unique()
        # check type of values
        
        # use date if all dates
        if( str(dfResourceTrain.dtypes[col]).startswith('datetime64') or (str(dfResourceTrain.dtypes[col]) == 'object' and alldates(unqValues,dateformat))):
            x =  '@attribute' + ' ' + col + ' date "' + dateformat + '"'
            output =  output + '\n' + x

        # using string, when there are strings and the cardinality is high
        elif( (len(unqValues) > maxNonimalCardinality) and (str(dfResourceTrain.dtypes[col]) == 'object')):
            x =  '@attribute' + ' ' + col + ' string'
            output =  output + '\n' + x
        
        # use numeric if numeric and the cardinality is high
        elif( (len(unqValues) > maxNonimalCardinality) and ((str(dfResourceTrain.dtypes[col]) == 'float64') or (str(dfResourceTrain.dtypes[col]) == 'int64'))):
            x =  '@attribute' + ' ' + col + ' numeric'
            output =  output + '\n' + x
        
        # use nominal
        else:
            x = ''
            for s in unqValues:
                x = x + str(s) + ','
            x =  '@attribute' + ' ' + col + ' {' + x + '}'
            x = x.replace(',}', '}')
            #print x
            output =  output + '\n' + x


    output =  output + '\n' 
    #print '@data'
    output =  output + '\n' + '@data' + '\n'

    with open(trainFileName+'.arff', 'w') as fout:
        fout.write(output)
        fout.writelines(data[1:])

In [ ]: