In [7]:
import pandas as pd
import numpy as np
def pandas2arff(df,filename,wekaname = "pandasdata",cleanstringdata=True,cleannan=True):
"""
converts the pandas dataframe to a weka compatible file
df: dataframe in pandas format
filename: the filename you want the weka compatible file to be in
wekaname: the name you want to give to the weka dataset (this will be visible to you when you open it in Weka)
cleanstringdata: clean up data which may have spaces and replace with "_", special characters etc which seem to annoy Weka.
To suppress this, set this to False
cleannan: replaces all nan values with "?" which is Weka's standard for missing values.
To suppress this, set this to False
"""
import re
def cleanstring(s):
if s!="?":
return re.sub('[^A-Za-z0-9]+', "_", str(s))
else:
return "?"
dfcopy = df #all cleaning operations get done on this copy
if cleannan!=False:
dfcopy = dfcopy.fillna(-999999999) #this is so that we can swap this out for "?"
#this makes sure that certain numerical columns with missing values don't get stuck with "object" type
f = open(filename,"w")
arffList = []
arffList.append("@relation " + wekaname + "\n")
#look at each column's dtype. If it's an "object", make it "nominal" under Weka for now (can be changed in source for dates.. etc)
for i in range(df.shape[1]):
if dfcopy.dtypes[i]=='O' or (df.columns[i] in ["Class","CLASS","class"]):
if cleannan!=False:
dfcopy.iloc[:,i] = dfcopy.iloc[:,i].replace(to_replace=-999999999, value="?")
if cleanstringdata!=False:
dfcopy.iloc[:,i] = dfcopy.iloc[:,i].apply(cleanstring)
_uniqueNominalVals = [str(_i) for _i in np.unique(dfcopy.iloc[:,i])]
_uniqueNominalVals = ",".join(_uniqueNominalVals)
_uniqueNominalVals = _uniqueNominalVals.replace("[","")
_uniqueNominalVals = _uniqueNominalVals.replace("]","")
_uniqueValuesString = "{" + _uniqueNominalVals +"}"
arffList.append("@attribute " + df.columns[i] + _uniqueValuesString + "\n")
else:
arffList.append("@attribute " + df.columns[i] + " real\n")
#even if it is an integer, let's just deal with it as a real number for now
arffList.append("@data\n")
for i in range(dfcopy.shape[0]):#instances
_instanceString = ""
for j in range(df.shape[1]):#features
if dfcopy.dtypes[j]=='O':
_instanceString+="\"" + str(dfcopy.iloc[i,j]) + "\""
else:
_instanceString+=str(dfcopy.iloc[i,j])
if j!=dfcopy.shape[1]-1:#if it's not the last feature, add a comma
_instanceString+=","
_instanceString+="\n"
if cleannan!=False:
_instanceString = _instanceString.replace("-999999999.0","?") #for numeric missing values
_instanceString = _instanceString.replace("\"?\"","?") #for categorical missing values
arffList.append(_instanceString)
f.writelines(arffList)
f.close()
del dfcopy
return True
In [8]:
df = pd.read_csv("test.csv",na_values="NA")
pandas2arff(df,"test.arff")
Out[8]:
In [11]:
#Thanks for the script, I fixed a few bugs in the first function.
def getCSVFromArff(fileName):
with open(fileName + '.arff', 'r') as fin:
data = fin.read().splitlines(True)
i = 0
cols = []
for line in data:
line = line.lower()
if ('@data' in line):
i+= 1
break
else:
#print line
i+= 1
if (line.startswith('@attribute')):
if('{' in line):
cols.append(line[11:line.index('{')-1])
else:
cols.append(line[11:line.index(' ', 11)])
headers = ",".join(cols)
with open(fileName + '.csv', 'w') as fout:
fout.write(headers)
fout.write('\n')
fout.writelines(data[i:])
In [12]:
df = pd.read_csv("test.csv",na_values="NA")
getCSVFromArff(df,"test.arff")
In [13]:
def getArffFile(fullDataSetFileName,trainFileName,target_nominal_column_name,colsExclude=[],maxNonimalCardinality=100,dateformat='%Y-%m-%d'):
# Here fullDataSetFileName is the full dataset while trainFileName is a part used for training or testing
# The reason you need fullDataSetFileName, is you need to make sure you have all the unique values for a feature
#
# The train file being a subset, might not have all the unique values of a feature and hence
# while using the same for creating a test file, you might get different values. In such cases Weka
# will not let you test on that set.
# If you do not need this functionality, such remove fullDataSetFileName from the arguments
# and work only with the trainFileName
# target_nominal_column_name is the column which will generally be the last
# feature in the arff while, which you want to predict
# colsExclude is a list of columns you want to exclude from the arff file
# pass an [] list if you want them all
# I also added a maxNonimalCardinality flag.
# This is used to make sure that numeric values can also be considered as nominal
# if there are very few of them
# read full data set
dfResourceFull = pd.read_csv(fullDataSetFileName)
# read train/test data set and temporily same a csv verion of only included columns
dfResourceTrain = pd.read_csv(trainFileName)
colsInclude = [col for col in dfResourceTrain.columns if col not in colsExclude]
dfResourceTrain = dfResourceTrain[colsInclude]
dfResourceTrain.to_csv('tmp.csv',index=False)
# read the temporary csv file
with open('tmp.csv', 'r') as fin:
data = fin.read().splitlines(True)
# first line in arff
output = '@relation ' + fullDataSetFileName + '\n'
#print
for col in colsInclude:
unqValues = pd.Series(dfResourceFull[col].values.ravel()).unique()
# check type of values
# use date if all dates
if( str(dfResourceTrain.dtypes[col]).startswith('datetime64') or (str(dfResourceTrain.dtypes[col]) == 'object' and alldates(unqValues,dateformat))):
x = '@attribute' + ' ' + col + ' date "' + dateformat + '"'
output = output + '\n' + x
# using string, when there are strings and the cardinality is high
elif( (len(unqValues) > maxNonimalCardinality) and (str(dfResourceTrain.dtypes[col]) == 'object')):
x = '@attribute' + ' ' + col + ' string'
output = output + '\n' + x
# use numeric if numeric and the cardinality is high
elif( (len(unqValues) > maxNonimalCardinality) and ((str(dfResourceTrain.dtypes[col]) == 'float64') or (str(dfResourceTrain.dtypes[col]) == 'int64'))):
x = '@attribute' + ' ' + col + ' numeric'
output = output + '\n' + x
# use nominal
else:
x = ''
for s in unqValues:
x = x + str(s) + ','
x = '@attribute' + ' ' + col + ' {' + x + '}'
x = x.replace(',}', '}')
#print x
output = output + '\n' + x
output = output + '\n'
#print '@data'
output = output + '\n' + '@data' + '\n'
with open(trainFileName+'.arff', 'w') as fout:
fout.write(output)
fout.writelines(data[1:])
In [ ]: