In [1]:
import pandas as pd

In [2]:
dbpedia_file ='politician-data' #make sure the file is parallel to the program
SPLIT_SIZE = 10000

In [3]:
dbpedia_data = pd.read_csv(dbpedia_file, sep='\t')
dbpedia_data.head()


Out[3]:
#DBpURL ID WikiURL gender name birthDate deathDate occupation nationality party
0 http://dbpedia.org/resource/Quincy_Timberlake 11 http://en.wikipedia.org/wiki/Quincy_Timberlake male [' quincy timberlake ', ' timberlake quincy '] [' 1980-04-22 '] NaN ['politician'] ['kenyan'] NaN
1 http://dbpedia.org/resource/Fanny_Edelman 29 http://en.wikipedia.org/wiki/Fanny_Edelman female [' fanny edelman ', ' edelman fanny '] [' 1911-02-27 '] [' 2011-11-01 '] ['politician'] ['argentine'] NaN
2 http://dbpedia.org/resource/Stojan_Novaković 65 http://en.wikipedia.org/wiki/Stojan_Novaković NaN [' stojan novaković ', ' (стојан новаковић) ',... [' 1842-11-01 '] [' 1915-02-18 '] ['historian', 'philology', 'politician', 'dipl... ['serbian'] [' serbian progressive party (historical) ']
3 http://dbpedia.org/resource/Paul_Scheffer 102 http://en.wikipedia.org/wiki/Paul_Scheffer male [' scheffer paul '] [' 1954-09-03 '] NaN ['politician'] NaN NaN
4 http://dbpedia.org/resource/Ed_Robb 127 http://en.wikipedia.org/wiki/Ed_Robb male NaN NaN NaN ['politician'] ['american'] NaN

In [4]:
len = dbpedia_data.shape[0] - 1
idx = 0
limit = SPLIT_SIZE
file_name = ''
while (idx < len) :
    rows = dbpedia_data.iloc[idx:limit,:]
    file_name = dbpedia_file+'-'+str(idx)+'-'+str(limit)
    rows.to_csv(file_name,sep='\t')
    idx = limit
    limit = idx + SPLIT_SIZE
    if(limit > len):
        limit = len

In [ ]: