Example Usage for DataFrame



In [1]:

    
# remove comment to use latest development version
import sys; sys.path.insert(0, '../')



In [2]:

    
# import libraries
import raccoon as rc

Initialize



In [3]:

    
# empty DataFrame
df = rc.DataFrame()
df









    Out[3]:





object id: 2305959579080
columns:
[]
data:
[]
index:
[]



In [4]:

    
# with columns and indexes but no data
df = rc.DataFrame(columns=['a', 'b', 'c'], index=[1, 2, 3])
df









    Out[4]:





object id: 2305959578792
columns:
['a', 'b', 'c']
data:
[[None, None, None], [None, None, None], [None, None, None]]
index:
[1, 2, 3]



In [5]:

    
# with data
df = rc.DataFrame(data={'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[10, 11, 12], columns=['a', 'b'])
df









    Out[5]:





object id: 2305959818248
columns:
['a', 'b']
data:
[[1, 2, 3], [4, 5, 6]]
index:
[10, 11, 12]

Print



In [6]:

    
df.print()









    



  index    a    b
-------  ---  ---
     10    1    4
     11    2    5
     12    3    6



In [7]:

    
print(df)









    



  index    a    b
-------  ---  ---
     10    1    4
     11    2    5
     12    3    6

Setters and Getters



In [8]:

    
# columns
df.columns









    Out[8]:





['a', 'b']



In [9]:

    
df.columns = ['first', 'second']
print(df)









    



  index    first    second
-------  -------  --------
     10        1         4
     11        2         5
     12        3         6



In [10]:

    
# columns can be renamed with a dict()
df.rename_columns({'second': 'b', 'first': 'a'})
df.columns









    Out[10]:





['a', 'b']



In [11]:

    
# index
df.index









    Out[11]:





[10, 11, 12]



In [12]:

    
#indexes can be any non-repeating unique values
df.index = ['apple', 'pear', 7.7]
df.print()









    



index      a    b
-------  ---  ---
apple      1    4
pear       2    5
7.7        3    6



In [13]:

    
df.index = [10, 11, 12]
print(df)









    



  index    a    b
-------  ---  ---
     10    1    4
     11    2    5
     12    3    6



In [14]:

    
# the index can also have a name, befault it is "index"
df.index_name









    Out[14]:





'index'



In [15]:

    
df.index_name = 'units'
df.index_name









    Out[15]:





'units'



In [16]:

    
# data is a shallow copy, be careful on how this is used
df.index_name = 'index'
df.data









    Out[16]:





[[1, 2, 3], [4, 5, 6]]

Select Index



In [17]:

    
df.select_index(11)









    Out[17]:





[False, True, False]

Set Values



In [18]:

    
# set a single cell
df.set(10, 'a', 100)
print(df)









    



  index    a    b
-------  ---  ---
     10  100    4
     11    2    5
     12    3    6



In [19]:

    
# set a value outside current range creates a new row and/or column. Can also use [] for setting
df[13, 'c'] = 9
df.print()









    



  index    a    b    c
-------  ---  ---  ---
     10  100    4
     11    2    5
     12    3    6
     13              9



In [20]:

    
# set column
df['b'] = 55
print(df)









    



  index    a    b    c
-------  ---  ---  ---
     10  100   55
     11    2   55
     12    3   55
     13        55    9



In [21]:

    
# set a subset of column
df[[10, 12], 'b'] = 66
print(df)









    



  index    a    b    c
-------  ---  ---  ---
     10  100   66
     11    2   55
     12    3   66
     13        55    9



In [22]:

    
# using boolean list
df.set([True, False, True, False], 'b', [88, 99])
print(df)









    



  index    a    b    c
-------  ---  ---  ---
     10  100   88
     11    2   55
     12    3   99
     13        55    9



In [23]:

    
# setting with slices
df[12:13, 'a'] = 33
print(df)









    



  index    a    b    c
-------  ---  ---  ---
     10  100   88
     11    2   55
     12   33   99
     13   33   55    9



In [24]:

    
df[10:12, 'c'] = [1, 2, 3]
print(df)









    



  index    a    b    c
-------  ---  ---  ---
     10  100   88    1
     11    2   55    2
     12   33   99    3
     13   33   55    9



In [25]:

    
# append a row, DANGEROUS as there is not validation checking, but can be used for speed
df.append_row(14, {'a': 44, 'c': 100, 'd': 99})
print(df)









    



  index    a    b    c    d
-------  ---  ---  ---  ---
     10  100   88    1
     11    2   55    2
     12   33   99    3
     13   33   55    9
     14   44       100   99



In [26]:

    
# append rows, again use caution
df.append_rows([15, 16], {'a': [55, 56], 'd': [100,101]})
print(df)









    



  index    a    b    c    d
-------  ---  ---  ---  ---
     10  100   88    1
     11    2   55    2
     12   33   99    3
     13   33   55    9
     14   44       100   99
     15   55            100
     16   56            101

Get Values



In [27]:

    
# get a single cell
df[10, 'a']









    Out[27]:





100



In [28]:

    
# get an entire column
df['c'].print()









    



  index    c
-------  ---
     10    1
     11    2
     12    3
     13    9
     14  100
     15
     16



In [29]:

    
# get list of columns
df[['a', 'c']].print()









    



  index    a    c
-------  ---  ---
     10  100    1
     11    2    2
     12   33    3
     13   33    9
     14   44  100
     15   55
     16   56



In [30]:

    
# get subset of the index
df[[11, 12, 13], 'b'].print()









    



  index    b
-------  ---
     11   55
     12   99
     13   55



In [31]:

    
# get using slices
df[11:13, 'b'].print()









    



  index    b
-------  ---
     11   55
     12   99
     13   55



In [32]:

    
# get a matrix
df[10:11, ['a', 'c']].print()









    



  index    a    c
-------  ---  ---
     10  100    1
     11    2    2



In [33]:

    
# get a column, return as a list
df.get(columns='a', as_list=True)









    Out[33]:





[100, 2, 33, 33, 44, 55, 56]



In [34]:

    
# get a row and return as a dictionary
df.get_columns(index=13, columns=['a', 'b'], as_dict=True)









    Out[34]:





{'a': 33, 'b': 55, 'index': 13}

Set and Get by Location

Locations are the index of the index, in other words the index locations from 0...len(index)



In [35]:

    
# get a single cell
df.get_location(2, 'a')









    Out[35]:





33



In [36]:

    
# get an entire row when the columns is None
print(df.get_location(2))









    



  index    a    b    c  d
-------  ---  ---  ---  ---
     12   33   99    3



In [37]:

    
print(df.get_location(0, ['b', 'c'], as_dict=True))









    



{'b': 88, 'c': 1, 'index': 10}



In [38]:

    
df.get_location(-1).print()









    



  index    a  b    c      d
-------  ---  ---  ---  ---
     16   56            101



In [39]:

    
df.get_locations(locations=[0, 2]).print()









    



  index    a    b    c  d
-------  ---  ---  ---  ---
     10  100   88    1
     12   33   99    3



In [40]:

    
df.set_locations(locations=[0, 2], column='a', values=-9)
df.print()









    



  index    a    b    c    d
-------  ---  ---  ---  ---
     10   -9   88    1
     11    2   55    2
     12   -9   99    3
     13   33   55    9
     14   44       100   99
     15   55            100
     16   56            101

Head and Tail



In [41]:

    
df.head(2).print()









    



  index    a    b    c  d
-------  ---  ---  ---  ---
     10   -9   88    1
     11    2   55    2



In [42]:

    
df.tail(2).print()









    



  index    a  b    c      d
-------  ---  ---  ---  ---
     15   55            100
     16   56            101

Delete colunmns and rows



In [43]:

    
df.delete_rows([10, 13])
print(df)









    



  index    a    b    c    d
-------  ---  ---  ---  ---
     11    2   55    2
     12   -9   99    3
     14   44       100   99
     15   55            100
     16   56            101



In [44]:

    
df.delete_columns('b')
print(df)









    



  index    a    c    d
-------  ---  ---  ---
     11    2    2
     12   -9    3
     14   44  100   99
     15   55       100
     16   56       101

Convert



In [45]:

    
# return a dict
df.to_dict()









    Out[45]:





{'index': [11, 12, 14, 15, 16],
 'a': [2, -9, 44, 55, 56],
 'c': [2, 3, 100, None, None],
 'd': [None, None, 99, 100, 101]}



In [46]:

    
# exclude the index
df.to_dict(index=False)









    Out[46]:





{'a': [2, -9, 44, 55, 56],
 'c': [2, 3, 100, None, None],
 'd': [None, None, 99, 100, 101]}



In [47]:

    
# return an OrderedDict()
df.to_dict(ordered=True)









    Out[47]:





OrderedDict([('index', [11, 12, 14, 15, 16]),
             ('a', [2, -9, 44, 55, 56]),
             ('c', [2, 3, 100, None, None]),
             ('d', [None, None, 99, 100, 101])])



In [48]:

    
# return a list of just one column
df['c'].to_list()









    Out[48]:





[2, 3, 100, None, None]



In [49]:

    
# convert to JSON
string = df.to_json()
print(string)









    



{"data": {"a": [2, -9, 44, 55, 56], "c": [2, 3, 100, null, null], "d": [null, null, 99, 100, 101]}, "index": [11, 12, 14, 15, 16], "meta_data": {"index_name": "index", "columns": ["a", "c", "d"], "sort": false, "dropin": null}}



In [50]:

    
# construct DataFrame from JSON
df_from_json = rc.DataFrame.from_json(string)
print(df_from_json)









    



  index    a    c    d
-------  ---  ---  ---
     11    2    2
     12   -9    3
     14   44  100   99
     15   55       100
     16   56       101

Sort by Index and Column



In [51]:

    
df = rc.DataFrame({'a': [4, 3, 2, 1], 'b': [6, 7, 8, 9]}, index=[25, 24, 23, 22])
print(df)









    



  index    a    b
-------  ---  ---
     25    4    6
     24    3    7
     23    2    8
     22    1    9



In [52]:

    
# sort by index. Sorts are inplace
df.sort_index()
print(df)









    



  index    a    b
-------  ---  ---
     22    1    9
     23    2    8
     24    3    7
     25    4    6



In [53]:

    
# sort by column
df.sort_columns('b')
print(df)









    



  index    a    b
-------  ---  ---
     25    4    6
     24    3    7
     23    2    8
     22    1    9



In [54]:

    
# sort by column in reverse order
df.sort_columns('b', reverse=True)
print(df)









    



  index    a    b
-------  ---  ---
     22    1    9
     23    2    8
     24    3    7
     25    4    6



In [55]:

    
# sorting with a key function is avaialble, see tests for examples

Append



In [56]:

    
df1 = rc.DataFrame({'a': [1, 2], 'b': [5, 6]}, index=[1, 2])
df1.print()









    



  index    a    b
-------  ---  ---
      1    1    5
      2    2    6



In [57]:

    
df2 = rc.DataFrame({'b': [7, 8], 'c': [11, 12]}, index=[3, 4])
print(df2)









    



  index    b    c
-------  ---  ---
      3    7   11
      4    8   12



In [58]:

    
df1.append(df2)
print(df1)









    



  index    a    b    c
-------  ---  ---  ---
      1    1    5
      2    2    6
      3         7   11
      4         8   12

Math Methods



In [59]:

    
df = rc.DataFrame({'a': [1, 2, 3], 'b': [2, 8, 9]})



In [60]:

    
# test for equality
df.equality('a', value=3)









    Out[60]:





[False, False, True]



In [61]:

    
# all math methods can operate on a subset of the index
df.equality('b', indexes=[1, 2], value=2)









    Out[61]:





[False, False]



In [62]:

    
# add two columns
df.add('a', 'b')









    Out[62]:





[3, 10, 12]



In [63]:

    
# subtract
df.subtract('b', 'a')









    Out[63]:





[1, 6, 6]



In [64]:

    
# multiply
df.multiply('a', 'b', [0, 2])









    Out[64]:





[2, 27]



In [65]:

    
# divide
df.divide('b', 'a')









    Out[65]:





[2.0, 4.0, 3.0]

Multi-Index

Raccoon does not have true hierarchical mulit-index capabilities like Pandas, but attempts to mimic some of the capabilities with the use of tuples as the index. Raccoon does not provide any checking to make sure the indexes are all the same length or any other integrity checking.



In [66]:

    
tuples = [('a', 1, 3), ('a', 1, 4), ('a', 2, 3), ('b', 1, 4), ('b', 2, 1), ('b', 3, 3)]
df = rc.DataFrame({'a': [1, 2, 3, 4, 5, 6]}, index=tuples)
print(df)









    



index          a
-----------  ---
('a', 1, 3)    1
('a', 1, 4)    2
('a', 2, 3)    3
('b', 1, 4)    4
('b', 2, 1)    5
('b', 3, 3)    6

The select_index method works with tuples by allowing the * to act as a wild card for matching.



In [67]:

    
compare = ('a', None, None)
df.select_index(compare)









    Out[67]:





[True, True, True, False, False, False]



In [68]:

    
compare = ('a', None, 3)
df.select_index(compare, 'boolean')









    Out[68]:





[True, False, True, False, False, False]



In [69]:

    
compare = (None, 2, None)
df.select_index(compare, 'value')









    Out[69]:





[('a', 2, 3), ('b', 2, 1)]



In [70]:

    
compare = (None, None, 3)
df.select_index(compare, 'value')









    Out[70]:





[('a', 1, 3), ('a', 2, 3), ('b', 3, 3)]



In [71]:

    
compare = (None, None, None)
df.select_index(compare)









    Out[71]:





[True, True, True, True, True, True]

Reset Index



In [72]:

    
df = rc.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, columns=['a', 'b'])
print(df)









    



  index    a    b
-------  ---  ---
      0    1    4
      1    2    5
      2    3    6



In [73]:

    
df.reset_index()
df









    Out[73]:





object id: 2305960012584
columns:
['a', 'b', 'index_0']
data:
[[1, 2, 3], [4, 5, 6], [0, 1, 2]]
index:
[0, 1, 2]



In [74]:

    
df = rc.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, columns=['a', 'b'], index=['x', 'y', 'z'], index_name='jelo')
print(df)









    



jelo      a    b
------  ---  ---
x         1    4
y         2    5
z         3    6



In [75]:

    
df.reset_index()
print(df)









    



  index    a    b  jelo
-------  ---  ---  ------
      0    1    4  x
      1    2    5  y
      2    3    6  z



In [76]:

    
df = rc.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, columns=['a', 'b'],
                   index=[('a', 10, 'x'), ('b', 11, 'y'), ('c', 12, 'z')], index_name=('melo', 'helo', 'gelo'))
print(df)









    



('melo', 'helo', 'gelo')      a    b
--------------------------  ---  ---
('a', 10, 'x')                1    4
('b', 11, 'y')                2    5
('c', 12, 'z')                3    6



In [77]:

    
df.reset_index()
print(df)









    



  index    a    b  melo      helo  gelo
-------  ---  ---  ------  ------  ------
      0    1    4  a           10  x
      1    2    5  b           11  y
      2    3    6  c           12  z



In [78]:

    
df = rc.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, columns=['a', 'b'], index=['x', 'y', 'z'], index_name='jelo')
print(df)









    



jelo      a    b
------  ---  ---
x         1    4
y         2    5
z         3    6



In [79]:

    
df.reset_index(drop=True)
print(df)









    



  index    a    b
-------  ---  ---
      0    1    4
      1    2    5
      2    3    6

Iterators



In [80]:

    
df = rc.DataFrame({'a': [1, 2, 'c'], 'b': [5, 6, 'd']}, index=[1, 2, 3])



In [81]:

    
for row in df.iterrows():
    print(row)









    



{'index': 1, 'a': 1, 'b': 5}
{'index': 2, 'a': 2, 'b': 6}
{'index': 3, 'a': 'c', 'b': 'd'}



In [82]:

    
for row in df.itertuples():
    print(row)









    



Raccoon(index=1, a=1, b=5)
Raccoon(index=2, a=2, b=6)
Raccoon(index=3, a='c', b='d')

Sorted DataFrames

DataFrames will be set to sorted by default if no index is given at initialization. If an index is given at initialization then the parameter sorted must be set to True



In [83]:

    
df = rc.DataFrame({'a': [3, 5, 4], 'b': [6, 8, 7]}, index=[12, 15, 14], sort=True)

When sorted=True on initialization the data will be sorted by index to start



In [84]:

    
df.print()









    



  index    a    b
-------  ---  ---
     12    3    6
     14    4    7
     15    5    8



In [85]:

    
df[16, 'b'] = 9
print(df)









    



  index    a    b
-------  ---  ---
     12    3    6
     14    4    7
     15    5    8
     16         9



In [86]:

    
df.set(indexes=13, values={'a': 3.5, 'b': 6.5})
print(df)









    



  index    a    b
-------  ---  ---
     12  3    6
     13  3.5  6.5
     14  4    7
     15  5    8
     16       9