High dimensional data notebook: pandas and CSV files

As part of my demonstration of the effect of packet loss rates on TCP performance, I build a comma separated variable (CSV) file which has data rate as a function of several variables: 1 packet loss rate % 1 Network delay 1 File size 1protocol (HTTP, HTTPS, FTP)


In [6]:
import pandas as pd

In [2]:
d=dict()
d={ 'gas': { 'a': { 7: {'IPv4': 20.7, 'IPv6': -20.7},
                  8: {'IPv4': 20.8, 'IPv6': -20.8},
                  9: {'IPv4': 20.9, 'IPv6': -20.9}},
           'b': { 7: {'IPv4': 21.7, 'IPv6': -21.7},
                  8: {'IPv4': 21.8, 'IPv6': -21.8},
                  9: {'IPv4': 21.9, 'IPv6': -21.9}},
           'c': { 7: {'IPv4': 22.7, 'IPv6': -22.7},
                  8: {'IPv4': 22.8, 'IPv6': -22.8},
                  9: {'IPv4': 22.9, 'IPv6': -22.9}}},
  'liquid': { 'a': { 7: {'IPv4': 10.7, 'IPv6': -10.7},
                     8: {'IPv4': 10.8, 'IPv6': -10.8},
                     9: {'IPv4': 10.9, 'IPv6': -10.9}},
              'b': { 7: {'IPv4': 11.7, 'IPv6': -11.7},
                     8: {'IPv4': 11.8, 'IPv6': -11.8},
                     9: {'IPv4': 11.8, 'IPv6': -11.8}},
              'c': { 7: {'IPv4': 12.7, 'IPv6': -12.7},
                     8: {'IPv4': 12.8, 'IPv6': -12.8},
                     9: {'IPv4': 12.9, 'IPv6': -12.9}}},
  'solid': { 'a': { 7: {'IPv4': 0.7, 'IPv6': -0.7},
                    8: {'IPv4': 0.8, 'IPv6': -0.8},
                    9: {'IPv4': 0.9, 'IPv6': -0.9}},
             'b': { 7: {'IPv4': 1.7, 'IPv6': -1.7},
                    8: {'IPv4': 1.8, 'IPv6': -1.8},
                    9: {'IPv4': 1.8, 'IPv6': -1.8}},
             'c': { 7: {'IPv4': 2.7, 'IPv6': -2.7},
                    8: {'IPv4': 2.8, 'IPv6': -2.8},
                    9: {'IPv4': 2.9, 'IPv6': -2.9}}}}

In [3]:
import pprint
pp=pprint.PrettyPrinter(indent=2, width=80)

In [4]:
pp.pprint(d)


{ 'gas': { 'a': { 7: {'IPv4': 20.7, 'IPv6': -20.7},
                  8: {'IPv4': 20.8, 'IPv6': -20.8},
                  9: {'IPv4': 20.9, 'IPv6': -20.9}},
           'b': { 7: {'IPv4': 21.7, 'IPv6': -21.7},
                  8: {'IPv4': 21.8, 'IPv6': -21.8},
                  9: {'IPv4': 21.9, 'IPv6': -21.9}},
           'c': { 7: {'IPv4': 22.7, 'IPv6': -22.7},
                  8: {'IPv4': 22.8, 'IPv6': -22.8},
                  9: {'IPv4': 22.9, 'IPv6': -22.9}}},
  'liquid': { 'a': { 7: {'IPv4': 10.7, 'IPv6': -10.7},
                     8: {'IPv4': 10.8, 'IPv6': -10.8},
                     9: {'IPv4': 10.9, 'IPv6': -10.9}},
              'b': { 7: {'IPv4': 11.7, 'IPv6': -11.7},
                     8: {'IPv4': 11.8, 'IPv6': -11.8},
                     9: {'IPv4': 11.8, 'IPv6': -11.8}},
              'c': { 7: {'IPv4': 12.7, 'IPv6': -12.7},
                     8: {'IPv4': 12.8, 'IPv6': -12.8},
                     9: {'IPv4': 12.9, 'IPv6': -12.9}}},
  'solid': { 'a': { 7: {'IPv4': 0.7, 'IPv6': -0.7},
                    8: {'IPv4': 0.8, 'IPv6': -0.8},
                    9: {'IPv4': 0.9, 'IPv6': -0.9}},
             'b': { 7: {'IPv4': 1.7, 'IPv6': -1.7},
                    8: {'IPv4': 1.8, 'IPv6': -1.8},
                    9: {'IPv4': 1.8, 'IPv6': -1.8}},
             'c': { 7: {'IPv4': 2.7, 'IPv6': -2.7},
                    8: {'IPv4': 2.8, 'IPv6': -2.8},
                    9: {'IPv4': 2.9, 'IPv6': -2.9}}}}

In [23]:
df=pd.DataFrame.from_dict(d, orient="index").rename_axis('phase')
print(df)
print(df.names)


                                                        a  \
phase                                                       
gas     {7: {'IPv4': 20.7, 'IPv6': -20.7}, 8: {'IPv4':...   
liquid  {7: {'IPv4': 10.7, 'IPv6': -10.7}, 8: {'IPv4':...   
solid   {7: {'IPv4': 0.7, 'IPv6': -0.7}, 8: {'IPv4': 0...   

                                                        b  \
phase                                                       
gas     {7: {'IPv4': 21.7, 'IPv6': -21.7}, 8: {'IPv4':...   
liquid  {7: {'IPv4': 11.7, 'IPv6': -11.7}, 8: {'IPv4':...   
solid   {7: {'IPv4': 1.7, 'IPv6': -1.7}, 8: {'IPv4': 1...   

                                                        c  
phase                                                      
gas     {7: {'IPv4': 22.7, 'IPv6': -22.7}, 8: {'IPv4':...  
liquid  {7: {'IPv4': 12.7, 'IPv6': -12.7}, 8: {'IPv4':...  
solid   {7: {'IPv4': 2.7, 'IPv6': -2.7}, 8: {'IPv4': 2...  
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-23-72d962d65fe7> in <module>()
      1 df=pd.DataFrame.from_dict(d, orient="index").rename_axis('phase')
      2 print(df)
----> 3 print(df.names)

/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __getattr__(self, name)
   3612             if name in self._info_axis:
   3613                 return self[name]
-> 3614             return object.__getattribute__(self, name)
   3615 
   3616     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'names'

In [30]:
df=pd.DataFrame.from_dict(d, orient="index").rename_axis('phase')
print(df)
print(40*'=')
df.rename_axis('alpha', axis=1, inplace=True)
print(df)


                                                        a  \
phase                                                       
gas     {7: {'IPv4': 20.7, 'IPv6': -20.7}, 8: {'IPv4':...   
liquid  {7: {'IPv4': 10.7, 'IPv6': -10.7}, 8: {'IPv4':...   
solid   {7: {'IPv4': 0.7, 'IPv6': -0.7}, 8: {'IPv4': 0...   

                                                        b  \
phase                                                       
gas     {7: {'IPv4': 21.7, 'IPv6': -21.7}, 8: {'IPv4':...   
liquid  {7: {'IPv4': 11.7, 'IPv6': -11.7}, 8: {'IPv4':...   
solid   {7: {'IPv4': 1.7, 'IPv6': -1.7}, 8: {'IPv4': 1...   

                                                        c  
phase                                                      
gas     {7: {'IPv4': 22.7, 'IPv6': -22.7}, 8: {'IPv4':...  
liquid  {7: {'IPv4': 12.7, 'IPv6': -12.7}, 8: {'IPv4':...  
solid   {7: {'IPv4': 2.7, 'IPv6': -2.7}, 8: {'IPv4': 2...  
========================================
alpha                                                   a  \
phase                                                       
gas     {7: {'IPv4': 20.7, 'IPv6': -20.7}, 8: {'IPv4':...   
liquid  {7: {'IPv4': 10.7, 'IPv6': -10.7}, 8: {'IPv4':...   
solid   {7: {'IPv4': 0.7, 'IPv6': -0.7}, 8: {'IPv4': 0...   

alpha                                                   b  \
phase                                                       
gas     {7: {'IPv4': 21.7, 'IPv6': -21.7}, 8: {'IPv4':...   
liquid  {7: {'IPv4': 11.7, 'IPv6': -11.7}, 8: {'IPv4':...   
solid   {7: {'IPv4': 1.7, 'IPv6': -1.7}, 8: {'IPv4': 1...   

alpha                                                   c  
phase                                                      
gas     {7: {'IPv4': 22.7, 'IPv6': -22.7}, 8: {'IPv4':...  
liquid  {7: {'IPv4': 12.7, 'IPv6': -12.7}, 8: {'IPv4':...  
solid   {7: {'IPv4': 2.7, 'IPv6': -2.7}, 8: {'IPv4': 2...  

In [34]:
# This is adapted from https://stackoverflow.com/questions/47416113/how-to-build-a-multiindex-pandas-dataframe-from-a-nested-dictionary-with-lists
d2 = {(i,j,k,l): d[i][j][k][l] 
         for i in d.keys()
              for j in d[i].keys()
                  for k in d[i][j].keys()
                      for l in d[i][j][k].keys()
     
     }

In [35]:
d2


Out[35]:
{('gas', 'a', 7, 'IPv4'): 20.7,
 ('gas', 'a', 7, 'IPv6'): -20.7,
 ('gas', 'a', 8, 'IPv4'): 20.8,
 ('gas', 'a', 8, 'IPv6'): -20.8,
 ('gas', 'a', 9, 'IPv4'): 20.9,
 ('gas', 'a', 9, 'IPv6'): -20.9,
 ('gas', 'b', 7, 'IPv4'): 21.7,
 ('gas', 'b', 7, 'IPv6'): -21.7,
 ('gas', 'b', 8, 'IPv4'): 21.8,
 ('gas', 'b', 8, 'IPv6'): -21.8,
 ('gas', 'b', 9, 'IPv4'): 21.9,
 ('gas', 'b', 9, 'IPv6'): -21.9,
 ('gas', 'c', 7, 'IPv4'): 22.7,
 ('gas', 'c', 7, 'IPv6'): -22.7,
 ('gas', 'c', 8, 'IPv4'): 22.8,
 ('gas', 'c', 8, 'IPv6'): -22.8,
 ('gas', 'c', 9, 'IPv4'): 22.9,
 ('gas', 'c', 9, 'IPv6'): -22.9,
 ('liquid', 'a', 7, 'IPv4'): 10.7,
 ('liquid', 'a', 7, 'IPv6'): -10.7,
 ('liquid', 'a', 8, 'IPv4'): 10.8,
 ('liquid', 'a', 8, 'IPv6'): -10.8,
 ('liquid', 'a', 9, 'IPv4'): 10.9,
 ('liquid', 'a', 9, 'IPv6'): -10.9,
 ('liquid', 'b', 7, 'IPv4'): 11.7,
 ('liquid', 'b', 7, 'IPv6'): -11.7,
 ('liquid', 'b', 8, 'IPv4'): 11.8,
 ('liquid', 'b', 8, 'IPv6'): -11.8,
 ('liquid', 'b', 9, 'IPv4'): 11.8,
 ('liquid', 'b', 9, 'IPv6'): -11.8,
 ('liquid', 'c', 7, 'IPv4'): 12.7,
 ('liquid', 'c', 7, 'IPv6'): -12.7,
 ('liquid', 'c', 8, 'IPv4'): 12.8,
 ('liquid', 'c', 8, 'IPv6'): -12.8,
 ('liquid', 'c', 9, 'IPv4'): 12.9,
 ('liquid', 'c', 9, 'IPv6'): -12.9,
 ('solid', 'a', 7, 'IPv4'): 0.7,
 ('solid', 'a', 7, 'IPv6'): -0.7,
 ('solid', 'a', 8, 'IPv4'): 0.8,
 ('solid', 'a', 8, 'IPv6'): -0.8,
 ('solid', 'a', 9, 'IPv4'): 0.9,
 ('solid', 'a', 9, 'IPv6'): -0.9,
 ('solid', 'b', 7, 'IPv4'): 1.7,
 ('solid', 'b', 7, 'IPv6'): -1.7,
 ('solid', 'b', 8, 'IPv4'): 1.8,
 ('solid', 'b', 8, 'IPv6'): -1.8,
 ('solid', 'b', 9, 'IPv4'): 1.8,
 ('solid', 'b', 9, 'IPv6'): -1.8,
 ('solid', 'c', 7, 'IPv4'): 2.7,
 ('solid', 'c', 7, 'IPv6'): -2.7,
 ('solid', 'c', 8, 'IPv4'): 2.8,
 ('solid', 'c', 8, 'IPv6'): -2.8,
 ('solid', 'c', 9, 'IPv4'): 2.9,
 ('solid', 'c', 9, 'IPv6'): -2.9}

In [39]:
mux = pd.MultiIndex.from_tuples(d2.keys())
df = pd.DataFrame(list(d2.values()), index=mux)
print(mux)


MultiIndex(levels=[['gas', 'liquid', 'solid'], ['a', 'b', 'c'], [7, 8, 9], ['IPv4', 'IPv6']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], [0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])

In [38]:
df


Out[38]:
0
gas a 7 IPv4 20.7
IPv6 -20.7
8 IPv4 20.8
IPv6 -20.8
9 IPv4 20.9
IPv6 -20.9
b 7 IPv4 21.7
IPv6 -21.7
8 IPv4 21.8
IPv6 -21.8
9 IPv4 21.9
IPv6 -21.9
c 7 IPv4 22.7
IPv6 -22.7
8 IPv4 22.8
IPv6 -22.8
9 IPv4 22.9
IPv6 -22.9
liquid a 7 IPv4 10.7
IPv6 -10.7
8 IPv4 10.8
IPv6 -10.8
9 IPv4 10.9
IPv6 -10.9
b 7 IPv4 11.7
IPv6 -11.7
8 IPv4 11.8
IPv6 -11.8
9 IPv4 11.8
IPv6 -11.8
c 7 IPv4 12.7
IPv6 -12.7
8 IPv4 12.8
IPv6 -12.8
9 IPv4 12.9
IPv6 -12.9
solid a 7 IPv4 0.7
IPv6 -0.7
8 IPv4 0.8
IPv6 -0.8
9 IPv4 0.9
IPv6 -0.9
b 7 IPv4 1.7
IPv6 -1.7
8 IPv4 1.8
IPv6 -1.8
9 IPv4 1.8
IPv6 -1.8
c 7 IPv4 2.7
IPv6 -2.7
8 IPv4 2.8
IPv6 -2.8
9 IPv4 2.9
IPv6 -2.9

In [46]:
# This is adapted from https://stackoverflow.com/questions/47416113/how-to-build-a-multiindex-pandas-dataframe-from-a-nested-dictionary-with-lists
d3 = {(i,j,k): d[i][j][k] 
         for i in d.keys()
              for j in d[i].keys()  
                  for k in d[i][j].keys()
     }
mux = pd.MultiIndex.from_tuples(d3.keys())
mux
df = pd.DataFrame(list(d3.values()), index=mux)
df


Out[46]:
IPv4 IPv6
gas a 7 20.7 -20.7
8 20.8 -20.8
9 20.9 -20.9
b 7 21.7 -21.7
8 21.8 -21.8
9 21.9 -21.9
c 7 22.7 -22.7
8 22.8 -22.8
9 22.9 -22.9
liquid a 7 10.7 -10.7
8 10.8 -10.8
9 10.9 -10.9
b 7 11.7 -11.7
8 11.8 -11.8
9 11.8 -11.8
c 7 12.7 -12.7
8 12.8 -12.8
9 12.9 -12.9
solid a 7 0.7 -0.7
8 0.8 -0.8
9 0.9 -0.9
b 7 1.7 -1.7
8 1.8 -1.8
9 1.8 -1.8
c 7 2.7 -2.7
8 2.8 -2.8
9 2.9 -2.9

In [60]:
# Adapted from https://stackoverflow.com/questions/50665996/pandas-read-in-multiindex-data-from-csv-file
# See also https://stackoverflow.com/questions/24519304/reading-csv-files-w-multiindex for another way to do it.
dfcsv = pd.read_csv("high_dimension_data.csv", index_col=[0,1,2,3])
dfcsv


Out[60]:
value
phase alpha idx protocol
solid a 1 ipv4 1.1
IPV6 -1.1
2 ipv4 1.2
IPV6 -1.2
3 ipv4 1.3
IPV6 -1.3
b 1 ipv4 1.6
IPV6 -1.6
2 ipv4 1.7
IPV6 -1.7
3 ipv4 1.8
IPV6 -1.8
c 1 ipv4 2.1
IPV6 -2.1
2 ipv4 2.2
IPV6 -2.2
3 ipv4 2.3
IPV6 -2.3
liquid a 1 ipv4 11.1
IPV6 -11.1
2 ipv4 11.2
IPV6 -11.2
3 ipv4 11.3
IPV6 -11.3
b 1 ipv4 11.6
IPV6 -11.6
2 ipv4 11.7
IPV6 -11.7
3 ipv4 11.8
IPV6 -11.8
c 1 ipv4 12.1
IPV6 -12.1
2 ipv4 12.2
IPV6 -12.2
3 ipv4 12.3
IPV6 -12.3
gas a 1 ipv4 21.1
IPV6 -21.1
2 ipv4 21.2
IPV6 -21.2
3 ipv4 21.3
IPV6 -21.3
b 1 ipv4 21.6
IPV6 -21.6
2 ipv4 21.7
IPV6 -21.7
3 ipv4 21.8
IPV6 -21.8
c 1 ipv4 22.1
IPV6 -22.1
2 ipv4 22.2
IPV6 -22.2
3 ipv4 22.3
IPV6 -22.3

In [2]:
lk=["solid","b",17,"IPv2"]
lk
tk=tuple(lk)
tk


Out[2]:
('solid', 'b', 17, 'IPv2')