In [47]:
# from utils.pandas import read_text
import pandas as pd
import numpy as np
from io import StringIO

data = StringIO("""
    A   B   C     D  E F
0  foo one small  1  1 200
1  foo one large  2  1
2  foo one large  2  1
5  bar one large  4  1
6  bar one small  5  1
7  bar two small  6  1
8  bar two large  7  1
9  foo th1 small  3  1
10  foo th2 small  3  1
11  foo th3 small  3  1
12  foo th4 small  3  1
""")

df = pd.read_csv(data,sep=r'\s+')
df


Out[47]:
A B C D E F
0 foo one small 1 1 200.0
1 foo one large 2 1 NaN
2 foo one large 2 1 NaN
5 bar one large 4 1 NaN
6 bar one small 5 1 NaN
7 bar two small 6 1 NaN
8 bar two large 7 1 NaN
9 foo th1 small 3 1 NaN
10 foo th2 small 3 1 NaN
11 foo th3 small 3 1 NaN
12 foo th4 small 3 1 NaN

In [60]:
# columns = None
columns = ['C','D']
index=['A', 'B']
# index=None
aggfunc=np.mean
values=['E']
# values = None
# index = None

In [61]:
from scipy.sparse import csr_matrix
from pandas.core.groupby import Grouper
from pandas.core.dtypes.generic import ABCSeries
from pandas.core.index import Index
from pandas.core.dtypes.common import is_scalar

def _convert_by(by):
    if by is None:
        by = []
    elif (is_scalar(by) or
          isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) or
          hasattr(by, '__call__')):
        by = [by]
    else:
        by = list(by)
    return by

# should use  _convert_by(by)
# https://github.com/pandas-dev/pandas/blob/master/pandas/core/reshape/pivot.py
index = _convert_by(index)
columns = _convert_by(columns)

keys = index + columns
grouped = df.groupby(keys)
keys_list = list(zip(*grouped.grouper.groups.keys()))

if not isinstance(aggfunc, dict):
    if values is None:
        values = df.columns
        for key in keys:
            try:
                values = values.drop(key)
            except (TypeError, ValueError):
                pass
        values = list(values)
    if not isinstance(values, list):
        values = [values]
    aggfunc = {value: aggfunc for value in values}


# final pivot_table
pt = []
for value, func in aggfunc.items():
    
    agged = grouped.agg({value: func})

    # determine the index, column coordinate for each data_values
    index_len = len(index)
    col_len = len(columns)

    index_zip = zip(*keys_list[:index_len])
    col_zip = zip(*keys_list[index_len:])

    index_set = []
    col_set = []
    for i in zip(*keys_list[:index_len]):
        if i not in index_set:
            index_set.append(i)
    for i in zip(*keys_list[index_len:]):
        if i not in col_set:
            col_set.append(i)

    index_dict = {name: idx for idx, name in enumerate(index_set)}
    col_dict = {name: idx for idx, name in enumerate(col_set)}

    index_coord = [index_dict[i] for i in index_zip]
    col_coord = [col_dict[i] for i in col_zip]
    data_values = agged[value].tolist()

    # use data_values, (index_coord, col_coord) to build csr_matrix
    sparse_matrix = csr_matrix((data_values, (index_coord, col_coord)),
                                 shape=(len(index_set), len(col_set)))
    
    sparse_df = pd.SparseDataFrame([
        pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=np.NaN) 
                                  for i in np.arange(sparse_matrix.shape[0])
    ], default_fill_value=np.NaN)
    
    # col_idx_data = col_set.copy()
    # if isinstance(values, list):
    #     col_idx_data = map(lambda x: tuple(values) + x, col_set)

    # prepare index for pivot_table
    col_idx_data = map(lambda x: tuple(value) + x, col_set)
    col_idx = pd.MultiIndex.from_arrays(list(zip(*col_idx_data)), names=[None] + columns)
    index_idx = pd.MultiIndex.from_arrays(list(zip(*index_set)), names=index)
    sparse_df.columns = col_idx
    sparse_df.index = index_idx
    
    pt.append(sparse_df)
    
pt = pd.concat(pt, axis=1).sort_index(axis=1)

In [62]:
pt


Out[62]:
E
C large small
D 2 4 7 1 3 5 6
A
bar 0 1 1 0 0 1 1
foo 1 0 0 1 1 0 0

In [54]:


In [43]:
temp = pd.pivot_table(df, index=index, columns=columns)

temp


Out[43]:
   C      D
E  large  2      1.0
          4      1.0
          7      1.0
   small  1      1.0
          3      1.0
          5      1.0
          6      1.0
F  large  2      NaN
          4      NaN
          7      NaN
   small  1    200.0
          3      NaN
          5      NaN
          6      NaN
dtype: float64

In [ ]:


In [ ]:


In [ ]:


In [112]:
keys = index+columns
grouped = df.groupby(keys)
agged = grouped.agg(aggfunc)

print(grouped.grouper.groups)
# print(grouped.grouper.get_group_levels())
# print(grouped.grouper.get_iterator(df))
print(agged)

for i in grouped.grouper.get_iterator(df):
    print(i)

table = agged
if table.index.nlevels > 1:
    # Related GH #17123
    # If index_names are integers, determine whether the integers refer
    # to the level position or name.
    index_names = agged.index.names[:len(index)]
    to_unstack = []
    for i in range(len(index), len(keys)):
        name = agged.index.names[i]
        if name is None or name in index_names:
            to_unstack.append(i)
        else:
            to_unstack.append(name)
    table = agged.unstack(to_unstack)

from pandas.core.reshape.util import cartesian_product
from pandas import MultiIndex
try:
    m = MultiIndex.from_arrays(cartesian_product(table.index.levels),
                               names=table.index.names)
    table = table.reindex_axis(m, axis=0)
except AttributeError:
    pass  # it's a single level

try:
    m = MultiIndex.from_arrays(cartesian_product(table.columns.levels),
                               names=table.columns.names)
    table = table.reindex_axis(m, axis=1)
except AttributeError:
    pass  # it's a single level or a series
        
table


{('bar', 'one', 'large', 4): Int64Index([5], dtype='int64'), ('bar', 'one', 'small', 5): Int64Index([6], dtype='int64'), ('bar', 'two', 'large', 7): Int64Index([8], dtype='int64'), ('bar', 'two', 'small', 6): Int64Index([7], dtype='int64'), ('foo', 'one', 'large', 2): Int64Index([1, 2], dtype='int64'), ('foo', 'one', 'small', 1): Int64Index([0], dtype='int64'), ('foo', 'th1', 'small', 3): Int64Index([9], dtype='int64'), ('foo', 'th2', 'small', 3): Int64Index([10], dtype='int64'), ('foo', 'th3', 'small', 3): Int64Index([11], dtype='int64'), ('foo', 'th4', 'small', 3): Int64Index([12], dtype='int64')}
                 E
A   B   C     D   
bar one large 4  1
        small 5  1
    two large 7  1
        small 6  1
foo one large 2  1
        small 1  1
    th1 small 3  1
    th2 small 3  1
    th3 small 3  1
    th4 small 3  1
(('bar', 'one', 'large', 4),      A    B      C  D  E
5  bar  one  large  4  1)
(('bar', 'one', 'small', 5),      A    B      C  D  E
6  bar  one  small  5  1)
(('bar', 'two', 'large', 7),      A    B      C  D  E
8  bar  two  large  7  1)
(('bar', 'two', 'small', 6),      A    B      C  D  E
7  bar  two  small  6  1)
(('foo', 'one', 'large', 2),      A    B      C  D  E
1  foo  one  large  2  1
2  foo  one  large  2  1)
(('foo', 'one', 'small', 1),      A    B      C  D  E
0  foo  one  small  1  1)
(('foo', 'th1', 'small', 3),      A    B      C  D  E
9  foo  th1  small  3  1)
(('foo', 'th2', 'small', 3),       A    B      C  D  E
10  foo  th2  small  3  1)
(('foo', 'th3', 'small', 3),       A    B      C  D  E
11  foo  th3  small  3  1)
(('foo', 'th4', 'small', 3),       A    B      C  D  E
12  foo  th4  small  3  1)
Out[112]:
E
C large small
D 1 2 3 4 5 6 7 1 2 3 4 5 6 7
A B
bar one NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN
th1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
th2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
th3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
th4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
two NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN 1.0 NaN
foo one NaN 1.0 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN
th1 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN
th2 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN
th3 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN
th4 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN
two NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [117]:
temp = df.groupby(keys).agg(aggfunc).apply(csr_matrix)

pd.SparseDataFrame(temp)


---------------------------------------------------------------------------
UnboundLocalError                         Traceback (most recent call last)
<ipython-input-117-ea336e7a113b> in <module>()
      1 temp = df.groupby(keys).agg(aggfunc).apply(csr_matrix)
      2 
----> 3 pd.SparseDataFrame(temp)

~/anaconda/envs/python3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in __init__(self, data, index, columns, default_kind, default_fill_value, dtype, copy)
    121                 mgr = mgr.astype(dtype)
    122 
--> 123         generic.NDFrame.__init__(self, mgr)
    124 
    125     @property

UnboundLocalError: local variable 'mgr' referenced before assignment

In [97]:
temp = {'A': 1,'B': 2}

for i in temp.keys():
    print(i)


A
B

In [98]:
x, y = zip(*temp.items())

In [99]:
x


Out[99]:
('A', 'B')

In [100]:
y


Out[100]:
(1, 2)

In [ ]: