notebook.community

Edit and run



In [47]:

    
# from utils.pandas import read_text
import pandas as pd
import numpy as np
from io import StringIO

data = StringIO("""
    A   B   C     D  E F
0  foo one small  1  1 200
1  foo one large  2  1
2  foo one large  2  1
5  bar one large  4  1
6  bar one small  5  1
7  bar two small  6  1
8  bar two large  7  1
9  foo th1 small  3  1
10  foo th2 small  3  1
11  foo th3 small  3  1
12  foo th4 small  3  1
""")

df = pd.read_csv(data,sep=r'\s+')
df



In [60]:

    
# columns = None
columns = ['C','D']
index=['A', 'B']
# index=None
aggfunc=np.mean
values=['E']
# values = None
# index = None



In [61]:

    
from scipy.sparse import csr_matrix
from pandas.core.groupby import Grouper
from pandas.core.dtypes.generic import ABCSeries
from pandas.core.index import Index
from pandas.core.dtypes.common import is_scalar

def _convert_by(by):
    if by is None:
        by = []
    elif (is_scalar(by) or
          isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) or
          hasattr(by, '__call__')):
        by = [by]
    else:
        by = list(by)
    return by

# should use  _convert_by(by)
# https://github.com/pandas-dev/pandas/blob/master/pandas/core/reshape/pivot.py
index = _convert_by(index)
columns = _convert_by(columns)

keys = index + columns
grouped = df.groupby(keys)
keys_list = list(zip(*grouped.grouper.groups.keys()))

if not isinstance(aggfunc, dict):
    if values is None:
        values = df.columns
        for key in keys:
            try:
                values = values.drop(key)
            except (TypeError, ValueError):
                pass
        values = list(values)
    if not isinstance(values, list):
        values = [values]
    aggfunc = {value: aggfunc for value in values}


# final pivot_table
pt = []
for value, func in aggfunc.items():
    
    agged = grouped.agg({value: func})

    # determine the index, column coordinate for each data_values
    index_len = len(index)
    col_len = len(columns)

    index_zip = zip(*keys_list[:index_len])
    col_zip = zip(*keys_list[index_len:])

    index_set = []
    col_set = []
    for i in zip(*keys_list[:index_len]):
        if i not in index_set:
            index_set.append(i)
    for i in zip(*keys_list[index_len:]):
        if i not in col_set:
            col_set.append(i)

    index_dict = {name: idx for idx, name in enumerate(index_set)}
    col_dict = {name: idx for idx, name in enumerate(col_set)}

    index_coord = [index_dict[i] for i in index_zip]
    col_coord = [col_dict[i] for i in col_zip]
    data_values = agged[value].tolist()

    # use data_values, (index_coord, col_coord) to build csr_matrix
    sparse_matrix = csr_matrix((data_values, (index_coord, col_coord)),
                                 shape=(len(index_set), len(col_set)))
    
    sparse_df = pd.SparseDataFrame([
        pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=np.NaN) 
                                  for i in np.arange(sparse_matrix.shape[0])
    ], default_fill_value=np.NaN)
    
    # col_idx_data = col_set.copy()
    # if isinstance(values, list):
    #     col_idx_data = map(lambda x: tuple(values) + x, col_set)

    # prepare index for pivot_table
    col_idx_data = map(lambda x: tuple(value) + x, col_set)
    col_idx = pd.MultiIndex.from_arrays(list(zip(*col_idx_data)), names=[None] + columns)
    index_idx = pd.MultiIndex.from_arrays(list(zip(*index_set)), names=index)
    sparse_df.columns = col_idx
    sparse_df.index = index_idx
    
    pt.append(sparse_df)
    
pt = pd.concat(pt, axis=1).sort_index(axis=1)



In [62]:

    
pt



In [54]:



In [43]:

    
temp = pd.pivot_table(df, index=index, columns=columns)

temp









    Out[43]:





   C      D
E  large  2      1.0
          4      1.0
          7      1.0
   small  1      1.0
          3      1.0
          5      1.0
          6      1.0
F  large  2      NaN
          4      NaN
          7      NaN
   small  1    200.0
          3      NaN
          5      NaN
          6      NaN
dtype: float64



In [ ]:



In [ ]:



In [ ]:



In [112]:

    
keys = index+columns
grouped = df.groupby(keys)
agged = grouped.agg(aggfunc)

print(grouped.grouper.groups)
# print(grouped.grouper.get_group_levels())
# print(grouped.grouper.get_iterator(df))
print(agged)

for i in grouped.grouper.get_iterator(df):
    print(i)

table = agged
if table.index.nlevels > 1:
    # Related GH #17123
    # If index_names are integers, determine whether the integers refer
    # to the level position or name.
    index_names = agged.index.names[:len(index)]
    to_unstack = []
    for i in range(len(index), len(keys)):
        name = agged.index.names[i]
        if name is None or name in index_names:
            to_unstack.append(i)
        else:
            to_unstack.append(name)
    table = agged.unstack(to_unstack)

from pandas.core.reshape.util import cartesian_product
from pandas import MultiIndex
try:
    m = MultiIndex.from_arrays(cartesian_product(table.index.levels),
                               names=table.index.names)
    table = table.reindex_axis(m, axis=0)
except AttributeError:
    pass  # it's a single level

try:
    m = MultiIndex.from_arrays(cartesian_product(table.columns.levels),
                               names=table.columns.names)
    table = table.reindex_axis(m, axis=1)
except AttributeError:
    pass  # it's a single level or a series
        
table









    



{('bar', 'one', 'large', 4): Int64Index([5], dtype='int64'), ('bar', 'one', 'small', 5): Int64Index([6], dtype='int64'), ('bar', 'two', 'large', 7): Int64Index([8], dtype='int64'), ('bar', 'two', 'small', 6): Int64Index([7], dtype='int64'), ('foo', 'one', 'large', 2): Int64Index([1, 2], dtype='int64'), ('foo', 'one', 'small', 1): Int64Index([0], dtype='int64'), ('foo', 'th1', 'small', 3): Int64Index([9], dtype='int64'), ('foo', 'th2', 'small', 3): Int64Index([10], dtype='int64'), ('foo', 'th3', 'small', 3): Int64Index([11], dtype='int64'), ('foo', 'th4', 'small', 3): Int64Index([12], dtype='int64')}
                 E
A   B   C     D   
bar one large 4  1
        small 5  1
    two large 7  1
        small 6  1
foo one large 2  1
        small 1  1
    th1 small 3  1
    th2 small 3  1
    th3 small 3  1
    th4 small 3  1
(('bar', 'one', 'large', 4),      A    B      C  D  E
5  bar  one  large  4  1)
(('bar', 'one', 'small', 5),      A    B      C  D  E
6  bar  one  small  5  1)
(('bar', 'two', 'large', 7),      A    B      C  D  E
8  bar  two  large  7  1)
(('bar', 'two', 'small', 6),      A    B      C  D  E
7  bar  two  small  6  1)
(('foo', 'one', 'large', 2),      A    B      C  D  E
1  foo  one  large  2  1
2  foo  one  large  2  1)
(('foo', 'one', 'small', 1),      A    B      C  D  E
0  foo  one  small  1  1)
(('foo', 'th1', 'small', 3),      A    B      C  D  E
9  foo  th1  small  3  1)
(('foo', 'th2', 'small', 3),       A    B      C  D  E
10  foo  th2  small  3  1)
(('foo', 'th3', 'small', 3),       A    B      C  D  E
11  foo  th3  small  3  1)
(('foo', 'th4', 'small', 3),       A    B      C  D  E
12  foo  th4  small  3  1)






    Out[112]:







  
    
      
      
      E
    
    
      
      C
      large
      small
    
    
      
      D
      1
      2
      3
      4
      5
      6
      7
      1
      2
      3
      4
      5
      6
      7
    
    
      A
      B
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      bar
      one
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
    
    
      th1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      th2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      th3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      th4
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      two
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      NaN
    
    
      foo
      one
      NaN
      1.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      th1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      NaN
      NaN
    
    
      th2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      NaN
      NaN
    
    
      th3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      NaN
      NaN
    
    
      th4
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      NaN
      NaN
    
    
      two
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN



In [117]:

    
temp = df.groupby(keys).agg(aggfunc).apply(csr_matrix)

pd.SparseDataFrame(temp)









    



---------------------------------------------------------------------------
UnboundLocalError                         Traceback (most recent call last)
<ipython-input-117-ea336e7a113b> in <module>()
      1 temp = df.groupby(keys).agg(aggfunc).apply(csr_matrix)
      2 
----> 3 pd.SparseDataFrame(temp)

~/anaconda/envs/python3/lib/python3.6/site-packages/pandas/core/sparse/frame.py in __init__(self, data, index, columns, default_kind, default_fill_value, dtype, copy)
    121                 mgr = mgr.astype(dtype)
    122 
--> 123         generic.NDFrame.__init__(self, mgr)
    124 
    125     @property

UnboundLocalError: local variable 'mgr' referenced before assignment



In [97]:

    
temp = {'A': 1,'B': 2}

for i in temp.keys():
    print(i)

A
B



In [98]:

    
x, y = zip(*temp.items())



In [99]:

    
x









    Out[99]:





('A', 'B')



In [100]:

    
y









    Out[100]:





(1, 2)



In [ ]:

	A	B	C	D	E	F
0	foo	one	small	1	1	200.0
1	foo	one	large	2	1	NaN
2	foo	one	large	2	1	NaN
5	bar	one	large	4	1	NaN
6	bar	one	small	5	1	NaN
7	bar	two	small	6	1	NaN
8	bar	two	large	7	1	NaN
9	foo	th1	small	3	1	NaN
10	foo	th2	small	3	1	NaN
11	foo	th3	small	3	1	NaN
12	foo	th4	small	3	1	NaN