In [47]:
# from utils.pandas import read_text
import pandas as pd
import numpy as np
from io import StringIO
data = StringIO("""
A B C D E F
0 foo one small 1 1 200
1 foo one large 2 1
2 foo one large 2 1
5 bar one large 4 1
6 bar one small 5 1
7 bar two small 6 1
8 bar two large 7 1
9 foo th1 small 3 1
10 foo th2 small 3 1
11 foo th3 small 3 1
12 foo th4 small 3 1
""")
df = pd.read_csv(data,sep=r'\s+')
df
Out[47]:
In [60]:
# columns = None
columns = ['C','D']
index=['A', 'B']
# index=None
aggfunc=np.mean
values=['E']
# values = None
# index = None
In [61]:
from scipy.sparse import csr_matrix
from pandas.core.groupby import Grouper
from pandas.core.dtypes.generic import ABCSeries
from pandas.core.index import Index
from pandas.core.dtypes.common import is_scalar
def _convert_by(by):
if by is None:
by = []
elif (is_scalar(by) or
isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) or
hasattr(by, '__call__')):
by = [by]
else:
by = list(by)
return by
# should use _convert_by(by)
# https://github.com/pandas-dev/pandas/blob/master/pandas/core/reshape/pivot.py
index = _convert_by(index)
columns = _convert_by(columns)
keys = index + columns
grouped = df.groupby(keys)
keys_list = list(zip(*grouped.grouper.groups.keys()))
if not isinstance(aggfunc, dict):
if values is None:
values = df.columns
for key in keys:
try:
values = values.drop(key)
except (TypeError, ValueError):
pass
values = list(values)
if not isinstance(values, list):
values = [values]
aggfunc = {value: aggfunc for value in values}
# final pivot_table
pt = []
for value, func in aggfunc.items():
agged = grouped.agg({value: func})
# determine the index, column coordinate for each data_values
index_len = len(index)
col_len = len(columns)
index_zip = zip(*keys_list[:index_len])
col_zip = zip(*keys_list[index_len:])
index_set = []
col_set = []
for i in zip(*keys_list[:index_len]):
if i not in index_set:
index_set.append(i)
for i in zip(*keys_list[index_len:]):
if i not in col_set:
col_set.append(i)
index_dict = {name: idx for idx, name in enumerate(index_set)}
col_dict = {name: idx for idx, name in enumerate(col_set)}
index_coord = [index_dict[i] for i in index_zip]
col_coord = [col_dict[i] for i in col_zip]
data_values = agged[value].tolist()
# use data_values, (index_coord, col_coord) to build csr_matrix
sparse_matrix = csr_matrix((data_values, (index_coord, col_coord)),
shape=(len(index_set), len(col_set)))
sparse_df = pd.SparseDataFrame([
pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=np.NaN)
for i in np.arange(sparse_matrix.shape[0])
], default_fill_value=np.NaN)
# col_idx_data = col_set.copy()
# if isinstance(values, list):
# col_idx_data = map(lambda x: tuple(values) + x, col_set)
# prepare index for pivot_table
col_idx_data = map(lambda x: tuple(value) + x, col_set)
col_idx = pd.MultiIndex.from_arrays(list(zip(*col_idx_data)), names=[None] + columns)
index_idx = pd.MultiIndex.from_arrays(list(zip(*index_set)), names=index)
sparse_df.columns = col_idx
sparse_df.index = index_idx
pt.append(sparse_df)
pt = pd.concat(pt, axis=1).sort_index(axis=1)
In [62]:
pt
Out[62]:
In [54]:
In [43]:
temp = pd.pivot_table(df, index=index, columns=columns)
temp
Out[43]:
In [ ]:
In [ ]:
In [ ]:
In [112]:
keys = index+columns
grouped = df.groupby(keys)
agged = grouped.agg(aggfunc)
print(grouped.grouper.groups)
# print(grouped.grouper.get_group_levels())
# print(grouped.grouper.get_iterator(df))
print(agged)
for i in grouped.grouper.get_iterator(df):
print(i)
table = agged
if table.index.nlevels > 1:
# Related GH #17123
# If index_names are integers, determine whether the integers refer
# to the level position or name.
index_names = agged.index.names[:len(index)]
to_unstack = []
for i in range(len(index), len(keys)):
name = agged.index.names[i]
if name is None or name in index_names:
to_unstack.append(i)
else:
to_unstack.append(name)
table = agged.unstack(to_unstack)
from pandas.core.reshape.util import cartesian_product
from pandas import MultiIndex
try:
m = MultiIndex.from_arrays(cartesian_product(table.index.levels),
names=table.index.names)
table = table.reindex_axis(m, axis=0)
except AttributeError:
pass # it's a single level
try:
m = MultiIndex.from_arrays(cartesian_product(table.columns.levels),
names=table.columns.names)
table = table.reindex_axis(m, axis=1)
except AttributeError:
pass # it's a single level or a series
table
Out[112]:
In [117]:
temp = df.groupby(keys).agg(aggfunc).apply(csr_matrix)
pd.SparseDataFrame(temp)
In [97]:
temp = {'A': 1,'B': 2}
for i in temp.keys():
print(i)
In [98]:
x, y = zip(*temp.items())
In [99]:
x
Out[99]:
In [100]:
y
Out[100]:
In [ ]: