In [6]:
import pandas as pd

train_data = pd.read_csv('Data/Train_UWu5bXk.csv')
train_data.describe()
print train_data.shape
print train_data.isnull().sum()


(8523, 12)
Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [7]:
test_data = pd.read_csv('Data/Test_u94Q5KV.csv')
test_data.describe()
print test_data.shape
print test_data.isnull().sum()


(5681, 11)
Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [49]:
print train_data['Outlet_Size'].value_counts()
print train_data['Outlet_Type'].value_counts()
# print train_data.groupby('Outlet_Type').apply(lambda x: x['Outlet_Size'].mode())
# print train_data[train_data['Outlet_Size'].apply(lambda x: x == 'High')]['Outlet_Type']

print train_data.groupby(['Outlet_Type', 'Outlet_Identifier'])['Outlet_Size'].value_counts()


Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64
Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64
Outlet_Type        Outlet_Identifier  Outlet_Size
Grocery Store      OUT019             Small          528
Supermarket Type1  OUT013             High           932
                   OUT035             Small          930
                   OUT046             Small          930
                   OUT049             Medium         930
Supermarket Type2  OUT018             Medium         928
Supermarket Type3  OUT027             Medium         935
dtype: int64

In [46]:
print train_data[train_data['Outlet_Size'].isnull()].groupby('Outlet_Identifier')['Outlet_Type'].value_counts()


Outlet_Identifier  Outlet_Type      
OUT010             Grocery Store        555
OUT017             Supermarket Type1    926
OUT045             Supermarket Type1    929
dtype: int64

In [23]:
df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(),
   ....:                    'size': list('SSMMMLL'),
   ....:                    'weight': [8, 10, 11, 1, 20, 12, 12],
   ....:                    'adult' : [False] * 5 + [True] * 2});
print df
df.groupby('animal').apply(lambda subf: subf['animal'])


   adult animal size  weight
0  False    cat    S       8
1  False    dog    S      10
2  False    cat    M      11
3  False   fish    M       1
4  False    dog    M      20
5   True    cat    L      12
6   True    cat    L      12
Out[23]:
animal   
cat     0     cat
        2     cat
        5     cat
        6     cat
dog     1     dog
        4     dog
fish    3    fish
Name: animal, dtype: object