In [9]:
import pandas as pd

In [10]:
data_total = pd.read_csv('uci_retail.csv', encoding = "ISO-8859-1")

In [11]:
data_total.head(10)


Out[11]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 United Kingdom
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
5 536365 22752 SET 7 BABUSHKA NESTING BOXES 2 12/1/2010 8:26 7.65 17850.0 United Kingdom
6 536365 21730 GLASS STAR FROSTED T-LIGHT HOLDER 6 12/1/2010 8:26 4.25 17850.0 United Kingdom
7 536366 22633 HAND WARMER UNION JACK 6 12/1/2010 8:28 1.85 17850.0 United Kingdom
8 536366 22632 HAND WARMER RED POLKA DOT 6 12/1/2010 8:28 1.85 17850.0 United Kingdom
9 536367 84879 ASSORTED COLOUR BIRD ORNAMENT 32 12/1/2010 8:34 1.69 13047.0 United Kingdom

In [65]:
data_geo_dist = data_total.groupby('Country')['CustomerID'].count()
data_geo_dist.sort_values(ascending=False)


Out[65]:
Country
United Kingdom          361878
Germany                   9495
France                    8491
EIRE                      7485
Spain                     2533
Netherlands               2371
Belgium                   2069
Switzerland               1877
Portugal                  1480
Australia                 1259
Norway                    1086
Italy                      803
Channel Islands            758
Finland                    695
Cyprus                     622
Sweden                     462
Austria                    401
Denmark                    389
Japan                      358
Poland                     341
USA                        291
Israel                     250
Unspecified                244
Singapore                  229
Iceland                    182
Canada                     151
Greece                     146
Malta                      127
United Arab Emirates        68
European Community          61
RSA                         58
Lebanon                     45
Lithuania                   35
Brazil                      32
Czech Republic              30
Bahrain                     17
Saudi Arabia                10
Hong Kong                    0
Name: CustomerID, dtype: int64

In [72]:
data_cust_geo = data_total[data_total['Country'].str.contains("United Kingdom")]
data_cust_geo.head(10)


Out[72]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 United Kingdom
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
5 536365 22752 SET 7 BABUSHKA NESTING BOXES 2 12/1/2010 8:26 7.65 17850.0 United Kingdom
6 536365 21730 GLASS STAR FROSTED T-LIGHT HOLDER 6 12/1/2010 8:26 4.25 17850.0 United Kingdom
7 536366 22633 HAND WARMER UNION JACK 6 12/1/2010 8:28 1.85 17850.0 United Kingdom
8 536366 22632 HAND WARMER RED POLKA DOT 6 12/1/2010 8:28 1.85 17850.0 United Kingdom
9 536367 84879 ASSORTED COLOUR BIRD ORNAMENT 32 12/1/2010 8:34 1.69 13047.0 United Kingdom

In [73]:
data_cust = data_cust_geo.groupby('CustomerID')['StockCode'].apply(','.join)
data_cust.head(10)


Out[73]:
CustomerID
12346.0                                          23166,23166
12747.0    85124C,85124B,84879,85062,85064,82484,21136,22...
12748.0    22854,22187,20894,85018D,21413,79149B,21682,78...
12749.0    22768,22766,22767,82482,82484,22170,22171,2245...
12820.0    21977,84992,21733,21985,22195,22196,22435,2280...
12821.0                  23068,22156,22295,22595,22335,22752
12822.0    22418,22367,23292,23290,22614,22899,23377,2266...
12823.0                        48138,48138,48138,48138,48138
12824.0    21506,22717,23541,23543,23234,22734,23274,2327...
12826.0    22086,22355,21481,20977,21479,22837,20978,2255...
Name: StockCode, dtype: object

In [75]:
data_cust_dist = data_cust_geo.groupby('CustomerID')['StockCode'].count()
data_cust_dist.sort_values(ascending=False)


Out[75]:
CustomerID
17841.0    7983
14096.0    5128
12748.0    4642
14606.0    2782
15311.0    2491
13089.0    1857
13263.0    1677
14298.0    1640
15039.0    1508
18118.0    1284
14159.0    1212
14796.0    1165
15005.0    1160
16033.0    1152
14056.0    1128
14769.0    1094
17511.0    1076
13081.0    1061
14527.0    1011
16549.0     981
14456.0     977
15719.0     938
15555.0     925
16931.0     898
17811.0     872
14505.0     803
17338.0     757
18283.0     756
17757.0     742
12921.0     741
           ... 
18141.0       1
18233.0       1
17925.0       1
18113.0       1
16148.0       1
16579.0       1
13120.0       1
14351.0       1
15389.0       1
12943.0       1
15753.0       1
13099.0       1
17846.0       1
17956.0       1
15524.0       1
16323.0       1
16078.0       1
13185.0       1
18068.0       1
13841.0       1
17752.0       1
13154.0       1
14090.0       1
14679.0       1
16881.0       1
17948.0       1
17102.0       1
17307.0       1
15195.0       1
15590.0       1
Name: StockCode, dtype: int64

In [76]:
print(data_cust_dist.count())
print(data_cust_dist.max())


3950
7983

In [ ]: