Chapter_07_Part_03



In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series

String manipulation

String object methods


In [2]:
val = 'a,b,  guido'
val.split(',')


Out[2]:
['a', 'b', '  guido']

In [3]:
pieces = [x.strip() for x in val.split(',')]
pieces


Out[3]:
['a', 'b', 'guido']

In [4]:
first, second, third = pieces
first + '::' + second + '::' + third


Out[4]:
'a::b::guido'

In [5]:
'::'.join(pieces)


Out[5]:
'a::b::guido'

In [6]:
'guido' in val


Out[6]:
True

In [7]:
val.index(',')


Out[7]:
1

In [8]:
val.find(':')


Out[8]:
-1

In [9]:
val.index(':')


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-9-280f8b2856ce> in <module>()
----> 1 val.index(':')

ValueError: substring not found

In [10]:
val.count(',')


Out[10]:
2

In [11]:
val.replace(',', '::')


Out[11]:
'a::b::  guido'

In [12]:
val.replace(',', '')


Out[12]:
'ab  guido'

Regular expressions


In [14]:
import re
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)


Out[14]:
['foo', 'bar', 'baz', 'qux']

In [15]:
regex = re.compile('\s+')
regex.split(text)


Out[15]:
['foo', 'bar', 'baz', 'qux']

In [16]:
regex.findall(text)


Out[16]:
['    ', '\t ', '  \t']

In [18]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)


Out[18]:
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [19]:
m = regex.search(text)
m


Out[19]:
<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

In [21]:
text[m.start():m.end()]


Out[21]:
'dave@google.com'

In [23]:
print(regex.match(text))


None

In [24]:
print(regex.sub('REDACTED', text))


Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED


In [25]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()


Out[25]:
('wesm', 'bright', 'net')

In [26]:
regex.findall(text)


Out[26]:
[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [27]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))


Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com


In [28]:
regex = re.compile(r"""
    (?P<username>[A-Z0-9._%+-]+)
    @
    (?P<domain>[A-Z0-9.-]+)
    \.
    (?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE)
m = regex.match('wesm@bright.net')
m.groupdict()


Out[28]:
{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}

Vectorized string functions in pandas


In [29]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = Series(data)
data


Out[29]:
Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

In [30]:
data.isnull()


Out[30]:
Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool

In [31]:
data.str.contains('gmail')


Out[31]:
Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object

In [32]:
pattern


Out[32]:
'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [34]:
data.str.findall(pattern, flags=re.IGNORECASE)


Out[34]:
Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

In [35]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches


/Users/alexkirnas/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:1: FutureWarning: In future versions of pandas, match will change to always return a bool indexer.
  if __name__ == '__main__':
Out[35]:
Dave     (dave, google, com)
Rob        (rob, gmail, com)
Steve    (steve, gmail, com)
Wes                      NaN
dtype: object

In [36]:
matches.str.get(1)


Out[36]:
Dave     google
Rob       gmail
Steve     gmail
Wes         NaN
dtype: object

In [37]:
matches.str[0]


Out[37]:
Dave      dave
Rob        rob
Steve    steve
Wes        NaN
dtype: object

In [38]:
data.str[:5]


Out[38]:
Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object

Example: USDA Food Database


In [39]:
import json
db = json.load(open('foods-2011-10-03.json'))
len(db)


Out[39]:
6636

In [40]:
db[0].keys()


Out[40]:
dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])

In [41]:
db[0]['nutrients'][0]


Out[41]:
{'description': 'Protein',
 'group': 'Composition',
 'units': 'g',
 'value': 25.18}

In [42]:
nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]


Out[42]:
description group units value
0 Protein Composition g 25.18
1 Total lipid (fat) Composition g 29.20
2 Carbohydrate, by difference Composition g 3.06
3 Ash Other g 3.28
4 Energy Energy kcal 376.00
5 Water Composition g 39.28
6 Energy Energy kJ 1573.00

In [43]:
info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)
info


Out[43]:
description group id manufacturer
0 Cheese, caraway Dairy and Egg Products 1008
1 Cheese, cheddar Dairy and Egg Products 1009
2 Cheese, edam Dairy and Egg Products 1018
3 Cheese, feta Dairy and Egg Products 1019
4 Cheese, mozzarella, part skim milk Dairy and Egg Products 1028
5 Cheese, mozzarella, part skim milk, low moisture Dairy and Egg Products 1029
6 Cheese, romano Dairy and Egg Products 1038
7 Cheese, roquefort Dairy and Egg Products 1039
8 Cheese spread, pasteurized process, american, ... Dairy and Egg Products 1048
9 Cream, fluid, half and half Dairy and Egg Products 1049
10 Sour dressing, non-butterfat, cultured, filled... Dairy and Egg Products 1058
11 Milk, filled, fluid, with blend of hydrogenate... Dairy and Egg Products 1059
12 Cream substitute, liquid, with lauric acid oil... Dairy and Egg Products 1068
13 Cream substitute, powdered Dairy and Egg Products 1069
14 Milk, producer, fluid, 3.7% milkfat Dairy and Egg Products 1078
15 Milk, reduced fat, fluid, 2% milkfat, with add... Dairy and Egg Products 1079 None
16 Milk, reduced fat, fluid, 2% milkfat, with add... Dairy and Egg Products 1080
17 Milk, reduced fat, fluid, 2% milkfat, protein ... Dairy and Egg Products 1081
18 Milk, lowfat, fluid, 1% milkfat, with added vi... Dairy and Egg Products 1082
19 Milk, lowfat, fluid, 1% milkfat, with added no... Dairy and Egg Products 1083
20 Milk, lowfat, fluid, 1% milkfat, protein forti... Dairy and Egg Products 1084
21 Milk, nonfat, fluid, with added vitamin A and ... Dairy and Egg Products 1085
22 Milk, nonfat, fluid, with added nonfat milk so... Dairy and Egg Products 1086
23 Milk, nonfat, fluid, protein fortified, with a... Dairy and Egg Products 1087
24 Milk, buttermilk, fluid, cultured, lowfat Dairy and Egg Products 1088
25 Milk, low sodium, fluid Dairy and Egg Products 1089
26 Milk, dry, whole, with added vitamin D Dairy and Egg Products 1090
27 Milk, dry, nonfat, regular, without added vita... Dairy and Egg Products 1091
28 Milk, dry, nonfat, instant, with added vitamin... Dairy and Egg Products 1092
29 Milk, dry, nonfat, calcium reduced Dairy and Egg Products 1093
... ... ... ... ...
6606 Beef, tenderloin, steak, separable lean only, ... Beef Products 23628
6607 Beef, top sirloin, steak, separable lean only,... Beef Products 23629
6608 Beef, short loin, top loin, steak, separable l... Beef Products 23630
6609 Beef, chuck, arm pot roast, separable lean onl... Beef Products 23631
6610 Beef, brisket, flat half, separable lean only,... Beef Products 23632
6611 Beef, chuck, arm pot roast, separable lean onl... Beef Products 23633
6612 Beef, brisket, flat half, separable lean only,... Beef Products 23634
6613 Beef, round, eye of round, roast, separable le... Beef Products 23635
6614 Beef, round, top round, steak, separable lean ... Beef Products 23636
6615 Beef, round, bottom round, roast, separable le... Beef Products 23637
6616 Beef, rib, small end (ribs 10-12), separable l... Beef Products 23638
6617 CAMPBELL Soup Company, CAMPBELL'S Red and Whit... Soups, Sauces, and Gravies 27015 Campbell Soup Co.
6618 CAMPBELL Soup Company, CAMPBELL's Red and Whit... Soups, Sauces, and Gravies 27016 Campbell Soup Co.
6619 CAMPBELL Soup Company, CAMPBELL'S SELECT Soups... Soups, Sauces, and Gravies 27021 Campbell Soup Co.
6620 CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND... Soups, Sauces, and Gravies 27022 Campbell Soup Co.
6621 CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND... Soups, Sauces, and Gravies 27023 Campbell Soup Co.
6622 CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ... Soups, Sauces, and Gravies 27024 Campbell Soup Co.
6623 CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ... Soups, Sauces, and Gravies 27025 Campbell Soup Co.
6624 CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ... Soups, Sauces, and Gravies 27026 Campbell Soup Co.
6625 CAMPBELL Soup Company, CAMPBELL'S Red and Whit... Soups, Sauces, and Gravies 27032 Campbell Soup Co.
6626 CAMPBELL Soup Company, V8 Vegetable Juice, Ess... Vegetables and Vegetable Products 31010 Campbell Soup Co.
6627 CAMPBELL Soup Company, V8 Vegetable Juice, Spi... Vegetables and Vegetable Products 31013 Campbell Soup Co.
6628 CAMPBELL Soup Company, PACE, Jalapenos Nacho S... Vegetables and Vegetable Products 31014 Campbell Soup Co.
6629 CAMPBELL Soup Company, V8 60% Vegetable Juice,... Vegetables and Vegetable Products 31016 Campbell Soup Co.
6630 CAMPBELL Soup Company, V8 Vegetable Juice, Low... Vegetables and Vegetable Products 31017 Campbell Soup Co.
6631 Bologna, beef, low fat Sausages and Luncheon Meats 42161
6632 Turkey and pork sausage, fresh, bulk, patty or... Sausages and Luncheon Meats 42173
6633 Babyfood, juice, pear Baby Foods 43408 None
6634 Babyfood, dessert, banana yogurt, strained Baby Foods 43539 None
6635 Babyfood, banana no tapioca, strained Baby Foods 43546 None

6636 rows × 4 columns


In [44]:
pd.value_counts(info.group)[:10]


Out[44]:
Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Legumes and Legume Products          365
Fast Foods                           365
Lamb, Veal, and Game Products        345
Sweets                               341
Pork Products                        328
Fruits and Fruit Juices              328
Name: group, dtype: int64

In [45]:
nutrients = []

for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)
nutrients


Out[45]:
description group units value id
0 Protein Composition g 25.180 1008
1 Total lipid (fat) Composition g 29.200 1008
2 Carbohydrate, by difference Composition g 3.060 1008
3 Ash Other g 3.280 1008
4 Energy Energy kcal 376.000 1008
5 Water Composition g 39.280 1008
6 Energy Energy kJ 1573.000 1008
7 Fiber, total dietary Composition g 0.000 1008
8 Calcium, Ca Elements mg 673.000 1008
9 Iron, Fe Elements mg 0.640 1008
10 Magnesium, Mg Elements mg 22.000 1008
11 Phosphorus, P Elements mg 490.000 1008
12 Potassium, K Elements mg 93.000 1008
13 Sodium, Na Elements mg 690.000 1008
14 Zinc, Zn Elements mg 2.940 1008
15 Copper, Cu Elements mg 0.024 1008
16 Manganese, Mn Elements mg 0.021 1008
17 Selenium, Se Elements mcg 14.500 1008
18 Vitamin A, IU Vitamins IU 1054.000 1008
19 Retinol Vitamins mcg 262.000 1008
20 Vitamin A, RAE Vitamins mcg_RAE 271.000 1008
21 Vitamin C, total ascorbic acid Vitamins mg 0.000 1008
22 Thiamin Vitamins mg 0.031 1008
23 Riboflavin Vitamins mg 0.450 1008
24 Niacin Vitamins mg 0.180 1008
25 Pantothenic acid Vitamins mg 0.190 1008
26 Vitamin B-6 Vitamins mg 0.074 1008
27 Folate, total Vitamins mcg 18.000 1008
28 Vitamin B-12 Vitamins mcg 0.270 1008
29 Folic acid Vitamins mcg 0.000 1008
... ... ... ... ... ...
389325 Selenium, Se Elements mcg 1.100 43546
389326 Vitamin A, IU Vitamins IU 5.000 43546
389327 Retinol Vitamins mcg 0.000 43546
389328 Vitamin A, RAE Vitamins mcg_RAE 0.000 43546
389329 Carotene, beta Vitamins mcg 2.000 43546
389330 Carotene, alpha Vitamins mcg 2.000 43546
389331 Vitamin E (alpha-tocopherol) Vitamins mg 0.250 43546
389332 Vitamin D Vitamins IU 0.000 43546
389333 Vitamin D (D2 + D3) Vitamins mcg 0.000 43546
389334 Cryptoxanthin, beta Vitamins mcg 0.000 43546
389335 Lycopene Vitamins mcg 0.000 43546
389336 Lutein + zeaxanthin Vitamins mcg 20.000 43546
389337 Vitamin C, total ascorbic acid Vitamins mg 21.900 43546
389338 Thiamin Vitamins mg 0.020 43546
389339 Riboflavin Vitamins mg 0.060 43546
389340 Niacin Vitamins mg 0.540 43546
389341 Vitamin B-6 Vitamins mg 0.260 43546
389342 Folate, total Vitamins mcg 17.000 43546
389343 Vitamin B-12 Vitamins mcg 0.000 43546
389344 Choline, total Vitamins mg 4.100 43546
389345 Vitamin K (phylloquinone) Vitamins mcg 0.500 43546
389346 Folic acid Vitamins mcg 0.000 43546
389347 Folate, food Vitamins mcg 17.000 43546
389348 Folate, DFE Vitamins mcg_DFE 17.000 43546
389349 Vitamin E, added Vitamins mg 0.000 43546
389350 Vitamin B-12, added Vitamins mcg 0.000 43546
389351 Cholesterol Other mg 0.000 43546
389352 Fatty acids, total saturated Other g 0.072 43546
389353 Fatty acids, total monounsaturated Other g 0.028 43546
389354 Fatty acids, total polyunsaturated Other g 0.041 43546

389355 rows × 5 columns


In [46]:
nutrients.duplicated().sum()


Out[46]:
14179

In [47]:
nutrients = nutrients.drop_duplicates()

In [48]:
col_mapping = {'description' : 'food',
               'group'       : 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
info


Out[48]:
food fgroup id manufacturer
0 Cheese, caraway Dairy and Egg Products 1008
1 Cheese, cheddar Dairy and Egg Products 1009
2 Cheese, edam Dairy and Egg Products 1018
3 Cheese, feta Dairy and Egg Products 1019
4 Cheese, mozzarella, part skim milk Dairy and Egg Products 1028
5 Cheese, mozzarella, part skim milk, low moisture Dairy and Egg Products 1029
6 Cheese, romano Dairy and Egg Products 1038
7 Cheese, roquefort Dairy and Egg Products 1039
8 Cheese spread, pasteurized process, american, ... Dairy and Egg Products 1048
9 Cream, fluid, half and half Dairy and Egg Products 1049
10 Sour dressing, non-butterfat, cultured, filled... Dairy and Egg Products 1058
11 Milk, filled, fluid, with blend of hydrogenate... Dairy and Egg Products 1059
12 Cream substitute, liquid, with lauric acid oil... Dairy and Egg Products 1068
13 Cream substitute, powdered Dairy and Egg Products 1069
14 Milk, producer, fluid, 3.7% milkfat Dairy and Egg Products 1078
15 Milk, reduced fat, fluid, 2% milkfat, with add... Dairy and Egg Products 1079 None
16 Milk, reduced fat, fluid, 2% milkfat, with add... Dairy and Egg Products 1080
17 Milk, reduced fat, fluid, 2% milkfat, protein ... Dairy and Egg Products 1081
18 Milk, lowfat, fluid, 1% milkfat, with added vi... Dairy and Egg Products 1082
19 Milk, lowfat, fluid, 1% milkfat, with added no... Dairy and Egg Products 1083
20 Milk, lowfat, fluid, 1% milkfat, protein forti... Dairy and Egg Products 1084
21 Milk, nonfat, fluid, with added vitamin A and ... Dairy and Egg Products 1085
22 Milk, nonfat, fluid, with added nonfat milk so... Dairy and Egg Products 1086
23 Milk, nonfat, fluid, protein fortified, with a... Dairy and Egg Products 1087
24 Milk, buttermilk, fluid, cultured, lowfat Dairy and Egg Products 1088
25 Milk, low sodium, fluid Dairy and Egg Products 1089
26 Milk, dry, whole, with added vitamin D Dairy and Egg Products 1090
27 Milk, dry, nonfat, regular, without added vita... Dairy and Egg Products 1091
28 Milk, dry, nonfat, instant, with added vitamin... Dairy and Egg Products 1092
29 Milk, dry, nonfat, calcium reduced Dairy and Egg Products 1093
... ... ... ... ...
6606 Beef, tenderloin, steak, separable lean only, ... Beef Products 23628
6607 Beef, top sirloin, steak, separable lean only,... Beef Products 23629
6608 Beef, short loin, top loin, steak, separable l... Beef Products 23630
6609 Beef, chuck, arm pot roast, separable lean onl... Beef Products 23631
6610 Beef, brisket, flat half, separable lean only,... Beef Products 23632
6611 Beef, chuck, arm pot roast, separable lean onl... Beef Products 23633
6612 Beef, brisket, flat half, separable lean only,... Beef Products 23634
6613 Beef, round, eye of round, roast, separable le... Beef Products 23635
6614 Beef, round, top round, steak, separable lean ... Beef Products 23636
6615 Beef, round, bottom round, roast, separable le... Beef Products 23637
6616 Beef, rib, small end (ribs 10-12), separable l... Beef Products 23638
6617 CAMPBELL Soup Company, CAMPBELL'S Red and Whit... Soups, Sauces, and Gravies 27015 Campbell Soup Co.
6618 CAMPBELL Soup Company, CAMPBELL's Red and Whit... Soups, Sauces, and Gravies 27016 Campbell Soup Co.
6619 CAMPBELL Soup Company, CAMPBELL'S SELECT Soups... Soups, Sauces, and Gravies 27021 Campbell Soup Co.
6620 CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND... Soups, Sauces, and Gravies 27022 Campbell Soup Co.
6621 CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND... Soups, Sauces, and Gravies 27023 Campbell Soup Co.
6622 CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ... Soups, Sauces, and Gravies 27024 Campbell Soup Co.
6623 CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ... Soups, Sauces, and Gravies 27025 Campbell Soup Co.
6624 CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ... Soups, Sauces, and Gravies 27026 Campbell Soup Co.
6625 CAMPBELL Soup Company, CAMPBELL'S Red and Whit... Soups, Sauces, and Gravies 27032 Campbell Soup Co.
6626 CAMPBELL Soup Company, V8 Vegetable Juice, Ess... Vegetables and Vegetable Products 31010 Campbell Soup Co.
6627 CAMPBELL Soup Company, V8 Vegetable Juice, Spi... Vegetables and Vegetable Products 31013 Campbell Soup Co.
6628 CAMPBELL Soup Company, PACE, Jalapenos Nacho S... Vegetables and Vegetable Products 31014 Campbell Soup Co.
6629 CAMPBELL Soup Company, V8 60% Vegetable Juice,... Vegetables and Vegetable Products 31016 Campbell Soup Co.
6630 CAMPBELL Soup Company, V8 Vegetable Juice, Low... Vegetables and Vegetable Products 31017 Campbell Soup Co.
6631 Bologna, beef, low fat Sausages and Luncheon Meats 42161
6632 Turkey and pork sausage, fresh, bulk, patty or... Sausages and Luncheon Meats 42173
6633 Babyfood, juice, pear Baby Foods 43408 None
6634 Babyfood, dessert, banana yogurt, strained Baby Foods 43539 None
6635 Babyfood, banana no tapioca, strained Baby Foods 43546 None

6636 rows × 4 columns


In [49]:
col_mapping = {'description' : 'nutrient',
               'group' : 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients


Out[49]:
nutrient nutgroup units value id
0 Protein Composition g 25.180 1008
1 Total lipid (fat) Composition g 29.200 1008
2 Carbohydrate, by difference Composition g 3.060 1008
3 Ash Other g 3.280 1008
4 Energy Energy kcal 376.000 1008
5 Water Composition g 39.280 1008
6 Energy Energy kJ 1573.000 1008
7 Fiber, total dietary Composition g 0.000 1008
8 Calcium, Ca Elements mg 673.000 1008
9 Iron, Fe Elements mg 0.640 1008
10 Magnesium, Mg Elements mg 22.000 1008
11 Phosphorus, P Elements mg 490.000 1008
12 Potassium, K Elements mg 93.000 1008
13 Sodium, Na Elements mg 690.000 1008
14 Zinc, Zn Elements mg 2.940 1008
15 Copper, Cu Elements mg 0.024 1008
16 Manganese, Mn Elements mg 0.021 1008
17 Selenium, Se Elements mcg 14.500 1008
18 Vitamin A, IU Vitamins IU 1054.000 1008
19 Retinol Vitamins mcg 262.000 1008
20 Vitamin A, RAE Vitamins mcg_RAE 271.000 1008
21 Vitamin C, total ascorbic acid Vitamins mg 0.000 1008
22 Thiamin Vitamins mg 0.031 1008
23 Riboflavin Vitamins mg 0.450 1008
24 Niacin Vitamins mg 0.180 1008
25 Pantothenic acid Vitamins mg 0.190 1008
26 Vitamin B-6 Vitamins mg 0.074 1008
27 Folate, total Vitamins mcg 18.000 1008
28 Vitamin B-12 Vitamins mcg 0.270 1008
29 Folic acid Vitamins mcg 0.000 1008
... ... ... ... ... ...
389325 Selenium, Se Elements mcg 1.100 43546
389326 Vitamin A, IU Vitamins IU 5.000 43546
389327 Retinol Vitamins mcg 0.000 43546
389328 Vitamin A, RAE Vitamins mcg_RAE 0.000 43546
389329 Carotene, beta Vitamins mcg 2.000 43546
389330 Carotene, alpha Vitamins mcg 2.000 43546
389331 Vitamin E (alpha-tocopherol) Vitamins mg 0.250 43546
389332 Vitamin D Vitamins IU 0.000 43546
389333 Vitamin D (D2 + D3) Vitamins mcg 0.000 43546
389334 Cryptoxanthin, beta Vitamins mcg 0.000 43546
389335 Lycopene Vitamins mcg 0.000 43546
389336 Lutein + zeaxanthin Vitamins mcg 20.000 43546
389337 Vitamin C, total ascorbic acid Vitamins mg 21.900 43546
389338 Thiamin Vitamins mg 0.020 43546
389339 Riboflavin Vitamins mg 0.060 43546
389340 Niacin Vitamins mg 0.540 43546
389341 Vitamin B-6 Vitamins mg 0.260 43546
389342 Folate, total Vitamins mcg 17.000 43546
389343 Vitamin B-12 Vitamins mcg 0.000 43546
389344 Choline, total Vitamins mg 4.100 43546
389345 Vitamin K (phylloquinone) Vitamins mcg 0.500 43546
389346 Folic acid Vitamins mcg 0.000 43546
389347 Folate, food Vitamins mcg 17.000 43546
389348 Folate, DFE Vitamins mcg_DFE 17.000 43546
389349 Vitamin E, added Vitamins mg 0.000 43546
389350 Vitamin B-12, added Vitamins mcg 0.000 43546
389351 Cholesterol Other mg 0.000 43546
389352 Fatty acids, total saturated Other g 0.072 43546
389353 Fatty acids, total monounsaturated Other g 0.028 43546
389354 Fatty acids, total polyunsaturated Other g 0.041 43546

375176 rows × 5 columns


In [51]:
ndata = pd.merge(nutrients, info, on='id', how='outer')
ndata


Out[51]:
nutrient nutgroup units value id food fgroup manufacturer
0 Protein Composition g 25.180 1008 Cheese, caraway Dairy and Egg Products
1 Total lipid (fat) Composition g 29.200 1008 Cheese, caraway Dairy and Egg Products
2 Carbohydrate, by difference Composition g 3.060 1008 Cheese, caraway Dairy and Egg Products
3 Ash Other g 3.280 1008 Cheese, caraway Dairy and Egg Products
4 Energy Energy kcal 376.000 1008 Cheese, caraway Dairy and Egg Products
5 Water Composition g 39.280 1008 Cheese, caraway Dairy and Egg Products
6 Energy Energy kJ 1573.000 1008 Cheese, caraway Dairy and Egg Products
7 Fiber, total dietary Composition g 0.000 1008 Cheese, caraway Dairy and Egg Products
8 Calcium, Ca Elements mg 673.000 1008 Cheese, caraway Dairy and Egg Products
9 Iron, Fe Elements mg 0.640 1008 Cheese, caraway Dairy and Egg Products
10 Magnesium, Mg Elements mg 22.000 1008 Cheese, caraway Dairy and Egg Products
11 Phosphorus, P Elements mg 490.000 1008 Cheese, caraway Dairy and Egg Products
12 Potassium, K Elements mg 93.000 1008 Cheese, caraway Dairy and Egg Products
13 Sodium, Na Elements mg 690.000 1008 Cheese, caraway Dairy and Egg Products
14 Zinc, Zn Elements mg 2.940 1008 Cheese, caraway Dairy and Egg Products
15 Copper, Cu Elements mg 0.024 1008 Cheese, caraway Dairy and Egg Products
16 Manganese, Mn Elements mg 0.021 1008 Cheese, caraway Dairy and Egg Products
17 Selenium, Se Elements mcg 14.500 1008 Cheese, caraway Dairy and Egg Products
18 Vitamin A, IU Vitamins IU 1054.000 1008 Cheese, caraway Dairy and Egg Products
19 Retinol Vitamins mcg 262.000 1008 Cheese, caraway Dairy and Egg Products
20 Vitamin A, RAE Vitamins mcg_RAE 271.000 1008 Cheese, caraway Dairy and Egg Products
21 Vitamin C, total ascorbic acid Vitamins mg 0.000 1008 Cheese, caraway Dairy and Egg Products
22 Thiamin Vitamins mg 0.031 1008 Cheese, caraway Dairy and Egg Products
23 Riboflavin Vitamins mg 0.450 1008 Cheese, caraway Dairy and Egg Products
24 Niacin Vitamins mg 0.180 1008 Cheese, caraway Dairy and Egg Products
25 Pantothenic acid Vitamins mg 0.190 1008 Cheese, caraway Dairy and Egg Products
26 Vitamin B-6 Vitamins mg 0.074 1008 Cheese, caraway Dairy and Egg Products
27 Folate, total Vitamins mcg 18.000 1008 Cheese, caraway Dairy and Egg Products
28 Vitamin B-12 Vitamins mcg 0.270 1008 Cheese, caraway Dairy and Egg Products
29 Folic acid Vitamins mcg 0.000 1008 Cheese, caraway Dairy and Egg Products
... ... ... ... ... ... ... ... ...
375146 Selenium, Se Elements mcg 1.100 43546 Babyfood, banana no tapioca, strained Baby Foods None
375147 Vitamin A, IU Vitamins IU 5.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375148 Retinol Vitamins mcg 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375149 Vitamin A, RAE Vitamins mcg_RAE 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375150 Carotene, beta Vitamins mcg 2.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375151 Carotene, alpha Vitamins mcg 2.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375152 Vitamin E (alpha-tocopherol) Vitamins mg 0.250 43546 Babyfood, banana no tapioca, strained Baby Foods None
375153 Vitamin D Vitamins IU 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375154 Vitamin D (D2 + D3) Vitamins mcg 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375155 Cryptoxanthin, beta Vitamins mcg 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375156 Lycopene Vitamins mcg 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375157 Lutein + zeaxanthin Vitamins mcg 20.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375158 Vitamin C, total ascorbic acid Vitamins mg 21.900 43546 Babyfood, banana no tapioca, strained Baby Foods None
375159 Thiamin Vitamins mg 0.020 43546 Babyfood, banana no tapioca, strained Baby Foods None
375160 Riboflavin Vitamins mg 0.060 43546 Babyfood, banana no tapioca, strained Baby Foods None
375161 Niacin Vitamins mg 0.540 43546 Babyfood, banana no tapioca, strained Baby Foods None
375162 Vitamin B-6 Vitamins mg 0.260 43546 Babyfood, banana no tapioca, strained Baby Foods None
375163 Folate, total Vitamins mcg 17.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375164 Vitamin B-12 Vitamins mcg 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375165 Choline, total Vitamins mg 4.100 43546 Babyfood, banana no tapioca, strained Baby Foods None
375166 Vitamin K (phylloquinone) Vitamins mcg 0.500 43546 Babyfood, banana no tapioca, strained Baby Foods None
375167 Folic acid Vitamins mcg 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375168 Folate, food Vitamins mcg 17.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375169 Folate, DFE Vitamins mcg_DFE 17.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375170 Vitamin E, added Vitamins mg 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375171 Vitamin B-12, added Vitamins mcg 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375172 Cholesterol Other mg 0.000 43546 Babyfood, banana no tapioca, strained Baby Foods None
375173 Fatty acids, total saturated Other g 0.072 43546 Babyfood, banana no tapioca, strained Baby Foods None
375174 Fatty acids, total monounsaturated Other g 0.028 43546 Babyfood, banana no tapioca, strained Baby Foods None
375175 Fatty acids, total polyunsaturated Other g 0.041 43546 Babyfood, banana no tapioca, strained Baby Foods None

375176 rows × 8 columns


In [52]:
ndata.ix[30000]


Out[52]:
nutrient                                       Glycine
nutgroup                                   Amino Acids
units                                                g
value                                             0.04
id                                                6158
food            Soup, tomato bisque, canned, condensed
fgroup                      Soups, Sauces, and Gravies
manufacturer                                          
Name: 30000, dtype: object

In [55]:
%matplotlib inline
result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
result['Zinc, Zn'].sort_values().plot(kind='barh')


Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f28cf28>

In [58]:
by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])

get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())

max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

max_foods.food = max_foods.food.str[:50]

In [59]:
max_foods.ix['Amino Acids']['food']


Out[59]:
nutrient
Alanine                           Gelatins, dry powder, unsweetened
Arginine                               Seeds, sesame flour, low-fat
Aspartic acid                                   Soy protein isolate
Cystine                Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                   Soy protein isolate
Glycine                           Gelatins, dry powder, unsweetened
Histidine                Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline    KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine           Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine            Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine                    Fish, cod, Atlantic, dried and salted
Phenylalanine     Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline                           Gelatins, dry powder, unsweetened
Serine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan         Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object

In [ ]: