Creating a flat file for data.world

We are reading the USDA National Nutrient Database as created by this project (https://github.com/CraigKelly/nndb-import). The original data can be found at http://www.ars.usda.gov/Services/docs.htm?docid=8964

The US RDA columns are created using data from Dietary Reference Intakes: The Essential Guide to Nutrient Requirements available from the National Academies Press at http://www.nap.edu/catalog/11537.html

Note every RDA value in the text was used. For instance, the US RDA for Iron varies significantly by age and sex, so I deemed it easier to just leave out RDA information for Iron.

And finally, every effort has been made to keep this data as clean and accurate as possible, but no medical warranty, advice, etc should be construed. This was collected by a computer scientist who might have made an error that would not be immediately apparent. Caveat emptor!

Important! The mongo access below assumes that the data was loaded in to a Docker container named mainmongo, and that this notebook is running in a Jupyter container that was linked (via --link on the command line). If you are running MongoDB and Jupyter "natively", you should be able to change 'mainmongo' to 'localhost' in the code below. Also note that you should have already installed pandas.



In [1]:

    
!pip install pymongo

import pandas as pd
import pymongo









    



Collecting pymongo
  Downloading pymongo-3.4.0-cp35-cp35m-manylinux1_x86_64.whl (359kB)
    100% |████████████████████████████████| 368kB 1.3MB/s 
Installing collected packages: pymongo
Successfully installed pymongo-3.4.0
You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.



In [2]:

    
COLUMNS = [
    'ID', 'FoodGroup', 'ShortDescrip', 'Descrip', 'CommonName', 'MfgName', 'ScientificName',

    'Energy_kcal', 'Protein_g', 'Fat_g', 'Carb_g', 'Sugar_g', 'Fiber_g',

    'VitA_mcg', 'VitB6_mg', 'VitB12_mcg', 'VitC_mg', 'VitE_mg', 'Folate_mcg',
    'Niacin_mg', 'Riboflavin_mg', 'Thiamin_mg', 'Calcium_mg', 'Copper_mcg',
    'Iron_mg', 'Magnesium_mg', 'Manganese_mg', 'Phosphorus_mg', 'Selenium_mcg',
    'Zinc_mg',
]


def flat(r):
    nutr = dict([
        ( int(n['nutrient_id']), n['nutrient_val'] )
        for n in r.get('nutrients', [])
    ])
    def n(i):
        return float(nutr.get(i, 0.0))
    
    return {
        'ID': r['ndb_num'],
        'FoodGroup': r['food_group_descrip'],
        'ShortDescrip': r['short_descrip'],
        'Descrip': r['descrip'],
        'CommonName': r['common_name'],
        'MfgName': r['mfg_name'],
        'ScientificName': r['scientific_name'],
        
        'Energy_kcal': n(208),
        'Protein_g': n(203),
        'Fat_g': n(204),
        'Carb_g': n(205),
        'Sugar_g': n(269),
        'Fiber_g': n(291),
        
        'VitA_mcg': n(320),
        'VitB6_mg': n(415),
        'VitB12_mcg': n(418),
        'VitC_mg': n(401),
        'VitE_mg': n(323),
        'Folate_mcg': n(435),
        'Niacin_mg': n(406),
        'Riboflavin_mg': n(405),
        'Thiamin_mg': n(404),
        'Calcium_mg': n(301),
        'Copper_mcg': n(312),
        'Iron_mg': n(303),
        'Magnesium_mg': n(304),
        'Manganese_mg': n(315),
        'Phosphorus_mg': n(305),
        'Selenium_mcg': n(317),
        'Zinc_mg': n(309),
    }

data = pd.DataFrame.from_records(
    (flat(r) for r in pymongo.MongoClient('mainmongo', 27017).nutrition.nndb.find()), 
    columns=COLUMNS
)



In [3]:

    
data['VitA_USRDA']       = data['VitA_mcg']      / 900.0
data['VitB6_USRDA']      = data['VitB6_mg']        / 1.7
data['VitB12_USRDA']     = data['VitB12_mcg']      / 2.4
data['VitC_USRDA']       = data['VitC_mg']        / 90.0
data['VitE_USRDA']       = data['VitE_mg']        / 15.0
data['Folate_USRDA']     = data['Folate_mcg']    / 400.0
data['Niacin_USRDA']     = data['Niacin_mg']      / 16.0
data['Riboflavin_USRDA'] = data['Riboflavin_mg']   / 1.3
data['Thiamin_USRDA']    = data['Thiamin_mg']      / 1.2
data['Calcium_USRDA']    = data['Calcium_mg']   / 1200.0
data['Copper_USRDA']     = data['Copper_mcg']    / 900.0
data['Magnesium_USRDA']  = data['Magnesium_mg']  / 420.0
data['Phosphorus_USRDA'] = data['Phosphorus_mg'] / 700.0
data['Selenium_USRDA']   = data['Selenium_mcg']   / 55.0
data['Zinc_USRDA']       = data['Zinc_mg']        / 11.0



In [4]:

    
data.describe()









    Out[4]:






  
    
      
      Energy_kcal
      Protein_g
      Fat_g
      Carb_g
      Sugar_g
      Fiber_g
      VitA_mcg
      VitB6_mg
      VitB12_mcg
      VitC_mg
      ...
      Folate_USRDA
      Niacin_USRDA
      Riboflavin_USRDA
      Thiamin_USRDA
      Calcium_USRDA
      Copper_USRDA
      Magnesium_USRDA
      Phosphorus_USRDA
      Selenium_USRDA
      Zinc_USRDA
    
  
  
    
      count
      8618.000000
      8618.00000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      ...
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
      8618.000000
    
    
      mean
      226.438617
      11.52391
      10.647024
      21.819062
      6.560253
      2.023242
      93.968786
      0.264369
      1.225260
      7.925377
      ...
      0.125765
      0.213222
      0.182499
      0.174881
      0.061176
      0.000191
      0.077965
      0.222838
      0.229363
      0.179111
    
    
      std
      169.388910
      10.55059
      15.866353
      27.239000
      13.602098
      4.313670
      779.362205
      0.478614
      4.319183
      57.582758
      ...
      0.466402
      0.302161
      0.346062
      0.431974
      0.167803
      0.000615
      0.133496
      0.290133
      0.514406
      0.305292
    
    
      min
      0.000000
      0.00000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      93.000000
      2.47000
      0.992500
      0.040000
      0.000000
      0.000000
      0.000000
      0.034250
      0.000000
      0.000000
      ...
      0.000000
      0.023875
      0.035385
      0.025000
      0.007500
      0.000033
      0.023810
      0.052857
      0.001818
      0.020909
    
    
      50%
      191.000000
      8.28500
      5.235000
      8.945000
      0.370000
      0.300000
      1.500000
      0.120000
      0.080000
      0.000000
      ...
      0.017500
      0.131250
      0.115385
      0.064583
      0.015833
      0.000088
      0.047619
      0.190000
      0.070909
      0.076818
    
    
      75%
      336.000000
      19.97750
      13.900000
      32.900000
      5.300000
      2.400000
      21.000000
      0.355000
      1.297500
      2.500000
      ...
      0.062500
      0.314328
      0.200000
      0.185833
      0.051667
      0.000161
      0.069048
      0.308571
      0.387273
      0.245455
    
    
      max
      902.000000
      88.32000
      100.000000
      100.000000
      99.800000
      79.000000
      30000.000000
      12.000000
      98.890000
      2400.000000
      ...
      14.702500
      7.968750
      13.461538
      19.479167
      6.136667
      0.016722
      1.859524
      14.168571
      34.854545
      8.268182
    
  

8 rows × 38 columns



In [7]:

    
data.to_csv('./nndb_flat.csv', index=False)

	Energy_kcal	Protein_g	Fat_g	Carb_g	Sugar_g	Fiber_g	VitA_mcg	VitB6_mg	VitB12_mcg	VitC_mg	...	Folate_USRDA	Niacin_USRDA	Riboflavin_USRDA	Thiamin_USRDA	Calcium_USRDA	Copper_USRDA	Magnesium_USRDA	Phosphorus_USRDA	Selenium_USRDA	Zinc_USRDA
count	8618.000000	8618.00000	8618.000000	8618.000000	8618.000000	8618.000000	8618.000000	8618.000000	8618.000000	8618.000000	...	8618.000000	8618.000000	8618.000000	8618.000000	8618.000000	8618.000000	8618.000000	8618.000000	8618.000000	8618.000000
mean	226.438617	11.52391	10.647024	21.819062	6.560253	2.023242	93.968786	0.264369	1.225260	7.925377	...	0.125765	0.213222	0.182499	0.174881	0.061176	0.000191	0.077965	0.222838	0.229363	0.179111
std	169.388910	10.55059	15.866353	27.239000	13.602098	4.313670	779.362205	0.478614	4.319183	57.582758	...	0.466402	0.302161	0.346062	0.431974	0.167803	0.000615	0.133496	0.290133	0.514406	0.305292
min	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	93.000000	2.47000	0.992500	0.040000	0.000000	0.000000	0.000000	0.034250	0.000000	0.000000	...	0.000000	0.023875	0.035385	0.025000	0.007500	0.000033	0.023810	0.052857	0.001818	0.020909
50%	191.000000	8.28500	5.235000	8.945000	0.370000	0.300000	1.500000	0.120000	0.080000	0.000000	...	0.017500	0.131250	0.115385	0.064583	0.015833	0.000088	0.047619	0.190000	0.070909	0.076818
75%	336.000000	19.97750	13.900000	32.900000	5.300000	2.400000	21.000000	0.355000	1.297500	2.500000	...	0.062500	0.314328	0.200000	0.185833	0.051667	0.000161	0.069048	0.308571	0.387273	0.245455
max	902.000000	88.32000	100.000000	100.000000	99.800000	79.000000	30000.000000	12.000000	98.890000	2400.000000	...	14.702500	7.968750	13.461538	19.479167	6.136667	0.016722	1.859524	14.168571	34.854545	8.268182