In [1]:
#2018-08-24 11:07:51.530419
%load_ext metapack.jupyter.magic

In [2]:
CACHE_DIR='/Users/eric/Library/Application Support/metapack/'
description='CNSS 2017, with categorical values'

In [3]:

In [4]:
import matplotlib.pyplot as plt 
import metapack as mp
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

In [5]:
pkg = mp.jupyter.open_source_package()


Cornell National Social Survey (CNSS), 2017

Cornell National Social Survey is a random-sample survey of adults aged 18 and over. In 2017, participants were asked their opinions on a range of topics. from metapack+file:///Users/eric/proj/virt-proj/data-project/sdrdl-data-projects/cornell-social-survey/

To use this package, you will have to download the research file manually, placing the file in the data directory. Cornell requires you to accept terms and conditions before downloading the file, so we can't redistribute it.



Wrangler: Eric Busboom Civic Knowledge


  1. cnss_2017 - file:///Users/eric/proj/virt-proj/data-project/sdrdl-data-projects/cornell-social-survey/ CNSS 2017, with categorical values


  1. cnss_2017 - file:///Users/eric/proj/virt-proj/data-project/sdrdl-data-projects/cornell-social-survey/ Cornell National Social Survey 2017, in converted Stata forma

In [6]:
fp = pkg.reference('cnss_2017').resolved_url.fspath

In [7]:
# The variable KRq1, Number of Dogs, has a duplicate label problem when the file is converted with StatTransfer; 
# StatTransfer seperates out the number from the text "dogs", and pandas
# interprets this as multiple labels with the value "dogs". The easiest way to
# handle this is to remove the variable. 

itr = pd.read_stata(fp, iterator=True)

columns = list(itr.varlist)

df = pd.read_stata('../data/cnss2017.dta', columns = columns)
df.labels = dict(itr.variable_labels()) # Store variable labels as a dict

caseid survid timezone state msa msc censusr censusd cbsamsa cbsamcsa ... RACE_E numraces relig church hhince hhinc50k hhincu hhinco hhinc gender
0 80007 80007 C TX 2920 1 3 7 3 5 ... No 1.0 No religion / Atheist / Agnostic A few times a year NaN $50,000 or over NaN $150,000 or more $150,000 or more Male
1 80027 80027 C AL 5 3 6 5 1 ... No 1.0 Protestant A few times a year NaN $50,000 or over NaN 50 to under $75,000 50 to under $75,000 Female
2 80029 80029 C LA 5560 3 3 7 3 5 ... No 1.0 Catholic A few times a year 196000 NaN NaN NaN $150,000 or more Male
3 80037 80037 C IN 5 2 3 5 1 ... No 1.0 No religion / Atheist / Agnostic Never 75000 NaN NaN NaN 75 to under $100,000 Male
4 80041 80041 C MO 3760 1 2 4 1 5 ... No 1.0 No religion / Atheist / Agnostic Never 60000 NaN NaN NaN 50 to under $75,000 Female

5 rows × 113 columns

In [8]:

['Case identification number (assigned by SRI)',
 'Case identification number (assigned by SRI)',
 'Time zone (provided by MSG)',
 'State (provided by MSG)',
 'Metropolitan Statistical Area (provided by MSG)',
 'Metropolitan Status Code (provided by MSG)',
 'Census Region (provided by MSG)',
 'Census Division (provided by MSG)',
 'CBSA MSA Met Status Code (provided by MSG)',
 'CBSA MCSA Met Status Code (provided by MSG)',
 'College degrees for prisoners',
 'State funded college for prisoners',
 'Prisoners should repay education costs',
 'Country needs strong leader',
 'Courts get in way of leaders',
 'Media get in way of leaders',
 'Randomized text within JSq1',
 'Contraceptive policy under Trump',
 'China rise a threat or opportunity',
 'Mens rights participant',
 'Red pill - Heard of term',
 'Alt right - Heard of term',
 'Men should be alpha',
 'Resource to find new surgeon',
 'Used internet to find physician',
 'Helpfulness of internet w/ finding physician',
 '# of healthcare visits in the past year',
 'Rate customer service last healthcare visit',
 'Most urgent healthcare issue',
 'Employer offers wellness program',
 'Describe workplace wellness program',
 'Rate workplace wellness program',
 'Teen access to patient portal',
 'JAq2, JAq3 - Sequence Index Variable',
 'Parent access teen medical record',
 'Sensitive issue avoidance if parent views record',
 'Cosmetic or health reasons - Bariatric surgery',
 'Quick fix - Bariatric surgery',
 'Should insurance cover - Bariatric surgery',
 '# of dogs owned',
 'Canine clinical study interest',
 'Aware of bee health concern',
 'Personal concern about bee health',
 'Produce protecting bees - Pay more',
 'Organic food - Pay more',
 'What do antibiotics kill',
 'Cow antibiotics threaten human health',
 'Milk without antibiotics - Pay more',
 'Cow treatment on conventional/organic farms',
 'Attractive natural sights - Neighborhood',
 'People look out for each other - Neighborhood',
 'Mental health a priority - Community',
 'Sufficient mental health services - Community',
 'Physical environment impacts mental health',
 'Worry about crime - Workplace',
 'Worry about crime - Neighborhood',
 'Recidivism due to criminal record',
 'Marijuana legalization',
 'Rate distraction at work',
 'Work from home frequency',
 'Homework completion rate in high school',
 'Work harassment - Experienced',
 'Domestic violence - Experienced',
 'Experience w/ violence impacted work',
 'Harder for men to be successful',
 'Men should protect women',
 'Feminism good or bad for America',
 'Asked to do favor outside of work',
 'Who asked for favor outside of work',
 'Time spent doing favor outside of work',
 'Rules about doing favors outside of work',
 'Restless - Past 30 days',
 'Everything an effort - Past 30 days',
 'Undocumented farmworkers community impact',
 'Occupational category',
 'Labor union member',
 'Voted in 2016 presidential election',
 'Elected officials represent the rich',
 'Minorities get government advantages',
 'Main job type',
 'Looking for new work',
 'Performance raise - Eligible at work',
 'Performance bonus - Eligible at work',
 '# adults 65+ in household',
 '# adults 18-64 in household',
 '# children in household',
 '# phones for household',
 'Cell/Landline for survey',
 'Year born',
 'Age (computed from yob)',
 'Born in US',
 'Marital status',
 'Social ideology',
 'Political party',
 'Education level',
 'Home ownership status',
 'Hispanic or Latino',
 'Caucasian - Race',
 'African-American - Race',
 'Native American - Race',
 'Asian - Race',
 'Other - Race',
 'Religious affiliation',
 'How often attend religious services',
 'Exact household income 2016',
 'Over/Under $50k - Household income 2016',
 'Range under $50k - Household income 2016',
 'Range over $50k - Household income 2016',
 'Household income 2016 - Coded value',

In [9]:
%mt_materialize df '/Users/eric/Library/Application Support/metapack/_materialized_data/'

ValueError                                Traceback (most recent call last)
<ipython-input-9-7c167cccefdd> in <module>()
----> 1 get_ipython().run_line_magic('mt_materialize', "df '/Users/eric/Library/Application Support/metapack/_materialized_data/'")

~/proj/virt/data-project/lib/python3.6/site-packages/IPython/core/ in run_line_magic(self, magic_name, line, _stack_depth)
   2129                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2130             with self.builtin_trap:
-> 2131                 result = fn(*args,**kwargs)
   2132             return result

<decorator-gen-132> in mt_materialize(self, line)

~/proj/virt/data-project/lib/python3.6/site-packages/IPython/core/ in <lambda>(f, *a, **k)
    185     # but it's overkill for just that one bit of state.
    186     def magic_deco(arg):
--> 187         call = lambda f, *a, **k: f(*a, **k)
    189         if callable(arg):

~/proj/virt/data-project/lib/python3.6/site-packages/metapack/jupyter/ in mt_materialize(self, line)
    295         path = join(dr, args.df_name + ".csv")
--> 296         df =[args.df_name].fillna('')
    297         gen = PandasDataframeSource(parse_app_url(path), df, cache=cache)

~/proj/virt/data-project/lib/python3.6/site-packages/pandas/core/ in fillna(self, value, method, axis, inplace, limit, downcast, **kwargs)
   3033                      self).fillna(value=value, method=method, axis=axis,
   3034                                   inplace=inplace, limit=limit,
-> 3035                                   downcast=downcast, **kwargs)
   3037     @Appender(_shared_docs['shift'] % _shared_doc_kwargs)

~/proj/virt/data-project/lib/python3.6/site-packages/pandas/core/ in fillna(self, value, method, axis, inplace, limit, downcast)
   4346                 new_data = self._data.fillna(value=value, limit=limit,
   4347                                              inplace=inplace,
-> 4348                                              downcast=downcast)
   4349             elif isinstance(value, DataFrame) and self.ndim == 2:
   4350                 new_data = self.where(self.notna(), value)

~/proj/virt/data-project/lib/python3.6/site-packages/pandas/core/ in fillna(self, **kwargs)
   3450     def fillna(self, **kwargs):
-> 3451         return self.apply('fillna', **kwargs)
   3453     def downcast(self, **kwargs):

~/proj/virt/data-project/lib/python3.6/site-packages/pandas/core/ in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3323             kwargs['mgr'] = self
-> 3324             applied = getattr(b, f)(**kwargs)
   3325             result_blocks = _extend_blocks(applied, result_blocks)

~/proj/virt/data-project/lib/python3.6/site-packages/pandas/core/ in fillna(self, value, limit, inplace, downcast, mgr)
   2374         values = self.values if inplace else self.values.copy()
   2375         values = self._try_coerce_result(values.fillna(value=value,
-> 2376                                                        limit=limit))
   2377         return [self.make_block(values=values)]

~/proj/virt/data-project/lib/python3.6/site-packages/pandas/util/ in wrapper(*args, **kwargs)
    116                 else:
    117                     kwargs[new_arg_name] = new_arg_value
--> 118             return func(*args, **kwargs)
    119         return wrapper
    120     return _deprecate_kwarg

~/proj/virt/data-project/lib/python3.6/site-packages/pandas/core/ in fillna(self, value, method, limit)
   1681             if not isna(value) and value not in self.categories:
-> 1682                 raise ValueError("fill value must be in categories")
   1684             mask = values == -1

ValueError: fill value must be in categories

In [ ]:
%mt_materialize_all '/Users/eric/Library/Application Support/metapack/_materialized_data/'

In [ ]:

In [ ]: