In [74]:
%reset


Once deleted, variables cannot be recovered. Proceed (y/[n])? y

In [1]:
import pandas as pd
import re # for regular expressions
import ast
import json

In [2]:
business = pd.read_csv('../../../data/interim/US_cities_only/business_US.csv', encoding='utf-8')

In [3]:
business.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105893 entries, 0 to 105892
Data columns (total 16 columns):
address         101952 non-null object
attributes      92729 non-null object
business_id     105893 non-null object
categories      105628 non-null object
city            105893 non-null object
hours           77042 non-null object
is_open         105893 non-null int64
latitude        105893 non-null float64
longitude       105893 non-null float64
name            105893 non-null object
neighborhood    32444 non-null object
postal_code     105355 non-null float64
review_count    105893 non-null int64
stars           105893 non-null float64
state           105893 non-null object
type            105893 non-null object
dtypes: float64(4), int64(2), object(10)
memory usage: 12.9+ MB

In [79]:
len(business['attributes'][business['attributes'] == '{}'])


Out[79]:
0

In [78]:
business.head()


Out[78]:
address attributes business_id categories city hours is_open latitude longitude name neighborhood postal_code review_count stars state type
0 227 E Baseline Rd, Ste J2 [BikeParking: True, BusinessAcceptsBitcoin: Fa... 0DI8Dt2PJp07XkVvIElIcQ [Tobacco Shops, Nightlife, Vape Shops, Shopping] Tempe [Monday 11:0-21:0, Tuesday 11:0-21:0, Wednesda... 0 33.378214 -111.936102 Innovative Vapors NaN 85283.0 17 4.5 AZ business
1 495 S Grand Central Pkwy [BusinessAcceptsBitcoin: False, BusinessAccept... LTlCaCGZE14GuaUXUGbamg [Caterers, Grocery, Food, Event Planning & Ser... Las Vegas [Monday 0:0-0:0, Tuesday 0:0-0:0, Wednesday 0:... 1 36.192284 -115.159272 Cut and Taste NaN 89106.0 9 5.0 NV business
2 7014 Steubenville Pike [AcceptsInsurance: False, BusinessAcceptsCredi... cnGIivYRLxpF7tBVR_JwWA [Hair Removal, Beauty & Spas, Blow Dry/Out Ser... Oakdale [Tuesday 10:0-21:0, Wednesday 10:0-21:0, Thurs... 1 40.444544 -80.174540 Plush Salon and Spa NaN 15071.0 4 4.0 PA business
3 10875 N Frankloyd Wright Blvd [BikeParking: True, BusinessAcceptsCreditCards... Cu4_Fheh7IrzGiK-Pc79ig [Baby Gear & Furniture, Shopping] Scottsdale [Monday 10:0-18:0, Tuesday 10:0-18:0, Wednesda... 1 33.585271 -111.834954 Boomerang Baby NaN 85259.0 8 3.5 AZ business
4 11072 No Frank Lloyd Wright [Alcohol: none, Ambience: {'romantic': False, ... GDnbt3isfhd57T1QqU6flg [Tex-Mex, Mexican, Fast Food, Restaurants] Scottsdale [Monday 10:0-22:0, Tuesday 10:0-22:0, Wednesda... 1 33.586710 -111.835410 Taco Bell NaN 85259.0 9 2.5 AZ business

In [3]:
business['attributes'][0]


Out[3]:
u"[BikeParking: True, BusinessAcceptsBitcoin: False, BusinessAcceptsCreditCards: True, BusinessParking: {'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}, DogsAllowed: False, RestaurantsPriceRange2: 2, WheelchairAccessible: True]"

In [4]:
# Cleaning business with Dave's tips

business['attributes'] = business['attributes'].fillna('[]')
business['attributes'] = business['attributes'].map(lambda x: x.replace('[','{'))
business['attributes'] = business['attributes'].map(lambda x: x.replace(']','}'))
business['attributes'] = business['attributes'].map(lambda x: x.replace('True', 'true'))
business['attributes'] = business['attributes'].map(lambda x: x.replace('False', 'false'))
business['attributes'] = business['attributes'].map(lambda x: x.replace('\'', '"'))

business.head(2)


Out[4]:
address attributes business_id categories city hours is_open latitude longitude name neighborhood postal_code review_count stars state type
0 227 E Baseline Rd, Ste J2 {BikeParking: true, BusinessAcceptsBitcoin: fa... 0DI8Dt2PJp07XkVvIElIcQ [Tobacco Shops, Nightlife, Vape Shops, Shopping] Tempe [Monday 11:0-21:0, Tuesday 11:0-21:0, Wednesda... 0 33.378214 -111.936102 Innovative Vapors NaN 85283.0 17 4.5 AZ business
1 495 S Grand Central Pkwy {BusinessAcceptsBitcoin: false, BusinessAccept... LTlCaCGZE14GuaUXUGbamg [Caterers, Grocery, Food, Event Planning & Ser... Las Vegas [Monday 0:0-0:0, Tuesday 0:0-0:0, Wednesday 0:... 1 36.192284 -115.159272 Cut and Taste NaN 89106.0 9 5.0 NV business

In [5]:
# Function to make python understand business['attributes'] is a json-type dict

def regex_match(test_str):

    matches = re.findall("([A-Za-z0-9]+)(?=:)", test_str)

    if len(matches):
        for match in matches:
            test_str = test_str.replace(match, '"%s"' % match)
    
    return test_str

In [6]:
business['attributes'] = business['attributes'].map(regex_match)

In [86]:
type(business['attributes'][0])


Out[86]:
unicode

In [98]:
business['attributes'][0]


Out[98]:
'{"BikeParking": true, "BusinessAcceptsBitcoin": false, "BusinessAcceptsCreditCards": true, "BusinessParking": {"garage": false, "street": false, "validated": false, "lot": true, "valet": false}, "DogsAllowed": false, "RestaurantsPriceRange2": 2, "WheelchairAccessible": true}'

In [110]:
def expand_features(row):
    try:
        extracted = json.loads(row['attributes'])
        for key, value in extracted.items():
            print key, type(value)
            if type(value) != dict:
                row["attribute_" + key] = value
            else:
                for attr_key, attr_value in value.items():
                    row["attribute_" + key + "_" + attr_key] = attr_value
    except:
        print "could not decode:", row['attributes']
    
    return row

business.head(10).apply(expand_features, axis=1).columns


DogsAllowed <type 'bool'>
BusinessParking <type 'dict'>
BusinessAcceptsCreditCards <type 'bool'>
RestaurantsPriceRange2 <type 'int'>
BusinessAcceptsBitcoin <type 'bool'>
BikeParking <type 'bool'>
WheelchairAccessible <type 'bool'>
DogsAllowed <type 'bool'>
BusinessParking <type 'dict'>
BusinessAcceptsCreditCards <type 'bool'>
RestaurantsPriceRange2 <type 'int'>
BusinessAcceptsBitcoin <type 'bool'>
BikeParking <type 'bool'>
WheelchairAccessible <type 'bool'>
BusinessAcceptsCreditCards <type 'bool'>
BusinessAcceptsBitcoin <type 'bool'>
BusinessParking <type 'dict'>
HairSpecializesIn <type 'dict'>
BusinessAcceptsCreditCards <type 'bool'>
RestaurantsPriceRange2 <type 'int'>
AcceptsInsurance <type 'bool'>
ByAppointmentOnly <type 'bool'>
BusinessAcceptsCreditCards <type 'bool'>
RestaurantsPriceRange2 <type 'int'>
BusinessParking <type 'dict'>
BikeParking <type 'bool'>
WheelchairAccessible <type 'bool'>
could not decode: {"Alcohol": none, "Ambience": {"romantic": false, "intimate": false, "classy": false, "hipster": false, "divey": false, "touristy": false, "trendy": false, "upscale": false, "casual": false}, "BikeParking": true, "BusinessAcceptsCreditCards": true, "BusinessParking": {"garage": false, "street": false, "validated": false, "lot": false, "valet": false}, "DriveThru": true, "GoodForKids": true, "GoodForMeal": {"dessert": false, "latenight": false, "lunch": false, "dinner": false, "breakfast": false, "brunch": false}, "HasTV": false, "NoiseLevel": quiet, "OutdoorSeating": false, "RestaurantsAttire": casual, "RestaurantsDelivery": false, "RestaurantsGoodForGroups": true, "RestaurantsPriceRange2": 1, "RestaurantsReservations": false, "RestaurantsTableService": false, "RestaurantsTakeOut": true, "WiFi": no}
BusinessAcceptsCreditCards <type 'bool'>
could not decode: {"Alcohol": none, "Ambience": {"romantic": false, "intimate": false, "classy": false, "hipster": false, "divey": false, "touristy": false, "trendy": false, "upscale": false, "casual": true}, "BusinessAcceptsCreditCards": true, "BusinessParking": {"garage": false, "street": false, "validated": false, "lot": true, "valet": false}, "Caters": true, "GoodForKids": true, "GoodForMeal": {"dessert": false, "latenight": false, "lunch": true, "dinner": true, "breakfast": false, "brunch": false}, "HasTV": true, "NoiseLevel": quiet, "OutdoorSeating": false, "RestaurantsAttire": casual, "RestaurantsDelivery": false, "RestaurantsGoodForGroups": true, "RestaurantsPriceRange2": 1, "RestaurantsReservations": false, "RestaurantsTableService": false, "RestaurantsTakeOut": true, "WiFi": no}
ByAppointmentOnly <type 'bool'>
Out[110]:
Index([u'address', u'attribute_AcceptsInsurance', u'attribute_BikeParking',
       u'attribute_BusinessAcceptsBitcoin',
       u'attribute_BusinessAcceptsCreditCards',
       u'attribute_BusinessParking_garage', u'attribute_BusinessParking_lot',
       u'attribute_BusinessParking_street', u'attribute_BusinessParking_valet',
       u'attribute_BusinessParking_validated', u'attribute_ByAppointmentOnly',
       u'attribute_DogsAllowed',
       u'attribute_HairSpecializesIn_africanamerican',
       u'attribute_HairSpecializesIn_asian',
       u'attribute_HairSpecializesIn_coloring',
       u'attribute_HairSpecializesIn_curly',
       u'attribute_HairSpecializesIn_extensions',
       u'attribute_HairSpecializesIn_kids',
       u'attribute_HairSpecializesIn_perms',
       u'attribute_HairSpecializesIn_straightperms',
       u'attribute_RestaurantsPriceRange2', u'attribute_WheelchairAccessible',
       u'attributes', u'business_id', u'categories', u'city', u'hours',
       u'is_open', u'latitude', u'longitude', u'name', u'neighborhood',
       u'postal_code', u'review_count', u'stars', u'state', u'type'],
      dtype='object')

In [96]:
business['attributes'] = business['attributes'].map(json.)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-96-443f4a080afe> in <module>()
----> 1 business['attributes'] = business['attributes'].map(lambda x: json.loads(x.encode('utf-8')))

/home/amlanlimaye/anaconda2/lib/python2.7/site-packages/pandas/core/series.pyc in map(self, arg, na_action)
   2175             new_values = algos.take_1d(arg._values, indexer)
   2176         else:
-> 2177             new_values = map_f(values, arg)
   2178 
   2179         return self._constructor(new_values,

pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:66124)()

<ipython-input-96-443f4a080afe> in <lambda>(x)
----> 1 business['attributes'] = business['attributes'].map(lambda x: json.loads(x.encode('utf-8')))

/home/amlanlimaye/anaconda2/lib/python2.7/json/__init__.pyc in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    337             parse_int is None and parse_float is None and
    338             parse_constant is None and object_pairs_hook is None and not kw):
--> 339         return _default_decoder.decode(s)
    340     if cls is None:
    341         cls = JSONDecoder

/home/amlanlimaye/anaconda2/lib/python2.7/json/decoder.pyc in decode(self, s, _w)
    362 
    363         """
--> 364         obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    365         end = _w(s, end).end()
    366         if end != len(s):

/home/amlanlimaye/anaconda2/lib/python2.7/json/decoder.pyc in raw_decode(self, s, idx)
    380             obj, end = self.scan_once(s, idx)
    381         except StopIteration:
--> 382             raise ValueError("No JSON object could be decoded")
    383         return obj, end

ValueError: No JSON object could be decoded

In [7]:
# Cleaning 'categories' column

business['categories'] = business['categories'].fillna(' ')
business['categories'] = business['categories'].map(lambda x: x[1:-1].split(','))

In [8]:
# Cleaning 'hours' column

business['hours'] = business['hours'].fillna(' ')
business['hours'] = business['hours'].map(lambda x: x[1:-1].split(','))

In [9]:
# Cleaning 'neighborhoods' column

business['neighborhood'] = business['neighborhood'].fillna(' ')
business['neighborhood'] = business['neighborhood'].map(lambda x: x[1:-1].split(','))

In [10]:
# Cleaning 'postal_code' column

business['postal_code'] = business['postal_code'].map(lambda x: x[:-2])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-10-2cf7e52127ab> in <module>()
      1 # Cleaning 'postal_code' column
      2 
----> 3 business['postal_code'] = business['postal_code'].map(lambda x: x[:-2])

/home/amlanlimaye/anaconda2/lib/python2.7/site-packages/pandas/core/series.pyc in map(self, arg, na_action)
   2175             new_values = algos.take_1d(arg._values, indexer)
   2176         else:
-> 2177             new_values = map_f(values, arg)
   2178 
   2179         return self._constructor(new_values,

pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:66124)()

<ipython-input-10-2cf7e52127ab> in <lambda>(x)
      1 # Cleaning 'postal_code' column
      2 
----> 3 business['postal_code'] = business['postal_code'].map(lambda x: x[:-2])

TypeError: 'float' object has no attribute '__getitem__'

In [ ]:
# Writing clean 'business' dataframe to csv

business.to_csv('../../../data/interim/clean_US_cities/business.csv', encoding='utf-8', index=False)