To-Do:
In [1]:
import seaborn
import pandas as pd
import pylab as pl
import yaml
%pylab inline
In [2]:
df = pd.read_pickle("../yelp-challenge/data_urbana_champaign/business_urbana_champaign.p")
df.reset_index(drop=True, inplace=True)
print df.shape
print df.columns.values
len(df.business_id.unique())
Out[2]:
In [3]:
df.head(2)
Out[3]:
In [4]:
print df.city.unique(), '\n', len(df.city.unique())
In [28]:
city_count = df.city.groupby(df.city).apply(lambda x:x.count())
uc_sum = city_count[city_count.keys().isin(['Champaign', 'Urbana'])].sum()
100.0 * uc_sum / city_count.sum()
Out[28]:
we only consider Champaign and Urbana as our target in this dataset
In [3]:
df_uc = df[df.city.isin(['Champaign', 'Urbana'])]
df_uc.shape
Out[3]:
In [33]:
print "{}% ({}/{}) business_id(s) are open".format(100.0 * sum(df_uc.is_open == 1) / len(df_uc), sum(df_uc.is_open == 1), len(df_uc))
we only consider those with is_open == 1
In [11]:
df_uc_open = df_uc[df_uc.is_open == 1]
df_uc_open.reset_index(drop=True, inplace=True)
df_uc_open.shape
Out[11]:
In [112]:
100.0 * df_uc_open.hours.dropna().shape[0] / df_uc_open.shape[0]
Out[112]:
Drop all records with missing 'Hour'
In [12]:
df_uc_open = df_uc_open.dropna(subset=['hours'])
df_uc_open.reset_index(drop=True, inplace=True)
df_uc_open.shape
Out[12]:
In [116]:
len(df_uc_open.postal_code.unique())
Out[116]:
In [117]:
df.review_count.describe()
Out[117]:
In [118]:
df_uc_open.review_count.describe()
Out[118]:
In [119]:
df_uc_open.ix[df_uc_open.review_count.argmax()]
Out[119]:
In [121]:
cat = {}
none_num = 0
for i in df_uc_open.categories:
if not i:
none_num += 1
continue
for j in i:
cat[j] = cat.get(j,0) + 1
In [122]:
none_num
Out[122]:
In [123]:
sorted(cat.iteritems(), key=lambda x: x[1], reverse=True)
Out[123]:
Here we should set the list of categories we are interested.
In [ ]:
In [124]:
df_uc_open.neighborhood.unique()
Out[124]:
All 'Neighborhood' == None
First of all, remove records with None 'Attribute'
In [13]:
df_uc_open = df_uc_open.dropna(subset=['attributes'])
df_uc_open.reset_index(drop=True, inplace=True)
df_uc_open.shape
# 814 -> ?
Out[13]:
In [204]:
# df_uc_open.attributes
# import yaml
In [202]:
att_count = {}
for i in df_uc_open.attributes:
data = yaml.load('['+','.join(i)+']')
for d in data:
att_count[d.keys()[0]] = att_count.get(d.keys()[0], 0) + 1
In [203]:
sorted(att_count.iteritems(), key=lambda x: x[1], reverse=True)
Out[203]:
In [207]:
df_uc_open.shape[0]
Out[207]:
We need a threshold to decide which attributes we use to build model.
In [212]:
data = yaml.load('['+','.join(df_uc_open.attributes[452])+']')
data
Out[212]:
There are 2 kinds of attributes:
* Binary
* 0 or 1
* True or False
* Multiple
* dict eg. 'GoodForMeal'
* if ALL False, ?
* if more than ONE True, ? eg. 'BestNights'
* multi-value eg. 'RestaurantsPriceRange2' 1,2,3... or 'Alcohol'
* Anything Else
* How to deal with missing values?
In [252]:
pl.hist(df_uc_open.stars)
Out[252]:
In [228]:
df_uc_open.stars.describe()
Out[228]:
In [242]:
s = 3.9
1.0 * sum(df_uc_open.stars > s) / 729
Out[242]:
If we would like to build a binary classification model, here we could set threshold 'star' as '<4.0'
In [241]:
df_uc_open.stars.value_counts() /729
Out[241]:
In [275]:
pl.hist?
In [276]:
c = ['Champaign', 'Urbana']
print c[0], '\n',df_uc_open[df_uc_open.city == c[0]].stars.describe()
pl.hist(df_uc_open[df_uc_open.city == c[0]].stars, normed=True)
Out[276]:
In [277]:
print c[1], '\n',df_uc_open[df_uc_open.city == c[1]].stars.describe()
pl.hist(df_uc_open[df_uc_open.city == c[1]].stars, normed=True)
Out[277]:
In [253]:
print df.shape, df_uc.shape, df_uc_open.shape
In [15]:
df_uc_open.to_pickle("UC01_df_uc_open.p")