In [1]:
%pylab inline
import pandas as pd
import json
import sqlite3
import ast
In [2]:
reviews = pd.DataFrame(json.loads(l) for l in open('/home/russ/Documents/DDL/Data/YelpData/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json'))
In [3]:
reviews['useful'] = reviews['votes'].apply(lambda v: v['useful'])
In [2]:
location_db = '/home/russ/Documents/DDL/Projects/03-censusables/databases/Censusables.db'
conn = sqlite3.connect(location_db)
reviews_db = pd.read_sql("SELECT * FROM 'Yelp.Reviews'", con=conn)
In [7]:
reviews_db.head(1)
Out[7]:
In [26]:
import ast
def json_convert (v):
d = json.loads(v)
return d['useful']
my_dict = ast.literal_eval("{u'funny': 0, u'useful': 2, u'cool': 1}")
print my_dict['useful']
#reviews_db ['useful'] = reviews_db['votes'].apply(json_convert)
In [3]:
reviews_db['useful'] = reviews_db['votes'].apply(lambda v: ast.literal_eval(v)['useful'])
In [4]:
reviews.head(1)
Out[4]:
In [31]:
reviews_db.head(1)
Out[31]:
In [4]:
ytract = pd.DataFrame(json.loads(l) for l in open('/home/russ/Documents/DDL/Data/YelpData/business_tract.json'))
business = pd.DataFrame(json.loads(l) for l in open('/home/russ/Documents/DDL/Data/YelpData/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json'))
In [3]:
business.head(1)
Out[3]:
In [5]:
business = business.merge(ytract)
In [ ]:
In [46]:
categories = {}
for d in (json.loads(l) for l in open('/home/russ/Documents/DDL/Data/YelpData/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json')):
for c in d['categories']:
if c in categories:
categories[c] += 1
else:
categories[c] = 1
In [30]:
category_df = pd.DataFrame(categories.items(),columns=['Category','Count'])
category_df = category_df.sort('Count',ascending=False)
category_df
Out[30]:
In [ ]:
category_df.to_csv('counts_by_category.csv')
In [11]:
"""cats = []
for ind, row in business.iterrows():
#cats.append(dict(business_id = business['business_id'], category=c))
print row
break
#cats = pd.DataFrame(cats) """
In [93]:
"""def SeriesFromSubList(aList):
return pd.Series(aList)
df_orgz = pd.concat(dict([ (ind,row.apply(SeriesFromSubList)) for ind,row in business.iterrows() ]))"""
Out[93]:
In [6]:
#Create BusinessID by Individual Category
cats = []
for d in (json.loads(l) for l in open('/home/russ/Documents/DDL/Data/YelpData/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json')):
for c in d['categories']:
cats.append(dict(business_id = d['business_id'], category=c))
cats = pd.DataFrame(cats)
In [9]:
cat_lookup = pd.read_csv('/home/russ/Documents/DDL/Data/YelpData/counts_by_category_recat.csv', header=0, usecols=['Category','Count','New'])
cat_lookup.columns = ['OldCategory','Count','Category']
In [6]:
cat_lookup.head(1)
Out[6]:
In [10]:
cat_combined = pd.merge(cat_lookup[['OldCategory','Count']], cat_lookup[['OldCategory','Category']],left_on='OldCategory', right_on='Category')
In [9]:
cat_combined.head(3)
Out[9]:
In [11]:
cat_combined.drop(['OldCategory_x'],axis=1, inplace=True)
cat_combined.columns = ['Count','OldCategory','Category']
cat_combined
Out[11]:
In [ ]:
#cats = cats.merge(ytract)
#counts_by_geocat.to_csv('counts_by_geocat.csv')
In [12]:
business_category_count = pd.merge (cats[['business_id','category']],cat_combined[['OldCategory','Category','Count']],left_on='category',right_on='OldCategory').sort(['Count','OldCategory'],ascending=[0,1])
In [13]:
business_category_count.drop(['category'],axis=1,inplace=True)
In [14]:
business_category_count
Out[14]:
In [171]:
#business_grouping = business_category_count[['business_id','Category','Count']].groupby(['business_id','Category']).max()
In [182]:
#business_grouping
In [74]:
business_category_count[business_category_count['business_id'] == '--1emggGHgoG6ipd_RMb-g']
Out[74]:
In [16]:
business_category_count.groupby('business_id').apply(lambda t: t[t.Count==t.Count.max()])
Out[16]:
In [17]:
#Get record that has the Max Count of each business_id group
business_new_cat = business_category_count[['business_id','Category','Count']].groupby('business_id').apply(lambda t: t[t.Count==t.Count.max()])
In [18]:
#Drop Duplicates
business_new_cat[['business_id','Category']].drop_duplicates()
Out[18]:
In [19]:
business_cat_dedup = business_new_cat[['business_id','Category']].drop_duplicates()
In [20]:
business_cat_dedup[business_cat_dedup['business_id'] == 'rgSSCx0tb9AQyCkAvMSIoQ']
Out[20]:
In [77]:
#TESTING UNSTACK
business_cat_dedup.unstack(level=0)
In [21]:
business = business.merge(ytract)
In [23]:
business = business.merge(business_cat_dedup)
In [25]:
business.head(1)
Out[25]:
In [26]:
cat_count = business[['business_id','Category']].groupby(['Category']).count()
In [27]:
cat_count
Out[27]:
In [28]:
cat_count.sort('business_id',ascending=False).plot(kind='bar')
Out[28]:
In [29]:
business.head(1)
Out[29]:
In [78]:
reviews.head(1)
Out[78]:
In [30]:
business[['business_id','GISJOIN','Category','open']].merge(reviews[['business_id', 'date','stars','useful']])
Out[30]:
In [31]:
geo_review = business[['business_id','GISJOIN','Category','open']].merge(reviews[['business_id', 'date','stars','useful']])
In [ ]:
#review_cats = reviews[['business_id', 'stars', 'useful']].merge(cats)
In [37]:
business_review_years = geo_review[['business_id','date']].groupby('business_id').agg({'date': [np.min,np.max]})
In [69]:
business_review_years
Out[69]:
In [ ]:
review_cats['wstars'] = review_cats.stars * (review_cats.useful + 1)
In [73]:
#reviews['useful'] = reviews['votes'].apply(lambda v: v['useful'])
business_review_years['years'] = business_review_years[['date']].stack(level=0)[['amax']]
#business_review_years[['date']].stack(level=0)[['amin','amax']]
#business_review_years.stack(level=0)[['amin']]
In [66]:
business_review_years
Out[66]:
In [ ]:
stars_by_geocat = review_cats[['stars']].groupby([review_cats.category, review_cats.GISJOIN]).mean()
stars_by_geocat = stars_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=99)
stars_by_geocat.columns = stars_by_geocat.columns.get_level_values(1)
stars_by_geocat.reset_index(inplace=True)
stars_by_geocat.to_csv('stars_by_geocat.csv')
wstars_by_geocat = review_cats[['wstars']].groupby([review_cats.category, review_cats.GISJOIN]).mean()
wstars_by_geocat = wstars_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=99)
wstars_by_geocat.columns = wstars_by_geocat.columns.get_level_values(1)
wstars_by_geocat.reset_index(inplace=True)
wstars_by_geocat.to_csv('wstars_by_geocat.csv')
counts_by_geocat = cats[['business_id']].groupby([cats.category, cats.GISJOIN]).count()
counts_by_geocat.columns = ['business_count']
counts_by_geocat = counts_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=99)
counts_by_geocat.columns = counts_by_geocat.columns.get_level_values(1)
counts_by_geocat.reset_index(inplace=True)
counts_by_geocat.to_csv('counts_by_geocat.csv')
In [ ]:
data = pandas.read_csv('/home/russ/Documents/DDL/ACS-5/nhgis0002_csv/nhgis0002_ds201_20135_2013_tract.csv', skiprows=[1])
In [ ]:
data['pc_income'] = data['UJAE001']
data['employed_percent'] = data['UJ8E002'] / data['UJ8E001']
data['median_age'] = data['UEFE001']
data['median_gross_rent'] = data['UL9E001']
data['median_family_income'] = data['UINE001']
econ = data[['GISJOIN', 'pc_income', 'employed_percent', 'median_age', 'median_gross_rent', 'median_family_income']]
In [ ]:
wcorrs = econ.merge(wstars_by_geocat).corr().unstack().reset_index()
wcorrs.columns = ['econ', 'cat', 'r']
wcorrs = wcorrs[wcorrs.econ.apply(lambda e: e.islower())]
wcorrs = wcorrs[wcorrs.cat.apply(lambda e: not e.islower())]
wcorrs[wcorrs.r.apply(lambda v: abs(v) > .3)]
In [ ]:
scorrs = econ.merge(stars_by_geocat).corr().unstack().reset_index()
scorrs.columns = ['econ', 'cat', 'r']
scorrs = scorrs[scorrs.econ.apply(lambda e: e.islower())]
scorrs = scorrs[scorrs.cat.apply(lambda e: not e.islower())]
scorrs[scorrs.r.apply(lambda v: abs(v) > .3)]
In [ ]:
ccorrs = econ.merge(counts_by_geocat).corr().unstack().reset_index()
ccorrs.columns = ['econ', 'cat', 'r']
ccorrs = ccorrs[ccorrs.econ.apply(lambda e: e.islower())]
ccorrs = ccorrs[ccorrs.cat.apply(lambda e: not e.islower())]
ccorrs[ccorrs.r.apply(lambda v: abs(v) > .3)]
In [ ]: