In [1]:
from pandas import Series, DataFrame
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
%pylab inline
In [2]:
restaurants = pd.read_csv("NYC_Restaurants.csv", dtype=unicode)
for index, item in enumerate(restaurants.columns.values):
print index, item
DBA
, BUILDING
, STREET
and ZIPCODE
columns as a dataframeapply()
function on the selected dataframe, which takes in the series of the dataframe.apply()
function, use placeholders to indicate that 4 series will be taken at the same time.
In [3]:
#use .apply() method to combine the 4 columns to get the unique restaurant name
restaurants["RESTAURANT"] = restaurants[["DBA", "BUILDING", "STREET", "ZIPCODE"]].\
apply(lambda x: "{} {} {} {}".format(x[0], x[1], x[2], x[3]), axis=1)
#incase that the RESTAURANT names contain spaces or symbols, strip off them
restaurants["RESTAURANT"] = restaurants["RESTAURANT"].map(lambda y: y.strip())
print restaurants["RESTAURANT"][:10]
Since each RESTAURANT
appears appears only once in value_count()
series, therefore applying len()
will return the number of restaurants in the whole dataset.
In [4]:
print "There are", len(restaurants.drop_duplicates(subset="RESTAURANT")["RESTAURANT"].value_counts()), "restaurants in the data."
"Chains" are brands having at least 2 different RESTAURANT
. After drop_duplicates(subset="RESTAURANT")
, extractingvalue_count()
on DBA
will give how many RESTAURANT
each DBA
has. Converting each value into logical with evaluation value_count()>=2
and then summing up the how series will give the number of True
records, which is the number of chains.
In [5]:
num_chain = sum(restaurants.drop_duplicates(subset="RESTAURANT")["DBA"].value_counts()>=2)
print "There are", num_chain, "chain restaurants."
"Popularity" is here understood as number of RESAURANT
of each DBA
.
DBA
chain
to identify if a given DBA
is a chain.DBA
.RESTAURANT
, the value_counts()
will give the number of locations of each DBA
In [6]:
chains = restaurants.drop_duplicates(subset="RESTAURANT")["DBA"].value_counts()[: num_chain].index.values
def chain(restaurant):
return (restaurant in chains)
mask = restaurants["DBA"].map(chain)
restaurants[mask].drop_duplicates(subset="RESTAURANT")["DBA"].value_counts()[:20].plot(kind="bar")
Out[6]:
To calculate the faction of chains among all restaurants, we use an inline mask on DBA
(True
if is chain). Summing up True
values gives the number of chains. It is divided by the total number of unique RESTAURANT
to get the fraction.
In [7]:
print "The percentage of chain restaurants is",
print "{:.2%}".format(sum(restaurants.drop_duplicates(subset="RESTAURANT")["DBA"].value_counts()>=2)/float(len(restaurants["RESTAURANT"].value_counts())))
lower_case
is defined to convert the string into lower case.chain
helper function to make a mask selecting chains. Negative of this mask will return non-chains.lower_case
function to select missing BORO
.RESTAURANT
, and then remove missing BORO
, value_counts()
gives number of non-chains in each borough.
In [8]:
def lower_case(X):
return X.lower()
mask_1 = restaurants["DBA"].map(chain)
mask_2 = restaurants["BORO"].map(lower_case) != "missing"
restaurants[-mask_1].drop_duplicates(subset="RESTAURANT")[mask_2]["BORO"].value_counts().sort_values(ascending=False).plot(kind="bar")
Out[8]:
The goal is to calculate the ratio of $\frac{N_{non-chain}}{N_{total}}$ within each borough.
This fraction can be done between two series-value_counts()
of non-chains of BORO
(not missing) and value_counts()
of all unique RESTAURANT
of BORO
.
Depending on which borough has the highest ratio, a message will pop out to compare if it is the same with the borough with the most non-chains.
In [9]:
series_tmp_1 = restaurants[mask_2].drop_duplicates(subset="RESTAURANT")["BORO"].value_counts()
series_tmp_2 = restaurants[-mask_1][mask_2].drop_duplicates(subset="RESTAURANT")["BORO"].value_counts()
series_tmp_ratio = series_tmp_2/series_tmp_1
series_tmp_ratio.sort_values(ascending=False).plot(kind="bar")
print "The highest non-chain/total ratio is:", "{:0.2%} ({})".format(series_tmp_ratio.sort_values(ascending=False)[0],\
series_tmp_ratio.sort_values(ascending=False).index.values[0])
if series_tmp_ratio.sort_values(ascending=False).index.values[0] !=\
restaurants[-mask_1].drop_duplicates(subset="RESTAURANT")[mask_2]["BORO"].value_counts().sort_values(ascending=False).index.values[0]:
print "It is not the same borough."
else:
print "It is the same borough."
Drop duplicate RESTAURANT
and plot on the top 20 of sorted value_counts()
of CUISINE DESCRIPTION.
In [10]:
restaurants.drop_duplicates(subset="RESTAURANT")["CUISINE DESCRIPTION"].value_counts()\
.sort_values(ascending=False)[:20].plot(kind="bar")
Out[10]:
Here we used a mask to sift out the restaurants whose VIOLATION CODE
is missing.
In [17]:
non_clean_restaurants = restaurants[-restaurants["VIOLATION CODE"].isnull()]["RESTAURANT"].value_counts().index.values
def is_clean(restaurant, blacklist=non_clean_restaurants):
return restaurant not in blacklist
mask_clean = restaurants["RESTAURANT"].map(is_clean)
restaurants[mask_clean]["CUISINE DESCRIPTION"].value_counts().sort_values(ascending=False)[:20].plot(kind="bar")
Out[17]:
value_counts()
series, containing the non-violation records for those cuisines.value_counts()
containing how many inspections were done for the most served cuisines.
In [12]:
top_cuisine_series = restaurants.drop_duplicates(subset=["RESTAURANT","CUISINE DESCRIPTION"])["CUISINE DESCRIPTION"].value_counts()
def is_top_cuisine(cuisine):
return top_cuisine_series[cuisine]>=20
mask_3 = restaurants["VIOLATION CODE"].isnull()
mask_4 = restaurants["CUISINE DESCRIPTION"].map(is_top_cuisine)
series_tmp_3 = restaurants[mask_4][mask_3]["CUISINE DESCRIPTION"].value_counts()
series_tmp_4 = restaurants[mask_4]["CUISINE DESCRIPTION"].value_counts()
(series_tmp_3/series_tmp_4).sort_values(ascending=False)[:10].plot(kind="bar")
Out[12]:
crosstab
to create a dataframe with VIOLATION DESCRIPTION
as index, and BORO
(without "Missing" boroughs) as columns. dropna
is set True
so NaN
will not be recorded.crosstab
is the number of occurences of a violation in a certain borough. idxmax()
method is applied to automatically retrieve the max occurence for each BORO
.
In [13]:
violation_boro_tab = pd.crosstab(
index=restaurants["VIOLATION DESCRIPTION"],
columns=restaurants[restaurants["BORO"]!="Missing"]["BORO"],
dropna=True
)
print "The most common violation in each borough is summarised below:"
violation_boro_tab.idxmax()
Out[13]:
apply()
function to apply lambda x: x.map(float)/violation_frequency_series, axis=0
on each column of the above crosstab
. The resulting series gives normalized violation frequency.float()
ensures the division returns fraction.value_counts()
of all VIOLATION DESCRIPTION
.
In [14]:
violation_frequency_series = restaurants["VIOLATION DESCRIPTION"].value_counts()
violation_boro_norm_tab = violation_boro_tab.apply(lambda x: x.map(float)/violation_frequency_series, axis=0)
print "After normalization, the most common violation in each borough is summarised below:"
violation_boro_norm_tab.idxmax()
Out[14]:
AREA
to store the first 3 digits of PHONE
, which is the area code.AREA
and ZIPCODE
.value_counts()==1
each AREA
with a single ZIPCODE
will return True
.True
values to return the total number of such area codes.
In [15]:
restaurants["AREA"] = restaurants["PHONE"].map(lambda x: x[:3])
print "There are",
print sum(restaurants.drop_duplicates(subset=["AREA", "ZIPCODE"])["AREA"].value_counts() == 1),
print "area codes corresponding to only 1 zipcode"
map
str.split()
function on STREET
to breakdown the string into a list of words, and take the last word as STREET TYPE
.STREET BASE
.STREET BASE
and STREET TYPE
together as STREET BASE & ZIP
, spaced with empty space.concat
the above 3 series. axis=1
meaning concatenating horizontally.STREET BASE
is not empty.STREET TYPE
.STREET TYPE
are different.crosstab
on the merged dataframe with one STREET TYPE
as index and the other as columns.crosstab
, the occurences of alternative STREET TYPE
are recorded in cells, whose max occurence can be obtained with idxmax
.
In [16]:
restaurants["STREET TYPE"] = restaurants["STREET"].map(lambda s: s.split()[-1])
restaurants["STREET BASE"] = restaurants["STREET"].map(lambda s: " ".join(s.split()[:-1]))
restaurants["STREET BASE & ZIP"] = restaurants["STREET BASE"].map(lambda s: s+" ") + restaurants["ZIPCODE"]
new_dataframe = pd.concat(
[restaurants["STREET BASE"], restaurants["STREET TYPE"], restaurants["STREET BASE & ZIP"]],
axis=1
)
new_dataframe = new_dataframe[new_dataframe["STREET BASE"].map(lambda s: len(s)>0)].drop_duplicates()
merged_new_dataframe = pd.merge(
new_dataframe,
new_dataframe,
left_on="STREET BASE & ZIP",
right_on="STREET BASE & ZIP",
suffixes=[" 1", " 2"]
)
merged_new_dataframe = merged_new_dataframe[merged_new_dataframe["STREET TYPE 1"] != merged_new_dataframe["STREET TYPE 2"]]
street_name = pd.crosstab(
index=merged_new_dataframe["STREET TYPE 1"],
columns=merged_new_dataframe["STREET TYPE 2"],
dropna=True
)
print "The most common alias for each of the following street type is listed"
street_name.idxmax()[
["AVE", "ST", "RD", "PL", "BOULEARD", "BOULEVARD"]
]
Out[16]: