In [1]:
import pandas as pd

In [2]:
review = pd.read_csv('../../../data/interim/US_cities_only/review_US.csv')

In [45]:
review.head()


Out[45]:
business_id cool date funny review_id stars text type useful user_id
0 2aFiy99vNLklCx3T_tGS9A 0 2011-10-10 0 NxL8SIC5yqOdnlXCg18IBg 5.0 If you enjoy service by someone who is as comp... review 0 KpkOkG6RIf4Ra25Lhhxf1A
1 2aFiy99vNLklCx3T_tGS9A 0 2010-12-29 0 pXbbIgOXvLuTi_SPs1hQEQ 5.0 After being on the phone with Verizon Wireless... review 1 bQ7fQq1otn9hKX-gXRsrgA
2 2aFiy99vNLklCx3T_tGS9A 0 2011-04-29 0 wslW2Lu4NYylb1jEapAGsw 5.0 Great service! Corey is very service oriented.... review 0 r1NUhdNmL6yU9Bn-Yx6FTw
3 2LfIuF3_sX6uwe-IR-P0jQ 1 2014-07-14 0 GP6YEearUWrzPtQYSF1vVg 5.0 Highly recommended. Went in yesterday looking ... review 0 aW3ix1KNZAvoM8q-WghA3Q
4 2LfIuF3_sX6uwe-IR-P0jQ 0 2014-01-15 0 25RlYGq2s5qShi-pn3ufVA 4.0 I walked in here looking for a specific piece ... review 0 YOo-Cip8HqvKp_p9nEGphw

In [44]:
review.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3457015 entries, 0 to 3457014
Data columns (total 10 columns):
business_id    object
cool           int64
date           datetime64[ns]
funny          int64
review_id      object
stars          float64
text           object
type           object
useful         int64
user_id        object
dtypes: datetime64[ns](1), float64(1), int64(3), object(5)
memory usage: 263.7+ MB

In [38]:
review.describe()


Out[38]:
cool funny stars useful
count 3.457015e+06 3.457015e+06 3.457015e+06 3.457015e+06
mean 5.461839e-01 4.455893e-01 3.741661e+00 1.048886e+00
std 2.035434e+00 1.832262e+00 1.428476e+00 2.744492e+00
min 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00
25% 0.000000e+00 0.000000e+00 3.000000e+00 0.000000e+00
50% 0.000000e+00 0.000000e+00 4.000000e+00 0.000000e+00
75% 0.000000e+00 0.000000e+00 5.000000e+00 1.000000e+00
max 5.130000e+02 6.320000e+02 5.000000e+00 1.125000e+03

In [3]:
# Cleaning the 'date' column

review['date'] = pd.to_datetime(review['date'])

In [4]:
# Cleaning the 'useful' column

review['useful'] = review['useful'].fillna(0)
review['useful'] = review['useful'].map(int)

In [13]:
# Reading clean vegas_three_years reviews and writing 2016 reviews

review = pd.read_csv('../../../data/interim/clean_US_cities/reviews_vegas_threeyears.csv')
review['date'] = pd.to_datetime(review['date'])
review = review.loc[review['date'] > '2015-12-31']
review.to_csv('../../../data/interim/clean_US_cities/2016_review.csv', encoding='utf-8', index=False)

In [46]:
# Writing clean 'review' dataframe to csv

review.to_csv('../../../data/interim/clean_US_cities/review_clean.csv', encoding='utf-8', index=False)