In [1]:
import pandas as pd

In [2]:
user = pd.read_csv('../../../data/interim/US_cities_only/user_US.csv')

In [3]:
user.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901709 entries, 0 to 901708
Data columns (total 23 columns):
average_stars         901709 non-null float64
compliment_cool       901709 non-null int64
compliment_cute       901709 non-null int64
compliment_funny      901709 non-null int64
compliment_hot        901709 non-null int64
compliment_list       901709 non-null int64
compliment_more       901709 non-null int64
compliment_note       901709 non-null int64
compliment_photos     901709 non-null int64
compliment_plain      901709 non-null int64
compliment_profile    901709 non-null int64
compliment_writer     901709 non-null int64
cool                  901709 non-null int64
elite                 901709 non-null object
fans                  901709 non-null int64
friends               901709 non-null object
funny                 901709 non-null int64
name                  901535 non-null object
review_count          901709 non-null int64
type                  901709 non-null object
useful                901709 non-null int64
user_id               901709 non-null object
yelping_since         901709 non-null object
dtypes: float64(1), int64(16), object(6)
memory usage: 158.2+ MB

In [4]:
user.head()


Out[4]:
average_stars compliment_cool compliment_cute compliment_funny compliment_hot compliment_list compliment_more compliment_note compliment_photos compliment_plain ... elite fans friends funny name review_count type useful user_id yelping_since
0 3.59 4192 79 4192 3904 19 305 4705 1347 2617 ... [2017, 2015, 2016, 2014, 2011, 2013, 2012] 298 [iJg9ekPzF9lkMuvjKYX6uA, ctWAuzS04Xu0lke2Rop4l... 12316 Rob 761 user 18456 EZmocAborM6z66rTzeZxzQ 2009-09-12
1 4.29 144 11 144 64 1 4 97 24 129 ... [None] 34 [r2UUCzGxqI6WPsiWPgqG2A, qewG3X2O4X6JKskxyyqFw... 28 Vivian 80 user 117 myql3o3x22_ygECb8gVo7A 2009-06-27
2 4.15 36 1 36 14 2 10 21 4 23 ... [2011, 2016, 2017, 2013, 2014, 2015, 2012] 48 [qewG3X2O4X6JKskxyyqFwQ, -50XWnmQGqBgEI-9ANvLl... 6 Carol 841 user 58 FIk4lQQu1eTe2EpzQ4xhBA 2010-08-26
3 3.82 54 6 54 32 0 5 13 1 29 ... [2014, 2010, 2017, 2015, 2011, 2016, 2013, 201... 28 [AIhfuFmX62k7a22gXXAB2Q, kQ1jU2rDpFD_q54edEmwI... 51 Miss Maggie 376 user 42 ojovtd9c8GIeDiB8e0mq2w 2008-05-31
4 3.76 19 0 19 7 2 1 9 5 14 ... [2011, 2016, 2014, 2017, 2012, 2013, 2015] 9 [YZ4dz1QD-muThmGMYoTnjA, qewG3X2O4X6JKskxyyqFw... 0 Ramsey 194 user 5 TprC8sujz8MkwuomrqUSiw 2009-01-06

5 rows × 23 columns


In [5]:
# Cleaning 'elite' column

user['elite'] = user['elite'].map(lambda x: x[1:-1].split(','))

In [6]:
# Cleaning 'friends' column

user['friends'] = user['friends'].map(lambda x: x[1:-1].split(','))

In [7]:
# Cleaning 'yelping since' column

user['yelping_since'] = pd.to_datetime(user['yelping_since'])

In [8]:
user.describe()


Out[8]:
average_stars compliment_cool compliment_cute compliment_funny compliment_hot compliment_list compliment_more compliment_note compliment_photos compliment_plain compliment_profile compliment_writer cool fans funny review_count useful
count 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000 901709.000000
mean 3.728413 4.373300 0.310129 4.373300 3.259165 0.134816 0.467713 2.074431 1.353958 4.355089 0.315737 1.650038 22.794375 1.652420 17.350433 25.266857 33.363234
std 1.090164 95.099742 16.354968 95.099742 82.789833 14.106165 16.861228 60.809347 110.579144 102.713015 19.814286 37.273630 693.805448 14.378906 509.071391 84.945665 728.294151
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 3.170000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.000000
50% 3.890000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.000000 0.000000
75% 4.590000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 17.000000 3.000000
max 5.000000 28500.000000 13352.000000 28500.000000 23324.000000 12383.000000 13035.000000 45129.000000 79652.000000 35399.000000 13616.000000 14703.000000 195160.000000 4691.000000 170896.000000 11284.000000 186543.000000

In [9]:
# Writing clean 'user' dataframe to csv

user.to_csv('../../../data/interim/clean_US_cities/user_clean.csv', encoding='utf-8', index=False)