In [1]:
import pandas as pd
In [2]:
user = pd.read_csv('../../../data/interim/US_cities_only/user_US.csv')
In [3]:
user.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901709 entries, 0 to 901708
Data columns (total 23 columns):
average_stars 901709 non-null float64
compliment_cool 901709 non-null int64
compliment_cute 901709 non-null int64
compliment_funny 901709 non-null int64
compliment_hot 901709 non-null int64
compliment_list 901709 non-null int64
compliment_more 901709 non-null int64
compliment_note 901709 non-null int64
compliment_photos 901709 non-null int64
compliment_plain 901709 non-null int64
compliment_profile 901709 non-null int64
compliment_writer 901709 non-null int64
cool 901709 non-null int64
elite 901709 non-null object
fans 901709 non-null int64
friends 901709 non-null object
funny 901709 non-null int64
name 901535 non-null object
review_count 901709 non-null int64
type 901709 non-null object
useful 901709 non-null int64
user_id 901709 non-null object
yelping_since 901709 non-null object
dtypes: float64(1), int64(16), object(6)
memory usage: 158.2+ MB
In [4]:
user.head()
Out[4]:
average_stars
compliment_cool
compliment_cute
compliment_funny
compliment_hot
compliment_list
compliment_more
compliment_note
compliment_photos
compliment_plain
...
elite
fans
friends
funny
name
review_count
type
useful
user_id
yelping_since
0
3.59
4192
79
4192
3904
19
305
4705
1347
2617
...
[2017, 2015, 2016, 2014, 2011, 2013, 2012]
298
[iJg9ekPzF9lkMuvjKYX6uA, ctWAuzS04Xu0lke2Rop4l...
12316
Rob
761
user
18456
EZmocAborM6z66rTzeZxzQ
2009-09-12
1
4.29
144
11
144
64
1
4
97
24
129
...
[None]
34
[r2UUCzGxqI6WPsiWPgqG2A, qewG3X2O4X6JKskxyyqFw...
28
Vivian
80
user
117
myql3o3x22_ygECb8gVo7A
2009-06-27
2
4.15
36
1
36
14
2
10
21
4
23
...
[2011, 2016, 2017, 2013, 2014, 2015, 2012]
48
[qewG3X2O4X6JKskxyyqFwQ, -50XWnmQGqBgEI-9ANvLl...
6
Carol
841
user
58
FIk4lQQu1eTe2EpzQ4xhBA
2010-08-26
3
3.82
54
6
54
32
0
5
13
1
29
...
[2014, 2010, 2017, 2015, 2011, 2016, 2013, 201...
28
[AIhfuFmX62k7a22gXXAB2Q, kQ1jU2rDpFD_q54edEmwI...
51
Miss Maggie
376
user
42
ojovtd9c8GIeDiB8e0mq2w
2008-05-31
4
3.76
19
0
19
7
2
1
9
5
14
...
[2011, 2016, 2014, 2017, 2012, 2013, 2015]
9
[YZ4dz1QD-muThmGMYoTnjA, qewG3X2O4X6JKskxyyqFw...
0
Ramsey
194
user
5
TprC8sujz8MkwuomrqUSiw
2009-01-06
5 rows × 23 columns
In [5]:
# Cleaning 'elite' column
user['elite'] = user['elite'].map(lambda x: x[1:-1].split(','))
In [6]:
# Cleaning 'friends' column
user['friends'] = user['friends'].map(lambda x: x[1:-1].split(','))
In [7]:
# Cleaning 'yelping since' column
user['yelping_since'] = pd.to_datetime(user['yelping_since'])
In [8]:
user.describe()
Out[8]:
average_stars
compliment_cool
compliment_cute
compliment_funny
compliment_hot
compliment_list
compliment_more
compliment_note
compliment_photos
compliment_plain
compliment_profile
compliment_writer
cool
fans
funny
review_count
useful
count
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
901709.000000
mean
3.728413
4.373300
0.310129
4.373300
3.259165
0.134816
0.467713
2.074431
1.353958
4.355089
0.315737
1.650038
22.794375
1.652420
17.350433
25.266857
33.363234
std
1.090164
95.099742
16.354968
95.099742
82.789833
14.106165
16.861228
60.809347
110.579144
102.713015
19.814286
37.273630
693.805448
14.378906
509.071391
84.945665
728.294151
min
1.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
25%
3.170000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
2.000000
0.000000
50%
3.890000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
5.000000
0.000000
75%
4.590000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
1.000000
17.000000
3.000000
max
5.000000
28500.000000
13352.000000
28500.000000
23324.000000
12383.000000
13035.000000
45129.000000
79652.000000
35399.000000
13616.000000
14703.000000
195160.000000
4691.000000
170896.000000
11284.000000
186543.000000
In [9]:
# Writing clean 'user' dataframe to csv
user.to_csv('../../../data/interim/clean_US_cities/user_clean.csv', encoding='utf-8', index=False)
Content source: amlanlimaye/yelp-dataset-challenge
Similar notebooks: