In [1]:
%matplotlib inline
In [2]:
%mkdir data/movielens
mkdir: cannot create directory ‘data/movielens’: File exists
In [3]:
%cd data/movielens
/home/ubuntu/courses/kevin_files/data/movielens
In [4]:
import pandas as pd
In [5]:
%ls
df = pd.read_csv('sample/ratings.csv')
sample/
In [6]:
df.head()
Out[6]:
userId
movieId
rating
timestamp
0
1
31
2.5
1260759144
1
1
1029
3.0
1260759179
2
1
1061
3.0
1260759182
3
1
1129
2.0
1260759185
4
1
1172
4.0
1260759205
In [7]:
df['rating'].describe()
Out[7]:
count 100004.000000
mean 3.543608
std 1.058064
min 0.500000
25% 3.000000
50% 4.000000
75% 4.000000
max 5.000000
Name: rating, dtype: float64
In [8]:
userID = df.groupby('userId')
In [10]:
userID.describe()
Out[10]:
movieId
rating
timestamp
userId
1
count
20.000000
20.000000
2.000000e+01
mean
1675.600000
2.550000
1.260759e+09
std
804.800492
0.887041
3.367628e+01
min
31.000000
1.000000
1.260759e+09
25%
1240.250000
2.000000
1.260759e+09
50%
1357.000000
2.500000
1.260759e+09
75%
2160.750000
3.000000
1.260759e+09
max
3671.000000
4.000000
1.260759e+09
2
count
76.000000
76.000000
7.600000e+01
mean
354.828947
3.486842
8.353557e+08
std
177.427723
0.901753
2.125090e+02
min
10.000000
1.000000
8.353554e+08
25%
224.500000
3.000000
8.353555e+08
50%
356.500000
3.000000
8.353557e+08
75%
508.250000
4.000000
8.353558e+08
max
720.000000
5.000000
8.353562e+08
3
count
51.000000
51.000000
5.100000e+01
mean
8207.901961
3.568627
1.298890e+09
std
17414.493258
0.741752
3.131689e+04
min
60.000000
2.000000
1.298862e+09
25%
594.000000
3.000000
1.298862e+09
50%
2028.000000
3.500000
1.298863e+09
75%
4649.000000
4.000000
1.298922e+09
max
84236.000000
5.000000
1.298933e+09
4
count
204.000000
204.000000
2.040000e+02
mean
1721.514706
4.348039
9.498806e+08
std
887.823028
0.947616
5.713218e+04
min
10.000000
1.000000
9.497787e+08
25%
1118.000000
4.000000
9.498116e+08
50%
1887.500000
5.000000
9.498963e+08
...
...
...
...
...
668
std
1520.797537
1.292692
1.109447e+02
min
296.000000
1.000000
9.936132e+08
25%
996.750000
3.000000
9.936133e+08
50%
1424.000000
4.000000
9.936134e+08
75%
2997.750000
5.000000
9.936135e+08
max
6425.000000
5.000000
9.936135e+08
669
count
37.000000
37.000000
3.700000e+01
mean
2316.054054
3.351351
1.015829e+09
std
1093.798807
0.919427
2.184557e+02
min
223.000000
2.000000
1.015829e+09
25%
1304.000000
3.000000
1.015829e+09
50%
2702.000000
3.000000
1.015829e+09
75%
2976.000000
4.000000
1.015830e+09
max
4015.000000
5.000000
1.015830e+09
670
count
31.000000
31.000000
3.100000e+01
mean
1267.677419
3.806452
9.390611e+08
std
1110.821269
1.222548
7.366792e+05
min
1.000000
1.000000
9.387813e+08
25%
130.000000
3.000000
9.387820e+08
50%
1183.000000
4.000000
9.387822e+08
75%
2431.000000
5.000000
9.387823e+08
max
2912.000000
5.000000
9.409440e+08
671
count
115.000000
115.000000
1.150000e+02
mean
2840.860870
3.917391
1.064845e+09
std
1897.120259
0.763563
1.838197e+06
min
1.000000
1.000000
1.063501e+09
25%
1213.000000
3.500000
1.063504e+09
50%
2683.000000
4.000000
1.064891e+09
75%
4170.000000
4.000000
1.065112e+09
max
6565.000000
5.000000
1.074785e+09
5368 rows × 3 columns
In [ ]:
In [ ]:
Content source: kmclaugh/fastai_courses
Similar notebooks: