In [1]:
%matplotlib inline

In [2]:
%mkdir data/movielens


mkdir: cannot create directory ‘data/movielens’: File exists

In [3]:
%cd data/movielens


/home/ubuntu/courses/kevin_files/data/movielens

In [4]:
import pandas as pd

In [5]:
%ls
df = pd.read_csv('sample/ratings.csv')


sample/

In [6]:
df.head()


Out[6]:
userId movieId rating timestamp
0 1 31 2.5 1260759144
1 1 1029 3.0 1260759179
2 1 1061 3.0 1260759182
3 1 1129 2.0 1260759185
4 1 1172 4.0 1260759205

In [7]:
df['rating'].describe()


Out[7]:
count    100004.000000
mean          3.543608
std           1.058064
min           0.500000
25%           3.000000
50%           4.000000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [8]:
userID = df.groupby('userId')

In [10]:
userID.describe()


Out[10]:
movieId rating timestamp
userId
1 count 20.000000 20.000000 2.000000e+01
mean 1675.600000 2.550000 1.260759e+09
std 804.800492 0.887041 3.367628e+01
min 31.000000 1.000000 1.260759e+09
25% 1240.250000 2.000000 1.260759e+09
50% 1357.000000 2.500000 1.260759e+09
75% 2160.750000 3.000000 1.260759e+09
max 3671.000000 4.000000 1.260759e+09
2 count 76.000000 76.000000 7.600000e+01
mean 354.828947 3.486842 8.353557e+08
std 177.427723 0.901753 2.125090e+02
min 10.000000 1.000000 8.353554e+08
25% 224.500000 3.000000 8.353555e+08
50% 356.500000 3.000000 8.353557e+08
75% 508.250000 4.000000 8.353558e+08
max 720.000000 5.000000 8.353562e+08
3 count 51.000000 51.000000 5.100000e+01
mean 8207.901961 3.568627 1.298890e+09
std 17414.493258 0.741752 3.131689e+04
min 60.000000 2.000000 1.298862e+09
25% 594.000000 3.000000 1.298862e+09
50% 2028.000000 3.500000 1.298863e+09
75% 4649.000000 4.000000 1.298922e+09
max 84236.000000 5.000000 1.298933e+09
4 count 204.000000 204.000000 2.040000e+02
mean 1721.514706 4.348039 9.498806e+08
std 887.823028 0.947616 5.713218e+04
min 10.000000 1.000000 9.497787e+08
25% 1118.000000 4.000000 9.498116e+08
50% 1887.500000 5.000000 9.498963e+08
... ... ... ... ...
668 std 1520.797537 1.292692 1.109447e+02
min 296.000000 1.000000 9.936132e+08
25% 996.750000 3.000000 9.936133e+08
50% 1424.000000 4.000000 9.936134e+08
75% 2997.750000 5.000000 9.936135e+08
max 6425.000000 5.000000 9.936135e+08
669 count 37.000000 37.000000 3.700000e+01
mean 2316.054054 3.351351 1.015829e+09
std 1093.798807 0.919427 2.184557e+02
min 223.000000 2.000000 1.015829e+09
25% 1304.000000 3.000000 1.015829e+09
50% 2702.000000 3.000000 1.015829e+09
75% 2976.000000 4.000000 1.015830e+09
max 4015.000000 5.000000 1.015830e+09
670 count 31.000000 31.000000 3.100000e+01
mean 1267.677419 3.806452 9.390611e+08
std 1110.821269 1.222548 7.366792e+05
min 1.000000 1.000000 9.387813e+08
25% 130.000000 3.000000 9.387820e+08
50% 1183.000000 4.000000 9.387822e+08
75% 2431.000000 5.000000 9.387823e+08
max 2912.000000 5.000000 9.409440e+08
671 count 115.000000 115.000000 1.150000e+02
mean 2840.860870 3.917391 1.064845e+09
std 1897.120259 0.763563 1.838197e+06
min 1.000000 1.000000 1.063501e+09
25% 1213.000000 3.500000 1.063504e+09
50% 2683.000000 4.000000 1.064891e+09
75% 4170.000000 4.000000 1.065112e+09
max 6565.000000 5.000000 1.074785e+09

5368 rows × 3 columns


In [ ]:


In [ ]: