This demo is based on an example in the book: https://www.amazon.com/dp/1449319793/
Python for Data Analysis by William Wesley McKinney (O'Reilly). Copyright 2012 William McKinney, 978-1-449-31979-3
The data is downloadable from http://grouplens.org/datasets/movielens/
SF_ACCOUNT, SF_USER and SF_PASSWORDSF_S3_BUCKET, AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY to load the daata from the given location.jupyter notebookSince multiple characters delimiter is not supported, I had to transform data:
zcat users.dat.gz | perl -nle '@arr=split/::/; print join("\t", @arr)' | gzip -c > users.data.tsv.gz
zcat movies.dat.gz | perl -nle '@arr=split/::/; print join("\t", @arr)' | native2ascii -encoding ISO-8859-1 | native2ascii -encoding utf8 -reverse | gzip -c > movies.dat.tsv.gz
zcat ratings.dat.gz | perl -nle '@arr=split/::/; print join("\t", @arr)' | gzip -c > ratings.dat.tsv.gz
In [1]:
import os
account=os.getenv('SF_ACCOUNT')
user=os.getenv('SF_USER')
password=os.getenv('SF_PASSWORD')
In [2]:
from snowflake.connector import connect
con = connect(
account=account,
user=user,
password=password,
database='testdb',
schema='public',
)
cur = con.cursor()
In [3]:
cur.execute("""
CREATE OR REPLACE SCHEMA movielense_1m
""")
Out[3]:
In [4]:
cur.execute("""
CREATE OR REPLACE TABLE users(
user_id integer,
gender varchar(1),
age integer,
occupation integer,
zip string)""")
cur.execute("""
CREATE OR REPLACE TABLE movies(
movie_id integer,
title string,
genres string)""")
cur.execute("""
CREATE OR REPLACE TABLE ratings(
user_id integer,
movie_id integer,
rating integer,
ts integer)""")
Out[4]:
In [5]:
aws_key=os.getenv("AWS_ACCESS_KEY_ID")
aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY")
sf_s3_bucket=os.getenv("SF_S3_BUCKET")
In [6]:
for tbl in ['users', 'ratings', 'movies']:
cur.execute("""
copy into {tbl} from s3://{s3_bucket}/movielense/1m/{tbl}.dat.tsv.gz
credentials=(
aws_key_id='{aws_key_id}'
aws_secret_key='{aws_secret_key}'
)
file_format= (
field_delimiter='\t'
type='csv'
)
""".format(
s3_bucket=sf_s3_bucket,
tbl=tbl,
aws_key_id=aws_key,
aws_secret_key=aws_secret))
In [7]:
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL
engine = create_engine(URL(
account=account,
user=user,
password=password,
database='testdb',
schema='movielense_1m',
))
In [8]:
import pandas as pd
In [9]:
users = pd.read_sql_query("SELECT * FROM users", engine)
In [10]:
ratings = pd.read_sql_query("SELECT * FROM ratings", engine)
In [11]:
movies = pd.read_sql_query("SELECT * FROM movies", engine)
In [12]:
users[:5]
Out[12]:
In [13]:
ratings[:5]
Out[13]:
In [14]:
movies[:5]
Out[14]:
In [15]:
data = pd.merge(pd.merge(ratings, users), movies)
In [16]:
data
Out[16]:
In [17]:
data.ix[0]
Out[17]:
In [18]:
mean_ratings = pd.pivot_table(data, values='rating', index='title', columns='gender', aggfunc='mean')
In [19]:
mean_ratings[:5]
Out[19]:
In [20]:
ratings_by_title = data.groupby('title').size()
In [21]:
ratings_by_title[:10]
Out[21]:
In [22]:
active_titles = ratings_by_title.index[ratings_by_title>=250]
In [23]:
active_titles
Out[23]:
In [24]:
mean_ratings = mean_ratings.ix[active_titles]
In [25]:
mean_ratings
Out[25]:
In [26]:
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
In [27]:
top_female_ratings[:10]
Out[27]:
In [28]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
In [29]:
sorted_by_diff = mean_ratings.sort_values(by='diff')
In [30]:
sorted_by_diff[:15]
Out[30]:
In [31]:
sorted_by_diff[::-1][:15]
Out[31]:
In [32]:
rating_std_by_title = data.groupby('title')['rating'].std()
In [33]:
rating_std_by_title = rating_std_by_title.ix[active_titles]
In [34]:
rating_std_by_title.sort_values(ascending=False)[:10]
Out[34]:
In [ ]: