In [1]:
import numpy as np
import pandas as pd
from faker import Faker
import os
In [2]:
fake = Faker()
fake.seed(23)
np.random.seed(23)
num_users = 100
num_docs = 1000
In [3]:
def generate_ratings(num_users, num_docs, p_na_min, p_na_max, kind="5Star"):
""" Generate random user ratings
:param num_users: number of users to generate
:param num_docs: number of documents to rate
:param p_na_min: min percentage of NaN per user
:param p_na_max: max percentage of NaN per user
:param kind: kind of rating scheme, either "5Star" (1 Star ... 5 Stars) or "binary" (like/dislike)
:return user_ratings: dictionary of users with list of ratings
"""
doc_uris = []
user_ratings = {}
# Generate fake URIs
for _ in range(num_docs):
doc_uris.append(fake.uri())
user_ratings["doc_uri"] = doc_uris
# Generate ratings
for _ in range(num_users):
if kind == "5Star":
# TBD
ratings = np.random.randint(0, 6, size=num_docs).tolist() # discr uniform ratings
num_na = np.random.randint(int(num_docs * p_na_min), int(num_docs * p_na_max) + 1)
random_ixs = np.random.choice(range(num_docs), size=num_na, replace=False) # mask
for i in random_ixs:
ratings[i] = np.NaN
elif kind == "binary":
ratings = np.random.choice([1, -1], num_docs, p=[.8, .2]).tolist()
# TBD
num_na = np.random.randint(int(num_docs * p_na_min), int(num_docs * p_na_max) + 1)
random_ixs = np.random.choice(range(num_docs), size=num_na, replace=False) # mask
for i in random_ixs:
ratings[i] = np.NaN
else:
NotImplementedError
# Generate fake user
user_ratings[fake.name()] = ratings
return user_ratings
user_ratings = generate_ratings(num_users, num_docs, 0.7, 0.95, "binary")
In [4]:
df = pd.DataFrame.from_dict(user_ratings).set_index("doc_uri")
df.head()
Out[4]:
In [5]:
#f_name = "petdata_1000_100.csv" # 5 Star ratings
f_name = "petdata_binary_1000_100.csv" # binary ratings
path = os.path.join("../data", f_name)
df.to_csv(path)
In [ ]: