Generate Pet Dataset

Imports


In [1]:
import numpy as np
import pandas as pd
from faker import Faker
import os

Pet dataset

Assumes...

  • Discrete uniform distribution of ratings per user
  • Each user rated more than 1/3 of documents

Presets


In [2]:
fake = Faker()
fake.seed(23)
np.random.seed(23)

num_users = 100
num_docs = 1000

Generate fake Ratings


In [3]:
def generate_ratings(num_users, num_docs, p_na_min, p_na_max, kind="5Star"):
    """ Generate random user ratings
    :param num_users:     number of users to generate
    :param num_docs:      number of documents to rate
    :param p_na_min:      min percentage of NaN per user
    :param p_na_max:      max percentage of NaN per user
    :param kind:          kind of rating scheme, either "5Star" (1 Star ... 5 Stars) or "binary" (like/dislike)
    :return user_ratings: dictionary of users with list of ratings
    """
    doc_uris = []
    user_ratings = {}
    
    # Generate fake URIs
    for _ in range(num_docs):
        doc_uris.append(fake.uri())
    user_ratings["doc_uri"] = doc_uris
    
    # Generate ratings
    for _ in range(num_users):
        if kind == "5Star":
            # TBD
            ratings = np.random.randint(0, 6, size=num_docs).tolist() # discr uniform ratings
            num_na = np.random.randint(int(num_docs * p_na_min), int(num_docs * p_na_max) + 1)
            random_ixs = np.random.choice(range(num_docs), size=num_na, replace=False) # mask
            for i in random_ixs:
                ratings[i] = np.NaN

        elif kind == "binary":
            ratings = np.random.choice([1, -1], num_docs, p=[.8, .2]).tolist()
            # TBD
            num_na = np.random.randint(int(num_docs * p_na_min), int(num_docs * p_na_max) + 1)
            random_ixs = np.random.choice(range(num_docs), size=num_na, replace=False) # mask
            for i in random_ixs:
                ratings[i] = np.NaN
        else:
            NotImplementedError
        
        # Generate fake user
        user_ratings[fake.name()] = ratings
    
    return user_ratings

user_ratings = generate_ratings(num_users, num_docs, 0.7, 0.95, "binary")

Dataframe


In [4]:
df = pd.DataFrame.from_dict(user_ratings).set_index("doc_uri")
df.head()


Out[4]:
Aaron Keith III Aaron Mills Abigail Wong Adam Ramirez Adam Rogers Adam Williams Albert Paul Alexis Levy Alicia Garcia Alicia Wiley ... Tina Fisher Tonya Long Travis Montgomery Travis Montoya Veronica Jackson Veronica Walker Victoria Perez William Carpenter William Vaughn Zachary Miles
doc_uri
http://www.vargas.biz/login.php NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN ... NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN
http://wallace-walker.info/index/ NaN 1.0 NaN NaN -1.0 NaN 1.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
http://www.jimenez.biz/ NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 1.0 NaN NaN NaN 1.0 NaN NaN NaN
http://www.logan.com/about.html 1.0 NaN 1.0 NaN NaN NaN NaN -1.0 NaN 1.0 ... NaN NaN NaN NaN NaN 1.0 NaN -1.0 NaN 1.0
http://cox.org/list/tag/faq.html NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 1.0 NaN NaN NaN NaN 1.0 NaN NaN NaN 1.0

5 rows × 100 columns

Persist dataset


In [5]:
#f_name = "petdata_1000_100.csv" # 5 Star ratings
f_name = "petdata_binary_1000_100.csv" # binary ratings
path = os.path.join("../data", f_name)

df.to_csv(path)

In [ ]: