Source code for spotlight.datasets.synthetic

"""
Module containing functions for generating synthetic
datasets with known properties, for model testing and
experimentation.
"""

import numpy as np

from spotlight.interactions import Interactions


def _build_transition_matrix(num_items,
                             concentration_parameter,
                             random_state,
                             atol=0.001):

    def _is_doubly_stochastic(matrix, atol):

        return (np.all(np.abs(1.0 - matrix.sum(axis=0)) < atol) and
                np.all(np.abs(1.0 - matrix.sum(axis=1)) < atol))

    transition_matrix = random_state.dirichlet(
        np.repeat(concentration_parameter, num_items),
        num_items)

    for _ in range(100):

        if _is_doubly_stochastic(transition_matrix, atol):
            break

        transition_matrix /= transition_matrix.sum(axis=0)
        transition_matrix /= transition_matrix.sum(1)[:, np.newaxis]

    return transition_matrix


def _generate_sequences(num_steps,
                        transition_matrix,
                        order,
                        random_state):

    elements = []

    num_states = transition_matrix.shape[0]

    transition_matrix = np.cumsum(transition_matrix,
                                  axis=1)

    rvs = random_state.rand(num_steps)
    state = random_state.randint(transition_matrix.shape[0], size=order,
                                 dtype=np.int64)

    for rv in rvs:

        row = transition_matrix[state].mean(axis=0)
        new_state = min(num_states - 1,
                        np.searchsorted(row, rv))

        state[:-1] = state[1:]
        state[-1] = new_state

        elements.append(new_state)

    return np.array(elements, dtype=np.int32)


[docs]def generate_sequential(num_users=100, num_items=1000, num_interactions=10000, concentration_parameter=0.1, order=3, random_state=None): """ Generate a dataset of user-item interactions where sequential information matters. The interactions are generated by a n-th order Markov chain with a uniform stationary distribution, where transition probabilities are given by doubly-stochastic transition matrix. For n-th order chains, transition probabilities are a convex combination of the transition probabilities of the last n states in the chain. The transition matrix is sampled from a Dirichlet distribution described by a constant concentration parameter. Concentration parameters closer to zero generate more predictable sequences. Parameters ---------- num_users: int, optional number of users in the dataset num_items: int, optional number of items (Markov states) in the dataset num_interactions: int, optional number of interactions to generate concentration_parameter: float, optional Controls how predictable the sequence is. Values closer to zero give more predictable sequences. order: int, optional order of the Markov chain random_state: numpy.random.RandomState, optional random state used to generate the data Returns ------- Interactions: :class:`spotlight.interactions.Interactions` instance of the interactions class """ if random_state is None: random_state = np.random.RandomState() transition_matrix = _build_transition_matrix( num_items - 1, concentration_parameter, random_state) user_ids = np.sort(random_state.randint(0, num_users, num_interactions, dtype=np.int32)) item_ids = _generate_sequences(num_interactions, transition_matrix, order, random_state) + 1 timestamps = np.arange(len(user_ids), dtype=np.int32) ratings = np.ones(len(user_ids), dtype=np.float32) return Interactions(user_ids, item_ids, ratings=ratings, timestamps=timestamps, num_users=num_users, num_items=num_items)