Source code for spotlight.interactions

"""
Classes describing datasets of user-item interactions. Instances of these
are returned by dataset-fetching and dataset-processing functions.
"""

import numpy as np

import scipy.sparse as sp


def _sliding_window(tensor, window_size, step_size=1):

    for i in range(len(tensor), 0, -step_size):
        yield tensor[max(i - window_size, 0):i]


def _generate_sequences(user_ids, item_ids,
                        indices,
                        max_sequence_length,
                        step_size):

    for i in range(len(indices)):

        start_idx = indices[i]

        if i >= len(indices) - 1:
            stop_idx = None
        else:
            stop_idx = indices[i + 1]

        for seq in _sliding_window(item_ids[start_idx:stop_idx],
                                   max_sequence_length,
                                   step_size):

            yield (user_ids[i], seq)


[docs]class Interactions(object): """ Interactions object. Contains (at a minimum) pair of user-item interactions, but can also be enriched with ratings, timestamps, and interaction weights. For *implicit feedback* scenarios, user ids and item ids should only be provided for user-item pairs where an interaction was observed. All pairs that are not provided are treated as missing observations, and often interpreted as (implicit) negative signals. For *explicit feedback* scenarios, user ids, item ids, and ratings should be provided for all user-item-rating triplets that were observed in the dataset. Parameters ---------- user_ids: array of np.int32 array of user ids of the user-item pairs item_ids: array of np.int32 array of item ids of the user-item pairs ratings: array of np.float32, optional array of ratings timestamps: array of np.int32, optional array of timestamps weights: array of np.float32, optional array of weights num_users: int, optional Number of distinct users in the dataset. Must be larger than the maximum user id in user_ids. num_items: int, optional Number of distinct items in the dataset. Must be larger than the maximum item id in item_ids. Attributes ---------- user_ids: array of np.int32 array of user ids of the user-item pairs item_ids: array of np.int32 array of item ids of the user-item pairs ratings: array of np.float32, optional array of ratings timestamps: array of np.int32, optional array of timestamps weights: array of np.float32, optional array of weights num_users: int, optional Number of distinct users in the dataset. num_items: int, optional Number of distinct items in the dataset. """ def __init__(self, user_ids, item_ids, ratings=None, timestamps=None, weights=None, num_users=None, num_items=None): self.num_users = num_users or int(user_ids.max() + 1) self.num_items = num_items or int(item_ids.max() + 1) self.user_ids = user_ids self.item_ids = item_ids self.ratings = ratings self.timestamps = timestamps self.weights = weights self._check() def __repr__(self): return ('<Interactions dataset ({num_users} users x {num_items} items ' 'x {num_interactions} interactions)>' .format( num_users=self.num_users, num_items=self.num_items, num_interactions=len(self) )) def __len__(self): return len(self.user_ids) def _check(self): if self.user_ids.max() >= self.num_users: raise ValueError('Maximum user id greater ' 'than declared number of users.') if self.item_ids.max() >= self.num_items: raise ValueError('Maximum item id greater ' 'than declared number of items.') num_interactions = len(self.user_ids) for name, value in (('item IDs', self.item_ids), ('ratings', self.ratings), ('timestamps', self.timestamps), ('weights', self.weights)): if value is None: continue if len(value) != num_interactions: raise ValueError('Invalid {} dimensions: length ' 'must be equal to number of interactions' .format(name))
[docs] def tocoo(self): """ Transform to a scipy.sparse COO matrix. """ row = self.user_ids col = self.item_ids data = self.ratings if self.ratings is not None else np.ones(len(self)) return sp.coo_matrix((data, (row, col)), shape=(self.num_users, self.num_items))
[docs] def tocsr(self): """ Transform to a scipy.sparse CSR matrix. """ return self.tocoo().tocsr()
[docs] def to_sequence(self, max_sequence_length=10, min_sequence_length=None, step_size=None): """ Transform to sequence form. User-item interaction pairs are sorted by their timestamps, and sequences of up to max_sequence_length events are arranged into a (zero-padded from the left) matrix with dimensions (num_sequences x max_sequence_length). Valid subsequences of users' interactions are returned. For example, if a user interacted with items [1, 2, 3, 4, 5], the returned interactions matrix at sequence length 5 and step size 1 will be be given by: .. code-block:: python [[1, 2, 3, 4, 5], [0, 1, 2, 3, 4], [0, 0, 1, 2, 3], [0, 0, 0, 1, 2], [0, 0, 0, 0, 1]] At step size 2: .. code-block:: python [[1, 2, 3, 4, 5], [0, 0, 1, 2, 3], [0, 0, 0, 0, 1]] Parameters ---------- max_sequence_length: int, optional Maximum sequence length. Subsequences shorter than this will be left-padded with zeros. min_sequence_length: int, optional If set, only sequences with at least min_sequence_length non-padding elements will be returned. step-size: int, optional The returned subsequences are the effect of moving a a sliding window over the input. This parameter governs the stride of that window. Increasing it will result in fewer subsequences being returned. Returns ------- sequence interactions: :class:`~SequenceInteractions` The resulting sequence interactions. """ if self.timestamps is None: raise ValueError('Cannot convert to sequences, ' 'timestamps not available.') if 0 in self.item_ids: raise ValueError('0 is used as an item id, conflicting ' 'with the sequence padding value.') if step_size is None: step_size = max_sequence_length # Sort first by user id, then by timestamp sort_indices = np.lexsort((self.timestamps, self.user_ids)) user_ids = self.user_ids[sort_indices] item_ids = self.item_ids[sort_indices] user_ids, indices, counts = np.unique(user_ids, return_index=True, return_counts=True) num_subsequences = int(np.ceil(counts / float(step_size)).sum()) sequences = np.zeros((num_subsequences, max_sequence_length), dtype=np.int32) sequence_users = np.empty(num_subsequences, dtype=np.int32) for i, (uid, seq) in enumerate(_generate_sequences(user_ids, item_ids, indices, max_sequence_length, step_size)): sequences[i][-len(seq):] = seq sequence_users[i] = uid if min_sequence_length is not None: long_enough = sequences[:, -min_sequence_length] != 0 sequences = sequences[long_enough] sequence_users = sequence_users[long_enough] return (SequenceInteractions(sequences, user_ids=sequence_users, num_items=self.num_items))
[docs]class SequenceInteractions(object): """ Interactions encoded as a sequence matrix. Parameters ---------- sequences: array of np.int32 of shape (num_sequences x max_sequence_length) The interactions sequence matrix, as produced by :func:`~Interactions.to_sequence` num_items: int, optional The number of distinct items in the data Attributes ---------- sequences: array of np.int32 of shape (num_sequences x max_sequence_length) The interactions sequence matrix, as produced by :func:`~Interactions.to_sequence` """ def __init__(self, sequences, user_ids=None, num_items=None): self.sequences = sequences self.user_ids = user_ids self.max_sequence_length = sequences.shape[1] if num_items is None: self.num_items = sequences.max() + 1 else: self.num_items = num_items def __repr__(self): num_sequences, sequence_length = self.sequences.shape return ('<Sequence interactions dataset ({num_sequences} ' 'sequences x {sequence_length} sequence length)>' .format( num_sequences=num_sequences, sequence_length=sequence_length, ))