def triu(arr, value=np.nan, k=0, copy=False):
    """
    Upper triangle of an array.

    Modifies input array, such that its elements below the k-th diagonal
    are filled with value (returns modified copy, when requested instead).
    Assumes that the input array is 2D.

    Parameters
    ----------
    arr : array_like
        The 2D array to be modified.
    value : float
        The value to fill with, by default nan.
    k : int, optional
        Diagonal, below which to fill, by default 0.
    copy : bool, optional
        If True, a copy of arr is returned, False - modified in place.

    Returns
    -------
    ndarray
        The filled array.

    Notes
    -----
    Similar to numpy.triu, but allows for a custom value to be filled.
    This solution was borrowed from
    https://stackoverflow.com/questions/33029129/make-a-numpy-upper-triangular-matrix-padded-with-nan-instead-of-zero
    It is quicker than the naive one based on numpy.tril_indices.
    """
    if copy:
        arr = arr.copy()
    m, n = arr.shape
    _tril_mask = np.arange(m)[:,None] > np.arange(n) - k
    arr[_tril_mask] = value
    return arr

from scipy.sparse import issparse
def _snip(matrix, offset, bad_bins_masks, min_diag, bbox):
    """
    Extract snippet from the matrix

    Parameters
    ----------
    matrix : matrix-like
        2D matrix from which bbox will be extracted.
        Supports dense, sparse and lazy subscriptable interfaces
    offset : (int, int)
        2D offset of the matrix: (offset1, offset2)
    bad_bins : ( ndarray[bool], ndarray[bool])
        Bad-bins masks for the matrix. Corresponding values are
        filled with nans.
    min_diag : int
        Diagonal of the matrix below which values are masked with nans.
        It takes offset into account: offset2 - offset1 is its main diagonal
    bbox : (int, int, int, int)
        Bounding box of the snippet to be extracted: (lo1, hi1, lo2, hi2)
        Indices are relative to the matrix.

    Returns
    -------
    np.array
        Requested snippet.
    """

    lo1, hi1, lo2, hi2 = bbox
    offset1, offset2 = offset
    bad_bins1, bad_bins2 = bad_bins_masks

    m, n = matrix.shape
    dm, dn = hi1 - lo1, hi2 - lo2

    # when snippet is out of bound, return empty snippet
    if lo1 < 0 or lo2 < 0 or hi1 > m or hi2 > n:
        return np.full((dm, dn), np.nan)
    # TODO implement proper padding instead

    if issparse(matrix):
        snippet = matrix[lo1:hi1, lo2:hi2].toarray()
    else:
        snippet = matrix[lo1:hi1, lo2:hi2].copy()

    # fill bad_bins with nans
    snippet[bad_bins1[lo1:hi1], :] = np.nan
    snippet[:, bad_bins2[lo2:hi2]] = np.nan

    # mask tril diagonals of the snippet when requested:
    if min_diag is not None:
        # low-left diagonal of the snip
        lowleft_diag_snip = (lo2 + offset2) - (hi1 + offset1)
        # number of lowleft diags to mask, if there are any:
        if (num_tril_diags := min_diag - lowleft_diag_snip) > 0:
            diag_below_which_nan = num_tril_diags - dm
            snippet = triu( snippet, k=diag_below_which_nan )

    return snippet


def array_to_csr_data(arr, sparse_arr):
    """
    Extract elements of an array corresponding to non-zero (nnz) elements
    of a reference sparse array sparse_arr.

    arr has to have the same shape as sparse_arr. It can be dense
    or any other 2D subscriptable object, e.g. LazyToeplitz.

    Parameters
    ----------
    arr : array-like
        2D array/matrix oe subscriptable object
    sparse_arr : scipy.sparse.csr_matrix
        CSR matrix to derive nnz indices from
    
    Returns
    -------
    arr_data : ndarray
        array of the same length as sparse_arr.data

    Notes
    -----
    Such a row-by-row extraction from arr seems like a good compromise
    for performance/memory usage. Alternatives include fully dense version,
    and using COO
    """
    # assuming arr.shape == sparse_arr.shape
    num_rows = sparse_arr.shape[0]

    arr_data = np.empty_like(sparse_arr.data)
    # extract arr values matching sparse_arr nnz values, row by row
    for i in range(num_rows):
        lo, hi = sparse_arr.indptr[i], sparse_arr.indptr[i+1]
        # determine col indices, and append if there are any
        js = sparse_arr.indices[lo:hi]
        # arr @ i-th row, js cols
        arr_data[lo:hi] = arr[i].squeeze()[js]
    
    return arr_data