Skip to content

Instantly share code, notes, and snippets.

@sergpolly
Created February 14, 2024 14:50
Show Gist options
  • Save sergpolly/76566fd8d559ffad5c5f1c19c5660a1a to your computer and use it in GitHub Desktop.
Save sergpolly/76566fd8d559ffad5c5f1c19c5660a1a to your computer and use it in GitHub Desktop.
helper functions for snipping module in cooltools
def triu(arr, value=np.nan, k=0, copy=False):
"""
Upper triangle of an array.
Modifies input array, such that its elements below the k-th diagonal
are filled with value (returns modified copy, when requested instead).
Assumes that the input array is 2D.
Parameters
----------
arr : array_like
The 2D array to be modified.
value : float
The value to fill with, by default nan.
k : int, optional
Diagonal, below which to fill, by default 0.
copy : bool, optional
If True, a copy of arr is returned, False - modified in place.
Returns
-------
ndarray
The filled array.
Notes
-----
Similar to numpy.triu, but allows for a custom value to be filled.
This solution was borrowed from
https://stackoverflow.com/questions/33029129/make-a-numpy-upper-triangular-matrix-padded-with-nan-instead-of-zero
It is quicker than the naive one based on numpy.tril_indices.
"""
if copy:
arr = arr.copy()
m, n = arr.shape
_tril_mask = np.arange(m)[:,None] > np.arange(n) - k
arr[_tril_mask] = value
return arr
from scipy.sparse import issparse
def _snip(matrix, offset, bad_bins_masks, min_diag, bbox):
"""
Extract snippet from the matrix
Parameters
----------
matrix : matrix-like
2D matrix from which bbox will be extracted.
Supports dense, sparse and lazy subscriptable interfaces
offset : (int, int)
2D offset of the matrix: (offset1, offset2)
bad_bins : ( ndarray[bool], ndarray[bool])
Bad-bins masks for the matrix. Corresponding values are
filled with nans.
min_diag : int
Diagonal of the matrix below which values are masked with nans.
It takes offset into account: offset2 - offset1 is its main diagonal
bbox : (int, int, int, int)
Bounding box of the snippet to be extracted: (lo1, hi1, lo2, hi2)
Indices are relative to the matrix.
Returns
-------
np.array
Requested snippet.
"""
lo1, hi1, lo2, hi2 = bbox
offset1, offset2 = offset
bad_bins1, bad_bins2 = bad_bins_masks
m, n = matrix.shape
dm, dn = hi1 - lo1, hi2 - lo2
# when snippet is out of bound, return empty snippet
if lo1 < 0 or lo2 < 0 or hi1 > m or hi2 > n:
return np.full((dm, dn), np.nan)
# TODO implement proper padding instead
if issparse(matrix):
snippet = matrix[lo1:hi1, lo2:hi2].toarray()
else:
snippet = matrix[lo1:hi1, lo2:hi2].copy()
# fill bad_bins with nans
snippet[bad_bins1[lo1:hi1], :] = np.nan
snippet[:, bad_bins2[lo2:hi2]] = np.nan
# mask tril diagonals of the snippet when requested:
if min_diag is not None:
# low-left diagonal of the snip
lowleft_diag_snip = (lo2 + offset2) - (hi1 + offset1)
# number of lowleft diags to mask, if there are any:
if (num_tril_diags := min_diag - lowleft_diag_snip) > 0:
diag_below_which_nan = num_tril_diags - dm
snippet = triu( snippet, k=diag_below_which_nan )
return snippet
def array_to_csr_data(arr, sparse_arr):
"""
Extract elements of an array corresponding to non-zero (nnz) elements
of a reference sparse array sparse_arr.
arr has to have the same shape as sparse_arr. It can be dense
or any other 2D subscriptable object, e.g. LazyToeplitz.
Parameters
----------
arr : array-like
2D array/matrix oe subscriptable object
sparse_arr : scipy.sparse.csr_matrix
CSR matrix to derive nnz indices from
Returns
-------
arr_data : ndarray
array of the same length as sparse_arr.data
Notes
-----
Such a row-by-row extraction from arr seems like a good compromise
for performance/memory usage. Alternatives include fully dense version,
and using COO
"""
# assuming arr.shape == sparse_arr.shape
num_rows = sparse_arr.shape[0]
arr_data = np.empty_like(sparse_arr.data)
# extract arr values matching sparse_arr nnz values, row by row
for i in range(num_rows):
lo, hi = sparse_arr.indptr[i], sparse_arr.indptr[i+1]
# determine col indices, and append if there are any
js = sparse_arr.indices[lo:hi]
# arr @ i-th row, js cols
arr_data[lo:hi] = arr[i].squeeze()[js]
return arr_data
@sergpolly
Copy link
Author

here is how they can be tested:

for m, n in [(40,40), (20,70), (70,20)]:
    # test using arrays with very slight sparsity
    for arr in [
        np.round(random((m,n)), decimals=1),
        LazyToeplitz(np.r_[0.0, random(m-1)], np.r_[0.0, random(n-1)]),
    ]:
        # derive csr from arr itself
        csr = csr_matrix(arr[:])
        # check that data matches extracted values
        assert np.allclose(
            csr.data,
            array_to_csr_data(arr, csr)
        )

triu - should behave like numpy.triu, so it is tested against it:

# testing our new triu
m,n = 5,3
arr = np.ones((m,n))
# testing our triu using numpy triu: all k-s, default value=nan
for k in range(-m+1,n+1):
    _our_triu_nans = np.isnan(triu(arr, k=k, copy=True))
    _np_triu_zeros = ~np.triu(arr, k).astype("bool")
    assert (_our_triu_nans == _np_triu_zeros).all()

m,n = 6,4
arr = np.ones((m,n))
# testing our triu using numpy triu: all k-s, custom value=0
for k in range(-m+1,n+1):
    _our_triu = triu(arr, value=0, k=k, copy=True)
    _numpy_triu = np.triu(arr, k).astype("bool")
    assert (_our_triu == _numpy_triu).all()

_snip should be tested extensively as it is a core of the snipping extraction:

#testing our generalized _snip
# from that one extract medium regional ones with different offsets
# and see if everything works as expected !
min_diags = [0, 1, 2, 5]
regional_offsets = [(0, 0), (0, 30), (70,70), (70, 100)]
regional_shapes = [(30, 30), (45,35), (35,45)]  # square, tall-rect, wide-rect
snip_shapes = [(4,4), (5,4), (4,5)]  # square, tall-rect, wide-rect
snip_offsets = [(0,0), (5,5), (10,14), (18,15)]  # on-diag (2), triu, tril 
    

# create a "large" global matrix (should be a square for ease of reasoning)
# it represent a chromosome-wide or genome wide matrix
gm = 150
for garr in [
    random((gm,gm)),
    LazyToeplitz(random(gm)),
    csr_matrix(np.round(random((gm,gm)), decimals=1)),
]:
    # random mask of bad_bins - 15% bad bins
    bbm = np.zeros(gm).astype("bool")
    bbm[ randint(0, gm, int(0.15*gm)) ] = True
    
    # garr but modified: garr_mod
    # it is square, so it is still easy to reason about it
    # and apply all of the required modifications to it: triu, bad-bins
    if issparse(garr):
        garr_mod = garr[:].toarray()
    else:
        garr_mod = garr[:].copy()
    garr_mod[bbm,:] = np.nan
    garr_mod[:,bbm] = np.nan
    
    for min_diag in min_diags:
        # depending on the min_diag ...
        garr_mod = triu(garr_mod, k=min_diag, copy=True)
    
        # regional submatrix of size dm,dn with the offset
        for r_lo1, r_lo2 in regional_offsets:
            for m, n in regional_shapes:
                # regional bbox: (r_lo1, r_hi1, r_lo2, r_hi2)
                r_hi1, r_hi2 = r_lo1 + m, r_lo2 + n
                assert (r_hi1 < gm) and (r_hi2 < gm)
        
                # extract original and modified regional matrices:
                arr = garr[r_lo1:r_hi1, r_lo2:r_hi2].copy()
                arr_mod = garr_mod[r_lo1:r_hi1, r_lo2:r_hi2]
                # regional bad bin masks:
                bbm1 = bbm[r_lo1:r_hi1]
                bbm2 = bbm[r_lo2:r_hi2]
        
                # test for several bbox configurations for _snip
                for lo1, lo2 in snip_offsets:
                    for dm, dn in snip_shapes:
                        hi1, hi2 = lo1 + dm, lo2 + dn
                        ref_snip = arr_mod[lo1:hi1, lo2:hi2]
                        our_snip = _snip(
                            arr,
                            offset=(r_lo1, r_lo2),
                            bad_bins_masks=(bbm1, bbm2),
                            min_diag=min_diag,
                            bbox=(lo1, hi1, lo2, hi2)
                        )
                        assert np.allclose( ref_snip, our_snip, equal_nan=True)
                
                # test out of bound
                oob_offsets = [(-dm//2, dn), (dm , -dn), (m - dm//2, n - 2*dn), (m - 2*dm, n - dn//2)]
                for lo1, lo2 in oob_offsets:
                    for dm, dn in snip_shapes:
                        hi1, hi2 = lo1 + dm, lo2 + dn
                        our_snip = _snip(
                            arr,
                            offset=offset,
                            bad_bins_masks=(bbm1, bbm2),
                            min_diag=min_diag,
                            bbox=(lo1, hi1, lo2, hi2)
                        )
                        assert np.isnan( our_snip ).all()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment