Source code for pareidolia.detection

#!/usr/bin/env python3

"""
Functions used for change detection.
cmdoret, 20200403
"""
from typing import Iterable
import scipy.sparse as sp
import numpy as np
import chromosight.utils.detection as cud
import chromosight.utils.preprocessing as cup


[docs]def median_bg(mats: Iterable[sp.spmatrix]) -> sp.spmatrix:
    """
    Given multiple sparse matrices with the same shape, format
    and nonzero coordinates, generate a background matrix made
    up of the median signal

    Parameters
    ----------
    mats : Iterable of scipy.sparse.spmatrix:
        A list of data matrices from different samples.

    Returns
    -------
    scipy.sparse.spmatrix :
        The median background matrix, where each pixel is the median of
        the corresponding pixel from all samples.
    """
    # Check that all matrices are sparse, have the same format and sparsity
    if not np.all([sp.issparse(m) for m in mats]):
        raise ValueError("Input must be a scipy sparse matrix.")
    if not np.all([m.format == mats[0].format for m in mats]):
        raise ValueError("All input matrices must be in the same format.")
    if not np.all([m.getnnz() == mats[0].getnnz() for m in mats]):
        raise ValueError("All input matrices must have the same sparsity.")
    bg = mats[0].copy()
    bg.data = np.median([m.data for m in mats], axis=0)
    return bg


[docs]def reps_bg_diff(mats: Iterable[sp.spmatrix]) -> np.ndarray:
    """
    Given multiple sample matrices, return a 1D array of all differences
    between each sample matrix's pixels and their corresponding median
    background value. All input matrices must have the same shape, nonzero
    coordinates and format.

    Parameters
    ----------
    mats : Iterable of scipy.sparse.spmatrix
        The list of data matrices from different samples.

    Returns
    -------
    numpy.ndarray of floats :
        The distribution of pixel differences to the median background. If
        there are S matrices of P nonzero pixels, this 1D array will contain
        P*S elements.
    """
    med_bg = median_bg(mats)
    diffs = [m.data - med_bg.data for m in mats]
    diffs = np.hstack(diffs)
    return diffs


[docs]def get_sse_mat(mats: Iterable[sp.spmatrix]) -> sp.spmatrix:
    """
    Given multiple matrices, return the matrix of position-wise sum of
    squared errors to median. All input matrices must have the same shape,
    nonzero coordinates and format. Output matrix will have the same format
    as inputs.

    Parameters
    ----------
    mats : Iterable of scipy.sparse.spmatrix
        The list of data matrices from different samples.

    Returns
    -------
    scipy.sparse.spmatrix :
        A sparse matrix where each nonzero pixel is the sum of squared
        difference of all samples to their median background, at the
        corresponding nonzero pixel.

    """
    bg = median_bg(mats)
    sse = bg.copy()
    se = np.array([(m.data - bg.data) ** 2 for m in mats])
    sse.data = np.sum(se, axis=0)
    return sse


[docs]def get_win_density(
    mat: sp.csr_matrix, win_size: int = 3, sym_upper: bool = False
) -> sp.csr_matrix:
    """Compute local pixel density in sparse matrices using convolution.
    The convolution is performed in 'full' mode: Computations are performed
    all the way to the edges by trimming the kernel.

    Parameters
    ----------
    mat : scipy.sparse.csr_matrix
        The input sparse matrix to convolve.
    win_size : int
        The size of the area in which to compute the proportion of nonzero
        pixels. This will be the kernel size used in convolution.
    sym_upper : bool
        Whether the input matrix is symmetric upper, in which case only the
        upper triangle is returned.


    Returns
    -------
    scipy.sparse.csr_matrix:
        The result of the convolution of the uniform kernel
        (win_size x win_size) on the binarized input matrix.
        Each value represents the proportion of nonzero pixels
        in the neighbourhood.
    """
    ms, ns = mat.shape
    # Generate a binary matrix (pixels are either empty or full)
    bin_mat = mat.copy()
    if sym_upper:
        bin_mat = sp.triu(bin_mat)
    bin_mat.data = bin_mat.data.astype(bool)
    # Adding a frame of zeros around the signal
    tmp = sp.csr_matrix((win_size - 1, ns), dtype=bool)
    bin_mat = sp.vstack([tmp, bin_mat, tmp], format=mat.format)
    tmp = sp.csr_matrix((ms + 2 * (win_size - 1), win_size - 1), dtype=bool)
    bin_mat = sp.hstack([tmp, bin_mat, tmp], format=mat.format)
    # Convolve the uniform kernel with this matrix to get the proportion of nonzero
    # pixels in each neighbourhood
    kernel = np.ones((win_size, win_size))
    win_area = win_size ** 2
    density = cud.xcorr2(bin_mat, kernel / win_area)

    # Compute convolution of uniform kernel with a frame of ones to get number
    # of missing pixels in each window.
    frame = cup.frame_missing_mask(
        sp.csr_matrix(mat.shape, dtype=bool), kernel.shape, sym_upper=sym_upper
    )
    frame = cud.xcorr2(frame, kernel).tocoo()
    # From now on, frame.data contains the number of 'present' samples.
    # (where there is at least one missing pixel)
    frame.data = win_area - frame.data

    # Adjust the proportion for values close to the border (lower denominator)
    density[frame.row, frame.col] = (
        density[frame.row, frame.col].A1 * win_area / frame.data
    )
    # Trim the frame out from the signal
    density = density[
        win_size - 1 : -win_size + 1, win_size - 1 : -win_size + 1
    ]
    if sym_upper:
        density = sp.triu(density)
    return density