Source code for pareidolia.preprocess

#!/usr/bin/env python3

"""
Functions used to clean and prepare sparse matrices for detection.
cmdoret, 20200403
"""
from typing import Iterable, Tuple, Iterator, Set
import numpy as np
import scipy.sparse as sp
import chromosight.utils.preprocessing as cup


[docs]def get_common_valid_bins(
    mats: Iterable["sp.csr_matrix[float]"], n_mads: float = 5,
) -> "np.ndarray[int]":
    """
    Generates an array of valid bins indices, using the intersection
    of valid bins from all input sparse matrices. All input matrices must
    be square and have the same shape. Valid bins are defined based on their
    proportion of nonzero pixels.

    Parameters
    ----------
    mats : Iterable of sp.csr_matrix
        A list sparse matrices representing Hi-C contacts, each matrix
        represents a sample.
    n_mads : float
        A bin is considered missing if its proportion of nonzero pixels is lower
        than n_mads median absolute deviations below the median of the bin
        distribution for the whole matrix.

    Returns
    -------
    np.ndarray of ints :
        A 1D array containing the indices of valid (non-missing) bins.
    """
    common_valid = None
    for mat in mats:
        if mat.shape[0] != mat.shape[1]:
            raise NotImplementedError("Only square matrices are valid input.")
        # Get the list of valid bins in the current matrix
        valid = cup.get_detectable_bins(mat, n_mads=n_mads)
        # Initialize set of common bins with the first matrix
        if common_valid is None:
            common_valid = set(valid[0])
        # Remove elements absent from current matrix from the common set
        else:
            common_valid = common_valid.intersection(set(valid[0]))
    return np.array(list(common_valid))


[docs]def get_nnz_union(mats: Iterable["sp.csr_matrix[float]"]) -> "np.ndarray[int]":
    """
    Given a list of sparse matrices, return the union of their nonzero
    coordinates, in the form of a 2D numpy array with 1 coordinate per
    row, with 2 columns representing coordinates rows and columns.

    Parameters
    ----------
    mats : Iterable of sp.csr_matrix
        List containing the sparse matrices from each sample.

    Returns
    -------
    np.ndarray of int :
        A 2D numpy array containing the union of nonzero coordinates in the
        input sparse matrices. The array has shape Nx2 where N is the number of
        coordinates. The columns represent row and column coordinates.
    """
    try:
        # Check for input type
        if np.all([m.format == "csr" for m in mats]):
            for i, mat in enumerate(mats):
                # Use first matrix to initialize set
                if i == 0:
                    union_mat = mat.copy().astype(bool, copy=True)
                # Iteratively sum matrices
                else:
                    union_mat += mat.astype(bool, copy=True)
                union_mat.eliminate_zeros()
            # Retrieve positions of nonzero entries into an array
            all_nnz = np.ascontiguousarray(np.vstack(union_mat.nonzero()).T)
        else:
            raise ValueError("input sparse matrices must be in csr format")
    except AttributeError:
        raise TypeError("Input type must be scipy.sparse.csr_matrix")

    return all_nnz


[docs]def fill_nnz(
    mat: "sp.csr_matrix", all_nnz: "np.ndarray[int]", fill_value: float = 1e-9
) -> sp.csr_matrix:
    """
    Given an input sparse matrix and a superset of nonzero coordinates, fill the
    matrix to ensure all values in the set are stored explicitely with the
    value of fill_value.

    Parameters
    ----------
    mat : sp.csr_matrix
        The sparse matrix of a single sample.
    all_nnz : np.ndarray of ints
        A 2D array of shape Nx2, where N is the number of nonzero elements to
        fill. The columns represent the row and column coordinates of those
        elements.
    fill_value : float
        The value to use when filling the nonzero elements in the matrix. Has to
        be the same datatype as the input matrix.

    Returns
    -------
    sp.csr_matrix :
        The filled sparse matrix, where all coordinates in all_nnz have been filled
        with fill_value.
    """
    # Get the set of nonzero coordinate in the input matrix
    mat_nnz = np.ascontiguousarray(np.vstack(mat.nonzero()).T)
    out = mat.copy()
    ncols = all_nnz.shape[1]
    # Tricking numpy into treating rows as single values using a custom dtype
    # based on: https://stackoverflow.com/a/8317403/8440675
    dtype = {
        "names": [f"f{i}" for i in range(ncols)],
        "formats": ncols * [all_nnz.dtype],
    }
    # get all all_nnz coordinates that are zero in the matrix
    add_mask = np.in1d(all_nnz.view(dtype), mat_nnz.view(dtype), invert=True)
    # Replace implicit zeros by fill_value at these coordinates
    out[all_nnz[add_mask, 0], all_nnz[add_mask, 1]] = fill_value
    out.eliminate_zeros()
    return out