Source code for pareidolia.stats

#!/usr/bin/env python3

"""
Functions used to evaluate statistical significance of changes.
cmdoret, 20200403
"""
import numpy as np
import scipy.stats as ss


[docs]def vals_to_percentiles(vals: np.ndarray, dist: np.ndarray) -> np.ndarray:
    """
    Return the percentiles corresponding to values in a distribution.

    Parameters
    ----------
    vals : np.ndarray of floats
        The values for which percentiles should be returned.
    dist : np.ndarray of floats
        The distribution to use when computing the percentiles.

    Returns
    -------
    np.ndarray of floats :
        The array of percentiles corresponding to input values
    """
    # Get indices corresponding to vals in the distribution
    sorted_dist = np.sort(dist)
    percentiles = np.searchsorted(sorted_dist, vals) / len(sorted_dist)
    return percentiles


[docs]def vec_ttest(arr1, arr2):
    """
    Given 2 numpy arrays containing data for 2 different conditions, where
    each array has shape shape rxd where r is the number of replicates and N
    is the number of independent tests to perform. N must be equal in both array,
    however the number of replicates can differ.
    """
    if not (len(arr1.shape) == len(arr2.shape) == 2):
        raise ValueError("Both arrays must be 2-dimensional.")
    if arr1.shape[0] != arr2.shape[0]:
        raise ValueError(
            "Both arrays must have the same number of test datasets"
        )
    # Number of sample in each test (both conditions)
    n_samples = arr1.shape[1] + arr2.shape[1]
    # Compute mean and variance for each test independently
    means_1, means_2 = arr1.mean(axis=1), arr2.mean(axis=1)
    var1, var2 = arr1.var(axis=1, ddof=1), arr2.var(axis=1, ddof=1)
    stds = np.sqrt((var1 + var2) / 2)
    stds[stds == 0] = np.nan
    # alt - control
    t_stats = (means_2 - means_1) / (stds * np.sqrt(1 / n_samples))
    t_stats[np.isnan(t_stats)] = 0

    return t_stats


# TODO: Write a vectorized linear model function for cases with more than
# 2 (quantitative) conditions
[docs]def vec_lm():
    ...


[docs]def tvals_to_pvals(tvals, n_samples):
    """Compute p-values from t-values
    Given an array of test statistics from multiple t-tests and the (fixed)
    number of samples used for each test, compute p-values for all tests
    """
    # get p-values from test statistics using the cumulative distribution
    # function of the student distribution
    return 1 - ss.t.cdf(np.abs(tvals), df=n_samples - 1)


[docs]def pval_to_tval(pval, n_samples):
    """Conversion of a single p-value to corresponding t
    Given a desired p-value and sample size, return the corresponding t-value.
    The absolute value for the 2 sided hypothesis is returned.
    """
    return abs(ss.t.ppf(pval / 2, df=n_samples - 1))