Source code for hilearn.plot.dot_plot

import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
from sklearn import linear_model

[docs]def corr_plot(x, y, max_num=10000, outlier=0.01, line_on=True,
    legend_on=True, size=30, dot_color=None, outlier_color="r",
    alpha=0.8, color_rate=10, corr_on=None):
    """Correlation plot for large number of dots by showing the density and 
    Pearson's correlatin coefficient

    Parameters
    ----------
    x: `array_like`, (1, ) 
        Values on x-axis
    y: `array_like`, (1, ) 
        Values on y-axis
    max_num: int
        Maximum number of dots to plotting by subsampling
    outlier: float
        The proportion of dots as outliers in different color
    line_on : bool
        If True, show the regression line
    legend_on: bool
        If True, show the Pearson's correlatin coefficient in legend. Replace
        of *corr_on*
    size: float
        The dot size
    dot_color: string
        The dot color. If None (by default), density color will be use
    outlier_color: string
        The color for outlier dot
    alpha : float
        The transparency: 0 (fully transparent) to 1
    color_rate: float
        Color rate for density

    Returns
    -------
    ax: matplotlib Axes
        The Axes object containing the plot.

    Examples
    --------

    .. plot::

        >>> import numpy as np
        >>> from hilearn.plot import corr_plot
        >>> np.random.seed(1)
        >>> x = np.append(np.random.normal(size=200), 5 + np.random.normal(size=500))
        >>> y = 2 * x + 3 * np.random.normal(size=700)
        >>> corr_plot(x, y)
    """
    
    score = st.pearsonr(x, y)
    np.random.seed(0)
    if len(x) > max_num:
        idx = np.random.permutation(len(x))[:max_num]
        x, y = x[idx], y[idx]
    outlier = int(len(x) * outlier)
    
    xy = np.vstack([x,y])
    z = st.gaussian_kde(xy)(xy)
    idx = z.argsort()
    idx1, idx2 = idx[outlier:], idx[:outlier]
    
    if dot_color is None: 
        #c_score = np.log2(z[idx]+100)
        c_score = np.log2(z[idx] + color_rate*np.min(z[idx]))
    else:
        #idx2 = []
        c_score = dot_color
    
    plt.set_cmap("Blues")
    plt.scatter(x[idx], y[idx], c=c_score, edgecolor=None, s=size, alpha=alpha)
    plt.scatter(x[idx2], y[idx2], c=outlier_color, edgecolor=None, s=size/5, 
                alpha=alpha/3.0)#/5

    # plt.grid(alpha=0.4)

    if line_on:
        clf = linear_model.LinearRegression()
        clf.fit(x.reshape(-1,1), y)
        xx = np.linspace(x.min(), x.max(), 1000).reshape(-1,1)
        yy = clf.predict(xx)
        plt.plot(xx, yy, "k--", label="R=%.3f" %score[0])
        # plt.plot(xx, yy, "k--")

    if legend_on or corr_on:
        plt.legend(loc="best", fancybox=True, ncol=1)
        # plt.annotate("R=%.3f\np=%.1e" %score, fontsize='x-large', 
        #             xy=(0.97, 0.05), xycoords='axes fraction',
        #             textcoords='offset points', ha='right', va='bottom')


def volcano_plot(fold_change, pval, p_min=0.00001, 
                 x_log10=True, p_threshold=0.05, 
                 h_color="red", label=None):
    """
    Volcano plot between log_fold change and p values, which is often used 
    for hypothesis test between two conditions.
    """
    pval[pval < p_min] = p_min
    idx1 = pval < p_threshold
    idx0 = pval >= p_threshold
    plt.scatter(fold_change[idx0], -np.log10(pval)[idx0], 
                color="grey", alpha=0.7, label=None)
    plt.scatter(fold_change[idx1], -np.log10(pval)[idx1], 
                color=h_color, alpha=0.7, label=label)
    plt.ylabel("-log10(p value)")
    plt.xlabel("Fold change")
    if x_log10: 
        plt.xscale('log', basex=10)