|
1 | 1 | # third party imports |
2 | 2 | import pandas as pd |
3 | | -#import matplotlib.pyplot as plt |
4 | | -#import seaborn as sns |
| 3 | +import matplotlib.pyplot as plt |
| 4 | +import seaborn as sns |
| 5 | +import numpy as np |
5 | 6 |
|
6 | 7 | import cobra.utils as utils |
7 | 8 |
|
@@ -87,3 +88,84 @@ def compute_pig_table(data: pd.DataFrame, |
87 | 88 | "avg_incidence", "incidence"] |
88 | 89 |
|
89 | 90 | return res[column_order] |
| 91 | + |
| 92 | + |
| 93 | +def plot_incidence(df: pd.DataFrame, variable: str, |
| 94 | + column_order: list=None, dim: tuple=(12, 8)): |
| 95 | + """Function plots Predictor Incidence Graphs (PIGs). |
| 96 | + Bins are ordered in descening order of bin incidence |
| 97 | + unless specified otherwise with `column_order` list. |
| 98 | +
|
| 99 | + Parameters |
| 100 | + ---------- |
| 101 | + df: pd.DataFrame |
| 102 | + dataframe with cleaned, binned, partitioned and prepared data |
| 103 | +
|
| 104 | + variable: str |
| 105 | + variable for which the incidence plot will be shown |
| 106 | +
|
| 107 | + column_order: list, default=None |
| 108 | + explicit order of variable |
| 109 | +
|
| 110 | + dim: tuple, default=(12, 8) |
| 111 | + tuple with width and lentgh of the plot |
| 112 | + """ |
| 113 | + df_plot = df[df['variable'] == variable] |
| 114 | + |
| 115 | + if column_order is not None: |
| 116 | + |
| 117 | + if not set(df_plot['label']) == set(column_order): |
| 118 | + raise ValueError( |
| 119 | + 'Variables in column_order and dataframe are not equal') |
| 120 | + |
| 121 | + df_plot['label'] = df_plot['label'].astype('category') |
| 122 | + df_plot['label'].cat.reorder_categories(column_order, |
| 123 | + inplace=True) |
| 124 | + |
| 125 | + df_plot.sort_values(by=['label'], ascending=True, inplace=True) |
| 126 | + df_plot.reset_index(inplace=True) |
| 127 | + else: |
| 128 | + df_plot.sort_values(by=['incidence'], ascending=False, inplace=True) |
| 129 | + df_plot.reset_index(inplace=True) |
| 130 | + |
| 131 | + with plt.style.context("seaborn-ticks"): |
| 132 | + fig, ax = plt.subplots(figsize=dim) |
| 133 | + |
| 134 | + # First Axis |
| 135 | + ax.bar(df_plot['label'], df_plot['pop_size'], |
| 136 | + align='center', color="cornflowerblue") |
| 137 | + ax.set_ylabel('population size', fontsize=16) |
| 138 | + ax.set_xlabel('{} bins' ''.format(variable), fontsize=16) |
| 139 | + ax.xaxis.set_tick_params(rotation=45, labelsize=14) |
| 140 | + ax.yaxis.set_tick_params(labelsize=14) |
| 141 | + |
| 142 | + max_inc = max(df_plot['incidence']) |
| 143 | + |
| 144 | + # Second Axis |
| 145 | + ax2 = ax.twinx() |
| 146 | + |
| 147 | + plt.plot(df_plot['incidence'], color="darkorange", marker=".", |
| 148 | + markersize=20, linewidth=3, label='incidence rate per bin') |
| 149 | + plt.plot(df_plot['avg_incidence'], color="dimgrey", linewidth=4, |
| 150 | + linestyle='--', |
| 151 | + label='average incidence rate') |
| 152 | + |
| 153 | + # dummy line to have label on second axis from first |
| 154 | + ax2.plot(np.nan, "cornflowerblue", linewidth=6, label='bin size') |
| 155 | + ax2.set_yticks(np.arange(0, max_inc+0.05, 0.05)) |
| 156 | + ax2.set_yticklabels( |
| 157 | + ['{:3.1f}%'.format(x*100) for x in ax2.get_yticks()]) |
| 158 | + ax2.yaxis.set_tick_params(labelsize=14) |
| 159 | + ax2.set_ylabel('incidence', fontsize=16) |
| 160 | + |
| 161 | + sns.despine(ax=ax, right=True, left=True) |
| 162 | + sns.despine(ax=ax2, left=True, right=False) |
| 163 | + ax2.spines['right'].set_color('white') |
| 164 | + |
| 165 | + ax2.grid(False) |
| 166 | + |
| 167 | + fig.suptitle('Incidence Plot - ' + variable, fontsize=22, y=1.02) |
| 168 | + ax2.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102), |
| 169 | + loc=3, ncol=1, mode="expand", borderaxespad=0., |
| 170 | + prop={"size": 14}) |
| 171 | + plt.show() |
0 commit comments