Skip to content

Commit 002bc57

Browse files
Added code for plotting PIGs (#23)
* plot incidence and tests * Change plotIncidence name to comply with PEP-8 PEP-8 naming convention for functions and class methods states that names should be in snake case (with underscore chars). So plotIncidence should be plot_incidence. Note that the former is used a lot amongst web developers (Flask apps, Django apps, ...) Co-authored-by: Matthias Roels <matthias.roels@pythonpredictions.com>
1 parent 8f442b4 commit 002bc57

4 files changed

Lines changed: 109 additions & 5 deletions

File tree

cobra/evaluation/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from .pigs_tables import generate_pig_tables
22
from .pigs_tables import compute_pig_table
3+
from .pigs_tables import plot_incidence
34

45
from .plotting_utils import plot_performance_curves
56
from .plotting_utils import plot_variable_importance
@@ -12,6 +13,7 @@
1213

1314
__all__ = ["generate_pig_tables",
1415
"compute_pig_table",
16+
"plot_incidence",
1517
"plot_performance_curves",
1618
"plot_variable_importance",
1719
"plot_univariate_predictor_quality",

cobra/evaluation/pigs_tables.py

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# third party imports
22
import pandas as pd
3-
#import matplotlib.pyplot as plt
4-
#import seaborn as sns
3+
import matplotlib.pyplot as plt
4+
import seaborn as sns
5+
import numpy as np
56

67
import cobra.utils as utils
78

@@ -87,3 +88,84 @@ def compute_pig_table(data: pd.DataFrame,
8788
"avg_incidence", "incidence"]
8889

8990
return res[column_order]
91+
92+
93+
def plot_incidence(df: pd.DataFrame, variable: str,
94+
column_order: list=None, dim: tuple=(12, 8)):
95+
"""Function plots Predictor Incidence Graphs (PIGs).
96+
Bins are ordered in descening order of bin incidence
97+
unless specified otherwise with `column_order` list.
98+
99+
Parameters
100+
----------
101+
df: pd.DataFrame
102+
dataframe with cleaned, binned, partitioned and prepared data
103+
104+
variable: str
105+
variable for which the incidence plot will be shown
106+
107+
column_order: list, default=None
108+
explicit order of variable
109+
110+
dim: tuple, default=(12, 8)
111+
tuple with width and lentgh of the plot
112+
"""
113+
df_plot = df[df['variable'] == variable]
114+
115+
if column_order is not None:
116+
117+
if not set(df_plot['label']) == set(column_order):
118+
raise ValueError(
119+
'Variables in column_order and dataframe are not equal')
120+
121+
df_plot['label'] = df_plot['label'].astype('category')
122+
df_plot['label'].cat.reorder_categories(column_order,
123+
inplace=True)
124+
125+
df_plot.sort_values(by=['label'], ascending=True, inplace=True)
126+
df_plot.reset_index(inplace=True)
127+
else:
128+
df_plot.sort_values(by=['incidence'], ascending=False, inplace=True)
129+
df_plot.reset_index(inplace=True)
130+
131+
with plt.style.context("seaborn-ticks"):
132+
fig, ax = plt.subplots(figsize=dim)
133+
134+
# First Axis
135+
ax.bar(df_plot['label'], df_plot['pop_size'],
136+
align='center', color="cornflowerblue")
137+
ax.set_ylabel('population size', fontsize=16)
138+
ax.set_xlabel('{} bins' ''.format(variable), fontsize=16)
139+
ax.xaxis.set_tick_params(rotation=45, labelsize=14)
140+
ax.yaxis.set_tick_params(labelsize=14)
141+
142+
max_inc = max(df_plot['incidence'])
143+
144+
# Second Axis
145+
ax2 = ax.twinx()
146+
147+
plt.plot(df_plot['incidence'], color="darkorange", marker=".",
148+
markersize=20, linewidth=3, label='incidence rate per bin')
149+
plt.plot(df_plot['avg_incidence'], color="dimgrey", linewidth=4,
150+
linestyle='--',
151+
label='average incidence rate')
152+
153+
# dummy line to have label on second axis from first
154+
ax2.plot(np.nan, "cornflowerblue", linewidth=6, label='bin size')
155+
ax2.set_yticks(np.arange(0, max_inc+0.05, 0.05))
156+
ax2.set_yticklabels(
157+
['{:3.1f}%'.format(x*100) for x in ax2.get_yticks()])
158+
ax2.yaxis.set_tick_params(labelsize=14)
159+
ax2.set_ylabel('incidence', fontsize=16)
160+
161+
sns.despine(ax=ax, right=True, left=True)
162+
sns.despine(ax=ax2, left=True, right=False)
163+
ax2.spines['right'].set_color('white')
164+
165+
ax2.grid(False)
166+
167+
fig.suptitle('Incidence Plot - ' + variable, fontsize=22, y=1.02)
168+
ax2.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
169+
loc=3, ncol=1, mode="expand", borderaxespad=0.,
170+
prop={"size": 14})
171+
plt.show()

cobra/model_building/univariate_selection.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,8 @@ def compute_correlations(target_enc_train_data: pd.DataFrame,
123123
target_enc_train_data : pd.DataFrame
124124
data to compute correlation
125125
predictors : list
126-
List of column names of the DataFrame between which
127-
matrix from
128-
to compute correlations
126+
List of column names of the DataFrame between which to compute
127+
the correlation matrix
129128
130129
Returns
131130
-------
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import pytest
2+
import pandas as pd
3+
from cobra.evaluation import plotIncidence
4+
5+
6+
def mock_data():
7+
d = {'variable': ['education', 'education', 'education', 'education'],
8+
'label': ['1st-4th', '5th-6th', '7th-8th', '9th'],
9+
'pop_size': [0.002, 0.004, 0.009, 0.019],
10+
'avg_incidence': [0.23, 0.23, 0.23, 0.23],
11+
'incidence': [0.047, 0.0434, 0.054, 0.069]}
12+
return pd.DataFrame(d)
13+
14+
15+
class TestEvaluation:
16+
17+
def test_plot_incidence(self):
18+
data = mock_data()
19+
column_order = ['1st-4th', '5th-6th', '7th-8th']
20+
with pytest.raises(Exception):
21+
plotIncidence(data, 'education', column_order)

0 commit comments

Comments
 (0)