-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathancova.py
More file actions
129 lines (116 loc) · 4.52 KB
/
ancova.py
File metadata and controls
129 lines (116 loc) · 4.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/local/bin/python3.9
#import packages
import os
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
#----------------single variable----------------
#read data
df = pd.read_csv('/Users/jessedesimone/Desktop/test_ancova.csv')
#run ancova model
model = ols('Amygdala_R_FW ~ C(grp_id) + age + C(sex) + educ + C(apoe)', data=df).fit()
model_sum = sm.stats.anova_lm(model, typ=3)
print(model_sum)
#tukey comparisons
mc = pairwise_tukeyhsd(df['Amygdala_R_FW'],df['grp_id']); mc_results = mc.summary(); print(mc_results)
#----------------loop variables----------------
'''
perform an ancova analysis with tukey post-hoc
loop through columns of a dataframe
each colume is a separate response variable to be tested
'''
#import packages
import os
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
#create output directory
root='/Users/jessedesimone/Desktop/ANCOVA_OUTPUT'
os.makedirs(root)
#read data
df = pd.read_csv('/Users/jessedesimone/Desktop/test_ancova.csv') #original dataframe
df2=df.drop(['age', 'sex', 'educ', 'apoe'], axis=1) #create a copy of dataframe without covariates
# make copy of input file in output folder
df.to_csv(os.path.join(root, "ancova_input_raw.csv"), index=False)
#define ancova function
def run_ancova_model():
import statsmodels.api as sm
from statsmodels.formula.api import ols
col_list=df2.columns.to_list()
col_list.remove('grp_id')
p_unc_list = []
loop_cols = df2.loc[:, df2.columns!='grp_id']
col_names = [col for col in loop_cols]
'''run model for each variable in col_names'''
for col in col_names:
print(col)
model = ols('{} ~ C(grp_id) + age + C(sex)'.format(col), data=df).fit()
print(model.summary())
model_sum = sm.stats.anova_lm(model, typ=2)
print(model_sum)
p=float(model_sum["PR(>F)"][0]) #get p value for group effect !!!!!0 if using type II; 1 if using type III
p=("{:.5f}".format(p))
p_unc_list.append(p)
d = {'variable':col_list, 'p_unc':p_unc_list}
df_f = pd.DataFrame(d)
df_f.to_csv(os.path.join(root, "aov_p_unc.csv"), index=False)
#define fdr function
'''run fdr correction for list of p values created above'''
def run_fdr_correct():
df = pd.read_csv(root + "/aov_p_unc.csv")
var_list=df['variable'].tolist(); var_list
p_unc_list=df['p_unc'].tolist(); p_unc_list
import statsmodels as sm
a=sm.stats.multitest.fdrcorrection(p_unc_list,
alpha=0.05,
method='indep',
is_sorted=False)
fdr_list=a[1].tolist()
d = {'variable':var_list, 'p_unc':p_unc_list, 'fdr':fdr_list}
df_f = pd.DataFrame(d)
print(df_f)
df_f.to_csv(os.path.join(root, "aov_p_fdr.csv"), index=False)
#define tukey function
def run_tukey():
import pingouin as pg
df1 = df
df2 = pd.read_csv(root + "/aov_p_fdr.csv")
var_list=df2['variable'].tolist()
p_unc_list=df2['p_unc'].tolist()
fdr_list=df2['fdr'].tolist()
col_names = [col for col in var_list]
mean_a_list=[]
mean_b_list=[]
mean_c_list=[]
diff_list_1=[]
diff_list_2=[]
diff_list_3=[]
tuk_list_1=[]
tuk_list_2=[]
tuk_list_3=[]
for col in col_names:
print(col)
pt = pg.pairwise_tukey(data=df1,dv=col,between='grp_id')
print(pt)
mean_a=pt['mean(A)'][0]; mean_a_list.append(mean_a)
mean_b=pt['mean(B)'][0]; mean_b_list.append(mean_b)
mean_c=pt['mean(B)'][1]; mean_c_list.append(mean_c)
diff1=pt["diff"][0]; diff_list_1.append(diff1)
diff2=pt["diff"][1]; diff_list_2.append(diff2)
diff3=pt["diff"][2]; diff_list_3.append(diff3)
tuk1=pt["p-tukey"][0]; tuk_list_1.append(tuk1)
tuk2=pt["p-tukey"][1]; tuk_list_2.append(tuk2)
tuk3=pt["p-tukey"][2]; tuk_list_3.append(tuk3)
'''create final dataframe'''
d = {'variable':var_list, 'p_unc':p_unc_list, 'fdr':fdr_list,
'mean_a':mean_a_list, 'mean_b':mean_b_list, 'mean_c':mean_c_list,
'a_vs_b_diff':diff_list_1, 'a_vs_c_diff':diff_list_2, 'b_vs_c_diff':diff_list_3,
'a_vs_b_tukey':tuk_list_1, 'a_vs_c_tukey':tuk_list_2, 'b_vs_c_tukey':tuk_list_3}
df_f = pd.DataFrame(d)
print(df_f)
df_f.to_csv(os.path.join(root, "aov_tukey_final.csv"), index=False)
run_ancova_model()
run_fdr_correct()
run_tukey()