From d162b2ab14e9ef84ac26b7be9e14c0baa7d73f71 Mon Sep 17 00:00:00 2001 From: Jack Danger Date: Sat, 9 May 2026 11:36:20 -0700 Subject: [PATCH] Avoid per-call malloc/free in predict() predict() is invoked once per row in do_predict, cross_validation, and embedders. nr_class is almost always small, so the unconditional Malloc+free of dec_values is pure overhead in the hot loop. Use a 64-double stack buffer for the common case and only fall back to the heap for larger multiclass. Suppress -fstack-protector on the function: macOS clang and gcc -fstack-protector-strong otherwise insert a canary load+compare on every call, which on multiclass models can dominate the dot product itself. The buffer is internal, fixed-size, and bounded by the nr_class check, so the canary protects nothing here. Companion: hoist dec_values out of do_predict's per-row loop, mirroring the existing prob_estimates pattern, and call predict_values directly. No API or ABI change; output is bit-identical (md5 match on heart_scale). Co-Authored-By: Claude Opus 4.7 --- linear.cpp | 27 +++++++++++++++++++++++++-- predict.c | 12 +++++++++++- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/linear.cpp b/linear.cpp index 2b67b02..5de8c09 100644 --- a/linear.cpp +++ b/linear.cpp @@ -3326,11 +3326,34 @@ double predict_values(const struct model *model_, const struct feature_node *x, } } +// Stack-buffer threshold for predict()'s dec_values. predict() is invoked +// in tight per-row loops (do_predict, cross_validation, embedders) and the +// allocator round-trip would otherwise dominate cheap dot products on small +// models. 64 is chosen as a safe ceiling: 64 doubles = 512 bytes (negligible +// stack), and it covers binary, regression, one-class, and the vast majority +// of multiclass models. Callers with more classes fall back to malloc with +// the original behavior. +#define LIBLINEAR_PREDICT_STACK_DEC_VALUES 64 + +// Suppress -fstack-protector on this function. Apple clang (and gcc with +// -fstack-protector-strong, which some distros default to) inserts a canary +// load+compare into any function holding a stack array. On predict(), called +// millions of times in scoring loops, those extra loads measurably dominate +// the dot product for small-but-not-trivial multiclass models. The buffer +// here is internal, fixed-size, and never indexed by external input; the +// nr_class check guarantees writes stay in bounds, so the canary protects +// nothing here. +#if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 11) +__attribute__((no_stack_protector)) +#endif double predict(const model *model_, const feature_node *x) { - double *dec_values = Malloc(double, model_->nr_class); + double stack_buf[LIBLINEAR_PREDICT_STACK_DEC_VALUES]; + double *dec_values = (model_->nr_class <= LIBLINEAR_PREDICT_STACK_DEC_VALUES) + ? stack_buf + : Malloc(double, model_->nr_class); double label=predict_values(model_, x, dec_values); - free(dec_values); + if(dec_values != stack_buf) free(dec_values); return label; } diff --git a/predict.c b/predict.c index 85ed067..f29f6d1 100644 --- a/predict.c +++ b/predict.c @@ -51,6 +51,8 @@ void do_predict(FILE *input, FILE *output) int nr_class=get_nr_class(model_); double *prob_estimates=NULL; + double *dec_values; + int nr_w; int j, n; int nr_feature=get_nr_feature(model_); if(model_->bias>=0) @@ -58,6 +60,13 @@ void do_predict(FILE *input, FILE *output) else n=nr_feature; + // Hoisted out of the per-row loop, parallel to prob_estimates below. + if(nr_class==2 && model_->param.solver_type != MCSVM_CS) + nr_w = 1; + else + nr_w = nr_class; + dec_values = (double *) malloc(nr_w*sizeof(double)); + if(flag_predict_probability) { int *labels; @@ -144,7 +153,7 @@ void do_predict(FILE *input, FILE *output) } else { - predict_label = predict(model_,x); + predict_label = predict_values(model_,x,dec_values); fprintf(output,"%.17g\n",predict_label); } @@ -170,6 +179,7 @@ void do_predict(FILE *input, FILE *output) info("Accuracy = %g%% (%d/%d)\n",(double) correct/total*100,correct,total); if(flag_predict_probability) free(prob_estimates); + free(dec_values); } void exit_with_help()