From 66fce7bef1ea3f1a56105114ef51f0d2d7024afe Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 9 Feb 2026 14:16:42 +0530 Subject: [PATCH 01/18] Add Google Tag Manager first-party integration Scripts like GTM and GA4 are often blocked by ad blockers and privacy extensions when loaded from third-party domains, leading to data loss. Third-party cookie deprecation further limits tracking durability. This change proxies GTM scripts and analytics beacons through the Trusted Server, establishing a first-party context. It automatically rewrites HTML tags and script content to point to local proxy endpoints, bypassing blockers and extending cookie life. Includes: Proxy endpoints for gtm.js and /collect Content rewriting for redirecting internal GTM calls Configuration and integration tests Resolves: #224 --- .../src/integrations/google_tag_manager.rs | 452 ++++++++++++++++++ crates/common/src/integrations/mod.rs | 2 + docs/guide/integrations/google_tag_manager.md | 95 ++++ trusted-server.toml | 5 + 4 files changed, 554 insertions(+) create mode 100644 crates/common/src/integrations/google_tag_manager.rs create mode 100644 docs/guide/integrations/google_tag_manager.md diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs new file mode 100644 index 00000000..1a00e42e --- /dev/null +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -0,0 +1,452 @@ +use std::sync::Arc; + +use async_trait::async_trait; +use error_stack::Report; +use fastly::http::StatusCode; +use fastly::{Request, Response}; +use serde::{Deserialize, Serialize}; +use validator::Validate; + +use crate::error::TrustedServerError; +use crate::integrations::{ + AttributeRewriteAction, IntegrationAttributeContext, IntegrationAttributeRewriter, + IntegrationEndpoint, IntegrationProxy, IntegrationRegistration, IntegrationScriptContext, + IntegrationScriptRewriter, ScriptRewriteAction, +}; +use crate::proxy::{proxy_request, ProxyRequestConfig}; +use crate::settings::{IntegrationConfig, Settings}; + +const GTM_INTEGRATION_ID: &str = "google_tag_manager"; +const DEFAULT_UPSTREAM: &str = "https://www.googletagmanager.com"; + +#[derive(Debug, Clone, Deserialize, Serialize, Validate)] +pub struct GoogleTagManagerConfig { + #[serde(default = "default_enabled")] + pub enabled: bool, + /// GTM Container ID (e.g., "GTM-XXXXXX"). + #[validate(length(min = 1))] + pub container_id: String, + /// Upstream URL for GTM (defaults to https://www.googletagmanager.com). + #[serde(default = "default_upstream")] + pub upstream_url: String, +} + +impl IntegrationConfig for GoogleTagManagerConfig { + fn is_enabled(&self) -> bool { + self.enabled + } +} + +fn default_enabled() -> bool { + true +} + +fn default_upstream() -> String { + DEFAULT_UPSTREAM.to_string() +} + +pub struct GoogleTagManagerIntegration { + config: GoogleTagManagerConfig, +} + +impl GoogleTagManagerIntegration { + fn new(config: GoogleTagManagerConfig) -> Arc { + Arc::new(Self { config }) + } + + fn upstream_url(&self) -> &str { + if self.config.upstream_url.is_empty() { + DEFAULT_UPSTREAM + } else { + &self.config.upstream_url + } + } + + fn rewrite_gtm_script(&self, content: &str) -> String { + // Rewrite 'www.google-analytics.com' to point to this server's proxy path + // path would be /integrations/google_tag_manager + let my_integration_path = format!("/integrations/{}", GTM_INTEGRATION_ID); + + // Simplistic replacements - mimic what Cloudflare/others do + // Replacements depend on exactly how the string appears in the minified JS. + // Common target: "https://www.google-analytics.com" + let mut new_content = + content.replace("https://www.google-analytics.com", &my_integration_path); + new_content = new_content.replace("https://www.googletagmanager.com", &my_integration_path); + new_content + } +} + +pub fn build(settings: &Settings) -> Option> { + let config = settings + .integration_config::(GTM_INTEGRATION_ID) + .ok() + .flatten()?; + + if !config.enabled { + return None; + } + + Some(GoogleTagManagerIntegration::new(config)) +} + +#[must_use] +pub fn register(settings: &Settings) -> Option { + let integration = build(settings)?; + Some( + IntegrationRegistration::builder(GTM_INTEGRATION_ID) + .with_proxy(integration.clone()) + .with_attribute_rewriter(integration.clone()) + .with_script_rewriter(integration) + .build(), + ) +} + +#[async_trait(?Send)] +impl IntegrationProxy for GoogleTagManagerIntegration { + fn integration_name(&self) -> &'static str { + GTM_INTEGRATION_ID + } + + fn routes(&self) -> Vec { + vec![ + // Proxy for the main GTM script + self.get("/gtm.js"), + // Proxy for the gtag script (if used) + self.get("/gtag/js"), + // Analytics beacons (GA4/UA) + // Note: In a real "Tag Gateway" implementation, we'd likely need + // to rewrite the GTM script to point these beacons to our proxy. + self.get("/collect"), + self.post("/collect"), + self.get("/g/collect"), + self.post("/g/collect"), + ] + } + + async fn handle( + &self, + settings: &Settings, + req: Request, + ) -> Result> { + let path = req.get_path().to_string(); + let upstream_base = self.upstream_url(); + + // Construct full target URL + let mut target_url = if path.ends_with("/gtm.js") { + format!("{}/gtm.js", upstream_base) + } else if path.ends_with("/gtag/js") { + format!("{}/gtag/js", upstream_base) + } else if path.ends_with("/collect") { + if path.contains("/g/") { + "https://www.google-analytics.com/g/collect".to_string() + } else { + "https://www.google-analytics.com/collect".to_string() + } + } else { + return Ok(Response::from_status(StatusCode::NOT_FOUND)); + }; + + // Append query params if present, or add default ID for gtm.js + if let Some(query) = req.get_url().query() { + target_url = format!("{}?{}", target_url, query); + } else if path.ends_with("/gtm.js") { + target_url = format!("{}?id={}", target_url, self.config.container_id); + } + + let mut proxy_config = ProxyRequestConfig::new(&target_url); + + // If we are fetching gtm.js, we intend to rewrite the body. + // We must ensure the upstream returns uncompressed content. + if path.ends_with("/gtm.js") { + proxy_config = proxy_config.with_header( + fastly::http::header::ACCEPT_ENCODING, + fastly::http::HeaderValue::from_static("identity"), + ); + } + + let mut response = proxy_request(settings, req, proxy_config).await?; + + // Rewrite logic (Primitive version) + // If we are serving gtm.js, we want to text-replace "www.google-analytics.com" + // with our proxy details to route beacons through us. + if path.ends_with("/gtm.js") { + // Note: This is an expensive operation if the script is large. + // Ideally should be streamed, but simple string replacement for now. + let body_bytes = response.into_body_bytes(); + let body_str = String::from_utf8_lossy(&body_bytes).to_string(); + + let rewritten_body = self.rewrite_gtm_script(&body_str); + + response = Response::from_body(rewritten_body) + .with_header(fastly::http::header::CONTENT_TYPE, "application/javascript"); + } + + Ok(response) + } +} + +impl IntegrationAttributeRewriter for GoogleTagManagerIntegration { + fn integration_id(&self) -> &'static str { + GTM_INTEGRATION_ID + } + + fn handles_attribute(&self, attribute: &str) -> bool { + matches!(attribute, "src" | "href") + } + + fn rewrite( + &self, + _attr_name: &str, + attr_value: &str, + _ctx: &IntegrationAttributeContext<'_>, + ) -> AttributeRewriteAction { + if attr_value.contains("googletagmanager.com/gtm.js") { + let encoded_integration_id = urlencoding::encode(self.integration_name()); + let mut new_value = attr_value.replace( + "https://www.googletagmanager.com/gtm.js", + &format!("/integrations/{}/gtm.js", encoded_integration_id), + ); + new_value = new_value.replace( + "//www.googletagmanager.com/gtm.js", + &format!("/integrations/{}/gtm.js", encoded_integration_id), + ); + + AttributeRewriteAction::replace(new_value) + } else { + AttributeRewriteAction::keep() + } + } +} + +impl IntegrationScriptRewriter for GoogleTagManagerIntegration { + fn integration_id(&self) -> &'static str { + GTM_INTEGRATION_ID + } + + fn selector(&self) -> &'static str { + "script" // Match all scripts to find inline GTM snippets + } + + fn rewrite(&self, content: &str, _ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction { + // Look for the GTM snippet pattern. + // Standard snippet contains: "googletagmanager.com/gtm.js" + if content.contains("googletagmanager.com/gtm.js") { + let encoded_integration_id = urlencoding::encode(self.integration_name()); + let my_integration_path = format!("/integrations/{}/gtm.js", encoded_integration_id); + + let mut new_content = content.replace( + "https://www.googletagmanager.com/gtm.js", + &my_integration_path, + ); + new_content = + new_content.replace("//www.googletagmanager.com/gtm.js", &my_integration_path); + + return ScriptRewriteAction::replace(new_content); + } + + ScriptRewriteAction::keep() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::integrations::{ + AttributeRewriteAction, IntegrationAttributeContext, IntegrationAttributeRewriter, + IntegrationDocumentState, IntegrationScriptContext, IntegrationScriptRewriter, + ScriptRewriteAction, + }; + + #[test] + fn test_attribute_rewriter() { + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST".to_string(), + upstream_url: "https://www.googletagmanager.com".to_string(), + }; + let integration = GoogleTagManagerIntegration::new(config); + + let ctx = IntegrationAttributeContext { + attribute_name: "src", + request_host: "example.com", + request_scheme: "https", + origin_host: "origin.example.com", + }; + + // Case 1: Standard HTTPS URL + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "src", + "https://www.googletagmanager.com/gtm.js?id=GTM-TEST", + &ctx, + ); + if let AttributeRewriteAction::Replace(val) = action { + assert_eq!(val, "/integrations/google_tag_manager/gtm.js?id=GTM-TEST"); + } else { + panic!("Expected Replace action for HTTPS URL, got {:?}", action); + } + + // Case 2: Protocol-relative URL + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "src", + "//www.googletagmanager.com/gtm.js?id=GTM-TEST", + &ctx, + ); + if let AttributeRewriteAction::Replace(val) = action { + assert_eq!(val, "/integrations/google_tag_manager/gtm.js?id=GTM-TEST"); + } else { + panic!( + "Expected Replace action for protocol-relative URL, got {:?}", + action + ); + } + + // Case 3: Other URL (should be kept) + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "src", + "https://other.com/script.js", + &ctx, + ); + assert!(matches!(action, AttributeRewriteAction::Keep)); + } + + #[test] + fn test_script_rewriter() { + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST".to_string(), + upstream_url: "https://www.googletagmanager.com".to_string(), + }; + let integration = GoogleTagManagerIntegration::new(config); + let doc_state = IntegrationDocumentState::default(); + + let ctx = IntegrationScriptContext { + selector: "script", + request_host: "example.com", + request_scheme: "https", + origin_host: "origin.example.com", + is_last_in_text_node: true, + document_state: &doc_state, + }; + + // Case 1: Inline GTM snippet + let snippet = r#"(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': +new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], +j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= +'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); +})(window,document,'script','dataLayer','GTM-XXXX');"#; + + let action = IntegrationScriptRewriter::rewrite(&*integration, snippet, &ctx); + if let ScriptRewriteAction::Replace(val) = action { + assert!(val.contains("/integrations/google_tag_manager/gtm.js")); + assert!(!val.contains("https://www.googletagmanager.com/gtm.js")); + } else { + panic!("Expected Replace action for GTM snippet, got {:?}", action); + } + + // Case 2: Protocol relative + let snippet_proto = r#"j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;"#; + let action = IntegrationScriptRewriter::rewrite(&*integration, snippet_proto, &ctx); + if let ScriptRewriteAction::Replace(val) = action { + assert!(val.contains("/integrations/google_tag_manager/gtm.js")); + assert!(!val.contains("//www.googletagmanager.com/gtm.js")); + } else { + panic!( + "Expected Replace action for proto-relative snippet, got {:?}", + action + ); + } + + // Case 3: Irrelevant script + let other_script = "console.log('hello');"; + let action = IntegrationScriptRewriter::rewrite(&*integration, other_script, &ctx); + assert!(matches!(action, ScriptRewriteAction::Keep)); + } + + #[test] + fn test_default_configuration() { + let config = GoogleTagManagerConfig { + enabled: default_enabled(), + container_id: "GTM-DEFAULT".to_string(), + upstream_url: default_upstream(), + }; + + assert!(config.enabled); + assert_eq!(config.upstream_url, "https://www.googletagmanager.com"); + } + + #[test] + fn test_upstream_url_logic() { + // Default upstream + let config_default = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-123".to_string(), + upstream_url: "".to_string(), // Empty string should fallback to default in accessor + }; + let integration_default = GoogleTagManagerIntegration::new(config_default); + assert_eq!( + integration_default.upstream_url(), + "https://www.googletagmanager.com" + ); + + // Custom upstream + let config_custom = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-123".to_string(), + upstream_url: "https://gtm.example.com".to_string(), + }; + let integration_custom = GoogleTagManagerIntegration::new(config_custom); + assert_eq!(integration_custom.upstream_url(), "https://gtm.example.com"); + } + + #[test] + fn test_routes_registered() { + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST".to_string(), + upstream_url: default_upstream(), + }; + let integration = GoogleTagManagerIntegration::new(config); + let routes = integration.routes(); + + // GTM.js, Gtag.js, and 4 Collect endpoints (GET/POST for standard & dual-tagging) + assert_eq!(routes.len(), 6); + + assert!(routes + .iter() + .any(|r| r.path == "/integrations/google_tag_manager/gtm.js")); + assert!(routes + .iter() + .any(|r| r.path == "/integrations/google_tag_manager/gtag/js")); + assert!(routes + .iter() + .any(|r| r.path == "/integrations/google_tag_manager/collect")); + assert!(routes + .iter() + .any(|r| r.path == "/integrations/google_tag_manager/g/collect")); + } + + #[test] + fn test_handle_response_rewriting() { + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST".to_string(), + upstream_url: default_upstream(), + }; + let integration = GoogleTagManagerIntegration::new(config); + + let original_body = r#" + var x = "https://www.google-analytics.com/collect"; + var y = "https://www.googletagmanager.com/gtm.js"; + "#; + + let rewritten = integration.rewrite_gtm_script(original_body); + + assert!(rewritten.contains("/integrations/google_tag_manager/collect")); + assert!(rewritten.contains("/integrations/google_tag_manager/gtm.js")); + assert!(!rewritten.contains("https://www.google-analytics.com")); + } +} diff --git a/crates/common/src/integrations/mod.rs b/crates/common/src/integrations/mod.rs index af1b5ea1..196b732b 100644 --- a/crates/common/src/integrations/mod.rs +++ b/crates/common/src/integrations/mod.rs @@ -5,6 +5,7 @@ use crate::settings::Settings; pub mod adserver_mock; pub mod aps; pub mod didomi; +pub mod google_tag_manager; pub mod lockr; pub mod nextjs; pub mod permutive; @@ -30,5 +31,6 @@ pub(crate) fn builders() -> &'static [IntegrationBuilder] { permutive::register, lockr::register, didomi::register, + google_tag_manager::register, ] } diff --git a/docs/guide/integrations/google_tag_manager.md b/docs/guide/integrations/google_tag_manager.md new file mode 100644 index 00000000..2d45cc7e --- /dev/null +++ b/docs/guide/integrations/google_tag_manager.md @@ -0,0 +1,95 @@ +# Google Tag Manager Integration + +**Category**: Tag Management +**Status**: Production +**Type**: First-Party Tag Gateway + +## Overview + +The Google Tag Manager (GTM) integration enables Trusted Server to act as a first-party proxy for GTM scripts and analytics beacons. This improves performance, tracking accuracy, and privacy control by serving these assets from your own domain. + +## What is the Tag Gateway? + +The Tag Gateway intercepts requests for GTM scripts (`gtm.js`) and Google Analytics beacons (`collect`). Instead of the user's browser connecting directly to Google content servers, it connects to your Trusted Server. Trusted Server then fetches the content from Google and serves it back to the user. + +**Benefits**: + +- **Bypass Ad Blockers**: Serving scripts from a first-party domain can prevent them from being blocked by some ad blockers and privacy extensions. +- **Extended Cookie Life**: First-party cookies set by these scripts are more durable in environments like Safari (ITP). +- **Performance**: Utilize edge caching for scripts. +- **Privacy Control**: Strips client IP addresses before forwarding data to Google. + +## Configuration + +Add the GTM configuration to `trusted-server.toml`: + +```toml +[integrations.google_tag_manager] +enabled = true +container_id = "GTM-XXXXXX" +# upstream_url = "https://www.googletagmanager.com" # Optional override +``` + +### Configuration Options + +| Field | Type | Required | Description | +| -------------- | ------ | -------- | ------------------------------------------------- | +| `enabled` | boolean| No | Enable/disable integration (default: `false`) | +| `container_id` | string | Yes | Your GTM Container ID (e.g., `GTM-A1B2C3`) | +| `upstream_url` | string | No | Custom upstream URL (advanced usage) | + +## How It Works + +### 1. Script Rewriting + +When Trusted Server processes an HTML response, it automatically rewrites GTM script tags: + +**Before:** +```html + +``` + +**After:** +```html + +``` + +### 2. Script Proxying + +When the browser requests `/integrations/google_tag_manager/gtm.js`: +1. Trusted Server fetches the original script from Google. +2. It modifies the script content on-the-fly to replace references to `www.google-analytics.com` and `www.googletagmanager.com` with the local proxy path. +3. It serves the modified script to the browser. + +### 3. Beacon Proxying + +Analytics data sent by the modified script is directed to: +`/integrations/google_tag_manager/collect` (or `/g/collect`) + +Trusted Server forwards these requests to Google's servers, ensuring the data is recorded successfully. + +## Manual Verification + +You can verify the integration using `curl`: + +**Test Script Proxy:** +```bash +curl -v "http://your-server.com/integrations/google_tag_manager/gtm.js?id=GTM-XXXXXX" +``` +*Expected*: 200 OK, and the body content should contain rewritten paths. + +**Test Beacon:** +```bash +curl -v -X POST "http://your-server.com/integrations/google_tag_manager/g/collect?v=2&tid=G-XXXXXX..." +``` +*Expected*: 200/204 OK. + +## Implementation Details + +See [crates/common/src/integrations/google_tag_manager.rs](https://github.com/IABTechLab/trusted-server/blob/main/crates/common/src/integrations/google_tag_manager.rs). + +## Next Steps + +- Review [Prebid Integration](/guide/integrations/prebid) for header bidding. +- Check [Configuration Guide](/guide/configuration) for other integration settings. +- Learn more about [Synthetic IDs](/guide/synthetic-ids) which are generated alongside this integration. diff --git a/trusted-server.toml b/trusted-server.toml index 2e22c06c..3404a0db 100644 --- a/trusted-server.toml +++ b/trusted-server.toml @@ -96,6 +96,11 @@ pub_id = "your-aps-publisher-id" endpoint = "https://origin-mocktioneer.cdintel.com/e/dtb/bid" timeout_ms = 1000 +[integrations.google_tag_manager] +enabled = true +container_id = "GTM-XXXXXX" +# upstream_url = "https://www.googletagmanager.com" + [integrations.adserver_mock] enabled = false endpoint = "https://origin-mocktioneer.cdintel.com/adserver/mediate" From 348f150a8522dd6fb17e5d470795d409284ab08d Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 9 Feb 2026 14:38:53 +0530 Subject: [PATCH 02/18] Fix linting errors in google_tag_manager.rs and google_tag_manager.md --- .../src/integrations/google_tag_manager.rs | 6 ++-- docs/guide/integrations/google_tag_manager.md | 29 ++++++++++++------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index 1a00e42e..dceb181a 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -26,7 +26,7 @@ pub struct GoogleTagManagerConfig { /// GTM Container ID (e.g., "GTM-XXXXXX"). #[validate(length(min = 1))] pub container_id: String, - /// Upstream URL for GTM (defaults to https://www.googletagmanager.com). + /// Upstream URL for GTM (defaults to ). #[serde(default = "default_upstream")] pub upstream_url: String, } @@ -77,6 +77,7 @@ impl GoogleTagManagerIntegration { } } +#[must_use] pub fn build(settings: &Settings) -> Option> { let config = settings .integration_config::(GTM_INTEGRATION_ID) @@ -116,7 +117,8 @@ impl IntegrationProxy for GoogleTagManagerIntegration { self.get("/gtag/js"), // Analytics beacons (GA4/UA) // Note: In a real "Tag Gateway" implementation, we'd likely need - // to rewrite the GTM script to point these beacons to our proxy. + // (e.g., `gtm.js` script tags), it will automatically rewrite the `src` attribute to point to + // the first-party proxy endpoint. self.get("/collect"), self.post("/collect"), self.get("/g/collect"), diff --git a/docs/guide/integrations/google_tag_manager.md b/docs/guide/integrations/google_tag_manager.md index 2d45cc7e..b130cbe4 100644 --- a/docs/guide/integrations/google_tag_manager.md +++ b/docs/guide/integrations/google_tag_manager.md @@ -14,10 +14,10 @@ The Tag Gateway intercepts requests for GTM scripts (`gtm.js`) and Google Analyt **Benefits**: -- **Bypass Ad Blockers**: Serving scripts from a first-party domain can prevent them from being blocked by some ad blockers and privacy extensions. -- **Extended Cookie Life**: First-party cookies set by these scripts are more durable in environments like Safari (ITP). -- **Performance**: Utilize edge caching for scripts. -- **Privacy Control**: Strips client IP addresses before forwarding data to Google. +- **Bypass Ad Blockers**: Serving scripts from a first-party domain can prevent them from being blocked by some ad blockers and privacy extensions. +- **Extended Cookie Life**: First-party cookies set by these scripts are more durable in environments like Safari (ITP). +- **Performance**: Utilize edge caching for scripts. +- **Privacy Control**: Strips client IP addresses before forwarding data to Google. ## Configuration @@ -32,11 +32,11 @@ container_id = "GTM-XXXXXX" ### Configuration Options -| Field | Type | Required | Description | -| -------------- | ------ | -------- | ------------------------------------------------- | -| `enabled` | boolean| No | Enable/disable integration (default: `false`) | -| `container_id` | string | Yes | Your GTM Container ID (e.g., `GTM-A1B2C3`) | -| `upstream_url` | string | No | Custom upstream URL (advanced usage) | +| Field | Type | Required | Description | +| -------------- | ------- | -------- | --------------------------------------------- | +| `enabled` | boolean | No | Enable/disable integration (default: `false`) | +| `container_id` | string | Yes | Your GTM Container ID (e.g., `GTM-A1B2C3`) | +| `upstream_url` | string | No | Custom upstream URL (advanced usage) | ## How It Works @@ -45,11 +45,13 @@ container_id = "GTM-XXXXXX" When Trusted Server processes an HTML response, it automatically rewrites GTM script tags: **Before:** + ```html ``` **After:** + ```html ``` @@ -57,6 +59,7 @@ When Trusted Server processes an HTML response, it automatically rewrites GTM sc ### 2. Script Proxying When the browser requests `/integrations/google_tag_manager/gtm.js`: + 1. Trusted Server fetches the original script from Google. 2. It modifies the script content on-the-fly to replace references to `www.google-analytics.com` and `www.googletagmanager.com` with the local proxy path. 3. It serves the modified script to the browser. @@ -73,16 +76,20 @@ Trusted Server forwards these requests to Google's servers, ensuring the data is You can verify the integration using `curl`: **Test Script Proxy:** + ```bash curl -v "http://your-server.com/integrations/google_tag_manager/gtm.js?id=GTM-XXXXXX" ``` -*Expected*: 200 OK, and the body content should contain rewritten paths. + +_Expected_: 200 OK, and the body content should contain rewritten paths. **Test Beacon:** + ```bash curl -v -X POST "http://your-server.com/integrations/google_tag_manager/g/collect?v=2&tid=G-XXXXXX..." ``` -*Expected*: 200/204 OK. + +_Expected_: 200/204 OK. ## Implementation Details From 4d543e480ba567bae473a6f40b2a5682a4ad1cb5 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 9 Feb 2026 15:57:46 +0530 Subject: [PATCH 03/18] Add configuration and pipeline integration tests Adds comprehensive tests for: - GTM configuration parsing and default values - HTML processor pipeline integration - Response body rewriting logic --- .../src/integrations/google_tag_manager.rs | 131 +++++++++++++++++- 1 file changed, 126 insertions(+), 5 deletions(-) diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index dceb181a..46d78b98 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -116,9 +116,7 @@ impl IntegrationProxy for GoogleTagManagerIntegration { // Proxy for the gtag script (if used) self.get("/gtag/js"), // Analytics beacons (GA4/UA) - // Note: In a real "Tag Gateway" implementation, we'd likely need - // (e.g., `gtm.js` script tags), it will automatically rewrite the `src` attribute to point to - // the first-party proxy endpoint. + // The GTM script is rewritten to point these beacons to our proxy. self.get("/collect"), self.post("/collect"), self.get("/g/collect"), @@ -254,11 +252,16 @@ impl IntegrationScriptRewriter for GoogleTagManagerIntegration { #[cfg(test)] mod tests { use super::*; + use crate::html_processor::{create_html_processor, HtmlProcessorConfig}; use crate::integrations::{ AttributeRewriteAction, IntegrationAttributeContext, IntegrationAttributeRewriter, - IntegrationDocumentState, IntegrationScriptContext, IntegrationScriptRewriter, - ScriptRewriteAction, + IntegrationDocumentState, IntegrationRegistry, IntegrationScriptContext, + IntegrationScriptRewriter, ScriptRewriteAction, }; + use crate::settings::Settings; + use crate::streaming_processor::{Compression, PipelineConfig, StreamingPipeline}; + use crate::test_support::tests::crate_test_settings_str; + use std::io::Cursor; #[test] fn test_attribute_rewriter() { @@ -451,4 +454,122 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= assert!(rewritten.contains("/integrations/google_tag_manager/gtm.js")); assert!(!rewritten.contains("https://www.google-analytics.com")); } + + fn make_settings() -> Settings { + Settings::from_toml(&crate_test_settings_str()).expect("should parse settings") + } + + fn config_from_settings( + settings: &Settings, + registry: &IntegrationRegistry, + ) -> HtmlProcessorConfig { + HtmlProcessorConfig::from_settings( + settings, + registry, + "origin.example.com", + "test.example.com", + "https", + ) + } + + #[test] + fn test_config_parsing() { + let toml_str = r#" +[publisher] +domain = "test-publisher.com" +cookie_domain = ".test-publisher.com" +origin_url = "https://origin.test-publisher.com" +proxy_secret = "test-secret" + +[synthetic] +counter_store = "test-counter-store" +opid_store = "test-opid-store" +secret_key = "test-secret-key" +template = "{{client_ip}}:{{user_agent}}" + +[integrations.google_tag_manager] +enabled = true +container_id = "GTM-PARSED" +upstream_url = "https://custom.gtm.example" +"#; + let settings = Settings::from_toml(toml_str).expect("should parse TOML"); + let config = settings + .integration_config::(GTM_INTEGRATION_ID) + .expect("should get config") + .expect("should be enabled"); + + assert!(config.enabled); + assert_eq!(config.container_id, "GTM-PARSED"); + assert_eq!(config.upstream_url, "https://custom.gtm.example"); + } + + #[test] + fn test_config_defaults() { + let toml_str = r#" +[publisher] +domain = "test-publisher.com" +cookie_domain = ".test-publisher.com" +origin_url = "https://origin.test-publisher.com" +proxy_secret = "test-secret" + +[synthetic] +counter_store = "test-counter-store" +opid_store = "test-opid-store" +secret_key = "test-secret-key" +template = "{{client_ip}}:{{user_agent}}" + +[integrations.google_tag_manager] +container_id = "GTM-DEFAULT" +"#; + let settings = Settings::from_toml(toml_str).expect("should parse TOML"); + let config = settings + .integration_config::(GTM_INTEGRATION_ID) + .expect("should get config") + .expect("should be enabled"); + + assert!(config.enabled); // Default is true + assert_eq!(config.container_id, "GTM-DEFAULT"); + assert_eq!(config.upstream_url, "https://www.googletagmanager.com"); // Default upstream + } + + #[test] + fn test_html_processor_pipeline_rewrites_gtm() { + let html = r#" + + "#; + + let mut settings = make_settings(); + // Enable GTM + settings + .integrations + .insert_config( + "google_tag_manager", + &serde_json::json!({ + "enabled": true, + "container_id": "GTM-TEST", + "upstream_url": "https://www.googletagmanager.com" + }), + ) + .expect("should update gtm config"); + + let registry = IntegrationRegistry::new(&settings).expect("should create registry"); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + let result = pipeline.process(Cursor::new(html.as_bytes()), &mut output); + assert!(result.is_ok()); + + let processed = String::from_utf8_lossy(&output); + + // Verify rewrite happened + assert!(processed.contains("/integrations/google_tag_manager/gtm.js?id=GTM-TEST")); + assert!(!processed.contains("https://www.googletagmanager.com/gtm.js")); + } } From 4fd54b8f2cece128a62c0e219d904343e7c6adc4 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 9 Feb 2026 21:22:52 +0530 Subject: [PATCH 04/18] Enhance GTM integration with caching, validation, and improved logging --- .../src/integrations/google_tag_manager.rs | 37 +++++++- docs/guide/integrations/google_tag_manager.md | 87 +++++++++++++++---- 2 files changed, 106 insertions(+), 18 deletions(-) diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index 46d78b98..b69404b6 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -28,6 +28,7 @@ pub struct GoogleTagManagerConfig { pub container_id: String, /// Upstream URL for GTM (defaults to ). #[serde(default = "default_upstream")] + #[validate(url)] pub upstream_url: String, } @@ -130,6 +131,9 @@ impl IntegrationProxy for GoogleTagManagerIntegration { req: Request, ) -> Result> { let path = req.get_path().to_string(); + let method = req.get_method(); + log::info!("Handling GTM request: {} {}", method, path); + let upstream_base = self.upstream_url(); // Construct full target URL @@ -154,6 +158,8 @@ impl IntegrationProxy for GoogleTagManagerIntegration { target_url = format!("{}?id={}", target_url, self.config.container_id); } + log::debug!("Proxying to upstream: {}", target_url); + let mut proxy_config = ProxyRequestConfig::new(&target_url); // If we are fetching gtm.js, we intend to rewrite the body. @@ -171,6 +177,7 @@ impl IntegrationProxy for GoogleTagManagerIntegration { // If we are serving gtm.js, we want to text-replace "www.google-analytics.com" // with our proxy details to route beacons through us. if path.ends_with("/gtm.js") { + log::info!("Rewriting GTM script content"); // Note: This is an expensive operation if the script is large. // Ideally should be streamed, but simple string replacement for now. let body_bytes = response.into_body_bytes(); @@ -179,7 +186,15 @@ impl IntegrationProxy for GoogleTagManagerIntegration { let rewritten_body = self.rewrite_gtm_script(&body_str); response = Response::from_body(rewritten_body) - .with_header(fastly::http::header::CONTENT_TYPE, "application/javascript"); + .with_header( + fastly::http::header::CONTENT_TYPE, + "application/javascript; charset=utf-8", + ) + // Enforce 1 hour cache TTL for the script, similar to Permutive + .with_header( + fastly::http::header::CACHE_CONTROL, + "public, max-age=3600, immutable", + ); } Ok(response) @@ -572,4 +587,24 @@ container_id = "GTM-DEFAULT" assert!(processed.contains("/integrations/google_tag_manager/gtm.js?id=GTM-TEST")); assert!(!processed.contains("https://www.googletagmanager.com/gtm.js")); } + + #[test] + fn test_headers() { + // This test simulates the header logic used in `handle` + // Since `handle` makes network calls, we can't easily unit test it without mocking. + // However, we can verify the logic constructs intended headers. + + let response_headers = vec![ + ("cache-control", "public, max-age=3600, immutable"), + ("content-type", "application/javascript; charset=utf-8"), + ]; + + for (key, value) in response_headers { + match key { + "cache-control" => assert_eq!(value, "public, max-age=3600, immutable"), + "content-type" => assert_eq!(value, "application/javascript; charset=utf-8"), + _ => panic!("Unexpected header"), + } + } + } } diff --git a/docs/guide/integrations/google_tag_manager.md b/docs/guide/integrations/google_tag_manager.md index b130cbe4..39b9ea83 100644 --- a/docs/guide/integrations/google_tag_manager.md +++ b/docs/guide/integrations/google_tag_manager.md @@ -40,9 +40,25 @@ container_id = "GTM-XXXXXX" ## How It Works +```mermaid +flowchart TD + user["User Browser"] + server["Trusted Server"] + google["Google Servers
(gtm.js, collect)"] + + user -- "1. Request HTML" --> server + server -- "2. Rewrite HTML
(src=/integrations/...)" --> user + user -- "3. Request Script
(gtm.js w/ ID)" --> server + server -- "4. Fetch Script" --> google + google -- "5. Return Script" --> server + server -- "6. Rewrite Script Content
(replace www.google-analytics.com)" --> user + user -- "7. Send Beacon
(/collect w/ data)" --> server + server -- "8. Proxy Beacon" --> google +``` + ### 1. Script Rewriting -When Trusted Server processes an HTML response, it automatically rewrites GTM script tags: +When Trusted Server processes an HTML response, it automatically rewrites GTM script tags to point to the local proxy: **Before:** @@ -58,38 +74,75 @@ When Trusted Server processes an HTML response, it automatically rewrites GTM sc ### 2. Script Proxying -When the browser requests `/integrations/google_tag_manager/gtm.js`: +The proxy intercepts requests for the GTM library and modifies it on-the-fly. This is critical for First-Party context. -1. Trusted Server fetches the original script from Google. -2. It modifies the script content on-the-fly to replace references to `www.google-analytics.com` and `www.googletagmanager.com` with the local proxy path. -3. It serves the modified script to the browser. +1. **Fetch**: Retrieves the original `gtm.js` from Google. +2. **Rewrite**: Replaces hardcoded references to `www.google-analytics.com` and `www.googletagmanager.com` with the local proxy path. +3. **Serve**: Returns the modified script with correct caching headers. ### 3. Beacon Proxying -Analytics data sent by the modified script is directed to: -`/integrations/google_tag_manager/collect` (or `/g/collect`) +Analytics data (events, pageviews) normally sent to `google-analytics.com/collect` are now routed to: -Trusted Server forwards these requests to Google's servers, ensuring the data is recorded successfully. +`https://your-server.com/integrations/google_tag_manager/collect` -## Manual Verification +Trusted Server acts as a gateway, stripping client IP addresses (privacy) before forwarding the data to Google. -You can verify the integration using `curl`: +## Core Endpoints -**Test Script Proxy:** +### `GET .../gtm.js` - Script Proxy -```bash -curl -v "http://your-server.com/integrations/google_tag_manager/gtm.js?id=GTM-XXXXXX" +Proxies the Google Tag Manager library. + +**Request**: +``` +GET /integrations/google_tag_manager/gtm.js?id=GTM-XXXXXX +``` + +**Behavior**: +- Proxies to `https://www.googletagmanager.com/gtm.js` +- Rewrites internal URLs to use the first-party proxy +- Strips `Accept-Encoding` during fetch to ensure rewriteable text response + +### `GET/POST .../collect` - Analytics Beacon + +Proxies analytics events (GA4/UA). + +**Request**: +``` +POST /integrations/google_tag_manager/g/collect?v=2&... ``` -_Expected_: 200 OK, and the body content should contain rewritten paths. +**Behavior**: +- Proxies to `https://www.google-analytics.com/g/collect` +- Forwarding: User-Agent, Referer, Payload +- Privacy: Does NOT forward client IP (Google sees Trusted Server IP) + +## Performance & Caching -**Test Beacon:** +### Compression +The integration requires the upstream `gtm.js` to be uncompressed to perform string replacement. Trusted Server fetches it with `Accept-Encoding: identity`. +*Note: Trusted Server will re-compress the response (gzip/brotli) before sending it to the user if the `compression` feature is enabled.* + +### Direct Proxying +Beacon requests (`/collect`) are proxied directly using streaming, minimizing latency overhead. + +## Manual Verification + +You can verify the integration using `curl`: + +**Test Script Result**: ```bash -curl -v -X POST "http://your-server.com/integrations/google_tag_manager/g/collect?v=2&tid=G-XXXXXX..." +curl -v "http://localhost:8080/integrations/google_tag_manager/gtm.js?id=GTM-XXXXXX" ``` +_Expected_: `200 OK`. Body should contain `/integrations/google_tag_manager` instead of `google-analytics.com`. -_Expected_: 200/204 OK. +**Test Beacon Result**: +```bash +curl -v -X POST "http://localhost:8080/integrations/google_tag_manager/g/collect?v=2&tid=G-TEST" +``` +_Expected_: `200 OK` (or 204). ## Implementation Details From 6486bd42451792ec724b68a130c30d6e1b95a85b Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 9 Feb 2026 21:26:03 +0530 Subject: [PATCH 05/18] Enhance GTM integration with caching, validation, and improved logging --- docs/guide/integrations/google_tag_manager.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/guide/integrations/google_tag_manager.md b/docs/guide/integrations/google_tag_manager.md index 39b9ea83..e61067e3 100644 --- a/docs/guide/integrations/google_tag_manager.md +++ b/docs/guide/integrations/google_tag_manager.md @@ -95,11 +95,13 @@ Trusted Server acts as a gateway, stripping client IP addresses (privacy) before Proxies the Google Tag Manager library. **Request**: + ``` GET /integrations/google_tag_manager/gtm.js?id=GTM-XXXXXX ``` **Behavior**: + - Proxies to `https://www.googletagmanager.com/gtm.js` - Rewrites internal URLs to use the first-party proxy - Strips `Accept-Encoding` during fetch to ensure rewriteable text response @@ -109,11 +111,13 @@ GET /integrations/google_tag_manager/gtm.js?id=GTM-XXXXXX Proxies analytics events (GA4/UA). **Request**: + ``` POST /integrations/google_tag_manager/g/collect?v=2&... ``` **Behavior**: + - Proxies to `https://www.google-analytics.com/g/collect` - Forwarding: User-Agent, Referer, Payload - Privacy: Does NOT forward client IP (Google sees Trusted Server IP) @@ -121,11 +125,13 @@ POST /integrations/google_tag_manager/g/collect?v=2&... ## Performance & Caching ### Compression -The integration requires the upstream `gtm.js` to be uncompressed to perform string replacement. Trusted Server fetches it with `Accept-Encoding: identity`. -*Note: Trusted Server will re-compress the response (gzip/brotli) before sending it to the user if the `compression` feature is enabled.* +The integration requires the upstream `gtm.js` to be uncompressed to perform string replacement. Trusted Server fetches it with `Accept-Encoding: identity`. + +_Note: Trusted Server will re-compress the response (gzip/brotli) before sending it to the user if the `compression` feature is enabled._ ### Direct Proxying + Beacon requests (`/collect`) are proxied directly using streaming, minimizing latency overhead. ## Manual Verification @@ -133,15 +139,19 @@ Beacon requests (`/collect`) are proxied directly using streaming, minimizing la You can verify the integration using `curl`: **Test Script Result**: + ```bash curl -v "http://localhost:8080/integrations/google_tag_manager/gtm.js?id=GTM-XXXXXX" ``` + _Expected_: `200 OK`. Body should contain `/integrations/google_tag_manager` instead of `google-analytics.com`. **Test Beacon Result**: + ```bash curl -v -X POST "http://localhost:8080/integrations/google_tag_manager/g/collect?v=2&tid=G-TEST" ``` + _Expected_: `200 OK` (or 204). ## Implementation Details From a02b7d88b19641b674c9f4337f82f6e84a4de4c2 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 9 Feb 2026 22:16:09 +0530 Subject: [PATCH 06/18] Add integration tests for HTML processing and inline script rewriting in Google Tag Manager --- .../src/integrations/google_tag_manager.rs | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index b69404b6..94c95c92 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -607,4 +607,134 @@ container_id = "GTM-DEFAULT" } } } + + #[test] + fn test_html_processing_with_fixture() { + // 1. Configure Settings with GTM enabled + let mut settings = make_settings(); + + // Use the ID from the fixture: GTM-522ZT3X6 + settings + .integrations + .insert_config( + "google_tag_manager", + &serde_json::json!({ + "enabled": true, + "container_id": "GTM-522ZT3X6", + "upstream_url": "https://www.googletagmanager.com" + }), + ) + .expect("should update gtm config"); + + // 2. Setup Pipeline + let registry = IntegrationRegistry::new(&settings).expect("should create registry"); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + // 3. Load Fixture + // Path is relative to this file: ../html_processor.test.html + let html_content = include_str!("../html_processor.test.html"); + + // 4. Run Pipeline + let mut output = Vec::new(); + let result = pipeline.process(Cursor::new(html_content.as_bytes()), &mut output); + assert!( + result.is_ok(), + "Pipeline processing failed: {:?}", + result.err() + ); + + let processed = String::from_utf8_lossy(&output); + let encoded_id = urlencoding::encode("google_tag_manager"); + + // 5. Assertions + + // a. Link Preload Rewrite: + // Original: + + + + + + "#; + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html_input.as_bytes()), &mut output) + .expect("should process"); + let processed = String::from_utf8_lossy(&output); + + let encoded_id = urlencoding::encode("google_tag_manager"); + let expected_src = format!("/integrations/{}/gtm.js", encoded_id); + + assert!( + processed.contains(&expected_src), + "Inline script src not rewritten" + ); + + assert!( + !processed.contains("j.src='https://www.googletagmanager.com/gtm.js"), + "Original src should be gone" + ); + } } From 0d4a47e34d8fdc4160b0a83ec1b0a65369a9f725 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Fri, 13 Feb 2026 14:34:54 +0530 Subject: [PATCH 07/18] Enhance Google Tag Manager integration with regex-based URL rewriting, set default enablement to false, and update documentation for handling --- .../src/integrations/google_tag_manager.rs | 233 ++++++++++-------- docs/guide/integrations/google_tag_manager.md | 2 +- trusted-server.toml | 2 +- 3 files changed, 132 insertions(+), 105 deletions(-) diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index 94c95c92..52881a50 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -1,9 +1,25 @@ +//! Google Tag Manager integration for first-party tag delivery. +//! +//! Proxies GTM scripts and Google Analytics beacons through the publisher's +//! domain, improving tracking accuracy and ad-blocker resistance. +//! +//! # Endpoints +//! +//! | Method | Path | Description | +//! |--------|------|-------------| +//! | `GET` | `.../gtm.js` | Proxies and rewrites the GTM script | +//! | `GET` | `.../gtag/js` | Proxies the gtag script | +//! | `GET/POST` | `.../collect` | Proxies GA analytics beacons | +//! | `GET/POST` | `.../g/collect` | Proxies GA4 analytics beacons | + use std::sync::Arc; use async_trait::async_trait; -use error_stack::Report; +use error_stack::{Report, ResultExt}; use fastly::http::StatusCode; use fastly::{Request, Response}; +use once_cell::sync::Lazy; +use regex::Regex; use serde::{Deserialize, Serialize}; use validator::Validate; @@ -19,6 +35,20 @@ use crate::settings::{IntegrationConfig, Settings}; const GTM_INTEGRATION_ID: &str = "google_tag_manager"; const DEFAULT_UPSTREAM: &str = "https://www.googletagmanager.com"; +/// Regex pattern for matching and rewriting GTM and Google Analytics URLs. +/// +/// Handles all URL variants: +/// - `https://www.googletagmanager.com/gtm.js?id=...` +/// - `//www.googletagmanager.com/gtm.js?id=...` +/// - `https://www.google-analytics.com/collect` +/// - `//www.google-analytics.com/g/collect` +/// +/// The replacement target is `/integrations/google_tag_manager`. +static GTM_URL_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r"(https?:)?//www\.(googletagmanager|google-analytics)\.com") + .expect("GTM URL regex should compile") +}); + #[derive(Debug, Clone, Deserialize, Serialize, Validate)] pub struct GoogleTagManagerConfig { #[serde(default = "default_enabled")] @@ -39,7 +69,7 @@ impl IntegrationConfig for GoogleTagManagerConfig { } fn default_enabled() -> bool { - true + false } fn default_upstream() -> String { @@ -55,6 +85,13 @@ impl GoogleTagManagerIntegration { Arc::new(Self { config }) } + fn error(message: impl Into) -> TrustedServerError { + TrustedServerError::Integration { + integration: GTM_INTEGRATION_ID.to_string(), + message: message.into(), + } + } + fn upstream_url(&self) -> &str { if self.config.upstream_url.is_empty() { DEFAULT_UPSTREAM @@ -63,31 +100,27 @@ impl GoogleTagManagerIntegration { } } - fn rewrite_gtm_script(&self, content: &str) -> String { - // Rewrite 'www.google-analytics.com' to point to this server's proxy path - // path would be /integrations/google_tag_manager - let my_integration_path = format!("/integrations/{}", GTM_INTEGRATION_ID); - - // Simplistic replacements - mimic what Cloudflare/others do - // Replacements depend on exactly how the string appears in the minified JS. - // Common target: "https://www.google-analytics.com" - let mut new_content = - content.replace("https://www.google-analytics.com", &my_integration_path); - new_content = new_content.replace("https://www.googletagmanager.com", &my_integration_path); - new_content + /// Rewrite GTM and Google Analytics URLs to first-party proxy paths. + /// + /// Uses [`GTM_URL_PATTERN`] to handle all URL variants (https, protocol-relative) + /// for both `googletagmanager.com` and `google-analytics.com`. + fn rewrite_gtm_urls(content: &str) -> String { + let replacement = format!("/integrations/{}", GTM_INTEGRATION_ID); + GTM_URL_PATTERN + .replace_all(content, replacement.as_str()) + .into_owned() } } -#[must_use] -pub fn build(settings: &Settings) -> Option> { - let config = settings - .integration_config::(GTM_INTEGRATION_ID) - .ok() - .flatten()?; - - if !config.enabled { - return None; - } +fn build(settings: &Settings) -> Option> { + let config = match settings.integration_config::(GTM_INTEGRATION_ID) { + Ok(Some(config)) => config, + Ok(None) => return None, + Err(err) => { + log::error!("Failed to load GTM integration config: {err:?}"); + return None; + } + }; Some(GoogleTagManagerIntegration::new(config)) } @@ -132,7 +165,7 @@ impl IntegrationProxy for GoogleTagManagerIntegration { ) -> Result> { let path = req.get_path().to_string(); let method = req.get_method(); - log::info!("Handling GTM request: {} {}", method, path); + log::debug!("Handling GTM request: {} {}", method, path); let upstream_base = self.upstream_url(); @@ -142,6 +175,8 @@ impl IntegrationProxy for GoogleTagManagerIntegration { } else if path.ends_with("/gtag/js") { format!("{}/gtag/js", upstream_base) } else if path.ends_with("/collect") { + // Analytics beacons always go to google-analytics.com, not the + // configurable upstream_url (which is for googletagmanager.com). if path.contains("/g/") { "https://www.google-analytics.com/g/collect".to_string() } else { @@ -171,30 +206,26 @@ impl IntegrationProxy for GoogleTagManagerIntegration { ); } - let mut response = proxy_request(settings, req, proxy_config).await?; + let mut response = proxy_request(settings, req, proxy_config) + .await + .change_context(Self::error("Failed to proxy GTM request"))?; - // Rewrite logic (Primitive version) - // If we are serving gtm.js, we want to text-replace "www.google-analytics.com" - // with our proxy details to route beacons through us. + // If we are serving gtm.js, rewrite internal URLs to route beacons through us. if path.ends_with("/gtm.js") { - log::info!("Rewriting GTM script content"); - // Note: This is an expensive operation if the script is large. - // Ideally should be streamed, but simple string replacement for now. - let body_bytes = response.into_body_bytes(); - let body_str = String::from_utf8_lossy(&body_bytes).to_string(); - - let rewritten_body = self.rewrite_gtm_script(&body_str); + if !response.get_status().is_success() { + log::warn!("GTM upstream returned status {}", response.get_status()); + return Ok(response); + } + log::debug!("Rewriting GTM script content"); + let body_str = response.take_body_str(); + let rewritten_body = Self::rewrite_gtm_urls(&body_str); response = Response::from_body(rewritten_body) .with_header( fastly::http::header::CONTENT_TYPE, "application/javascript; charset=utf-8", ) - // Enforce 1 hour cache TTL for the script, similar to Permutive - .with_header( - fastly::http::header::CACHE_CONTROL, - "public, max-age=3600, immutable", - ); + .with_header(fastly::http::header::CACHE_CONTROL, "public, max-age=3600"); } Ok(response) @@ -217,17 +248,7 @@ impl IntegrationAttributeRewriter for GoogleTagManagerIntegration { _ctx: &IntegrationAttributeContext<'_>, ) -> AttributeRewriteAction { if attr_value.contains("googletagmanager.com/gtm.js") { - let encoded_integration_id = urlencoding::encode(self.integration_name()); - let mut new_value = attr_value.replace( - "https://www.googletagmanager.com/gtm.js", - &format!("/integrations/{}/gtm.js", encoded_integration_id), - ); - new_value = new_value.replace( - "//www.googletagmanager.com/gtm.js", - &format!("/integrations/{}/gtm.js", encoded_integration_id), - ); - - AttributeRewriteAction::replace(new_value) + AttributeRewriteAction::replace(Self::rewrite_gtm_urls(attr_value)) } else { AttributeRewriteAction::keep() } @@ -247,17 +268,7 @@ impl IntegrationScriptRewriter for GoogleTagManagerIntegration { // Look for the GTM snippet pattern. // Standard snippet contains: "googletagmanager.com/gtm.js" if content.contains("googletagmanager.com/gtm.js") { - let encoded_integration_id = urlencoding::encode(self.integration_name()); - let my_integration_path = format!("/integrations/{}/gtm.js", encoded_integration_id); - - let mut new_content = content.replace( - "https://www.googletagmanager.com/gtm.js", - &my_integration_path, - ); - new_content = - new_content.replace("//www.googletagmanager.com/gtm.js", &my_integration_path); - - return ScriptRewriteAction::replace(new_content); + return ScriptRewriteAction::replace(Self::rewrite_gtm_urls(content)); } ScriptRewriteAction::keep() @@ -278,6 +289,33 @@ mod tests { use crate::test_support::tests::crate_test_settings_str; use std::io::Cursor; + #[test] + fn test_rewrite_gtm_urls() { + // All URL patterns should be rewritten via the shared regex + let input = r#" + var a = "https://www.googletagmanager.com/gtm.js"; + var b = "//www.googletagmanager.com/gtm.js"; + var c = "https://www.google-analytics.com/collect"; + var d = "//www.google-analytics.com/g/collect"; + var e = "http://www.googletagmanager.com/gtm.js"; + "#; + + let result = GoogleTagManagerIntegration::rewrite_gtm_urls(input); + + assert!(result.contains("/integrations/google_tag_manager/gtm.js")); + assert!(result.contains("/integrations/google_tag_manager/collect")); + assert!(result.contains("/integrations/google_tag_manager/g/collect")); + assert!(!result.contains("www.googletagmanager.com")); + assert!(!result.contains("www.google-analytics.com")); + } + + #[test] + fn test_rewrite_preserves_non_gtm_urls() { + let input = r#"var x = "https://example.com/script.js";"#; + let result = GoogleTagManagerIntegration::rewrite_gtm_urls(input); + assert_eq!(input, result); + } + #[test] fn test_attribute_rewriter() { let config = GoogleTagManagerConfig { @@ -394,7 +432,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= upstream_url: default_upstream(), }; - assert!(config.enabled); + assert!(!config.enabled); assert_eq!(config.upstream_url, "https://www.googletagmanager.com"); } @@ -451,19 +489,12 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= #[test] fn test_handle_response_rewriting() { - let config = GoogleTagManagerConfig { - enabled: true, - container_id: "GTM-TEST".to_string(), - upstream_url: default_upstream(), - }; - let integration = GoogleTagManagerIntegration::new(config); - let original_body = r#" var x = "https://www.google-analytics.com/collect"; var y = "https://www.googletagmanager.com/gtm.js"; "#; - let rewritten = integration.rewrite_gtm_script(original_body); + let rewritten = GoogleTagManagerIntegration::rewrite_gtm_urls(original_body); assert!(rewritten.contains("/integrations/google_tag_manager/collect")); assert!(rewritten.contains("/integrations/google_tag_manager/gtm.js")); @@ -539,12 +570,15 @@ container_id = "GTM-DEFAULT" let settings = Settings::from_toml(toml_str).expect("should parse TOML"); let config = settings .integration_config::(GTM_INTEGRATION_ID) - .expect("should get config") - .expect("should be enabled"); + .expect("should get config"); - assert!(config.enabled); // Default is true - assert_eq!(config.container_id, "GTM-DEFAULT"); - assert_eq!(config.upstream_url, "https://www.googletagmanager.com"); // Default upstream + // Default is now false, so integration_config returns None for disabled + // When we explicitly parse the config with container_id but no enabled field, + // the config is present but disabled + assert!( + config.is_none(), + "Config with default enabled=false should return None from integration_config" + ); } #[test] @@ -588,26 +622,6 @@ container_id = "GTM-DEFAULT" assert!(!processed.contains("https://www.googletagmanager.com/gtm.js")); } - #[test] - fn test_headers() { - // This test simulates the header logic used in `handle` - // Since `handle` makes network calls, we can't easily unit test it without mocking. - // However, we can verify the logic constructs intended headers. - - let response_headers = vec![ - ("cache-control", "public, max-age=3600, immutable"), - ("content-type", "application/javascript; charset=utf-8"), - ]; - - for (key, value) in response_headers { - match key { - "cache-control" => assert_eq!(value, "public, max-age=3600, immutable"), - "content-type" => assert_eq!(value, "application/javascript; charset=utf-8"), - _ => panic!("Unexpected header"), - } - } - } - #[test] fn test_html_processing_with_fixture() { // 1. Configure Settings with GTM enabled @@ -651,17 +665,16 @@ container_id = "GTM-DEFAULT" ); let processed = String::from_utf8_lossy(&output); - let encoded_id = urlencoding::encode("google_tag_manager"); // 5. Assertions // a. Link Preload Rewrite: // Original: { + assert_eq!(integration, "google_tag_manager"); + assert_eq!(message, "test failure"); + } + other => panic!("Expected Integration error, got {:?}", other), + } + } } diff --git a/docs/guide/integrations/google_tag_manager.md b/docs/guide/integrations/google_tag_manager.md index e61067e3..5b918768 100644 --- a/docs/guide/integrations/google_tag_manager.md +++ b/docs/guide/integrations/google_tag_manager.md @@ -104,7 +104,7 @@ GET /integrations/google_tag_manager/gtm.js?id=GTM-XXXXXX - Proxies to `https://www.googletagmanager.com/gtm.js` - Rewrites internal URLs to use the first-party proxy -- Strips `Accept-Encoding` during fetch to ensure rewriteable text response +- Sets `Accept-Encoding: identity` during fetch to ensure rewriteable text response ### `GET/POST .../collect` - Analytics Beacon diff --git a/trusted-server.toml b/trusted-server.toml index 1e47c009..24497986 100644 --- a/trusted-server.toml +++ b/trusted-server.toml @@ -115,7 +115,7 @@ endpoint = "https://origin-mocktioneer.cdintel.com/e/dtb/bid" timeout_ms = 1000 [integrations.google_tag_manager] -enabled = true +enabled = false container_id = "GTM-XXXXXX" # upstream_url = "https://www.googletagmanager.com" From 4f33b266e4e2253ed5426624831a6cfcdbe4c91d Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 16 Feb 2026 13:16:32 +0530 Subject: [PATCH 08/18] Add configurable cache max-age for GTM scripts and disable synthetic ID forwarding for GTM proxy requests --- .../src/integrations/google_tag_manager.rs | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index 52881a50..437f85d6 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -60,6 +60,10 @@ pub struct GoogleTagManagerConfig { #[serde(default = "default_upstream")] #[validate(url)] pub upstream_url: String, + /// Cache max-age in seconds for the rewritten GTM script (default: 900 to match Google's default). + #[serde(default = "default_cache_max_age")] + #[validate(range(min = 60, max = 86400))] + pub cache_max_age: u32, } impl IntegrationConfig for GoogleTagManagerConfig { @@ -76,6 +80,10 @@ fn default_upstream() -> String { DEFAULT_UPSTREAM.to_string() } +fn default_cache_max_age() -> u32 { + 900 // Match Google's default +} + pub struct GoogleTagManagerIntegration { config: GoogleTagManagerConfig, } @@ -196,6 +204,8 @@ impl IntegrationProxy for GoogleTagManagerIntegration { log::debug!("Proxying to upstream: {}", target_url); let mut proxy_config = ProxyRequestConfig::new(&target_url); + // Do not forward the synthetic ID to external Google services. + proxy_config.forward_synthetic_id = false; // If we are fetching gtm.js, we intend to rewrite the body. // We must ensure the upstream returns uncompressed content. @@ -225,7 +235,10 @@ impl IntegrationProxy for GoogleTagManagerIntegration { fastly::http::header::CONTENT_TYPE, "application/javascript; charset=utf-8", ) - .with_header(fastly::http::header::CACHE_CONTROL, "public, max-age=3600"); + .with_header( + fastly::http::header::CACHE_CONTROL, + format!("public, max-age={}", self.config.cache_max_age), + ); } Ok(response) @@ -322,6 +335,7 @@ mod tests { enabled: true, container_id: "GTM-TEST".to_string(), upstream_url: "https://www.googletagmanager.com".to_string(), + cache_max_age: default_cache_max_age(), }; let integration = GoogleTagManagerIntegration::new(config); @@ -377,6 +391,7 @@ mod tests { enabled: true, container_id: "GTM-TEST".to_string(), upstream_url: "https://www.googletagmanager.com".to_string(), + cache_max_age: default_cache_max_age(), }; let integration = GoogleTagManagerIntegration::new(config); let doc_state = IntegrationDocumentState::default(); @@ -430,6 +445,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= enabled: default_enabled(), container_id: "GTM-DEFAULT".to_string(), upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), }; assert!(!config.enabled); @@ -443,6 +459,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= enabled: true, container_id: "GTM-123".to_string(), upstream_url: "".to_string(), // Empty string should fallback to default in accessor + cache_max_age: default_cache_max_age(), }; let integration_default = GoogleTagManagerIntegration::new(config_default); assert_eq!( @@ -455,6 +472,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= enabled: true, container_id: "GTM-123".to_string(), upstream_url: "https://gtm.example.com".to_string(), + cache_max_age: default_cache_max_age(), }; let integration_custom = GoogleTagManagerIntegration::new(config_custom); assert_eq!(integration_custom.upstream_url(), "https://gtm.example.com"); @@ -466,6 +484,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= enabled: true, container_id: "GTM-TEST".to_string(), upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), }; let integration = GoogleTagManagerIntegration::new(config); let routes = integration.routes(); From 61125a2c320bade86b68f3cee8c1f06a9b609b1b Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 16 Feb 2026 21:36:05 +0530 Subject: [PATCH 09/18] feat: Add gtag.js support and refactor GTM proxy logic into helper functions, improving request configuration for beacons and scripts --- .../src/integrations/google_tag_manager.rs | 217 ++++++++++++++---- 1 file changed, 175 insertions(+), 42 deletions(-) diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index 437f85d6..108234e2 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -16,7 +16,7 @@ use std::sync::Arc; use async_trait::async_trait; use error_stack::{Report, ResultExt}; -use fastly::http::StatusCode; +use fastly::http::{Method, StatusCode}; use fastly::{Request, Response}; use once_cell::sync::Lazy; use regex::Regex; @@ -118,6 +118,69 @@ impl GoogleTagManagerIntegration { .replace_all(content, replacement.as_str()) .into_owned() } + + fn is_rewritable_script(&self, path: &str) -> bool { + path.ends_with("/gtm.js") || path.ends_with("/gtag/js") || path.ends_with("/gtag.js") + } + + fn build_target_url(&self, req: &Request, path: &str) -> Option { + let upstream_base = self.upstream_url(); + + let mut target_url = if path.ends_with("/gtm.js") { + format!("{}/gtm.js", upstream_base) + } else if path.ends_with("/gtag/js") || path.ends_with("/gtag.js") { + format!("{}/gtag/js", upstream_base) // Always normalize to /gtag/js upstream as it's canonical + } else if path.ends_with("/collect") { + if path.contains("/g/") { + "https://www.google-analytics.com/g/collect".to_string() + } else { + "https://www.google-analytics.com/collect".to_string() + } + } else { + return None; + }; + + if let Some(query) = req.get_url().query() { + target_url = format!("{}?{}", target_url, query); + } else if path.ends_with("/gtm.js") { + target_url = format!("{}?id={}", target_url, self.config.container_id); + } + + Some(target_url) + } + + fn build_proxy_config<'a>( + &self, + path: &str, + req: &mut Request, + target_url: &'a str, + ) -> ProxyRequestConfig<'a> { + let mut proxy_config = ProxyRequestConfig::new(target_url); + proxy_config.forward_synthetic_id = false; + + // If it's a POST request (e.g. /collect beacon), we must manually attach the body + // because ProxyRequestConfig doesn't automatically copy it from the source request. + if req.get_method() == Method::POST { + let body_bytes = req.take_body_bytes(); + proxy_config.body = Some(body_bytes); + } + + // Explicitly strip X-Forwarded-For to prevent client IP leakage to Google. + // The empty value will override any existing header during proxy forwarding. + proxy_config = proxy_config.with_header( + crate::constants::HEADER_X_FORWARDED_FOR, + fastly::http::HeaderValue::from_static(""), + ); + + if self.is_rewritable_script(path) { + proxy_config = proxy_config.with_header( + fastly::http::header::ACCEPT_ENCODING, + fastly::http::HeaderValue::from_static("identity"), + ); + } + + proxy_config + } } fn build(settings: &Settings) -> Option> { @@ -157,6 +220,7 @@ impl IntegrationProxy for GoogleTagManagerIntegration { self.get("/gtm.js"), // Proxy for the gtag script (if used) self.get("/gtag/js"), + self.get("/gtag.js"), // Analytics beacons (GA4/UA) // The GTM script is rewritten to point these beacons to our proxy. self.get("/collect"), @@ -169,64 +233,31 @@ impl IntegrationProxy for GoogleTagManagerIntegration { async fn handle( &self, settings: &Settings, - req: Request, + mut req: Request, ) -> Result> { let path = req.get_path().to_string(); let method = req.get_method(); log::debug!("Handling GTM request: {} {}", method, path); - let upstream_base = self.upstream_url(); - - // Construct full target URL - let mut target_url = if path.ends_with("/gtm.js") { - format!("{}/gtm.js", upstream_base) - } else if path.ends_with("/gtag/js") { - format!("{}/gtag/js", upstream_base) - } else if path.ends_with("/collect") { - // Analytics beacons always go to google-analytics.com, not the - // configurable upstream_url (which is for googletagmanager.com). - if path.contains("/g/") { - "https://www.google-analytics.com/g/collect".to_string() - } else { - "https://www.google-analytics.com/collect".to_string() - } - } else { + let Some(target_url) = self.build_target_url(&req, &path) else { return Ok(Response::from_status(StatusCode::NOT_FOUND)); }; - // Append query params if present, or add default ID for gtm.js - if let Some(query) = req.get_url().query() { - target_url = format!("{}?{}", target_url, query); - } else if path.ends_with("/gtm.js") { - target_url = format!("{}?id={}", target_url, self.config.container_id); - } - log::debug!("Proxying to upstream: {}", target_url); - let mut proxy_config = ProxyRequestConfig::new(&target_url); - // Do not forward the synthetic ID to external Google services. - proxy_config.forward_synthetic_id = false; - - // If we are fetching gtm.js, we intend to rewrite the body. - // We must ensure the upstream returns uncompressed content. - if path.ends_with("/gtm.js") { - proxy_config = proxy_config.with_header( - fastly::http::header::ACCEPT_ENCODING, - fastly::http::HeaderValue::from_static("identity"), - ); - } + let proxy_config = self.build_proxy_config(&path, &mut req, &target_url); let mut response = proxy_request(settings, req, proxy_config) .await .change_context(Self::error("Failed to proxy GTM request"))?; - // If we are serving gtm.js, rewrite internal URLs to route beacons through us. - if path.ends_with("/gtm.js") { + // If we are serving gtm.js or gtag.js, rewrite internal URLs to route beacons through us. + if self.is_rewritable_script(&path) { if !response.get_status().is_success() { log::warn!("GTM upstream returned status {}", response.get_status()); return Ok(response); } - log::debug!("Rewriting GTM script content"); + log::debug!("Rewriting GTM/gtag script content"); let body_str = response.take_body_str(); let rewritten_body = Self::rewrite_gtm_urls(&body_str); @@ -299,7 +330,9 @@ mod tests { }; use crate::settings::Settings; use crate::streaming_processor::{Compression, PipelineConfig, StreamingPipeline}; + use crate::test_support::tests::crate_test_settings_str; + use fastly::http::Method; use std::io::Cursor; #[test] @@ -489,8 +522,8 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= let integration = GoogleTagManagerIntegration::new(config); let routes = integration.routes(); - // GTM.js, Gtag.js, and 4 Collect endpoints (GET/POST for standard & dual-tagging) - assert_eq!(routes.len(), 6); + // GTM.js, Gtag.js (/js and .js), and 4 Collect endpoints (GET/POST for standard & dual-tagging) + assert_eq!(routes.len(), 7); assert!(routes .iter() @@ -498,6 +531,9 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= assert!(routes .iter() .any(|r| r.path == "/integrations/google_tag_manager/gtag/js")); + assert!(routes + .iter() + .any(|r| r.path == "/integrations/google_tag_manager/gtag.js")); assert!(routes .iter() .any(|r| r.path == "/integrations/google_tag_manager/collect")); @@ -506,6 +542,103 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= .any(|r| r.path == "/integrations/google_tag_manager/g/collect")); } + #[test] + fn test_post_collect_proxy_config_includes_payload() { + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST".to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + }; + let integration = GoogleTagManagerIntegration::new(config); + + let payload = b"v=2&tid=G-TEST&cid=123&en=page_view".to_vec(); + let mut req = Request::new( + Method::POST, + "https://edge.example.com/integrations/google_tag_manager/g/collect?v=2&tid=G-TEST", + ); + req.set_body(payload.clone()); + + let path = req.get_path().to_string(); + let target_url = integration + .build_target_url(&req, &path) + .expect("should resolve collect target URL"); + let proxy_config = integration.build_proxy_config(&path, &mut req, &target_url); + + assert_eq!( + proxy_config.body.as_deref(), + Some(payload.as_slice()), + "collect POST should forward payload body" + ); + } + + #[test] + fn test_collect_proxy_config_strips_client_ip_forwarding() { + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST".to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + }; + let integration = GoogleTagManagerIntegration::new(config); + + let mut req = Request::new( + Method::GET, + "https://edge.example.com/integrations/google_tag_manager/collect?v=2", + ); + req.set_header(crate::constants::HEADER_X_FORWARDED_FOR, "198.51.100.42"); + + let path = req.get_path().to_string(); + let target_url = integration + .build_target_url(&req, &path) + .expect("should resolve collect target URL"); + let proxy_config = integration.build_proxy_config(&path, &mut req, &target_url); + + // We check if X-Forwarded-For is explicitly overridden with an empty string, + // which effectively strips it during proxy forwarding due to header override logic. + let has_header_override = proxy_config.headers.iter().any(|(name, value)| { + name.as_str() + .eq_ignore_ascii_case(crate::constants::HEADER_X_FORWARDED_FOR.as_str()) + && value == "" + }); + + assert!( + has_header_override, + "collect routes should strip client IP by overriding X-Forwarded-For with empty string" + ); + } + + #[test] + fn test_gtag_proxy_config_requests_identity_encoding() { + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GT-123".to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + }; + let integration = GoogleTagManagerIntegration::new(config); + + let mut req = Request::new( + Method::GET, + "https://edge.example.com/integrations/google_tag_manager/gtag/js?id=G-123", + ); + + let path = req.get_path().to_string(); + let target_url = integration + .build_target_url(&req, &path) + .expect("should resolve gtag target URL"); + let proxy_config = integration.build_proxy_config(&path, &mut req, &target_url); + + let has_identity = proxy_config.headers.iter().any(|(name, value)| { + name == fastly::http::header::ACCEPT_ENCODING && value == "identity" + }); + + assert!( + has_identity, + "gtag/js requests should force Accept-Encoding: identity for rewriting" + ); + } + #[test] fn test_handle_response_rewriting() { let original_body = r#" From 6a96c7f730bcf3e03dcb5b1c091ebaefa32cb732 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Mon, 16 Feb 2026 21:45:45 +0530 Subject: [PATCH 10/18] fix: lint error on ci --- crates/common/src/integrations/google_tag_manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index 108234e2..587b3c0b 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -599,7 +599,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= let has_header_override = proxy_config.headers.iter().any(|(name, value)| { name.as_str() .eq_ignore_ascii_case(crate::constants::HEADER_X_FORWARDED_FOR.as_str()) - && value == "" + && value.is_empty() }); assert!( From 76b20be7a030a78eefb671576a19c8cb65bf9f23 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Tue, 17 Feb 2026 18:55:30 +0530 Subject: [PATCH 11/18] Docs fix to clarify client IP handling and privacy enhancement for Google Tag Manager integration --- docs/guide/integrations/google_tag_manager.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guide/integrations/google_tag_manager.md b/docs/guide/integrations/google_tag_manager.md index 5b918768..ddba8a1d 100644 --- a/docs/guide/integrations/google_tag_manager.md +++ b/docs/guide/integrations/google_tag_manager.md @@ -17,7 +17,7 @@ The Tag Gateway intercepts requests for GTM scripts (`gtm.js`) and Google Analyt - **Bypass Ad Blockers**: Serving scripts from a first-party domain can prevent them from being blocked by some ad blockers and privacy extensions. - **Extended Cookie Life**: First-party cookies set by these scripts are more durable in environments like Safari (ITP). - **Performance**: Utilize edge caching for scripts. -- **Privacy Control**: Strips client IP addresses before forwarding data to Google. +- **Privacy Enhancement**: Does not forward client IP to Google (Google sees edge server IP, not user IP). ## Configuration @@ -86,7 +86,7 @@ Analytics data (events, pageviews) normally sent to `google-analytics.com/collec `https://your-server.com/integrations/google_tag_manager/collect` -Trusted Server acts as a gateway, stripping client IP addresses (privacy) before forwarding the data to Google. +Trusted Server acts as a privacy-enhancing gateway. Client IP addresses are not forwarded to Google — Google sees only the edge server IP, not the actual user IP. ## Core Endpoints From 72bacca7dcdad064900c74fdde91a49bcbe35694 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Fri, 20 Feb 2026 19:39:11 +0530 Subject: [PATCH 12/18] Broaden GTM attribute rewriter, add script guard, and harden regex - Widen IntegrationAttributeRewriter to rewrite href/src for gtag/js and google-analytics.com URLs (not just gtm.js), fixing tags not being rewritten on Next.js sites - Add client-side script guard for dynamically inserted GTM/GA scripts using the shared createScriptGuard factory (matches DataDome pattern) - Harden URL regex with delimiter capture group to prevent subdomain spoofing (e.g., www.googletagmanager.com.evil.com) - Add is_rewritable_url helper to selectively rewrite only URLs with corresponding proxy routes (excludes ns.html) - Document gtag/js endpoint in integration guide --- .../src/integrations/google_tag_manager.rs | 77 ++++- .../integrations/google_tag_manager/index.ts | 24 ++ .../google_tag_manager/script_guard.ts | 79 +++++ .../google_tag_manager/script_guard.test.ts | 316 ++++++++++++++++++ docs/guide/integrations/google_tag_manager.md | 16 + 5 files changed, 506 insertions(+), 6 deletions(-) create mode 100644 crates/js/lib/src/integrations/google_tag_manager/index.ts create mode 100644 crates/js/lib/src/integrations/google_tag_manager/script_guard.ts create mode 100644 crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index 587b3c0b..d19323e5 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -42,10 +42,14 @@ const DEFAULT_UPSTREAM: &str = "https://www.googletagmanager.com"; /// - `//www.googletagmanager.com/gtm.js?id=...` /// - `https://www.google-analytics.com/collect` /// - `//www.google-analytics.com/g/collect` +/// - `"www.googletagmanager.com"` (bare domain in GTM JSON config data) /// -/// The replacement target is `/integrations/google_tag_manager`. +/// Captures a trailing delimiter (`/` or `"`) in group 2 to prevent false matches +/// on subdomains (e.g., `www.googletagmanager.com.evil.com`). +/// +/// The replacement target is `/integrations/google_tag_manager` + the captured delimiter. static GTM_URL_PATTERN: Lazy = Lazy::new(|| { - Regex::new(r"(https?:)?//www\.(googletagmanager|google-analytics)\.com") + Regex::new(r#"(?:https?:)?(?://)?www\.(googletagmanager|google-analytics)\.com([/"])"#) .expect("GTM URL regex should compile") }); @@ -113,12 +117,30 @@ impl GoogleTagManagerIntegration { /// Uses [`GTM_URL_PATTERN`] to handle all URL variants (https, protocol-relative) /// for both `googletagmanager.com` and `google-analytics.com`. fn rewrite_gtm_urls(content: &str) -> String { - let replacement = format!("/integrations/{}", GTM_INTEGRATION_ID); + let replacement = format!("/integrations/{}$2", GTM_INTEGRATION_ID); GTM_URL_PATTERN .replace_all(content, replacement.as_str()) .into_owned() } + /// Whether an attribute value URL should be rewritten to first-party. + /// Only matches URLs for which we have corresponding proxy routes + /// (gtm.js, gtag/js, collect, g/collect). Excludes ns.html and other + /// GTM endpoints we don't proxy. + fn is_rewritable_url(url: &str) -> bool { + // Match googletagmanager.com URLs for scripts we proxy + if url.contains("googletagmanager.com") { + return url.contains("/gtm.js") || url.contains("/gtag/js") || url.contains("/gtag.js"); + } + // Match google-analytics.com URLs for beacons we proxy + if url.contains("google-analytics.com") { + return url.contains("/collect") || url.contains("/g/collect"); + } + false + } + + /// Both `/gtag/js` (canonical) and `/gtag.js` (alternate) are accepted; + /// upstream always normalizes to `/gtag/js`. fn is_rewritable_script(&self, path: &str) -> bool { path.ends_with("/gtm.js") || path.ends_with("/gtag/js") || path.ends_with("/gtag.js") } @@ -291,7 +313,7 @@ impl IntegrationAttributeRewriter for GoogleTagManagerIntegration { attr_value: &str, _ctx: &IntegrationAttributeContext<'_>, ) -> AttributeRewriteAction { - if attr_value.contains("googletagmanager.com/gtm.js") { + if Self::is_rewritable_url(attr_value) { AttributeRewriteAction::replace(Self::rewrite_gtm_urls(attr_value)) } else { AttributeRewriteAction::keep() @@ -311,7 +333,7 @@ impl IntegrationScriptRewriter for GoogleTagManagerIntegration { fn rewrite(&self, content: &str, _ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction { // Look for the GTM snippet pattern. // Standard snippet contains: "googletagmanager.com/gtm.js" - if content.contains("googletagmanager.com/gtm.js") { + if content.contains("googletagmanager.com") || content.contains("google-analytics.com") { return ScriptRewriteAction::replace(Self::rewrite_gtm_urls(content)); } @@ -362,6 +384,14 @@ mod tests { assert_eq!(input, result); } + #[test] + fn test_rewrite_rejects_subdomain_spoofing() { + // Should NOT rewrite URLs where the GTM domain is a subdomain of another domain + let input = r#"var x = "https://www.googletagmanager.com.evil.com/collect";"#; + let result = GoogleTagManagerIntegration::rewrite_gtm_urls(input); + assert_eq!(input, result, "should not rewrite spoofed subdomain URLs"); + } + #[test] fn test_attribute_rewriter() { let config = GoogleTagManagerConfig { @@ -408,7 +438,42 @@ mod tests { ); } - // Case 3: Other URL (should be kept) + // Case 3: gtag/js URL in href (preload link) + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "href", + "https://www.googletagmanager.com/gtag/js?id=G-DQMZGMPHXN", + &ctx, + ); + if let AttributeRewriteAction::Replace(val) = action { + assert_eq!( + val, + "/integrations/google_tag_manager/gtag/js?id=G-DQMZGMPHXN" + ); + } else { + panic!( + "Expected Replace action for gtag/js preload href, got {:?}", + action + ); + } + + // Case 4: google-analytics.com URL in href + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "href", + "https://www.google-analytics.com/g/collect", + &ctx, + ); + if let AttributeRewriteAction::Replace(val) = action { + assert_eq!(val, "/integrations/google_tag_manager/g/collect"); + } else { + panic!( + "Expected Replace action for google-analytics href, got {:?}", + action + ); + } + + // Case 5: Other URL (should be kept) let action = IntegrationAttributeRewriter::rewrite( &*integration, "src", diff --git a/crates/js/lib/src/integrations/google_tag_manager/index.ts b/crates/js/lib/src/integrations/google_tag_manager/index.ts new file mode 100644 index 00000000..e0e6f0d3 --- /dev/null +++ b/crates/js/lib/src/integrations/google_tag_manager/index.ts @@ -0,0 +1,24 @@ +import { log } from '../../core/log'; + +import { installGtmGuard } from './script_guard'; + +/** + * Google Tag Manager integration for tsjs + * + * Installs a script guard to intercept dynamically inserted GTM and Google + * Analytics scripts and rewrites them to use the first-party proxy endpoint. + * + * The guard intercepts: + * - Script elements with src containing www.googletagmanager.com + * - Script elements with src containing www.google-analytics.com + * - Link preload/prefetch elements for those scripts + * + * URLs are rewritten to preserve the original path: + * - https://www.googletagmanager.com/gtm.js?id=GTM-XXXX -> /integrations/google_tag_manager/gtm.js?id=GTM-XXXX + * - https://www.google-analytics.com/g/collect -> /integrations/google_tag_manager/g/collect + */ + +if (typeof window !== 'undefined') { + installGtmGuard(); + log.info('Google Tag Manager integration initialized'); +} diff --git a/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts b/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts new file mode 100644 index 00000000..466aef6d --- /dev/null +++ b/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts @@ -0,0 +1,79 @@ +import { createScriptGuard } from '../../shared/script_guard'; + +/** + * Google Tag Manager Script Interception Guard + * + * Intercepts dynamically inserted script tags that load GTM or Google Analytics + * and rewrites their URLs to use the first-party proxy endpoint. This catches + * scripts inserted via appendChild, insertBefore, or any other dynamic DOM + * manipulation (e.g. Next.js dynamic imports). + * + * Built on the shared script_guard factory with custom URL rewriting to preserve + * the original path and query string. + */ + +/** Regex to match www.googletagmanager.com or www.google-analytics.com as domains */ +const GTM_URL_PATTERN = + /^(?:https?:)?(?:\/\/)?www\.(googletagmanager|google-analytics)\.com(?:\/|$)/i; + +/** + * Check if a URL is a GTM or Google Analytics URL. + * Matches the logic from google_tag_manager.rs GTM_URL_PATTERN. + * + * Valid patterns: + * - https://www.googletagmanager.com/gtm.js?id=GTM-XXXX + * - https://www.google-analytics.com/g/collect + * - //www.googletagmanager.com/gtm.js?id=GTM-XXXX + * + * Invalid: + * - https://googletagmanager.com/gtm.js (missing www.) + * - https://example.com/www.googletagmanager.com (domain mismatch) + */ +function isGtmUrl(url: string): boolean { + return !!url && GTM_URL_PATTERN.test(url); +} + +/** + * Extract the path and query string from a GTM/GA URL. + * e.g., "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX" -> "/gtm.js?id=GTM-XXXX" + * "https://www.google-analytics.com/g/collect?v=2" -> "/g/collect?v=2" + */ +function extractGtmPath(url: string): string { + try { + const normalizedUrl = url.startsWith('//') + ? `https:${url}` + : url.startsWith('http') + ? url + : `https://${url}`; + + const parsed = new URL(normalizedUrl); + return parsed.pathname + parsed.search; + } catch { + // Fallback: extract path after the domain + console.debug('[GTM Guard] URL parsing failed, using fallback for:', url); + const match = url.match( + /www\.(?:googletagmanager|google-analytics)\.com(\/[^'"\s]*)/i, + ); + return match?.[1] || '/gtm.js'; + } +} + +/** + * Rewrite a GTM/GA URL to the first-party proxy path. + */ +function rewriteGtmUrl(originalUrl: string): string { + return `${window.location.origin}/integrations/google_tag_manager${extractGtmPath(originalUrl)}`; +} + +const guard = createScriptGuard({ + name: 'GTM', + isTargetUrl: isGtmUrl, + rewriteUrl: rewriteGtmUrl, +}); + +export const installGtmGuard = guard.install; +export const isGuardInstalled = guard.isInstalled; +export const resetGuardState = guard.reset; + +// Export for testing +export { isGtmUrl, extractGtmPath, rewriteGtmUrl }; diff --git a/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts b/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts new file mode 100644 index 00000000..e8af358e --- /dev/null +++ b/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts @@ -0,0 +1,316 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { + installGtmGuard, + isGuardInstalled, + resetGuardState, + isGtmUrl, + extractGtmPath, + rewriteGtmUrl, +} from '../../../src/integrations/google_tag_manager/script_guard'; + +describe('GTM Script Interception Guard', () => { + let originalAppendChild: typeof Element.prototype.appendChild; + let originalInsertBefore: typeof Element.prototype.insertBefore; + + beforeEach(() => { + originalAppendChild = Element.prototype.appendChild; + originalInsertBefore = Element.prototype.insertBefore; + resetGuardState(); + }); + + afterEach(() => { + Element.prototype.appendChild = originalAppendChild; + Element.prototype.insertBefore = originalInsertBefore; + resetGuardState(); + }); + + describe('isGtmUrl', () => { + it('should detect www.googletagmanager.com URLs', () => { + expect(isGtmUrl('https://www.googletagmanager.com/gtm.js?id=GTM-XXXX')).toBe(true); + expect(isGtmUrl('https://www.googletagmanager.com/gtag/js?id=G-XXXX')).toBe(true); + expect(isGtmUrl('//www.googletagmanager.com/gtm.js?id=GTM-XXXX')).toBe(true); + expect(isGtmUrl('http://www.googletagmanager.com/gtm.js?id=GTM-XXXX')).toBe(true); + }); + + it('should detect www.google-analytics.com URLs', () => { + expect(isGtmUrl('https://www.google-analytics.com/collect')).toBe(true); + expect(isGtmUrl('https://www.google-analytics.com/g/collect')).toBe(true); + expect(isGtmUrl('//www.google-analytics.com/collect')).toBe(true); + }); + + it('should be case-insensitive', () => { + expect(isGtmUrl('https://WWW.GOOGLETAGMANAGER.COM/gtm.js')).toBe(true); + expect(isGtmUrl('https://WWW.GOOGLE-ANALYTICS.COM/collect')).toBe(true); + }); + + it('should not match without www prefix', () => { + expect(isGtmUrl('https://googletagmanager.com/gtm.js')).toBe(false); + expect(isGtmUrl('https://google-analytics.com/collect')).toBe(false); + }); + + it('should not match non-Google URLs', () => { + expect(isGtmUrl('https://example.com/gtm.js')).toBe(false); + expect(isGtmUrl('https://cdn.example.com/www.googletagmanager.com.js')).toBe(false); + }); + + it('should handle empty and null values', () => { + expect(isGtmUrl('')).toBe(false); + expect(isGtmUrl(null as unknown as string)).toBe(false); + expect(isGtmUrl(undefined as unknown as string)).toBe(false); + }); + }); + + describe('extractGtmPath', () => { + it('should extract path from GTM URLs', () => { + expect(extractGtmPath('https://www.googletagmanager.com/gtm.js')).toBe('/gtm.js'); + expect(extractGtmPath('https://www.googletagmanager.com/gtag/js')).toBe('/gtag/js'); + }); + + it('should extract path from GA URLs', () => { + expect(extractGtmPath('https://www.google-analytics.com/collect')).toBe('/collect'); + expect(extractGtmPath('https://www.google-analytics.com/g/collect')).toBe('/g/collect'); + }); + + it('should extract path from protocol-relative URLs', () => { + expect(extractGtmPath('//www.googletagmanager.com/gtm.js')).toBe('/gtm.js'); + expect(extractGtmPath('//www.google-analytics.com/g/collect')).toBe('/g/collect'); + }); + + it('should preserve query strings', () => { + expect(extractGtmPath('https://www.googletagmanager.com/gtm.js?id=GTM-XXXX')).toBe( + '/gtm.js?id=GTM-XXXX' + ); + expect( + extractGtmPath('https://www.google-analytics.com/g/collect?v=2&tid=G-TEST') + ).toBe('/g/collect?v=2&tid=G-TEST'); + }); + + it('should handle bare domain', () => { + expect(extractGtmPath('https://www.googletagmanager.com')).toBe('/'); + expect(extractGtmPath('https://www.googletagmanager.com/')).toBe('/'); + }); + }); + + describe('rewriteGtmUrl', () => { + it('should rewrite GTM script URL to first-party', () => { + const rewritten = rewriteGtmUrl('https://www.googletagmanager.com/gtm.js?id=GTM-XXXX'); + expect(rewritten).toContain('/integrations/google_tag_manager/gtm.js?id=GTM-XXXX'); + expect(rewritten).toContain(window.location.host); + }); + + it('should rewrite GA collect URL to first-party', () => { + const rewritten = rewriteGtmUrl('https://www.google-analytics.com/g/collect?v=2'); + expect(rewritten).toContain('/integrations/google_tag_manager/g/collect?v=2'); + }); + + it('should preserve query strings', () => { + const rewritten = rewriteGtmUrl( + 'https://www.googletagmanager.com/gtm.js?id=GTM-XXXX&l=dataLayer' + ); + expect(rewritten).toContain('/integrations/google_tag_manager/gtm.js?id=GTM-XXXX&l=dataLayer'); + }); + }); + + describe('installGtmGuard', () => { + it('should install the guard successfully', () => { + expect(isGuardInstalled()).toBe(false); + installGtmGuard(); + expect(isGuardInstalled()).toBe(true); + }); + + it('should not install twice', () => { + installGtmGuard(); + const firstInstall = Element.prototype.appendChild; + installGtmGuard(); + const secondInstall = Element.prototype.appendChild; + expect(firstInstall).toBe(secondInstall); + }); + + it('should patch Element.prototype.appendChild', () => { + installGtmGuard(); + expect(Element.prototype.appendChild).not.toBe(originalAppendChild); + }); + + it('should patch Element.prototype.insertBefore', () => { + installGtmGuard(); + expect(Element.prototype.insertBefore).not.toBe(originalInsertBefore); + }); + }); + + describe('appendChild interception', () => { + it('should rewrite GTM script URL', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const script = document.createElement('script'); + script.src = 'https://www.googletagmanager.com/gtm.js?id=GTM-XXXX'; + + container.appendChild(script); + + expect(script.src).toContain('/integrations/google_tag_manager/gtm.js?id=GTM-XXXX'); + expect(script.src).not.toContain('googletagmanager.com'); + }); + + it('should rewrite Google Analytics script URL', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const script = document.createElement('script'); + script.src = 'https://www.google-analytics.com/g/collect'; + + container.appendChild(script); + + expect(script.src).toContain('/integrations/google_tag_manager/g/collect'); + expect(script.src).not.toContain('google-analytics.com'); + }); + + it('should not rewrite non-GTM scripts', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const script = document.createElement('script'); + script.src = 'https://example.com/some-script.js'; + + container.appendChild(script); + + expect(script.src).toBe('https://example.com/some-script.js'); + }); + + it('should not affect non-script elements', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const img = document.createElement('img'); + img.src = 'https://www.googletagmanager.com/image.png'; + + container.appendChild(img); + + expect(img.src).toBe('https://www.googletagmanager.com/image.png'); + }); + + it('should preserve other script attributes', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const script = document.createElement('script'); + script.setAttribute('async', ''); + script.setAttribute('data-nscript', 'afterInteractive'); + script.src = 'https://www.googletagmanager.com/gtm.js?id=GTM-XXXX'; + + container.appendChild(script); + + expect(script.getAttribute('async')).toBe(''); + expect(script.getAttribute('data-nscript')).toBe('afterInteractive'); + }); + }); + + describe('insertBefore interception', () => { + it('should rewrite GTM script URL', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const reference = document.createElement('div'); + container.appendChild(reference); + + const script = document.createElement('script'); + script.src = 'https://www.googletagmanager.com/gtm.js?id=GTM-XXXX'; + + container.insertBefore(script, reference); + + expect(script.src).toContain('/integrations/google_tag_manager/gtm.js?id=GTM-XXXX'); + expect(script.src).not.toContain('googletagmanager.com'); + }); + + it('should work with null reference node', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const script = document.createElement('script'); + script.src = 'https://www.googletagmanager.com/gtm.js?id=GTM-XXXX'; + + container.insertBefore(script, null); + + expect(script.src).toContain('/integrations/google_tag_manager/gtm.js?id=GTM-XXXX'); + }); + }); + + describe('link preload interception', () => { + it('should rewrite GTM preload link', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const link = document.createElement('link'); + link.setAttribute('rel', 'preload'); + link.setAttribute('as', 'script'); + link.href = 'https://www.googletagmanager.com/gtm.js?id=GTM-XXXX'; + + container.appendChild(link); + + expect(link.href).toContain('/integrations/google_tag_manager/gtm.js?id=GTM-XXXX'); + expect(link.href).not.toContain('googletagmanager.com'); + }); + + it('should not rewrite preload links without as="script"', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const link = document.createElement('link'); + link.setAttribute('rel', 'preload'); + link.setAttribute('as', 'style'); + link.href = 'https://www.googletagmanager.com/gtm.js?id=GTM-XXXX'; + + container.appendChild(link); + + expect(link.href).toBe('https://www.googletagmanager.com/gtm.js?id=GTM-XXXX'); + }); + + it('should not rewrite non-GTM preload links', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const link = document.createElement('link'); + link.setAttribute('rel', 'preload'); + link.setAttribute('as', 'script'); + link.href = 'https://example.com/other-script.js'; + + container.appendChild(link); + + expect(link.href).toBe('https://example.com/other-script.js'); + }); + }); + + describe('integration scenarios', () => { + it('should handle the standard GTM snippet pattern (dynamic script creation)', () => { + installGtmGuard(); + + // Simulates the standard GTM snippet: j.src='https://www.googletagmanager.com/gtm.js?id='+i + const container = document.createElement('div'); + const script = document.createElement('script'); + script.async = true; + script.src = 'https://www.googletagmanager.com/gtm.js?id=GTM-XXXX'; + + container.appendChild(script); + + expect(script.src).toContain('/integrations/google_tag_manager/gtm.js?id=GTM-XXXX'); + expect(script.async).toBe(true); + }); + + it('should handle multiple script insertions', () => { + installGtmGuard(); + + const container = document.createElement('div'); + + const script1 = document.createElement('script'); + script1.src = 'https://www.googletagmanager.com/gtm.js?id=GTM-XXXX'; + + const script2 = document.createElement('script'); + script2.src = 'https://example.com/other.js'; + + container.appendChild(script1); + container.appendChild(script2); + + expect(script1.src).toContain('/integrations/google_tag_manager/gtm.js?id=GTM-XXXX'); + expect(script2.src).toBe('https://example.com/other.js'); + }); + }); +}); diff --git a/docs/guide/integrations/google_tag_manager.md b/docs/guide/integrations/google_tag_manager.md index ddba8a1d..d69718be 100644 --- a/docs/guide/integrations/google_tag_manager.md +++ b/docs/guide/integrations/google_tag_manager.md @@ -106,6 +106,22 @@ GET /integrations/google_tag_manager/gtm.js?id=GTM-XXXXXX - Rewrites internal URLs to use the first-party proxy - Sets `Accept-Encoding: identity` during fetch to ensure rewriteable text response +### `GET .../gtag/js` - GA4 Tag Proxy + +Proxies the Google Analytics 4 tag library (gtag.js). + +**Request**: + +``` +GET /integrations/google_tag_manager/gtag/js?id=G-XXXXXXXX +``` + +**Behavior**: + +- Proxies to `https://www.googletagmanager.com/gtag/js` +- Rewrites internal URLs to use the first-party proxy +- Sets `Accept-Encoding: identity` during fetch to ensure rewriteable text response + ### `GET/POST .../collect` - Analytics Beacon Proxies analytics events (GA4/UA). From 61a82602c78f771be14cdf0a8316cd8a3df22bc0 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Fri, 20 Feb 2026 19:45:37 +0530 Subject: [PATCH 13/18] fix: ts lint --- .../integrations/google_tag_manager/script_guard.ts | 4 +--- .../google_tag_manager/script_guard.test.ts | 10 ++++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts b/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts index 466aef6d..24cdf4f4 100644 --- a/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts +++ b/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts @@ -51,9 +51,7 @@ function extractGtmPath(url: string): string { } catch { // Fallback: extract path after the domain console.debug('[GTM Guard] URL parsing failed, using fallback for:', url); - const match = url.match( - /www\.(?:googletagmanager|google-analytics)\.com(\/[^'"\s]*)/i, - ); + const match = url.match(/www\.(?:googletagmanager|google-analytics)\.com(\/[^'"\s]*)/i); return match?.[1] || '/gtm.js'; } } diff --git a/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts b/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts index e8af358e..fae43346 100644 --- a/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts +++ b/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts @@ -80,9 +80,9 @@ describe('GTM Script Interception Guard', () => { expect(extractGtmPath('https://www.googletagmanager.com/gtm.js?id=GTM-XXXX')).toBe( '/gtm.js?id=GTM-XXXX' ); - expect( - extractGtmPath('https://www.google-analytics.com/g/collect?v=2&tid=G-TEST') - ).toBe('/g/collect?v=2&tid=G-TEST'); + expect(extractGtmPath('https://www.google-analytics.com/g/collect?v=2&tid=G-TEST')).toBe( + '/g/collect?v=2&tid=G-TEST' + ); }); it('should handle bare domain', () => { @@ -107,7 +107,9 @@ describe('GTM Script Interception Guard', () => { const rewritten = rewriteGtmUrl( 'https://www.googletagmanager.com/gtm.js?id=GTM-XXXX&l=dataLayer' ); - expect(rewritten).toContain('/integrations/google_tag_manager/gtm.js?id=GTM-XXXX&l=dataLayer'); + expect(rewritten).toContain( + '/integrations/google_tag_manager/gtm.js?id=GTM-XXXX&l=dataLayer' + ); }); }); From 1eda0aff04aff7909c161632154bdf6980f58f2e Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Fri, 20 Feb 2026 21:12:04 +0530 Subject: [PATCH 14/18] Aadd beacon guard to proxy GA4 sendBeacon/fetch through first-party gtag.js constructs beacon URLs dynamically from bare domain strings, so rewriting them at the script level produces broken URLs. Instead, add a shared beacon_guard that patches navigator.sendBeacon and window.fetch at runtime to intercept requests to google-analytics.com and analytics.google.com, rewriting them to the first-party proxy. - Add shared beacon_guard.ts factory (sendBeacon + fetch interception) - Wire GTM integration to install beacon guard on init - Require // prefix in Rust GTM_URL_PATTERN to prevent bare domain rewrites - Add tests for both shared factory and GTM-specific beacon interception --- .../src/integrations/google_tag_manager.rs | 59 ++++++- .../integrations/google_tag_manager/index.ts | 19 +- .../google_tag_manager/script_guard.ts | 20 ++- crates/js/lib/src/shared/beacon_guard.ts | 123 +++++++++++++ .../google_tag_manager/script_guard.test.ts | 84 +++++++++ .../js/lib/test/shared/beacon_guard.test.ts | 167 ++++++++++++++++++ 6 files changed, 457 insertions(+), 15 deletions(-) create mode 100644 crates/js/lib/src/shared/beacon_guard.ts create mode 100644 crates/js/lib/test/shared/beacon_guard.test.ts diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index d19323e5..b1946042 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -37,19 +37,30 @@ const DEFAULT_UPSTREAM: &str = "https://www.googletagmanager.com"; /// Regex pattern for matching and rewriting GTM and Google Analytics URLs. /// -/// Handles all URL variants: +/// Handles full and protocol-relative URL variants: /// - `https://www.googletagmanager.com/gtm.js?id=...` /// - `//www.googletagmanager.com/gtm.js?id=...` /// - `https://www.google-analytics.com/collect` /// - `//www.google-analytics.com/g/collect` -/// - `"www.googletagmanager.com"` (bare domain in GTM JSON config data) +/// +/// **Requires `//` prefix** — bare domain strings like `"www.googletagmanager.com"` +/// are intentionally NOT matched. gtag.js stores domains as bare strings and +/// constructs URLs dynamically (`"https://" + domain + "/path"`). Rewriting +/// the bare domain produces broken URLs like +/// `https://integrations/google_tag_manager/path` because the script still +/// prepends `"https://"`. +/// +/// **Does NOT include `analytics.google.com`** — same dynamic URL construction +/// issue. Full URLs containing `analytics.google.com` are handled by +/// [`is_rewritable_url`] for HTML attribute rewriting where we see the +/// complete URL. /// /// Captures a trailing delimiter (`/` or `"`) in group 2 to prevent false matches /// on subdomains (e.g., `www.googletagmanager.com.evil.com`). /// /// The replacement target is `/integrations/google_tag_manager` + the captured delimiter. static GTM_URL_PATTERN: Lazy = Lazy::new(|| { - Regex::new(r#"(?:https?:)?(?://)?www\.(googletagmanager|google-analytics)\.com([/"])"#) + Regex::new(r#"(?:https?:)?//www\.(googletagmanager|google-analytics)\.com([/"])"#) .expect("GTM URL regex should compile") }); @@ -115,7 +126,7 @@ impl GoogleTagManagerIntegration { /// Rewrite GTM and Google Analytics URLs to first-party proxy paths. /// /// Uses [`GTM_URL_PATTERN`] to handle all URL variants (https, protocol-relative) - /// for both `googletagmanager.com` and `google-analytics.com`. + /// for `googletagmanager.com` and `google-analytics.com`. fn rewrite_gtm_urls(content: &str) -> String { let replacement = format!("/integrations/{}$2", GTM_INTEGRATION_ID); GTM_URL_PATTERN @@ -132,8 +143,8 @@ impl GoogleTagManagerIntegration { if url.contains("googletagmanager.com") { return url.contains("/gtm.js") || url.contains("/gtag/js") || url.contains("/gtag.js"); } - // Match google-analytics.com URLs for beacons we proxy - if url.contains("google-analytics.com") { + // Match google-analytics.com and analytics.google.com URLs for beacons we proxy + if url.contains("google-analytics.com") || url.contains("analytics.google.com") { return url.contains("/collect") || url.contains("/g/collect"); } false @@ -333,6 +344,9 @@ impl IntegrationScriptRewriter for GoogleTagManagerIntegration { fn rewrite(&self, content: &str, _ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction { // Look for the GTM snippet pattern. // Standard snippet contains: "googletagmanager.com/gtm.js" + // Note: analytics.google.com is intentionally excluded — gtag.js stores + // that domain as a bare string and constructs URLs dynamically, so + // rewriting it in scripts produces broken URLs. if content.contains("googletagmanager.com") || content.contains("google-analytics.com") { return ScriptRewriteAction::replace(Self::rewrite_gtm_urls(content)); } @@ -377,6 +391,20 @@ mod tests { assert!(!result.contains("www.google-analytics.com")); } + #[test] + fn test_rewrite_does_not_touch_analytics_google_com() { + // analytics.google.com must NOT be rewritten in scripts — gtag.js stores + // the bare domain string and constructs URLs dynamically with + // "https://" + domain + "/g/collect", so rewriting the domain produces + // the broken URL https://integrations/google_tag_manager/g/collect. + let input = r#"var f = "https://analytics.google.com/g/collect";"#; + let result = GoogleTagManagerIntegration::rewrite_gtm_urls(input); + assert_eq!( + input, result, + "analytics.google.com should not be rewritten by regex" + ); + } + #[test] fn test_rewrite_preserves_non_gtm_urls() { let input = r#"var x = "https://example.com/script.js";"#; @@ -392,6 +420,25 @@ mod tests { assert_eq!(input, result, "should not rewrite spoofed subdomain URLs"); } + #[test] + fn test_rewrite_does_not_touch_bare_domain_strings() { + // Bare domain strings (without // prefix) must NOT be rewritten. + // gtag.js stores domains as bare strings and constructs URLs dynamically: + // "https://" + domain + "/g/collect" + // Rewriting the bare domain produces broken URLs like: + // https://integrations/google_tag_manager/g/collect + let input = r#"var d = "www.googletagmanager.com";"#; + let result = GoogleTagManagerIntegration::rewrite_gtm_urls(input); + assert_eq!(input, result, "bare domain strings should not be rewritten"); + + let input2 = r#"var d = "www.google-analytics.com";"#; + let result2 = GoogleTagManagerIntegration::rewrite_gtm_urls(input2); + assert_eq!( + input2, result2, + "bare google-analytics domain should not be rewritten" + ); + } + #[test] fn test_attribute_rewriter() { let config = GoogleTagManagerConfig { diff --git a/crates/js/lib/src/integrations/google_tag_manager/index.ts b/crates/js/lib/src/integrations/google_tag_manager/index.ts index e0e6f0d3..ca73f248 100644 --- a/crates/js/lib/src/integrations/google_tag_manager/index.ts +++ b/crates/js/lib/src/integrations/google_tag_manager/index.ts @@ -1,24 +1,31 @@ import { log } from '../../core/log'; +import { installGtmBeaconGuard } from './script_guard'; import { installGtmGuard } from './script_guard'; /** * Google Tag Manager integration for tsjs * - * Installs a script guard to intercept dynamically inserted GTM and Google - * Analytics scripts and rewrites them to use the first-party proxy endpoint. + * Installs guards to intercept GTM and Google Analytics traffic: * - * The guard intercepts: - * - Script elements with src containing www.googletagmanager.com - * - Script elements with src containing www.google-analytics.com - * - Link preload/prefetch elements for those scripts + * 1. **Script guard** — intercepts dynamically inserted ` + "#; let mut settings = make_settings(); @@ -859,7 +875,7 @@ container_id = "GTM-DEFAULT" "google_tag_manager", &serde_json::json!({ "enabled": true, - "container_id": "GTM-TEST", + "container_id": "GTM-TEST1234", "upstream_url": "https://www.googletagmanager.com" }), ) @@ -882,7 +898,7 @@ container_id = "GTM-DEFAULT" let processed = String::from_utf8_lossy(&output); // Verify rewrite happened - assert!(processed.contains("/integrations/google_tag_manager/gtm.js?id=GTM-TEST")); + assert!(processed.contains("/integrations/google_tag_manager/gtm.js?id=GTM-TEST1234")); assert!(!processed.contains("https://www.googletagmanager.com/gtm.js")); } @@ -1014,6 +1030,84 @@ container_id = "GTM-DEFAULT" ); } + #[test] + fn test_container_id_validation_accepts_valid_ids() { + // Valid container IDs with different lengths + let valid_ids = vec![ + "GTM-ABCD", // Minimum length (4 chars) + "GTM-TEST1234", // 8 chars + "GTM-ABC123XYZ", // 10 chars + "GTM-12345678901234567890", // Maximum length (20 chars) + "GTM-MIXEDCASE123", // Mixed alphanumeric + ]; + + for container_id in valid_ids { + let config = GoogleTagManagerConfig { + enabled: true, + container_id: container_id.to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + }; + + assert!( + config.validate().is_ok(), + "Container ID '{}' should be valid", + container_id + ); + } + } + + #[test] + fn test_container_id_validation_rejects_invalid_ids() { + // Invalid container IDs + let invalid_ids = vec![ + ("GTM-ABC", "too short (3 chars)"), + ("GTM-123456789012345678901", "too long (21 chars)"), + ("INVALID", "missing GTM- prefix"), + ("GTM-", "empty after prefix"), + ("gtm-ABCD", "lowercase prefix"), + ("GTM-abc123", "lowercase chars"), + ("GTM-AB@CD", "special characters"), + ("GTM-AB CD", "spaces"), + ("", "empty string"), + ]; + + for (container_id, reason) in invalid_ids { + let config = GoogleTagManagerConfig { + enabled: true, + container_id: container_id.to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + }; + + assert!( + config.validate().is_err(), + "Container ID '{}' should be invalid ({})", + container_id, + reason + ); + } + } + + #[test] + fn test_container_id_validation_max_length() { + // Test that max length constraint is enforced + let too_long = "GTM-".to_string() + &"X".repeat(50); // 54 chars total + + let config = GoogleTagManagerConfig { + enabled: true, + container_id: too_long.clone(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + }; + + assert!( + config.validate().is_err(), + "Container ID with {} chars should be rejected (max 50)", + too_long.len() + ); + } + #[test] fn test_error_helper() { let err = GoogleTagManagerIntegration::error("test failure"); diff --git a/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts b/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts index 0657c2ae..b9ff327a 100644 --- a/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts +++ b/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts @@ -50,13 +50,17 @@ function extractGtmPath(url: string): string { const parsed = new URL(normalizedUrl); return parsed.pathname + parsed.search; - } catch { - // Fallback: extract path after the domain - console.debug('[GTM Guard] URL parsing failed, using fallback for:', url); + } catch (error) { + // Fallback: extract path after the domain using regex + console.warn('[GTM Guard] URL parsing failed for:', url, 'Error:', error); const match = url.match( /(?:www\.(?:googletagmanager|google-analytics)\.com|analytics\.google\.com)(\/[^'"\s]*)/i ); - return match?.[1] || '/gtm.js'; + if (!match || !match[1]) { + console.warn('[GTM Guard] Fallback regex failed, using default path /gtm.js'); + return '/gtm.js'; + } + return match[1]; } } diff --git a/crates/js/lib/src/shared/beacon_guard.ts b/crates/js/lib/src/shared/beacon_guard.ts index 5942209f..94618dc3 100644 --- a/crates/js/lib/src/shared/beacon_guard.ts +++ b/crates/js/lib/src/shared/beacon_guard.ts @@ -55,6 +55,8 @@ function extractUrl(input: RequestInfo | URL): string | null { */ export function createBeaconGuard(config: BeaconGuardConfig): BeaconGuard { let installed = false; + let originalSendBeacon: typeof navigator.sendBeacon | null = null; + let originalFetch: typeof window.fetch | null = null; const prefix = `${config.name} beacon guard`; function install(): void { @@ -72,21 +74,21 @@ export function createBeaconGuard(config: BeaconGuardConfig): BeaconGuard { // --- Patch navigator.sendBeacon --- if (typeof navigator !== 'undefined' && typeof navigator.sendBeacon === 'function') { - const originalSendBeacon = navigator.sendBeacon.bind(navigator); + originalSendBeacon = navigator.sendBeacon.bind(navigator); navigator.sendBeacon = function (url: string, data?: BodyInit | null): boolean { if (config.isTargetUrl(url)) { const rewritten = config.rewriteUrl(url); log.info(`${prefix}: rewriting sendBeacon`, { original: url, rewritten }); - return originalSendBeacon(rewritten, data); + return originalSendBeacon!(rewritten, data); } - return originalSendBeacon(url, data); + return originalSendBeacon!(url, data); }; } // --- Patch window.fetch --- if (typeof window.fetch === 'function') { - const originalFetch = window.fetch.bind(window); + originalFetch = window.fetch.bind(window); window.fetch = function (input: RequestInfo | URL, init?: RequestInit): Promise { const url = extractUrl(input); @@ -98,12 +100,12 @@ export function createBeaconGuard(config: BeaconGuardConfig): BeaconGuard { // If the input was a Request, create a new one with the rewritten URL if (input instanceof Request) { const newRequest = new Request(rewritten, input); - return originalFetch(newRequest, init); + return originalFetch!(newRequest, init); } - return originalFetch(rewritten, init); + return originalFetch!(rewritten, init); } - return originalFetch(input, init); + return originalFetch!(input, init); }; } @@ -116,7 +118,16 @@ export function createBeaconGuard(config: BeaconGuardConfig): BeaconGuard { } function reset(): void { + if (originalSendBeacon && typeof navigator !== 'undefined') { + navigator.sendBeacon = originalSendBeacon; + originalSendBeacon = null; + } + if (originalFetch && typeof window !== 'undefined') { + window.fetch = originalFetch; + originalFetch = null; + } installed = false; + log.debug(`${prefix}: reset and uninstalled`); } return { install, isInstalled, reset }; diff --git a/docs/guide/integrations/google_tag_manager.md b/docs/guide/integrations/google_tag_manager.md index d69718be..22ce91ed 100644 --- a/docs/guide/integrations/google_tag_manager.md +++ b/docs/guide/integrations/google_tag_manager.md @@ -19,6 +19,16 @@ The Tag Gateway intercepts requests for GTM scripts (`gtm.js`) and Google Analyt - **Performance**: Utilize edge caching for scripts. - **Privacy Enhancement**: Does not forward client IP to Google (Google sees edge server IP, not user IP). +**Privacy vs. Analytics Tradeoff**: + +Client IP addresses are intentionally **not forwarded** to Google Analytics. This means: + +- ✅ **Privacy**: User IP addresses remain private and are not sent to Google +- ⚠️ **Analytics**: Geographic targeting and user location data will be based on the edge server's IP, not the actual user's location +- ⚠️ **Accuracy**: Analytics reports may show less accurate geographic distribution + +This is a deliberate privacy-first design choice. If your use case requires accurate geographic data, you may need to consider alternative approaches. + ## Configuration Add the GTM configuration to `trusted-server.toml`: @@ -146,6 +156,24 @@ The integration requires the upstream `gtm.js` to be uncompressed to perform str _Note: Trusted Server will re-compress the response (gzip/brotli) before sending it to the user if the `compression` feature is enabled._ +### Cache Behavior + +- **Script Caching**: `gtm.js` and `gtag/js` are cached with `Cache-Control: public, max-age=900` (15 minutes) by default +- **Cache Duration**: Configurable via `cache_max_age` setting (60-86400 seconds) +- **Edge Caching**: Fastly edge nodes will cache scripts according to the Cache-Control headers + +**Stale Cache Handling**: + +If the Google upstream is unreachable when a cached script expires: +- The edge will attempt to fetch a fresh copy from Google +- If the fetch fails, the request will fail (no stale content is served) +- Consider implementing `stale-while-revalidate` at the CDN level if you need fallback behavior + +For production deployments, monitor upstream availability and consider: +- Setting up health checks for Google's endpoints +- Configuring appropriate cache TTLs based on your update frequency needs +- Implementing retry logic at the edge platform level + ### Direct Proxying Beacon requests (`/collect`) are proxied directly using streaming, minimizing latency overhead. From 4a7ed3759924f3315ffbb240f1e1b5ee027fda33 Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Tue, 24 Feb 2026 12:41:05 +0530 Subject: [PATCH 17/18] Fix lint format issue --- crates/common/src/integrations/google_tag_manager.rs | 8 ++++---- docs/guide/integrations/google_tag_manager.md | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index 7c77139b..72f10d1d 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -1034,11 +1034,11 @@ container_id = "GTM-DEFAULT" fn test_container_id_validation_accepts_valid_ids() { // Valid container IDs with different lengths let valid_ids = vec![ - "GTM-ABCD", // Minimum length (4 chars) - "GTM-TEST1234", // 8 chars - "GTM-ABC123XYZ", // 10 chars + "GTM-ABCD", // Minimum length (4 chars) + "GTM-TEST1234", // 8 chars + "GTM-ABC123XYZ", // 10 chars "GTM-12345678901234567890", // Maximum length (20 chars) - "GTM-MIXEDCASE123", // Mixed alphanumeric + "GTM-MIXEDCASE123", // Mixed alphanumeric ]; for container_id in valid_ids { diff --git a/docs/guide/integrations/google_tag_manager.md b/docs/guide/integrations/google_tag_manager.md index 22ce91ed..b72b70db 100644 --- a/docs/guide/integrations/google_tag_manager.md +++ b/docs/guide/integrations/google_tag_manager.md @@ -165,11 +165,13 @@ _Note: Trusted Server will re-compress the response (gzip/brotli) before sending **Stale Cache Handling**: If the Google upstream is unreachable when a cached script expires: + - The edge will attempt to fetch a fresh copy from Google - If the fetch fails, the request will fail (no stale content is served) - Consider implementing `stale-while-revalidate` at the CDN level if you need fallback behavior For production deployments, monitor upstream availability and consider: + - Setting up health checks for Google's endpoints - Configuring appropriate cache TTLs based on your update frequency needs - Implementing retry logic at the edge platform level From 457702c60b043887bbd60aaef61e9afbd462eb9f Mon Sep 17 00:00:00 2001 From: prk-Jr Date: Wed, 25 Feb 2026 15:39:33 +0530 Subject: [PATCH 18/18] Fix memory and security vulnerabilities in GTM integration This commit addresses three security/reliability issues in the Google Tag Manager integration identified during code review: 1. Prevent unbounded memory allocation in POST body handling - Replace take_body_bytes with chunked reading using 8KB chunks - Reject oversized payloads early without buffering entire body - Maximum memory usage now capped at max_beacon_body_size + 8KB - Protects against memory exhaustion attacks via malicious POSTs with missing or incorrect Content-Length headers 2. Fix URL validation to prevent false positives - Replace substring matching with proper URL parsing in is_rewritable_url - Validate host and path separately using allowlist - Reject URLs with GTM domains in query parameters or fragments - Add comprehensive test coverage for false positive scenarios 3. Update documentation to reflect actual security guarantees - Document chunked body reading implementation - Clarify memory protection limits - Describe early rejection behavior for oversized payloads Test coverage: Added test_attribute_rewriter_rejects_false_positives All 30 integration tests passing with no clippy warnings. Fixes: Memory exhaustion vector, URL validation bypass --- .../src/integrations/google_tag_manager.rs | 559 +++++++++++++++++- .../google_tag_manager/script_guard.ts | 34 +- .../google_tag_manager/script_guard.test.ts | 40 +- docs/guide/integrations/google_tag_manager.md | 33 +- 4 files changed, 625 insertions(+), 41 deletions(-) diff --git a/crates/common/src/integrations/google_tag_manager.rs b/crates/common/src/integrations/google_tag_manager.rs index 72f10d1d..7bee4d49 100644 --- a/crates/common/src/integrations/google_tag_manager.rs +++ b/crates/common/src/integrations/google_tag_manager.rs @@ -35,6 +35,12 @@ use crate::settings::{IntegrationConfig, Settings}; const GTM_INTEGRATION_ID: &str = "google_tag_manager"; const DEFAULT_UPSTREAM: &str = "https://www.googletagmanager.com"; +/// Error type for payload size validation +#[derive(Debug)] +enum PayloadSizeError { + TooLarge { actual: usize, max: usize }, +} + /// Regex pattern for validating GTM container IDs. /// Format: GTM-XXXXXX where X is alphanumeric. static GTM_CONTAINER_ID_PATTERN: Lazy = Lazy::new(|| { @@ -48,6 +54,8 @@ static GTM_CONTAINER_ID_PATTERN: Lazy = Lazy::new(|| { /// - `//www.googletagmanager.com/gtm.js?id=...` /// - `https://www.google-analytics.com/collect` /// - `//www.google-analytics.com/g/collect` +/// - `https://analytics.google.com/g/collect` +/// - `//analytics.google.com/g/collect` /// /// **Requires `//` prefix** — bare domain strings like `"www.googletagmanager.com"` /// are intentionally NOT matched. gtag.js stores domains as bare strings and @@ -56,17 +64,16 @@ static GTM_CONTAINER_ID_PATTERN: Lazy = Lazy::new(|| { /// `https://integrations/google_tag_manager/path` because the script still /// prepends `"https://"`. /// -/// **Does NOT include `analytics.google.com`** — same dynamic URL construction -/// issue. Full URLs containing `analytics.google.com` are handled by -/// [`is_rewritable_url`] for HTML attribute rewriting where we see the -/// complete URL. +/// **Full URL matching for `analytics.google.com`** — Only full URLs with `//` prefix +/// are matched and rewritten (e.g., `https://analytics.google.com/g/collect`). +/// Bare domain strings are not matched due to the same dynamic URL construction issue. /// -/// Captures a trailing delimiter (`/` or `"`) in group 2 to prevent false matches +/// Captures a trailing delimiter (`/` or `"`) in the last group to prevent false matches /// on subdomains (e.g., `www.googletagmanager.com.evil.com`). /// /// The replacement target is `/integrations/google_tag_manager` + the captured delimiter. static GTM_URL_PATTERN: Lazy = Lazy::new(|| { - Regex::new(r#"(?:https?:)?//www\.(googletagmanager|google-analytics)\.com([/"])"#) + Regex::new(r#"(?:https?:)?//(?:www\.(googletagmanager|google-analytics)\.com|analytics\.google\.com)([/"])"#) .expect("GTM URL regex should compile") }); @@ -85,6 +92,11 @@ pub struct GoogleTagManagerConfig { #[serde(default = "default_cache_max_age")] #[validate(range(min = 60, max = 86400))] pub cache_max_age: u32, + /// Maximum allowed size for POST beacon bodies in bytes (default: 65536 / 64KB). + /// Prevents memory pressure from oversized payloads on public /collect endpoints. + #[serde(default = "default_max_beacon_body_size")] + #[validate(range(min = 1024, max = 1048576))] + pub max_beacon_body_size: usize, } impl IntegrationConfig for GoogleTagManagerConfig { @@ -105,6 +117,10 @@ fn default_cache_max_age() -> u32 { 900 // Match Google's default } +fn default_max_beacon_body_size() -> usize { + 65536 // 64KB - prevents memory pressure from oversized payloads +} + fn validate_container_id(container_id: &str) -> Result<(), validator::ValidationError> { if GTM_CONTAINER_ID_PATTERN.is_match(container_id) { Ok(()) @@ -150,16 +166,55 @@ impl GoogleTagManagerIntegration { /// Only matches URLs for which we have corresponding proxy routes /// (gtm.js, gtag/js, collect, g/collect). Excludes ns.html and other /// GTM endpoints we don't proxy. + /// + /// Uses proper URL parsing to prevent false positives from substring matching. + /// For example, rejects URLs like `https://evil.com/?u=https://www.google-analytics.com/collect` fn is_rewritable_url(url: &str) -> bool { - // Match googletagmanager.com URLs for scripts we proxy - if url.contains("googletagmanager.com") { - return url.contains("/gtm.js") || url.contains("/gtag/js") || url.contains("/gtag.js"); - } - // Match google-analytics.com and analytics.google.com URLs for beacons we proxy - if url.contains("google-analytics.com") || url.contains("analytics.google.com") { - return url.contains("/collect") || url.contains("/g/collect"); + // List of supported paths we proxy (must match route handlers) + const SUPPORTED_PATHS: &[&str] = + &["/gtm.js", "/gtag/js", "/gtag.js", "/collect", "/g/collect"]; + + // Parse URL to extract host and path + // Support both absolute URLs (https://...) and protocol-relative URLs (//...) + let url_to_parse = if url.starts_with("//") { + format!("https:{}", url) + } else if url.starts_with("http://") || url.starts_with("https://") { + url.to_string() + } else { + // Relative URLs or other formats - not rewritable via this integration + return false; + }; + + // Extract host and path from URL + // Format: https://host/path?query + let without_protocol = url_to_parse + .trim_start_matches("https://") + .trim_start_matches("http://"); + + let (host, path_with_query) = match without_protocol.split_once('/') { + Some((h, p)) => (h, format!("/{}", p)), + None => return false, // No path component + }; + + // Extract path without query string or fragment + let path = path_with_query + .split('?') + .next() + .and_then(|p| p.split('#').next()) + .unwrap_or(""); + + // Validate host is exactly one of our supported GTM/GA domains + let valid_host = matches!( + host, + "www.googletagmanager.com" | "www.google-analytics.com" | "analytics.google.com" + ); + + if !valid_host { + return false; } - false + + // Validate path is in our allowlist + SUPPORTED_PATHS.contains(&path) } /// Both `/gtag/js` (canonical) and `/gtag.js` (alternate) are accepted; @@ -197,14 +252,48 @@ impl GoogleTagManagerIntegration { path: &str, req: &mut Request, target_url: &'a str, - ) -> ProxyRequestConfig<'a> { + ) -> Result, PayloadSizeError> { let mut proxy_config = ProxyRequestConfig::new(target_url); proxy_config.forward_synthetic_id = false; // If it's a POST request (e.g. /collect beacon), we must manually attach the body // because ProxyRequestConfig doesn't automatically copy it from the source request. if req.get_method() == Method::POST { - let body_bytes = req.take_body_bytes(); + // Read body with size cap to prevent unbounded memory allocation. + // Read in chunks and reject early if body exceeds max_beacon_body_size. + let mut body = req.take_body(); + let mut body_bytes = Vec::new(); + let max_size = self.config.max_beacon_body_size; + const CHUNK_SIZE: usize = 8192; // 8KB chunks + + for chunk_result in body.read_chunks(CHUNK_SIZE) { + let chunk = chunk_result.map_err(|e| { + log::error!("Error reading request body: {}", e); + // Convert I/O error to size error for uniform handling + PayloadSizeError::TooLarge { + actual: 0, + max: max_size, + } + })?; + + // Check if adding this chunk would exceed the limit + // This prevents buffering oversized bodies into memory + if body_bytes.len() + chunk.len() > max_size { + let total_size = body_bytes.len() + chunk.len(); + log::warn!( + "POST body size {} exceeds max {} (rejected during chunked read)", + total_size, + max_size + ); + return Err(PayloadSizeError::TooLarge { + actual: total_size, + max: max_size, + }); + } + + body_bytes.extend_from_slice(&chunk); + } + proxy_config.body = Some(body_bytes); } @@ -222,7 +311,7 @@ impl GoogleTagManagerIntegration { ); } - proxy_config + Ok(proxy_config) } } @@ -282,13 +371,54 @@ impl IntegrationProxy for GoogleTagManagerIntegration { let method = req.get_method(); log::debug!("Handling GTM request: {} {}", method, path); + // Validate body size for POST requests to prevent memory pressure + // Check Content-Length header if present for early rejection + if method == Method::POST { + if let Some(content_length_str) = + req.get_header_str(fastly::http::header::CONTENT_LENGTH) + { + match content_length_str.parse::() { + Ok(content_length) => { + // Early rejection based on Content-Length + if content_length > self.config.max_beacon_body_size { + log::warn!( + "Rejecting POST beacon with Content-Length {} exceeding max {}", + content_length, + self.config.max_beacon_body_size + ); + return Ok(Response::from_status(StatusCode::PAYLOAD_TOO_LARGE)); + } + } + Err(_) => { + // Invalid Content-Length header + log::warn!("POST request with malformed Content-Length header"); + return Ok(Response::from_status(StatusCode::BAD_REQUEST)); + } + } + } + // If Content-Length is missing, we'll check actual size after read + // This maintains compatibility with HTTP/2 and intermediaries + } + let Some(target_url) = self.build_target_url(&req, &path) else { return Ok(Response::from_status(StatusCode::NOT_FOUND)); }; log::debug!("Proxying to upstream: {}", target_url); - let proxy_config = self.build_proxy_config(&path, &mut req, &target_url); + // Handle payload size errors explicitly to return 413 instead of 502 + let proxy_config = match self.build_proxy_config(&path, &mut req, &target_url) { + Ok(config) => config, + Err(PayloadSizeError::TooLarge { actual, max }) => { + // This catches cases where Content-Length was incorrect + log::warn!( + "Returning 413: actual body size {} exceeds max {} (Content-Length mismatch)", + actual, + max + ); + return Ok(Response::from_status(StatusCode::PAYLOAD_TOO_LARGE)); + } + }; let mut response = proxy_request(settings, req, proxy_config) .await @@ -390,6 +520,8 @@ mod tests { var c = "https://www.google-analytics.com/collect"; var d = "//www.google-analytics.com/g/collect"; var e = "http://www.googletagmanager.com/gtm.js"; + var f = "https://analytics.google.com/g/collect"; + var g = "//analytics.google.com/collect"; "#; let result = GoogleTagManagerIntegration::rewrite_gtm_urls(input); @@ -399,19 +531,25 @@ mod tests { assert!(result.contains("/integrations/google_tag_manager/g/collect")); assert!(!result.contains("www.googletagmanager.com")); assert!(!result.contains("www.google-analytics.com")); + assert!( + !result.contains("analytics.google.com"), + "analytics.google.com should be rewritten" + ); } #[test] - fn test_rewrite_does_not_touch_analytics_google_com() { - // analytics.google.com must NOT be rewritten in scripts — gtag.js stores - // the bare domain string and constructs URLs dynamically with - // "https://" + domain + "/g/collect", so rewriting the domain produces - // the broken URL https://integrations/google_tag_manager/g/collect. + fn test_rewrite_analytics_google_com_full_urls() { + // Full analytics.google.com URLs (with // prefix) SHOULD be rewritten + // for HTML attributes where we see the complete URL. let input = r#"var f = "https://analytics.google.com/g/collect";"#; let result = GoogleTagManagerIntegration::rewrite_gtm_urls(input); - assert_eq!( - input, result, - "analytics.google.com should not be rewritten by regex" + assert!( + result.contains("/integrations/google_tag_manager/g/collect"), + "Full analytics.google.com URLs should be rewritten" + ); + assert!( + !result.contains("analytics.google.com"), + "analytics.google.com should be replaced" ); } @@ -456,6 +594,7 @@ mod tests { container_id: "GTM-TEST1234".to_string(), upstream_url: "https://www.googletagmanager.com".to_string(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; let integration = GoogleTagManagerIntegration::new(config); @@ -536,7 +675,23 @@ mod tests { ); } - // Case 5: Other URL (should be kept) + // Case 5: analytics.google.com URL in href + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "href", + "https://analytics.google.com/g/collect?v=2", + &ctx, + ); + if let AttributeRewriteAction::Replace(val) = action { + assert_eq!(val, "/integrations/google_tag_manager/g/collect?v=2"); + } else { + panic!( + "Expected Replace action for analytics.google.com href, got {:?}", + action + ); + } + + // Case 6: Other URL (should be kept) let action = IntegrationAttributeRewriter::rewrite( &*integration, "src", @@ -546,6 +701,87 @@ mod tests { assert!(matches!(action, AttributeRewriteAction::Keep)); } + #[test] + fn test_attribute_rewriter_rejects_false_positives() { + // Test that URLs with GTM domains in query parameters or paths are NOT rewritten + // This verifies the fix for P2: proper URL parsing instead of substring matching + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST1234".to_string(), + upstream_url: "https://www.googletagmanager.com".to_string(), + cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), + }; + let integration = GoogleTagManagerIntegration::new(config); + + let ctx = IntegrationAttributeContext { + attribute_name: "href", + request_host: "example.com", + request_scheme: "https", + origin_host: "origin.example.com", + }; + + // Case 1: GTM domain in query parameter - should NOT be rewritten + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "href", + "https://evil.com/?redirect=https://www.google-analytics.com/collect", + &ctx, + ); + assert!( + matches!(action, AttributeRewriteAction::Keep), + "URLs with GTM domains in query params should not be rewritten" + ); + + // Case 2: GTM domain in path component - should NOT be rewritten + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "href", + "https://example.com/www.googletagmanager.com/gtm.js", + &ctx, + ); + assert!( + matches!(action, AttributeRewriteAction::Keep), + "URLs with GTM domains in path should not be rewritten" + ); + + // Case 3: Unsupported path on valid GTM domain - should NOT be rewritten + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "href", + "https://www.googletagmanager.com/ns.html", + &ctx, + ); + assert!( + matches!(action, AttributeRewriteAction::Keep), + "Unsupported paths like ns.html should not be rewritten" + ); + + // Case 4: Fragment with GTM domain - should NOT be rewritten + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "href", + "https://example.com/page#https://www.googletagmanager.com/gtm.js", + &ctx, + ); + assert!( + matches!(action, AttributeRewriteAction::Keep), + "URLs with GTM domains in fragment should not be rewritten" + ); + + // Case 5: Valid GTM URL should STILL be rewritten (sanity check) + let action = IntegrationAttributeRewriter::rewrite( + &*integration, + "src", + "https://www.googletagmanager.com/gtm.js?id=GTM-TEST", + &ctx, + ); + assert!( + matches!(action, AttributeRewriteAction::Replace(_)), + "Valid GTM URLs should still be rewritten" + ); + } + #[test] fn test_script_rewriter() { let config = GoogleTagManagerConfig { @@ -553,6 +789,7 @@ mod tests { container_id: "GTM-TEST1234".to_string(), upstream_url: "https://www.googletagmanager.com".to_string(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; let integration = GoogleTagManagerIntegration::new(config); let doc_state = IntegrationDocumentState::default(); @@ -607,6 +844,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= container_id: "GTM-DEFAULT".to_string(), upstream_url: default_upstream(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; assert!(!config.enabled); @@ -621,6 +859,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= container_id: "GTM-TEST1234123".to_string(), upstream_url: default_upstream(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; let integration_default = GoogleTagManagerIntegration::new(config_default); assert_eq!( @@ -634,6 +873,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= container_id: "GTM-TEST1234123".to_string(), upstream_url: "https://gtm.example.com".to_string(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; let integration_custom = GoogleTagManagerIntegration::new(config_custom); assert_eq!(integration_custom.upstream_url(), "https://gtm.example.com"); @@ -646,6 +886,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= container_id: "GTM-TEST1234".to_string(), upstream_url: default_upstream(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; let integration = GoogleTagManagerIntegration::new(config); let routes = integration.routes(); @@ -677,6 +918,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= container_id: "GTM-TEST1234".to_string(), upstream_url: default_upstream(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; let integration = GoogleTagManagerIntegration::new(config); @@ -691,7 +933,9 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= let target_url = integration .build_target_url(&req, &path) .expect("should resolve collect target URL"); - let proxy_config = integration.build_proxy_config(&path, &mut req, &target_url); + let proxy_config = integration + .build_proxy_config(&path, &mut req, &target_url) + .expect("should build proxy config"); assert_eq!( proxy_config.body.as_deref(), @@ -700,6 +944,252 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= ); } + #[test] + fn test_oversized_post_body_rejected() { + let max_size = default_max_beacon_body_size(); + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST1234".to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + max_beacon_body_size: max_size, + }; + let integration = GoogleTagManagerIntegration::new(config); + + // Create a payload larger than the configured max size (64KB by default) + let oversized_payload = vec![b'X'; max_size + 1]; + let mut req = Request::new( + Method::POST, + "https://edge.example.com/integrations/google_tag_manager/collect", + ); + req.set_body(oversized_payload.clone()); + + let path = req.get_path().to_string(); + let target_url = integration + .build_target_url(&req, &path) + .expect("should resolve collect target URL"); + + // Attempt to build proxy config should fail due to oversized body + let result = integration.build_proxy_config(&path, &mut req, &target_url); + + assert!(result.is_err(), "Oversized POST body should be rejected"); + + if let Err(PayloadSizeError::TooLarge { actual, max }) = result { + assert_eq!(actual, max_size + 1, "Should report actual size"); + assert_eq!(max, max_size, "Should report max size"); + } else { + panic!("Expected PayloadSizeError::TooLarge"); + } + } + + #[test] + fn test_custom_max_beacon_body_size() { + // Test with a custom smaller limit + let custom_max_size = 1024; // 1KB + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST1234".to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + max_beacon_body_size: custom_max_size, + }; + let integration = GoogleTagManagerIntegration::new(config); + + // Payload just under the custom limit should succeed + let acceptable_payload = vec![b'X'; custom_max_size - 1]; + let mut req1 = Request::new( + Method::POST, + "https://edge.example.com/integrations/google_tag_manager/collect", + ); + req1.set_body(acceptable_payload.clone()); + + let path = req1.get_path().to_string(); + let target_url = integration + .build_target_url(&req1, &path) + .expect("should resolve collect target URL"); + + let result = integration.build_proxy_config(&path, &mut req1, &target_url); + assert!(result.is_ok(), "Payload under custom limit should succeed"); + + // Payload over the custom limit should fail + let oversized_payload = vec![b'X'; custom_max_size + 1]; + let mut req2 = Request::new( + Method::POST, + "https://edge.example.com/integrations/google_tag_manager/collect", + ); + req2.set_body(oversized_payload); + + let target_url2 = integration + .build_target_url(&req2, &path) + .expect("should resolve collect target URL"); + + let result2 = integration.build_proxy_config(&path, &mut req2, &target_url2); + assert!( + result2.is_err(), + "Payload over custom limit should be rejected" + ); + } + + #[test] + fn test_incorrect_content_length_returns_413() { + // Verify that when Content-Length is incorrect (smaller than actual body), + // we still catch it and return 413 (not 502) + let max_size = default_max_beacon_body_size(); + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST1234".to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + max_beacon_body_size: max_size, + }; + let integration = GoogleTagManagerIntegration::new(config); + + // Create oversized payload but with incorrect (small) Content-Length + let oversized_payload = vec![b'X'; max_size + 1]; + let mut req = Request::new( + Method::POST, + "https://edge.example.com/integrations/google_tag_manager/collect", + ); + req.set_body(oversized_payload.clone()); + // Set Content-Length to a small value (incorrect) + req.set_header( + fastly::http::header::CONTENT_LENGTH, + (max_size / 2).to_string(), + ); + + let path = req.get_path().to_string(); + let target_url = integration + .build_target_url(&req, &path) + .expect("should resolve collect target URL"); + + // build_proxy_config should detect the mismatch and return PayloadSizeError + let result = integration.build_proxy_config(&path, &mut req, &target_url); + + assert!( + result.is_err(), + "Should reject when actual body exceeds max despite low Content-Length" + ); + + // Verify it's a PayloadSizeError::TooLarge + if let Err(PayloadSizeError::TooLarge { actual, max }) = result { + assert_eq!(actual, oversized_payload.len(), "Should report actual size"); + assert_eq!(max, max_size, "Should report max size"); + } else { + panic!("Expected PayloadSizeError::TooLarge"); + } + } + + #[tokio::test] + async fn test_handle_returns_413_for_oversized_post() { + // Verify that handle() actually returns 413 status code for oversized POST + let max_size = 1024; // Use small size for testing + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST1234".to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + max_beacon_body_size: max_size, + }; + let integration = GoogleTagManagerIntegration::new(config); + + // Create oversized payload with correct Content-Length + let oversized_payload = vec![b'X'; max_size + 1]; + let mut req = Request::new( + Method::POST, + "https://edge.example.com/integrations/google_tag_manager/collect", + ); + req.set_body(oversized_payload.clone()); + req.set_header( + fastly::http::header::CONTENT_LENGTH, + oversized_payload.len().to_string(), + ); + + let settings = make_settings(); + let response = integration + .handle(&settings, req) + .await + .expect("handle should not return error"); + + // Verify we get 413 Payload Too Large, not 502 Bad Gateway + assert_eq!( + response.get_status(), + StatusCode::PAYLOAD_TOO_LARGE, + "Should return 413 for oversized POST body" + ); + } + + #[tokio::test] + async fn test_handle_returns_400_for_invalid_content_length() { + // Verify that handle() returns 400 Bad Request for malformed Content-Length + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST1234".to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), + }; + let integration = GoogleTagManagerIntegration::new(config); + + // Create POST request with invalid Content-Length header + let payload = b"v=2&tid=G-TEST&cid=123".to_vec(); + let mut req = Request::new( + Method::POST, + "https://edge.example.com/integrations/google_tag_manager/collect", + ); + req.set_body(payload); + req.set_header(fastly::http::header::CONTENT_LENGTH, "not-a-number"); + + let settings = make_settings(); + let response = integration + .handle(&settings, req) + .await + .expect("handle should not return error"); + + // Verify we get 400 Bad Request for malformed Content-Length + assert_eq!( + response.get_status(), + StatusCode::BAD_REQUEST, + "Should return 400 for malformed Content-Length" + ); + } + + #[tokio::test] + async fn test_handle_accepts_post_without_content_length() { + // Verify that POST without Content-Length is accepted (for HTTP/2 compatibility) + // but still checked against max size after read + let max_size = default_max_beacon_body_size(); + let config = GoogleTagManagerConfig { + enabled: true, + container_id: "GTM-TEST1234".to_string(), + upstream_url: default_upstream(), + cache_max_age: default_cache_max_age(), + max_beacon_body_size: max_size, + }; + let integration = GoogleTagManagerIntegration::new(config); + + // Create small POST request without Content-Length header + let small_payload = b"v=2&tid=G-TEST&cid=123".to_vec(); + let mut req = Request::new( + Method::POST, + "https://edge.example.com/integrations/google_tag_manager/collect", + ); + req.set_body(small_payload); + // Intentionally NOT setting Content-Length header (HTTP/2 scenario) + + let path = req.get_path().to_string(); + let target_url = integration + .build_target_url(&req, &path) + .expect("should resolve collect target URL"); + + // build_proxy_config should accept small payloads even without Content-Length + let result = integration.build_proxy_config(&path, &mut req, &target_url); + + assert!( + result.is_ok(), + "Should accept small POST without Content-Length (HTTP/2 compat)" + ); + } + #[test] fn test_collect_proxy_config_strips_client_ip_forwarding() { let config = GoogleTagManagerConfig { @@ -707,6 +1197,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= container_id: "GTM-TEST1234".to_string(), upstream_url: default_upstream(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; let integration = GoogleTagManagerIntegration::new(config); @@ -720,7 +1211,9 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= let target_url = integration .build_target_url(&req, &path) .expect("should resolve collect target URL"); - let proxy_config = integration.build_proxy_config(&path, &mut req, &target_url); + let proxy_config = integration + .build_proxy_config(&path, &mut req, &target_url) + .expect("should build proxy config"); // We check if X-Forwarded-For is explicitly overridden with an empty string, // which effectively strips it during proxy forwarding due to header override logic. @@ -743,6 +1236,7 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= container_id: "GT-123".to_string(), upstream_url: default_upstream(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; let integration = GoogleTagManagerIntegration::new(config); @@ -755,7 +1249,9 @@ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= let target_url = integration .build_target_url(&req, &path) .expect("should resolve gtag target URL"); - let proxy_config = integration.build_proxy_config(&path, &mut req, &target_url); + let proxy_config = integration + .build_proxy_config(&path, &mut req, &target_url) + .expect("should build proxy config"); let has_identity = proxy_config.headers.iter().any(|(name, value)| { name == fastly::http::header::ACCEPT_ENCODING && value == "identity" @@ -1047,6 +1543,7 @@ container_id = "GTM-DEFAULT" container_id: container_id.to_string(), upstream_url: default_upstream(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; assert!( @@ -1078,6 +1575,7 @@ container_id = "GTM-DEFAULT" container_id: container_id.to_string(), upstream_url: default_upstream(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; assert!( @@ -1099,6 +1597,7 @@ container_id = "GTM-DEFAULT" container_id: too_long.clone(), upstream_url: default_upstream(), cache_max_age: default_cache_max_age(), + max_beacon_body_size: default_max_beacon_body_size(), }; assert!( diff --git a/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts b/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts index b9ff327a..70b99cea 100644 --- a/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts +++ b/crates/js/lib/src/integrations/google_tag_manager/script_guard.ts @@ -18,7 +18,14 @@ const GTM_URL_PATTERN = /^(?:https?:)?(?:\/\/)?(www\.(googletagmanager|google-analytics)\.com|analytics\.google\.com)(?:\/|$)/i; /** - * Check if a URL is a GTM or Google Analytics URL. + * Supported paths that the server can proxy. + * Must match the route patterns defined in the GoogleTagManagerIntegration handler + * in crates/common/src/integrations/google_tag_manager.rs + */ +const SUPPORTED_PATHS = ['/gtm.js', '/gtag/js', '/gtag.js', '/collect', '/g/collect']; + +/** + * Check if a URL is a GTM or Google Analytics URL with a supported path. * Matches the logic from google_tag_manager.rs GTM_URL_PATTERN. * * Valid patterns: @@ -30,9 +37,32 @@ const GTM_URL_PATTERN = * Invalid: * - https://googletagmanager.com/gtm.js (missing www.) * - https://example.com/www.googletagmanager.com (domain mismatch) + * - https://www.googletagmanager.com/ns.html (unsupported path) */ function isGtmUrl(url: string): boolean { - return !!url && GTM_URL_PATTERN.test(url); + if (!url || !GTM_URL_PATTERN.test(url)) { + return false; + } + + // Extract path from URL to validate it's a supported route + try { + const normalizedUrl = url.startsWith('//') + ? `https:${url}` + : url.startsWith('http') + ? url + : `https://${url}`; + + const parsed = new URL(normalizedUrl); + const path = parsed.pathname; + + // Check if the path matches any of our supported paths + // Note: pathname never includes query strings, so exact match is sufficient + return SUPPORTED_PATHS.some((supportedPath) => path === supportedPath); + } catch { + // Fail closed: if URL parsing fails, reject the URL rather than + // using a permissive fallback that could match malformed strings + return false; + } } /** diff --git a/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts b/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts index 7b7d70b6..6b1ecdde 100644 --- a/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts +++ b/crates/js/lib/test/integrations/google_tag_manager/script_guard.test.ts @@ -1,4 +1,5 @@ import { describe, it, expect, beforeEach, afterEach } from 'vitest'; + import { installGtmGuard, isGuardInstalled, @@ -61,6 +62,25 @@ describe('GTM Script Interception Guard', () => { expect(isGtmUrl(null as unknown as string)).toBe(false); expect(isGtmUrl(undefined as unknown as string)).toBe(false); }); + + it('should only match supported paths', () => { + // Supported paths + expect(isGtmUrl('https://www.googletagmanager.com/gtm.js')).toBe(true); + expect(isGtmUrl('https://www.googletagmanager.com/gtag/js')).toBe(true); + expect(isGtmUrl('https://www.googletagmanager.com/gtag.js')).toBe(true); + expect(isGtmUrl('https://www.google-analytics.com/collect')).toBe(true); + expect(isGtmUrl('https://www.google-analytics.com/g/collect')).toBe(true); + + // Unsupported paths should be rejected + expect(isGtmUrl('https://www.googletagmanager.com/ns.html')).toBe(false); + expect(isGtmUrl('https://www.googletagmanager.com/other.js')).toBe(false); + expect(isGtmUrl('https://www.google-analytics.com/analytics.js')).toBe(false); + }); + + it('should match supported paths with query parameters', () => { + expect(isGtmUrl('https://www.googletagmanager.com/gtm.js?id=GTM-XXXX')).toBe(true); + expect(isGtmUrl('https://www.google-analytics.com/collect?v=2&tid=G-TEST')).toBe(true); + }); }); describe('extractGtmPath', () => { @@ -207,6 +227,20 @@ describe('GTM Script Interception Guard', () => { expect(script.getAttribute('async')).toBe(''); expect(script.getAttribute('data-nscript')).toBe('afterInteractive'); }); + + it('should not rewrite unsupported GTM paths', () => { + installGtmGuard(); + + const container = document.createElement('div'); + const script = document.createElement('script'); + script.src = 'https://www.googletagmanager.com/ns.html?id=GTM-XXXX'; + + container.appendChild(script); + + // ns.html is not a supported path, so it should not be rewritten + expect(script.src).toBe('https://www.googletagmanager.com/ns.html?id=GTM-XXXX'); + expect(script.src).toContain('googletagmanager.com'); + }); }); describe('insertBefore interception', () => { @@ -330,12 +364,10 @@ describe('GTM Beacon Guard', () => { originalSendBeacon = navigator.sendBeacon; originalFetch = window.fetch; - sendBeaconSpy = vi.fn((_url: string | URL, _data?: BodyInit | null) => true); + sendBeaconSpy = vi.fn(() => true); navigator.sendBeacon = sendBeaconSpy; - fetchSpy = vi.fn((_input: RequestInfo | URL, _init?: RequestInit) => - Promise.resolve(new Response('', { status: 200 })) - ); + fetchSpy = vi.fn(() => Promise.resolve(new Response('', { status: 200 }))); window.fetch = fetchSpy; resetBeaconGuardState(); diff --git a/docs/guide/integrations/google_tag_manager.md b/docs/guide/integrations/google_tag_manager.md index b72b70db..074c7a88 100644 --- a/docs/guide/integrations/google_tag_manager.md +++ b/docs/guide/integrations/google_tag_manager.md @@ -38,15 +38,19 @@ Add the GTM configuration to `trusted-server.toml`: enabled = true container_id = "GTM-XXXXXX" # upstream_url = "https://www.googletagmanager.com" # Optional override +# cache_max_age = 900 # Optional: Cache duration in seconds (default: 900) +# max_beacon_body_size = 65536 # Optional: Max POST body size in bytes (default: 65536 / 64KB) ``` ### Configuration Options -| Field | Type | Required | Description | -| -------------- | ------- | -------- | --------------------------------------------- | -| `enabled` | boolean | No | Enable/disable integration (default: `false`) | -| `container_id` | string | Yes | Your GTM Container ID (e.g., `GTM-A1B2C3`) | -| `upstream_url` | string | No | Custom upstream URL (advanced usage) | +| Field | Type | Required | Description | +| ---------------------- | ------- | -------- | ----------------------------------------------------------------------- | +| `enabled` | boolean | No | Enable/disable integration (default: `false`) | +| `container_id` | string | Yes | Your GTM Container ID (e.g., `GTM-A1B2C3`) | +| `upstream_url` | string | No | Custom upstream URL (default: `https://www.googletagmanager.com`) | +| `cache_max_age` | number | No | Cache duration in seconds (default: `900`, range: `60`-`86400`) | +| `max_beacon_body_size` | number | No | Max POST body size in bytes (default: `65536`, range: `1024`-`1048576`) | ## How It Works @@ -148,6 +152,25 @@ POST /integrations/google_tag_manager/g/collect?v=2&... - Forwarding: User-Agent, Referer, Payload - Privacy: Does NOT forward client IP (Google sees Trusted Server IP) +**POST Request Handling**: + +The endpoint validates POST request sizes to prevent memory pressure: + +- If `Content-Length` header is present and valid: + - Requests exceeding `max_beacon_body_size` are rejected early with `413 Payload Too Large` + - Valid requests proceed normally +- If `Content-Length` header is malformed: `400 Bad Request` +- If `Content-Length` header is missing: Request is accepted (HTTP/2 compatible) + - Body is read in 8KB chunks with size validation + - Reading stops immediately if `max_beacon_body_size` is exceeded + - Oversized bodies return `413 Payload Too Large` without buffering the full payload + +**Memory Protection**: + +The implementation uses chunked reading to prevent unbounded memory allocation. Bodies are read in small chunks (8KB), and size is validated incrementally. This ensures that even if a client sends a malicious multi-gigabyte POST (with no Content-Length or an incorrect one), the server will reject it after reading at most `max_beacon_body_size + 8KB` into memory. + +This approach maintains compatibility with HTTP/2 and HTTP/3 clients while providing robust protection against memory exhaustion attacks. + ## Performance & Caching ### Compression