Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions backend/account_v2/serializer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

from rest_framework import serializers
from utils.input_sanitizer import validate_name_field

from account_v2.models import Organization, User

Expand All @@ -10,6 +11,12 @@ class OrganizationSignupSerializer(serializers.Serializer):
display_name = serializers.CharField(required=True, max_length=150)
organization_id = serializers.CharField(required=True, max_length=30)

def validate_name(self, value: str) -> str:
return validate_name_field(value, field_name="Organization name")

def validate_display_name(self, value: str) -> str:
return validate_name_field(value, field_name="Display name")

def validate_organization_id(self, value): # type: ignore
if not re.match(r"^[a-z0-9_-]+$", value):
raise serializers.ValidationError(
Expand Down
15 changes: 15 additions & 0 deletions backend/adapter_processor_v2/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from django.conf import settings
from rest_framework import serializers
from rest_framework.serializers import ModelSerializer
from utils.input_sanitizer import validate_name_field, validate_no_html_tags

from adapter_processor_v2.adapter_processor import AdapterProcessor
from adapter_processor_v2.constants import AdapterKeys
Expand All @@ -28,6 +29,20 @@ class Meta:
model = AdapterInstance
fields = "__all__"

def validate(self, data):
data = super().validate(data)
adapter_name = data.get("adapter_name")
if adapter_name is not None:
data["adapter_name"] = validate_name_field(
adapter_name, field_name="Adapter name"
)
description = data.get("description")
if description is not None:
data["description"] = validate_no_html_tags(
description, field_name="Description"
)
return data
Comment thread
hari-kuriakose marked this conversation as resolved.


class DefaultAdapterSerializer(serializers.Serializer):
llm_default = serializers.CharField(max_length=FLC.UUID_LENGTH, required=False)
Expand Down
9 changes: 9 additions & 0 deletions backend/api_v2/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
ValidationError,
)
from tags.serializers import TagParamsSerializer
from utils.input_sanitizer import validate_name_field, validate_no_html_tags
from utils.serializer.integrity_error_mixin import IntegrityErrorMixin
from workflow_manager.endpoint_v2.models import WorkflowEndpoint
from workflow_manager.workflow_v2.exceptions import ExecutionDoesNotExistError
Expand Down Expand Up @@ -62,6 +63,14 @@ def validate_api_name(self, value: str) -> str:
api_name_validator(value)
return value

def validate_display_name(self, value: str) -> str:
return validate_name_field(value, field_name="Display name")

def validate_description(self, value: str) -> str:
if value is None:
return value
return validate_no_html_tags(value, field_name="Description")
Comment thread
hari-kuriakose marked this conversation as resolved.

def validate_workflow(self, workflow):
"""Validate that the workflow has properly configured source and destination endpoints."""
# Get all endpoints for this workflow with related data
Expand Down
1 change: 1 addition & 0 deletions backend/backend/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ def filter(self, record):
"social_django.middleware.SocialAuthExceptionMiddleware",
"middleware.remove_allow_header.RemoveAllowHeaderMiddleware",
"middleware.cache_control.CacheControlMiddleware",
"middleware.content_security_policy.ContentSecurityPolicyMiddleware",
Comment thread
vishnuszipstack marked this conversation as resolved.
]

TENANT_SUBFOLDER_PREFIX = f"{PATH_PREFIX}/unstract"
Expand Down
4 changes: 4 additions & 0 deletions backend/connector_v2/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from connector_processor.exceptions import OAuthTimeOut
from rest_framework.serializers import CharField, SerializerMethodField
from utils.fields import EncryptedBinaryFieldSerializer
from utils.input_sanitizer import validate_name_field

from backend.serializers import AuditSerializer
from connector_v2.constants import ConnectorInstanceKey as CIKey
Expand All @@ -28,6 +29,9 @@ class Meta:
model = ConnectorInstance
fields = "__all__"

def validate_connector_name(self, value: str) -> str:
return validate_name_field(value, field_name="Connector name")

def save(self, **kwargs): # type: ignore
user = self.context.get("request").user or None
connector_id: str = kwargs[CIKey.CONNECTOR_ID]
Expand Down
40 changes: 40 additions & 0 deletions backend/middleware/content_security_policy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from django.http import HttpRequest, HttpResponse
from django.utils.deprecation import MiddlewareMixin


class ContentSecurityPolicyMiddleware(MiddlewareMixin):
"""Middleware to add Content-Security-Policy header to all responses.

The policy is restrictive by default. The inline script in login.html
is allowed via a SHA-256 hash rather than 'unsafe-inline' to maintain
strong XSS protection. 'unsafe-inline' is only used for style-src
because the login page uses inline <style> blocks.

Uses response.setdefault() so that route-specific CSP policies set by
views or earlier middleware are not overwritten.
"""

# SHA-256 hash of the inline script in login.html (form submit spinner).
# If that script changes, regenerate with:
# python -c "import hashlib,base64; ..."
Comment thread
kirtimanmishrazipstack marked this conversation as resolved.
Comment thread
kirtimanmishrazipstack marked this conversation as resolved.
_SCRIPT_HASH = "sha256-GES82NvXpRYmVFDKv6vRHx2c7xuv8mgUzUaP7heKeFY="

def process_response(
self, request: HttpRequest, response: HttpResponse

Check warning on line 23 in backend/middleware/content_security_policy.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Remove the unused function parameter "request".

See more on https://sonarcloud.io/project/issues?id=Zipstack_unstract&issues=AZ0eXgEpSUO5CDbnTmtY&open=AZ0eXgEpSUO5CDbnTmtY&pullRequest=1834
) -> HttpResponse:
response.setdefault(
"Content-Security-Policy",
(
"default-src 'self'; "
f"script-src 'self' '{self._SCRIPT_HASH}'; "
"style-src 'self' 'unsafe-inline'; "
Comment thread
greptile-apps[bot] marked this conversation as resolved.
"img-src 'self'; "
"font-src 'self'; "
"connect-src 'self'; "
"object-src 'none'; "
"frame-ancestors 'none'; "
"base-uri 'self'; "
"form-action 'self'"
),
Comment thread
greptile-apps[bot] marked this conversation as resolved.
)
return response
Comment thread
kirtimanmishrazipstack marked this conversation as resolved.
3 changes: 3 additions & 0 deletions backend/notification_v2/serializers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from rest_framework import serializers
from utils.input_sanitizer import validate_name_field

from .enums import AuthorizationType, NotificationType, PlatformType
from .models import Notification
Expand Down Expand Up @@ -109,6 +110,8 @@ def validate_name(self, value):
"""Check uniqueness of the name with respect to either 'api' or
'pipeline'.
"""
value = validate_name_field(value, field_name="Notification name")

api = self.initial_data.get("api", getattr(self.instance, "api", None))
pipeline = self.initial_data.get(
"pipeline", getattr(self.instance, "pipeline", None)
Expand Down
9 changes: 9 additions & 0 deletions backend/prompt_studio/prompt_studio_core_v2/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from rest_framework import serializers
from rest_framework.exceptions import ValidationError
from utils.FileValidator import FileValidator
from utils.input_sanitizer import validate_name_field, validate_no_html_tags
from utils.serializer.integrity_error_mixin import IntegrityErrorMixin

from backend.serializers import AuditSerializer
Expand Down Expand Up @@ -54,6 +55,14 @@ class Meta:
}
}

def validate_tool_name(self, value: str) -> str:
return validate_name_field(value, field_name="Tool name")

def validate_description(self, value: str) -> str:
if value is None:
return value
return validate_no_html_tags(value, field_name="Description")
Comment thread
greptile-apps[bot] marked this conversation as resolved.

def validate_summarize_llm_adapter(self, value):
"""Validate that the adapter type is LLM and is accessible to the user."""
if value is None:
Expand Down
45 changes: 45 additions & 0 deletions backend/utils/input_sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import re

from rest_framework.serializers import ValidationError

# Pattern to detect HTML/script tags (closed tags and unclosed tags starting with a letter)
# The second alternative catches unclosed tags like "<script" or "<img src=x" that could
# be completed by adjacent content in non-React rendering contexts (emails, PDFs, logs)
HTML_TAG_PATTERN = re.compile(r"<[^>]*>|<[a-zA-Z/!]")
# Pattern to detect dangerous URI protocols: javascript:, vbscript:, and data: URIs.
# data: URIs are only matched when followed by a MIME type (word/word) to avoid
# false positives on ordinary English text like "Input data: JSON format".
JS_PROTOCOL_PATTERN = re.compile(
r"(?:javascript|vbscript)\s*:|data\s*:\s*\w+/\w+", re.IGNORECASE
)
Comment thread
kirtimanmishrazipstack marked this conversation as resolved.
# Pattern to detect event handlers using a vetted list of DOM event names.
# This avoids false positives on benign words like "connection=", "onboarding=", etc.
_DOM_EVENTS = (
"abort|blur|change|click|close|contextmenu|copy|cut|dblclick|drag|dragend|"
"dragenter|dragleave|dragover|dragstart|drop|error|focus|focusin|focusout|"
"input|invalid|keydown|keypress|keyup|load|mousedown|mouseenter|mouseleave|"
"mousemove|mouseout|mouseover|mouseup|paste|pointerdown|pointerenter|"
"pointerleave|pointermove|pointerout|pointerover|pointerup|reset|resize|"
"scroll|select|submit|toggle|touchcancel|touchend|touchmove|touchstart|"
"unload|wheel"
)
EVENT_HANDLER_PATTERN = re.compile(rf"\bon({_DOM_EVENTS})\s*=", re.IGNORECASE)
Comment thread
kirtimanmishrazipstack marked this conversation as resolved.


def validate_no_html_tags(value: str, field_name: str = "This field") -> str:
"""Reject values containing HTML/script tags."""
if HTML_TAG_PATTERN.search(value):
raise ValidationError(f"{field_name} must not contain HTML or script tags.")
if JS_PROTOCOL_PATTERN.search(value):
raise ValidationError(f"{field_name} must not contain dangerous URI protocols.")
if EVENT_HANDLER_PATTERN.search(value):
raise ValidationError(f"{field_name} must not contain event handler attributes.")
return value


def validate_name_field(value: str, field_name: str = "This field") -> str:
"""Validate name/identifier fields - no HTML tags, strip whitespace."""
value = value.strip()
if not value:
raise ValidationError(f"{field_name} must not be empty.")
return validate_no_html_tags(value, field_name)
Empty file added backend/utils/tests/__init__.py
Empty file.
150 changes: 150 additions & 0 deletions backend/utils/tests/test_input_sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import pytest
from rest_framework.serializers import ValidationError

from utils.input_sanitizer import validate_name_field, validate_no_html_tags


class TestValidateNoHtmlTags:
def test_clean_input_passes(self):
assert validate_no_html_tags("Hello World") == "Hello World"

def test_allows_normal_special_chars(self):
assert (
validate_no_html_tags("My workflow (v2), test - final")
== "My workflow (v2), test - final"
)

def test_allows_numbers_and_punctuation(self):
assert validate_no_html_tags("Test 123 & more!") == "Test 123 & more!"

def test_rejects_script_tag(self):
with pytest.raises(ValidationError, match="must not contain HTML or script tags"):
validate_no_html_tags("<script>alert(1)</script>")

def test_rejects_img_tag(self):
with pytest.raises(ValidationError, match="must not contain HTML or script tags"):
validate_no_html_tags('<img src=x onerror=alert(1)>')

def test_rejects_div_tag(self):
with pytest.raises(ValidationError, match="must not contain HTML or script tags"):
validate_no_html_tags("<div>content</div>")

def test_rejects_self_closing_tag(self):
with pytest.raises(ValidationError, match="must not contain HTML or script tags"):
validate_no_html_tags("<br/>")

def test_rejects_unclosed_script_tag(self):
with pytest.raises(ValidationError, match="must not contain HTML or script tags"):
validate_no_html_tags("<script")

def test_rejects_unclosed_img_tag(self):
with pytest.raises(ValidationError, match="must not contain HTML or script tags"):
validate_no_html_tags("<img src=x onerror")

def test_allows_less_than_with_number(self):
assert validate_no_html_tags("a < 3") == "a < 3"

def test_rejects_javascript_protocol(self):
with pytest.raises(ValidationError, match="must not contain dangerous URI protocols"):
validate_no_html_tags("javascript:alert(1)")

def test_rejects_javascript_protocol_with_spaces(self):
with pytest.raises(ValidationError, match="must not contain dangerous URI protocols"):
validate_no_html_tags("javascript :alert(1)")

def test_rejects_javascript_protocol_case_insensitive(self):
with pytest.raises(ValidationError, match="must not contain dangerous URI protocols"):
validate_no_html_tags("JAVASCRIPT:alert(1)")

def test_rejects_data_uri(self):
with pytest.raises(ValidationError, match="must not contain dangerous URI protocols"):
validate_no_html_tags("data:text/html,alert(1)")

def test_rejects_data_uri_with_mime_type(self):
with pytest.raises(ValidationError, match="must not contain dangerous URI protocols"):
validate_no_html_tags("data:application/javascript,code")

def test_allows_benign_data_colon_in_text(self):
assert validate_no_html_tags("Input data: JSON format") == "Input data: JSON format"

def test_allows_data_colon_without_mime(self):
assert validate_no_html_tags("my data: 100 records") == "my data: 100 records"

def test_rejects_vbscript_protocol(self):
with pytest.raises(ValidationError, match="must not contain dangerous URI protocols"):
validate_no_html_tags("vbscript:MsgBox")

def test_rejects_event_handler(self):
with pytest.raises(
ValidationError, match="must not contain event handler attributes"
):
validate_no_html_tags("onclick=alert(1)")

def test_rejects_event_handler_with_spaces(self):
with pytest.raises(
ValidationError, match="must not contain event handler attributes"
):
validate_no_html_tags("onerror =alert(1)")

def test_rejects_event_handler_case_insensitive(self):
with pytest.raises(
ValidationError, match="must not contain event handler attributes"
):
validate_no_html_tags("ONLOAD=alert(1)")

def test_allows_benign_connection_text(self):
assert validate_no_html_tags("connection=ok") == "connection=ok"

def test_allows_onboarding_text(self):
assert validate_no_html_tags("onboarding = enabled") == "onboarding = enabled"

def test_allows_oncall_text(self):
assert validate_no_html_tags("oncall = primary") == "oncall = primary"

def test_allows_condition_text(self):
assert validate_no_html_tags("condition = good") == "condition = good"

def test_rejects_event_handler_after_semicolon(self):
with pytest.raises(
ValidationError, match="must not contain event handler attributes"
):
validate_no_html_tags("foo;onclick=alert(1)")

def test_rejects_event_handler_after_ampersand(self):
with pytest.raises(
ValidationError, match="must not contain event handler attributes"
):
validate_no_html_tags("foo&onerror=alert(1)")

def test_custom_field_name_in_error(self):
with pytest.raises(ValidationError, match="Workflow name"):
validate_no_html_tags("<script>", field_name="Workflow name")


class TestValidateNameField:
def test_clean_name_passes(self):
assert validate_name_field("My Workflow") == "My Workflow"

def test_strips_whitespace(self):
assert validate_name_field(" hello ") == "hello"

def test_rejects_empty_after_strip(self):
with pytest.raises(ValidationError, match="must not be empty"):
validate_name_field(" ")

def test_rejects_html_tags(self):
with pytest.raises(ValidationError, match="must not contain HTML or script tags"):
validate_name_field("<script>alert(1)</script>")

def test_allows_hyphens_and_underscores(self):
assert validate_name_field("my-workflow_v2") == "my-workflow_v2"

def test_allows_periods(self):
assert validate_name_field("config.v2") == "config.v2"

def test_allows_parentheses_and_commas(self):
assert validate_name_field("Test (v2), final") == "Test (v2), final"

def test_custom_field_name_in_error(self):
with pytest.raises(ValidationError, match="Tool name"):
validate_name_field(" ", field_name="Tool name")
9 changes: 9 additions & 0 deletions backend/workflow_manager/workflow_v2/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from tool_instance_v2.serializers import ToolInstanceSerializer
from tool_instance_v2.tool_instance_helper import ToolInstanceHelper
from utils.input_sanitizer import validate_name_field, validate_no_html_tags
from utils.serializer.integrity_error_mixin import IntegrityErrorMixin

from backend.constants import RequestKey
Expand Down Expand Up @@ -46,6 +47,14 @@ class Meta:
}
}

def validate_workflow_name(self, value: str) -> str:
return validate_name_field(value, field_name="Workflow name")

def validate_description(self, value: str) -> str:
if value is None:
return value
return validate_no_html_tags(value, field_name="Description")
Comment thread
greptile-apps[bot] marked this conversation as resolved.

def to_representation(self, instance: Workflow) -> dict[str, str]:
representation: dict[str, str] = super().to_representation(instance)
representation[WorkflowKey.WF_NAME] = instance.workflow_name
Expand Down
Loading
Loading