Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,20 @@ def load_config():
loaded.setdefault("playwright_auto_install", True)
loaded.setdefault("defender_exclusions_ack", False)
loaded.setdefault("auto_configure_nordvpn_path", True)
loaded.setdefault("enable_onion_processing", False)
loaded.setdefault("use_nordvpn_onion_only", False)
loaded.setdefault("random_user_agent", False)
loaded.setdefault("selected_user_agent", "")
loaded.setdefault("custom_user_agent", "")
loaded.setdefault("user_agent_library", [])
loaded.setdefault("tor_executable_path", "")
loaded.setdefault("proxy_scope", "both")
loaded.setdefault("threads_scope", "both")
loaded.setdefault("timeout_scope", "both")
loaded.setdefault("captcha_scope", "both")
loaded.setdefault("validation_scope", "both")
loaded.setdefault("random_user_agent_scope", "both")
loaded.setdefault("user_agent_scope", "both")
return loaded


Expand Down
42 changes: 30 additions & 12 deletions extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@

from config import config, get_intercept_proxy
from fetch import HAS_PLAYWRIGHT, HAS_SELENIUM, fetch_page_playwright, fetch_page_requests, fetch_page_selenium, solve_captcha
from helpers import COMMON_LOGIN_PATHS, get_base_url, normalize_and_validate_target, validate_url
from helpers import COMMON_LOGIN_PATHS, classify_onion_reachability, get_base_url, get_scoped_value, is_onion_url, normalize_and_validate_target, redact_onion_value, resolve_user_agent, scope_applies, validate_url
from logging import write_detailed, write_privacy
from login_tester import domain_from_url, hydra_module_for_method, hydra_runtime_flags_for_method, save_hit
from tor_fetch import fetch_onion_html


def detect_failure_string(soup, url):
Expand Down Expand Up @@ -318,15 +320,19 @@ def _loginish_paths_from_links(soup, page_url, limit=3):


def _fetch_html_for_mode(url, proxy, mode):
preferred = (mode or "static").strip().lower()
attempts = []
if preferred == "playwright":
attempts = [fetch_page_playwright, fetch_page_requests, fetch_page_selenium]
else:
attempts = [fetch_page_requests, fetch_page_playwright, fetch_page_selenium]
if is_onion_url(url):
timeout_seconds = int(get_scoped_value(config, "extract_site_timeout_seconds", 45, target_url=url, scope_key="timeout_scope") or 45)
user_agent = resolve_user_agent(config, target_url=url)
html, error, used_ua, used_playwright = fetch_onion_html(url, timeout_seconds=max(45, timeout_seconds), user_agent=user_agent)
if html:
return html, None, bool(used_playwright), used_ua
return None, error, bool(used_playwright), used_ua

preferred = (mode or "static").strip().lower()
attempts = [fetch_page_playwright, fetch_page_requests, fetch_page_selenium] if preferred == "playwright" else [fetch_page_requests, fetch_page_playwright, fetch_page_selenium]
last_error = None
used_playwright = False
used_ua = resolve_user_agent(config, target_url=url)
for fetcher in attempts:
if fetcher == fetch_page_playwright and not HAS_PLAYWRIGHT:
continue
Expand All @@ -336,9 +342,9 @@ def _fetch_html_for_mode(url, proxy, mode):
if fetcher == fetch_page_playwright:
used_playwright = bool(html)
if html:
return html, None, used_playwright
return html, None, used_playwright, used_ua
last_error = error
return None, last_error, used_playwright
return None, last_error, used_playwright, used_ua



Expand Down Expand Up @@ -414,7 +420,15 @@ def extract_login_form(url, proxy=None, strict_validation=True, mode="static", a
if not url:
return None, {"status": "skipped_invalid_target", "reason": invalid_reason or "invalid target"}

html, error, used_playwright = _fetch_html_for_mode(url, proxy, mode)
if is_onion_url(url) and not bool(config.get("enable_onion_processing", False)):
return None, {"status": "skipped_disabled", "reason": ".onion processing disabled"}

if is_onion_url(url):
reachability = classify_onion_reachability(url, user_agent=resolve_user_agent(config, target_url=url))
if reachability.get("status") == "tor_error":
return None, {"status": "fetch_failed", "error_code": "tor_error", "error_hint": "Tor is not running", "error_detail": reachability.get("detail", "Tor not running")}

html, error, used_playwright, used_user_agent = _fetch_html_for_mode(url, proxy, mode)
fallback_used = False

if not html:
Expand Down Expand Up @@ -445,7 +459,7 @@ def extract_login_form(url, proxy=None, strict_validation=True, mode="static", a
if adv_url in checked_urls:
continue
checked_urls.append(adv_url)
html_adv, _, used_pw_adv = _fetch_html_for_mode(adv_url, proxy, "playwright")
html_adv, _, used_pw_adv, _ = _fetch_html_for_mode(adv_url, proxy, "playwright")
used_playwright = used_playwright or used_pw_adv
if not html_adv:
continue
Expand All @@ -467,7 +481,7 @@ def extract_login_form(url, proxy=None, strict_validation=True, mode="static", a
continue
seen.add(candidate)
checked_urls.append(candidate)
html2, error2, used_pw2 = _fetch_html_for_mode(candidate, proxy, mode)
html2, error2, used_pw2, _ = _fetch_html_for_mode(candidate, proxy, mode)
used_playwright = used_playwright or used_pw2
if not html2:
continue
Expand Down Expand Up @@ -593,6 +607,7 @@ def extract_login_form(url, proxy=None, strict_validation=True, mode="static", a
"classification": "✅ actionable native form" if status == "success_form" else "🟨 login-ish (JS-handled / non-POST / missing action)",
"method_warning": "Detected GET form; payload may need manual tuning" if method == "get" else "",
"custom_tester_required": custom_tester_required,
"user_agent": used_user_agent,
"login_metadata": {
"page_url": url,
"fields": best_candidate["fields"],
Expand All @@ -611,6 +626,9 @@ def extract_login_form(url, proxy=None, strict_validation=True, mode="static", a
enable_dummy_interaction=bool(opts.get("enable_dummy_interaction", False)),
)

log_message = f"Extracted login form {url} status={result.get('status')} ua={used_user_agent}"
write_detailed(log_message)
write_privacy(redact_onion_value(log_message))
return result, None


Expand Down
26 changes: 6 additions & 20 deletions fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,12 @@

from app_logging import logger, log_once
from config import config, get_intercept_proxy
from helpers import USER_AGENTS, normalize_and_validate_target
from helpers import normalize_and_validate_target, resolve_user_agent
from logging import write_detailed, write_privacy


# Dedicated request/browser UA pool used for per-request rotation.
FETCH_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.127 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.6; rv:128.0) Gecko/20100101 Firefox/128.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.141 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edg/127.0.2651.74 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0",
]


def _pick_user_agent() -> str:
pool = FETCH_USER_AGENTS or USER_AGENTS
return random.choice(pool)
def _pick_user_agent(target_url: str | None = None, override: str | None = None) -> str:
return resolve_user_agent(config, target_url=target_url, override=override)

try:
from selenium import webdriver
Expand Down Expand Up @@ -228,7 +214,7 @@ def _fetch_page_playwright_once(clean_url, effective_proxy):
browser = p.chromium.launch(**launch_args)
context = browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=_pick_user_agent(),
user_agent=_pick_user_agent(clean_url),
locale="en-US",
ignore_https_errors=bool(config.get("ignore_https_errors", False)),
java_script_enabled=True,
Expand Down Expand Up @@ -352,7 +338,7 @@ def fetch_page_selenium(url, proxy=None):
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument(f"user-agent={_pick_user_agent()}")
options.add_argument(f"user-agent={_pick_user_agent(clean_url)}")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
if current_proxy and current_proxy.get("server"):
Expand Down Expand Up @@ -429,7 +415,7 @@ def fetch_page_requests(url, proxy=None, timeout=REQUEST_TIMEOUT_SECONDS):
except RuntimeError as e:
return None, build_error_payload("proxy_down", "SOCKS proxy unreachable", str(e))

headers = {"User-Agent": _pick_user_agent()}
headers = {"User-Agent": _pick_user_agent(clean_url)}
proxy_map = None
if effective_proxy and effective_proxy.get("server"):
server = effective_proxy["server"]
Expand Down
Loading