Skip to content

Fetchers Classes

Here's the reference information for all fetcher-type classes' parameters, attributes, and methods.

You can import all of them directly like below:

from scrapling.fetchers import (
    Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher,
    FetcherSession, AsyncStealthySession, StealthySession, DynamicSession, AsyncDynamicSession
)

scrapling.fetchers.Fetcher

Fetcher(*args, **kwargs)

Bases: BaseFetcher


              flowchart TD
              scrapling.fetchers.Fetcher[Fetcher]
              scrapling.engines.toolbelt.custom.BaseFetcher[BaseFetcher]

                              scrapling.engines.toolbelt.custom.BaseFetcher --> scrapling.fetchers.Fetcher
                


              click scrapling.fetchers.Fetcher href "" "scrapling.fetchers.Fetcher"
              click scrapling.engines.toolbelt.custom.BaseFetcher href "" "scrapling.engines.toolbelt.custom.BaseFetcher"
            

A basic Fetcher class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on curl_cffi.

Source code in scrapling/engines/toolbelt/custom.py
def __init__(self, *args, **kwargs):
    # For backward-compatibility before 0.2.99
    args_str = ", ".join(args) or ""
    kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
    if args_str:
        args_str += ", "

    log.warning(
        f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
    )
    pass

__slots__ class-attribute instance-attribute

__slots__ = ()

huge_tree class-attribute instance-attribute

huge_tree = True

adaptive class-attribute instance-attribute

adaptive = False

storage class-attribute instance-attribute

storage = SQLiteStorageSystem

keep_cdata class-attribute instance-attribute

keep_cdata = False

storage_args class-attribute instance-attribute

storage_args = None

keep_comments class-attribute instance-attribute

keep_comments = False

adaptive_domain class-attribute instance-attribute

adaptive_domain = ''

parser_keywords class-attribute instance-attribute

parser_keywords = (
    "huge_tree",
    "adaptive",
    "storage",
    "keep_cdata",
    "storage_args",
    "keep_comments",
    "adaptive_domain",
)

display_config classmethod

display_config()
Source code in scrapling/engines/toolbelt/custom.py
@classmethod
def display_config(cls):
    return dict(
        huge_tree=cls.huge_tree,
        keep_comments=cls.keep_comments,
        keep_cdata=cls.keep_cdata,
        adaptive=cls.adaptive,
        storage=cls.storage,
        storage_args=cls.storage_args,
        adaptive_domain=cls.adaptive_domain,
    )

configure classmethod

configure(**kwargs)

Set multiple arguments for the parser at once globally

PARAMETER DESCRIPTION
kwargs

The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain

DEFAULT: {}

Source code in scrapling/engines/toolbelt/custom.py
@classmethod
def configure(cls, **kwargs):
    """Set multiple arguments for the parser at once globally

    :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
    """
    for key, value in kwargs.items():
        key = key.strip().lower()
        if hasattr(cls, key):
            if key in cls.parser_keywords:
                setattr(cls, key, value)
            else:
                # Yup, no fun allowed LOL
                raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
        else:
            raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')

    if not kwargs:
        raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")

get classmethod

get(url, **kwargs)
Source code in scrapling/fetchers/requests.py
@classmethod
def get(cls, url: str, **kwargs: Unpack[GetRequestParams]) -> Response:
    return __FetcherClientInstance__.get(url, **_merge_selector_config(cls, kwargs))

post classmethod

post(url, **kwargs)
Source code in scrapling/fetchers/requests.py
@classmethod
def post(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
    return __FetcherClientInstance__.post(url, **_merge_selector_config(cls, kwargs))

put classmethod

put(url, **kwargs)
Source code in scrapling/fetchers/requests.py
@classmethod
def put(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
    return __FetcherClientInstance__.put(url, **_merge_selector_config(cls, kwargs))

delete classmethod

delete(url, **kwargs)
Source code in scrapling/fetchers/requests.py
@classmethod
def delete(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
    return __FetcherClientInstance__.delete(url, **_merge_selector_config(cls, kwargs))

scrapling.fetchers.AsyncFetcher

AsyncFetcher(*args, **kwargs)

Bases: BaseFetcher


              flowchart TD
              scrapling.fetchers.AsyncFetcher[AsyncFetcher]
              scrapling.engines.toolbelt.custom.BaseFetcher[BaseFetcher]

                              scrapling.engines.toolbelt.custom.BaseFetcher --> scrapling.fetchers.AsyncFetcher
                


              click scrapling.fetchers.AsyncFetcher href "" "scrapling.fetchers.AsyncFetcher"
              click scrapling.engines.toolbelt.custom.BaseFetcher href "" "scrapling.engines.toolbelt.custom.BaseFetcher"
            

A basic Fetcher class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on curl_cffi.

Source code in scrapling/engines/toolbelt/custom.py
def __init__(self, *args, **kwargs):
    # For backward-compatibility before 0.2.99
    args_str = ", ".join(args) or ""
    kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
    if args_str:
        args_str += ", "

    log.warning(
        f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
    )
    pass

__slots__ class-attribute instance-attribute

__slots__ = ()

huge_tree class-attribute instance-attribute

huge_tree = True

adaptive class-attribute instance-attribute

adaptive = False

storage class-attribute instance-attribute

storage = SQLiteStorageSystem

keep_cdata class-attribute instance-attribute

keep_cdata = False

storage_args class-attribute instance-attribute

storage_args = None

keep_comments class-attribute instance-attribute

keep_comments = False

adaptive_domain class-attribute instance-attribute

adaptive_domain = ''

parser_keywords class-attribute instance-attribute

parser_keywords = (
    "huge_tree",
    "adaptive",
    "storage",
    "keep_cdata",
    "storage_args",
    "keep_comments",
    "adaptive_domain",
)

display_config classmethod

display_config()
Source code in scrapling/engines/toolbelt/custom.py
@classmethod
def display_config(cls):
    return dict(
        huge_tree=cls.huge_tree,
        keep_comments=cls.keep_comments,
        keep_cdata=cls.keep_cdata,
        adaptive=cls.adaptive,
        storage=cls.storage,
        storage_args=cls.storage_args,
        adaptive_domain=cls.adaptive_domain,
    )

configure classmethod

configure(**kwargs)

Set multiple arguments for the parser at once globally

PARAMETER DESCRIPTION
kwargs

The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain

DEFAULT: {}

Source code in scrapling/engines/toolbelt/custom.py
@classmethod
def configure(cls, **kwargs):
    """Set multiple arguments for the parser at once globally

    :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
    """
    for key, value in kwargs.items():
        key = key.strip().lower()
        if hasattr(cls, key):
            if key in cls.parser_keywords:
                setattr(cls, key, value)
            else:
                # Yup, no fun allowed LOL
                raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
        else:
            raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')

    if not kwargs:
        raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")

get classmethod

get(url, **kwargs)
Source code in scrapling/fetchers/requests.py
@classmethod
def get(cls, url: str, **kwargs: Unpack[GetRequestParams]) -> Awaitable[Response]:
    return __AsyncFetcherClientInstance__.get(url, **_merge_selector_config(cls, kwargs))

post classmethod

post(url, **kwargs)
Source code in scrapling/fetchers/requests.py
@classmethod
def post(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
    return __AsyncFetcherClientInstance__.post(url, **_merge_selector_config(cls, kwargs))

put classmethod

put(url, **kwargs)
Source code in scrapling/fetchers/requests.py
@classmethod
def put(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
    return __AsyncFetcherClientInstance__.put(url, **_merge_selector_config(cls, kwargs))

delete classmethod

delete(url, **kwargs)
Source code in scrapling/fetchers/requests.py
@classmethod
def delete(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
    return __AsyncFetcherClientInstance__.delete(url, **_merge_selector_config(cls, kwargs))

scrapling.fetchers.DynamicFetcher

DynamicFetcher(*args, **kwargs)

Bases: BaseFetcher


              flowchart TD
              scrapling.fetchers.DynamicFetcher[DynamicFetcher]
              scrapling.engines.toolbelt.custom.BaseFetcher[BaseFetcher]

                              scrapling.engines.toolbelt.custom.BaseFetcher --> scrapling.fetchers.DynamicFetcher
                


              click scrapling.fetchers.DynamicFetcher href "" "scrapling.fetchers.DynamicFetcher"
              click scrapling.engines.toolbelt.custom.BaseFetcher href "" "scrapling.engines.toolbelt.custom.BaseFetcher"
            

A Fetcher that provide many options to fetch/load websites' pages through chromium-based browsers.

Source code in scrapling/engines/toolbelt/custom.py
def __init__(self, *args, **kwargs):
    # For backward-compatibility before 0.2.99
    args_str = ", ".join(args) or ""
    kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
    if args_str:
        args_str += ", "

    log.warning(
        f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
    )
    pass

__slots__ class-attribute instance-attribute

__slots__ = ()

huge_tree class-attribute instance-attribute

huge_tree = True

adaptive class-attribute instance-attribute

adaptive = False

storage class-attribute instance-attribute

storage = SQLiteStorageSystem

keep_cdata class-attribute instance-attribute

keep_cdata = False

storage_args class-attribute instance-attribute

storage_args = None

keep_comments class-attribute instance-attribute

keep_comments = False

adaptive_domain class-attribute instance-attribute

adaptive_domain = ''

parser_keywords class-attribute instance-attribute

parser_keywords = (
    "huge_tree",
    "adaptive",
    "storage",
    "keep_cdata",
    "storage_args",
    "keep_comments",
    "adaptive_domain",
)

display_config classmethod

display_config()
Source code in scrapling/engines/toolbelt/custom.py
@classmethod
def display_config(cls):
    return dict(
        huge_tree=cls.huge_tree,
        keep_comments=cls.keep_comments,
        keep_cdata=cls.keep_cdata,
        adaptive=cls.adaptive,
        storage=cls.storage,
        storage_args=cls.storage_args,
        adaptive_domain=cls.adaptive_domain,
    )

configure classmethod

configure(**kwargs)

Set multiple arguments for the parser at once globally

PARAMETER DESCRIPTION
kwargs

The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain

DEFAULT: {}

Source code in scrapling/engines/toolbelt/custom.py
@classmethod
def configure(cls, **kwargs):
    """Set multiple arguments for the parser at once globally

    :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
    """
    for key, value in kwargs.items():
        key = key.strip().lower()
        if hasattr(cls, key):
            if key in cls.parser_keywords:
                setattr(cls, key, value)
            else:
                # Yup, no fun allowed LOL
                raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
        else:
            raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')

    if not kwargs:
        raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")

fetch classmethod

fetch(url, **kwargs)

Opens up a browser and do your request based on your chosen options below.

PARAMETER DESCRIPTION
url

Target url.

TYPE: str

headless

Run the browser in headless/hidden (default), or headful/visible mode.

disable_resources

Drop requests for unnecessary resources for a speed boost.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

block_ads

Block requests to ~3,500 known ad/tracking domains. Can be combined with blocked_domains.

dns_over_https

Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.

useragent

Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.

cookies

Set cookies for the next request.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

wait_selector

Wait for a specific CSS selector to be in a specific state.

init_script

An absolute path to a JavaScript file to be executed on page creation with this request.

locale

Set the locale for the browser if wanted. Defaults to the system default locale.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

real_chrome

If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.

cdp_url

Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.

google_search

Enabled by default, Scrapling will set a Google referer header.

extra_headers

A dictionary of extra headers to add to the request.

proxy

The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.

extra_flags

A list of additional browser flags to pass to the browser on launch.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

additional_args

Additional arguments to be passed to Playwright's context as additional settings.

RETURNS DESCRIPTION
Response

A Response object.

Source code in scrapling/fetchers/chrome.py
@classmethod
def fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
    """Opens up a browser and do your request based on your chosen options below.

    :param url: Target url.
    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param block_ads: Block requests to ~3,500 known ad/tracking domains. Can be combined with ``blocked_domains``.
    :param dns_over_https: Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
    :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request.
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
    :return: A `Response` object.
    """
    selector_config = kwargs.get("selector_config", {}) or kwargs.get(
        "custom_config", {}
    )  # Checking `custom_config` for backward compatibility
    if not isinstance(selector_config, dict):
        raise TypeError("Argument `selector_config` must be a dictionary.")

    kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

    with DynamicSession(**kwargs) as session:
        return session.fetch(url)

async_fetch async classmethod

async_fetch(url, **kwargs)

Opens up a browser and do your request based on your chosen options below.

PARAMETER DESCRIPTION
url

Target url.

TYPE: str

headless

Run the browser in headless/hidden (default), or headful/visible mode.

disable_resources

Drop requests for unnecessary resources for a speed boost.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

block_ads

Block requests to ~3,500 known ad/tracking domains. Can be combined with blocked_domains.

dns_over_https

Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.

useragent

Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.

cookies

Set cookies for the next request.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

wait_selector

Wait for a specific CSS selector to be in a specific state.

init_script

An absolute path to a JavaScript file to be executed on page creation with this request.

locale

Set the locale for the browser if wanted. Defaults to the system default locale.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

real_chrome

If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.

cdp_url

Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.

google_search

Enabled by default, Scrapling will set a Google referer header.

extra_headers

A dictionary of extra headers to add to the request.

proxy

The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.

extra_flags

A list of additional browser flags to pass to the browser on launch.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

additional_args

Additional arguments to be passed to Playwright's context as additional settings.

RETURNS DESCRIPTION
Response

A Response object.

Source code in scrapling/fetchers/chrome.py
@classmethod
async def async_fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
    """Opens up a browser and do your request based on your chosen options below.

    :param url: Target url.
    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param block_ads: Block requests to ~3,500 known ad/tracking domains. Can be combined with ``blocked_domains``.
    :param dns_over_https: Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
    :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request.
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
    :return: A `Response` object.
    """
    selector_config = kwargs.get("selector_config", {}) or kwargs.get(
        "custom_config", {}
    )  # Checking `custom_config` for backward compatibility
    if not isinstance(selector_config, dict):
        raise TypeError("Argument `selector_config` must be a dictionary.")

    kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

    async with AsyncDynamicSession(**kwargs) as session:
        return await session.fetch(url)

scrapling.fetchers.StealthyFetcher

StealthyFetcher(*args, **kwargs)

Bases: BaseFetcher


              flowchart TD
              scrapling.fetchers.StealthyFetcher[StealthyFetcher]
              scrapling.engines.toolbelt.custom.BaseFetcher[BaseFetcher]

                              scrapling.engines.toolbelt.custom.BaseFetcher --> scrapling.fetchers.StealthyFetcher
                


              click scrapling.fetchers.StealthyFetcher href "" "scrapling.fetchers.StealthyFetcher"
              click scrapling.engines.toolbelt.custom.BaseFetcher href "" "scrapling.engines.toolbelt.custom.BaseFetcher"
            

A Fetcher class type which is a completely stealthy built on top of Chromium.

It works as real browsers passing almost all online tests/protections with many customization options.

Source code in scrapling/engines/toolbelt/custom.py
def __init__(self, *args, **kwargs):
    # For backward-compatibility before 0.2.99
    args_str = ", ".join(args) or ""
    kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
    if args_str:
        args_str += ", "

    log.warning(
        f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
    )
    pass

__slots__ class-attribute instance-attribute

__slots__ = ()

huge_tree class-attribute instance-attribute

huge_tree = True

adaptive class-attribute instance-attribute

adaptive = False

storage class-attribute instance-attribute

storage = SQLiteStorageSystem

keep_cdata class-attribute instance-attribute

keep_cdata = False

storage_args class-attribute instance-attribute

storage_args = None

keep_comments class-attribute instance-attribute

keep_comments = False

adaptive_domain class-attribute instance-attribute

adaptive_domain = ''

parser_keywords class-attribute instance-attribute

parser_keywords = (
    "huge_tree",
    "adaptive",
    "storage",
    "keep_cdata",
    "storage_args",
    "keep_comments",
    "adaptive_domain",
)

display_config classmethod

display_config()
Source code in scrapling/engines/toolbelt/custom.py
@classmethod
def display_config(cls):
    return dict(
        huge_tree=cls.huge_tree,
        keep_comments=cls.keep_comments,
        keep_cdata=cls.keep_cdata,
        adaptive=cls.adaptive,
        storage=cls.storage,
        storage_args=cls.storage_args,
        adaptive_domain=cls.adaptive_domain,
    )

configure classmethod

configure(**kwargs)

Set multiple arguments for the parser at once globally

PARAMETER DESCRIPTION
kwargs

The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain

DEFAULT: {}

Source code in scrapling/engines/toolbelt/custom.py
@classmethod
def configure(cls, **kwargs):
    """Set multiple arguments for the parser at once globally

    :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
    """
    for key, value in kwargs.items():
        key = key.strip().lower()
        if hasattr(cls, key):
            if key in cls.parser_keywords:
                setattr(cls, key, value)
            else:
                # Yup, no fun allowed LOL
                raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
        else:
            raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')

    if not kwargs:
        raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")

fetch classmethod

fetch(url, **kwargs)

Opens up a browser and do your request based on your chosen options below.

PARAMETER DESCRIPTION
url

Target url.

TYPE: str

headless

Run the browser in headless/hidden (default), or headful/visible mode.

disable_resources

Drop requests for unnecessary resources for a speed boost. Requests dropped are of type font, image, media, beacon, object, imageset, texttrack, websocket, csp_report, and stylesheet.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

block_ads

Block requests to ~3,500 known ad/tracking domains. Can be combined with blocked_domains.

dns_over_https

Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.

useragent

Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.

cookies

Set cookies for the next request.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

wait_selector

Wait for a specific CSS selector to be in a specific state.

init_script

An absolute path to a JavaScript file to be executed on page creation for all pages in this session.

locale

Specify user locale, for example, en-GB, de-DE, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.

timezone_id

Changes the timezone of the browser. Defaults to the system timezone.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

solve_cloudflare

Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.

real_chrome

If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.

hide_canvas

Add random noise to canvas operations to prevent fingerprinting.

block_webrtc

Forces WebRTC to respect proxy settings to prevent local IP address leak.

allow_webgl

Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

cdp_url

Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.

google_search

Enabled by default, Scrapling will set a Google referer header.

extra_headers

A dictionary of extra headers to add to the request. The referer set by google_search takes priority over the referer set here if used together.

proxy

The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.

user_data_dir

Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.

extra_flags

A list of additional browser flags to pass to the browser on launch.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

additional_args

Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

RETURNS DESCRIPTION
Response

A Response object.

Source code in scrapling/fetchers/stealth_chrome.py
@classmethod
def fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
    """
    Opens up a browser and do your request based on your chosen options below.

    :param url: Target url.
    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param block_ads: Block requests to ~3,500 known ad/tracking domains. Can be combined with ``blocked_domains``.
    :param dns_over_https: Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
    :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
    :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    :return: A `Response` object.
    """
    selector_config = kwargs.get("selector_config", {}) or kwargs.get(
        "custom_config", {}
    )  # Checking `custom_config` for backward compatibility
    if not isinstance(selector_config, dict):
        raise TypeError("Argument `selector_config` must be a dictionary.")

    kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

    with StealthySession(**kwargs) as engine:
        return engine.fetch(url)

async_fetch async classmethod

async_fetch(url, **kwargs)

Opens up a browser and do your request based on your chosen options below.

PARAMETER DESCRIPTION
url

Target url.

TYPE: str

headless

Run the browser in headless/hidden (default), or headful/visible mode.

disable_resources

Drop requests for unnecessary resources for a speed boost. Requests dropped are of type font, image, media, beacon, object, imageset, texttrack, websocket, csp_report, and stylesheet.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

block_ads

Block requests to ~3,500 known ad/tracking domains. Can be combined with blocked_domains.

dns_over_https

Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.

useragent

Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.

cookies

Set cookies for the next request.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

wait_selector

Wait for a specific CSS selector to be in a specific state.

init_script

An absolute path to a JavaScript file to be executed on page creation for all pages in this session.

locale

Specify user locale, for example, en-GB, de-DE, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.

timezone_id

Changes the timezone of the browser. Defaults to the system timezone.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

solve_cloudflare

Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.

real_chrome

If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.

hide_canvas

Add random noise to canvas operations to prevent fingerprinting.

block_webrtc

Forces WebRTC to respect proxy settings to prevent local IP address leak.

allow_webgl

Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

cdp_url

Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.

google_search

Enabled by default, Scrapling will set a Google referer header.

extra_headers

A dictionary of extra headers to add to the request. The referer set by google_search takes priority over the referer set here if used together.

proxy

The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.

user_data_dir

Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.

extra_flags

A list of additional browser flags to pass to the browser on launch.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

additional_args

Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

RETURNS DESCRIPTION
Response

A Response object.

Source code in scrapling/fetchers/stealth_chrome.py
@classmethod
async def async_fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
    """
    Opens up a browser and do your request based on your chosen options below.

    :param url: Target url.
    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param block_ads: Block requests to ~3,500 known ad/tracking domains. Can be combined with ``blocked_domains``.
    :param dns_over_https: Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
    :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
    :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    :return: A `Response` object.
    """
    selector_config = kwargs.get("selector_config", {}) or kwargs.get(
        "custom_config", {}
    )  # Checking `custom_config` for backward compatibility
    if not isinstance(selector_config, dict):
        raise TypeError("Argument `selector_config` must be a dictionary.")

    kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

    async with AsyncStealthySession(**kwargs) as engine:
        return await engine.fetch(url)

Session Classes

HTTP Sessions

scrapling.fetchers.FetcherSession

FetcherSession(
    impersonate="chrome",
    http3=False,
    stealthy_headers=True,
    proxies=None,
    proxy=None,
    proxy_auth=None,
    timeout=30,
    headers=None,
    retries=3,
    retry_delay=1,
    follow_redirects="safe",
    max_redirects=30,
    verify=True,
    cert=None,
    selector_config=None,
    proxy_rotator=None,
)

A factory context manager that provides configured Fetcher sessions.

When this manager is used in a 'with' or 'async with' block, it yields a new session configured with the manager's defaults. A single instance of this manager should ideally be used for one active session at a time (or sequentially). Re-entering a context with the same manager instance while a session is already active is disallowed.

PARAMETER DESCRIPTION
impersonate

Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)

TYPE: ImpersonateType DEFAULT: 'chrome'

http3

Whether to use HTTP3. Defaults to False. It might be problematic if used it with impersonate.

TYPE: Optional[bool] DEFAULT: False

stealthy_headers

If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.

TYPE: Optional[bool] DEFAULT: True

proxies

Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.

TYPE: Optional[Dict[str, str]] DEFAULT: None

proxy

Proxy URL to use. Format: "http://username:password@localhost:8030". Cannot be used together with the proxies parameter.

TYPE: Optional[str] DEFAULT: None

proxy_auth

HTTP basic auth for proxy, tuple of (username, password).

TYPE: Optional[Tuple[str, str]] DEFAULT: None

timeout

Number of seconds to wait before timing out.

TYPE: Optional[int | float] DEFAULT: 30

headers

Headers to include in the session with every request.

TYPE: Optional[Dict[str, str]] DEFAULT: None

retries

Number of retry attempts. Defaults to 3.

TYPE: Optional[int] DEFAULT: 3

retry_delay

Number of seconds to wait between retry attempts. Defaults to 1 second.

TYPE: Optional[int] DEFAULT: 1

follow_redirects

Whether to follow redirects. Defaults to "safe", which follows redirects but rejects those targeting internal/private IPs (SSRF protection). Pass True to follow all redirects without restriction.

TYPE: FollowRedirects DEFAULT: 'safe'

max_redirects

Maximum number of redirects. Default 30, use -1 for unlimited.

TYPE: int DEFAULT: 30

verify

Whether to verify HTTPS certificates. Defaults to True.

TYPE: bool DEFAULT: True

cert

Tuple of (cert, key) filenames for the client certificate.

TYPE: Optional[str | Tuple[str, str]] DEFAULT: None

selector_config

Arguments passed when creating the final Selector class.

TYPE: Optional[Dict] DEFAULT: None

proxy_rotator

A ProxyRotator instance for automatic proxy rotation.

TYPE: Optional[ProxyRotator] DEFAULT: None

Source code in scrapling/engines/static.py
def __init__(
    self,
    impersonate: ImpersonateType = "chrome",
    http3: Optional[bool] = False,
    stealthy_headers: Optional[bool] = True,
    proxies: Optional[Dict[str, str]] = None,
    proxy: Optional[str] = None,
    proxy_auth: Optional[Tuple[str, str]] = None,
    timeout: Optional[int | float] = 30,
    headers: Optional[Dict[str, str]] = None,
    retries: Optional[int] = 3,
    retry_delay: Optional[int] = 1,
    follow_redirects: FollowRedirects = "safe",
    max_redirects: int = 30,
    verify: bool = True,
    cert: Optional[str | Tuple[str, str]] = None,
    selector_config: Optional[Dict] = None,
    proxy_rotator: Optional[ProxyRotator] = None,
):
    """
    :param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
    :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
    :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
    :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
    :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
                 Cannot be used together with the `proxies` parameter.
    :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
    :param timeout: Number of seconds to wait before timing out.
    :param headers: Headers to include in the session with every request.
    :param retries: Number of retry attempts. Defaults to 3.
    :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
    :param follow_redirects: Whether to follow redirects. Defaults to "safe", which follows redirects but rejects those targeting internal/private IPs (SSRF protection). Pass True to follow all redirects without restriction.
    :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
    :param verify: Whether to verify HTTPS certificates. Defaults to True.
    :param cert: Tuple of (cert, key) filenames for the client certificate.
    :param selector_config: Arguments passed when creating the final Selector class.
    :param proxy_rotator: A ProxyRotator instance for automatic proxy rotation.
    """
    self._default_impersonate: ImpersonateType = impersonate
    self._stealth = stealthy_headers
    self._default_proxies = proxies or {}
    self._default_proxy = proxy or None
    self._default_proxy_auth = proxy_auth or None
    self._default_timeout = timeout
    self._default_headers = headers or {}
    self._default_retries = retries
    self._default_retry_delay = retry_delay
    self._default_follow_redirects = follow_redirects
    self._default_max_redirects = max_redirects
    self._default_verify = verify
    self._default_cert = cert
    self._default_http3 = http3
    self.selector_config = selector_config or {}
    self._is_alive = False
    self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None
    self._proxy_rotator = proxy_rotator

__slots__ class-attribute instance-attribute

__slots__ = (
    "_default_impersonate",
    "_stealth",
    "_default_proxies",
    "_default_proxy",
    "_default_proxy_auth",
    "_default_timeout",
    "_default_headers",
    "_default_retries",
    "_default_retry_delay",
    "_default_follow_redirects",
    "_default_max_redirects",
    "_default_verify",
    "_default_cert",
    "_default_http3",
    "selector_config",
    "_client",
    "_is_alive",
    "_proxy_rotator",
)

selector_config instance-attribute

selector_config = selector_config or {}

__enter__

__enter__()

Creates and returns a new synchronous Fetcher Session

Source code in scrapling/engines/static.py
def __enter__(self) -> _SyncSessionLogic:
    """Creates and returns a new synchronous Fetcher Session"""
    if self._client is None:
        # Use **vars(self) to avoid repeating all parameters
        config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
        config["stealthy_headers"] = self._stealth
        config["selector_config"] = self.selector_config
        config["proxy_rotator"] = self._proxy_rotator
        self._client = _SyncSessionLogic(**config)
        try:
            result = self._client.__enter__()
        except Exception:
            self._client = None
            raise
        self._is_alive = True
        return result
    raise RuntimeError("This FetcherSession instance already has an active synchronous session.")

__exit__

__exit__(exc_type, exc_val, exc_tb)
Source code in scrapling/engines/static.py
def __exit__(self, exc_type, exc_val, exc_tb):
    if self._client is not None and isinstance(self._client, _SyncSessionLogic):
        self._client.__exit__(exc_type, exc_val, exc_tb)
        self._client = None
        self._is_alive = False
        return
    raise RuntimeError("Cannot exit invalid session")

__aenter__ async

__aenter__()

Creates and returns a new asynchronous Session.

Source code in scrapling/engines/static.py
async def __aenter__(self) -> _ASyncSessionLogic:
    """Creates and returns a new asynchronous Session."""
    if self._client is None:
        # Use **vars(self) to avoid repeating all parameters
        config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
        config["stealthy_headers"] = self._stealth
        config["selector_config"] = self.selector_config
        config["proxy_rotator"] = self._proxy_rotator
        self._client = _ASyncSessionLogic(**config)
        try:
            result = await self._client.__aenter__()
        except Exception:
            self._client = None
            raise
        self._is_alive = True
        return result
    raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")

__aexit__ async

__aexit__(exc_type, exc_val, exc_tb)
Source code in scrapling/engines/static.py
async def __aexit__(self, exc_type, exc_val, exc_tb):
    if self._client is not None and isinstance(self._client, _ASyncSessionLogic):
        await self._client.__aexit__(exc_type, exc_val, exc_tb)
        self._client = None
        self._is_alive = False
        return
    raise RuntimeError("Cannot exit invalid session")

Stealth Sessions

scrapling.fetchers.StealthySession

StealthySession(**kwargs)

Bases: SyncSession, StealthySessionMixin


              flowchart TD
              scrapling.fetchers.StealthySession[StealthySession]
              scrapling.engines._browsers._base.SyncSession[SyncSession]
              scrapling.engines._browsers._base.StealthySessionMixin[StealthySessionMixin]
              scrapling.engines._browsers._base.BaseSessionMixin[BaseSessionMixin]

                              scrapling.engines._browsers._base.SyncSession --> scrapling.fetchers.StealthySession
                
                scrapling.engines._browsers._base.StealthySessionMixin --> scrapling.fetchers.StealthySession
                                scrapling.engines._browsers._base.BaseSessionMixin --> scrapling.engines._browsers._base.StealthySessionMixin
                



              click scrapling.fetchers.StealthySession href "" "scrapling.fetchers.StealthySession"
              click scrapling.engines._browsers._base.SyncSession href "" "scrapling.engines._browsers._base.SyncSession"
              click scrapling.engines._browsers._base.StealthySessionMixin href "" "scrapling.engines._browsers._base.StealthySessionMixin"
              click scrapling.engines._browsers._base.BaseSessionMixin href "" "scrapling.engines._browsers._base.BaseSessionMixin"
            

A Stealthy Browser session manager with page pooling.

A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

PARAMETER DESCRIPTION
headless

Run the browser in headless/hidden (default), or headful/visible mode.

disable_resources

Drop requests for unnecessary resources for a speed boost. Requests dropped are of type font, image, media, beacon, object, imageset, texttrack, websocket, csp_report, and stylesheet.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

useragent

Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.

cookies

Set cookies for the next request.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

wait_selector

Wait for a specific CSS selector to be in a specific state.

init_script

An absolute path to a JavaScript file to be executed on page creation for all pages in this session.

locale

Specify user locale, for example, en-GB, de-DE, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.

timezone_id

Changes the timezone of the browser. Defaults to the system timezone.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

solve_cloudflare

Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.

real_chrome

If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.

hide_canvas

Add random noise to canvas operations to prevent fingerprinting.

block_webrtc

Forces WebRTC to respect proxy settings to prevent local IP address leak.

allow_webgl

Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

cdp_url

Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.

google_search

Enabled by default, Scrapling will set a Google referer header.

extra_headers

A dictionary of extra headers to add to the request. The referer set by google_search takes priority over the referer set here if used together.

proxy

The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.

user_data_dir

Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.

extra_flags

A list of additional browser flags to pass to the browser on launch.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

additional_args

Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

Source code in scrapling/engines/_browsers/_stealth.py
def __init__(self, **kwargs: Unpack[StealthSession]):
    """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
    :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
    :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    """
    self.__validate__(**kwargs)
    super().__init__()

max_pages instance-attribute

max_pages = max_pages

page_pool instance-attribute

page_pool = PagePool(max_pages)

playwright instance-attribute

playwright = None

context instance-attribute

context = None

browser instance-attribute

browser = None

__slots__ class-attribute instance-attribute

__slots__ = (
    "_config",
    "_context_options",
    "_browser_options",
    "_user_data_dir",
    "_headers_keys",
    "max_pages",
    "page_pool",
    "_max_wait_for_page",
    "playwright",
    "context",
)

__validate_routine__

__validate_routine__(params, model)
Source code in scrapling/engines/_browsers/_base.py
def __validate_routine__(
    self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
) -> PlaywrightConfig | StealthConfig:
    # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
    self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
    self._browser_options: Dict[str, Any] = {
        "args": DEFAULT_ARGS,
        "ignore_default_args": HARMFUL_ARGS,
    }
    if "__max_pages" in params:
        params["max_pages"] = params.pop("__max_pages")

    config = validate(params, model=model)
    self._headers_keys = (
        {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
    )

    return config

__generate_options__

__generate_options__(extra_flags=None)
Source code in scrapling/engines/_browsers/_base.py
def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
    config: PlaywrightConfig | StealthConfig = self._config
    self._context_options.update(
        {
            "proxy": config.proxy,
            "locale": config.locale,
            "timezone_id": config.timezone_id,
            "extra_http_headers": config.extra_headers,
        }
    )
    # The default useragent in the headful is always correct now in the current versions of Playwright
    if config.useragent:
        self._context_options["user_agent"] = config.useragent
    elif not config.useragent and config.headless:
        self._context_options["user_agent"] = (
            __default_chrome_useragent__ if config.real_chrome else __default_useragent__
        )

    if not config.cdp_url:
        flags = self._browser_options["args"]
        if config.extra_flags or extra_flags:
            flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))

        if config.dns_over_https:
            doh_flag = "--dns-over-https-templates=https://cloudflare-dns.com/dns-query"
            if isinstance(flags, list):
                flags.append(doh_flag)
            else:
                flags = list(flags) + [doh_flag]

        self._browser_options.update(
            {
                "args": flags,
                "headless": config.headless,
                "channel": "chrome" if config.real_chrome else "chromium",
            }
        )
        if config.executable_path:
            self._browser_options["executable_path"] = config.executable_path

        self._user_data_dir = config.user_data_dir
    else:
        self._browser_options = {}

    if config.additional_args:
        self._context_options.update(config.additional_args)

__validate__

__validate__(**params)
Source code in scrapling/engines/_browsers/_base.py
def __validate__(self, **params):
    self._config = self.__validate_routine__(params, model=StealthConfig)
    self._context_options.update(
        {
            "is_mobile": False,
            "has_touch": False,
            # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
            "service_workers": "allow",
            "ignore_https_errors": True,
            "screen": {"width": 1920, "height": 1080},
            "viewport": {"width": 1920, "height": 1080},
            "permissions": ["geolocation", "notifications"],
        }
    )
    self.__generate_stealth_options()

close

close()

Close all resources

Source code in scrapling/engines/_browsers/_base.py
def close(self):  # pragma: no cover
    """Close all resources"""
    if not self._is_alive:
        return

    if self.context:
        self.context.close()
        self.context = None

    if self.browser:
        self.browser.close()
        self.browser = None

    if self.playwright:
        self.playwright.stop()
        self.playwright = None  # pyright: ignore

    self._is_alive = False

__enter__

__enter__()
Source code in scrapling/engines/_browsers/_base.py
def __enter__(self):
    self.start()
    return self

__exit__

__exit__(exc_type, exc_val, exc_tb)
Source code in scrapling/engines/_browsers/_base.py
def __exit__(self, exc_type, exc_val, exc_tb):
    self.close()

get_pool_stats

get_pool_stats()

Get statistics about the current page pool

Source code in scrapling/engines/_browsers/_base.py
def get_pool_stats(self) -> Dict[str, int]:
    """Get statistics about the current page pool"""
    return {
        "total_pages": self.page_pool.pages_count,
        "busy_pages": self.page_pool.busy_count,
        "max_pages": self.max_pages,
    }

start

start()

Create a browser for this instance and context.

Source code in scrapling/engines/_browsers/_stealth.py
def start(self) -> None:
    """Create a browser for this instance and context."""
    if not self.playwright:
        self.playwright = sync_playwright().start()

        try:
            if self._config.cdp_url:  # pragma: no cover
                self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                if not self._config.proxy_rotator:
                    assert self.browser is not None
                    self.context = self.browser.new_context(**self._context_options)
            elif self._config.proxy_rotator:
                self.browser = self.playwright.chromium.launch(**self._browser_options)
            else:
                persistent_options = (
                    self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                )
                self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)

            if self.context:
                self.context = self._initialize_context(self._config, self.context)

            self._is_alive = True
        except Exception:
            # Clean up playwright if browser setup fails
            self.playwright.stop()
            self.playwright = None
            raise
    else:
        raise RuntimeError("Session has been already started")

fetch

fetch(url, **kwargs)

Opens up the browser and do your request based on your chosen options.

PARAMETER DESCRIPTION
url

The Target url.

TYPE: str

google_search

Enabled by default, Scrapling will set a Google referer header.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

extra_headers

A dictionary of extra headers to add to the request. The referer set by google_search takes priority over the referer set here if used together.

disable_resources

Drop requests for unnecessary resources for a speed boost. Requests dropped are of type font, image, media, beacon, object, imageset, texttrack, websocket, csp_report, and stylesheet.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

wait_selector

Wait for a specific CSS selector to be in a specific state.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

solve_cloudflare

Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

proxy

Static proxy to override rotator and session proxy. A new browser context will be created and used with it.

RETURNS DESCRIPTION
Response

A Response object.

Source code in scrapling/engines/_browsers/_stealth.py
def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
    """Opens up the browser and do your request based on your chosen options.

    :param url: The Target url.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
    :return: A `Response` object.
    """
    static_proxy = kwargs.pop("proxy", None)

    params = _validate(kwargs, self, StealthConfig)
    if not self._is_alive:  # pragma: no cover
        raise RuntimeError("Context manager has been closed")

    request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
    referer = (
        "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
    )

    for attempt in range(self._config.retries):
        proxy: Optional[ProxyType] = None
        if self._config.proxy_rotator and static_proxy is None:
            proxy = self._config.proxy_rotator.get_proxy()
        else:
            proxy = static_proxy

        with self._page_generator(
            params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
        ) as page_info:
            final_response: List = [None]
            xhr_captured: List = []
            page = page_info.page
            page.on(
                "response",
                self._create_response_handler(
                    page_info,
                    final_response,
                    xhr_pattern=self._config.capture_xhr,
                    xhr_container=xhr_captured,
                ),
            )

            if params.page_setup:
                try:
                    params.page_setup(page)
                except Exception as e:  # pragma: no cover
                    log.error(f"Error executing page_setup: {e}")

            try:
                first_response = page.goto(url, referer=referer)
                self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if not first_response:
                    raise RuntimeError(f"Failed to get response for {url}")

                if params.solve_cloudflare:
                    self._cloudflare_solver(page)
                    # Make sure the page is fully loaded after the captcha
                    self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if params.page_action:
                    try:
                        _ = params.page_action(page)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error executing page_action: {e}")

                if params.wait_selector:
                    try:
                        waiter: Locator = page.locator(params.wait_selector)
                        waiter.first.wait_for(state=params.wait_selector_state)
                        self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                page.wait_for_timeout(params.wait)

                response = ResponseFactory.from_playwright_response(
                    page,
                    first_response,
                    final_response[0],
                    params.selector_config,
                    meta={"proxy": proxy},
                    xhr_captured=xhr_captured,
                )
                return response

            except Exception as e:
                page_info.mark_error()
                if attempt < self._config.retries - 1:
                    if is_proxy_error(e):
                        log.warning(
                            f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                        )
                    else:
                        log.warning(
                            f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                        )
                    time_sleep(self._config.retry_delay)
                else:
                    log.error(f"Failed after {self._config.retries} attempts: {e}")
                    raise

    raise RuntimeError("Request failed")  # pragma: no cover

scrapling.fetchers.AsyncStealthySession

AsyncStealthySession(**kwargs)

Bases: AsyncSession, StealthySessionMixin


              flowchart TD
              scrapling.fetchers.AsyncStealthySession[AsyncStealthySession]
              scrapling.engines._browsers._base.AsyncSession[AsyncSession]
              scrapling.engines._browsers._base.StealthySessionMixin[StealthySessionMixin]
              scrapling.engines._browsers._base.BaseSessionMixin[BaseSessionMixin]

                              scrapling.engines._browsers._base.AsyncSession --> scrapling.fetchers.AsyncStealthySession
                
                scrapling.engines._browsers._base.StealthySessionMixin --> scrapling.fetchers.AsyncStealthySession
                                scrapling.engines._browsers._base.BaseSessionMixin --> scrapling.engines._browsers._base.StealthySessionMixin
                



              click scrapling.fetchers.AsyncStealthySession href "" "scrapling.fetchers.AsyncStealthySession"
              click scrapling.engines._browsers._base.AsyncSession href "" "scrapling.engines._browsers._base.AsyncSession"
              click scrapling.engines._browsers._base.StealthySessionMixin href "" "scrapling.engines._browsers._base.StealthySessionMixin"
              click scrapling.engines._browsers._base.BaseSessionMixin href "" "scrapling.engines._browsers._base.BaseSessionMixin"
            

An async Stealthy Browser session manager with page pooling.

A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

PARAMETER DESCRIPTION
headless

Run the browser in headless/hidden (default), or headful/visible mode.

disable_resources

Drop requests for unnecessary resources for a speed boost. Requests dropped are of type font, image, media, beacon, object, imageset, texttrack, websocket, csp_report, and stylesheet.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

useragent

Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.

cookies

Set cookies for the next request.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

wait_selector

Wait for a specific CSS selector to be in a specific state.

init_script

An absolute path to a JavaScript file to be executed on page creation for all pages in this session.

locale

Specify user locale, for example, en-GB, de-DE, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.

timezone_id

Changes the timezone of the browser. Defaults to the system timezone.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

solve_cloudflare

Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.

real_chrome

If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.

hide_canvas

Add random noise to canvas operations to prevent fingerprinting.

block_webrtc

Forces WebRTC to respect proxy settings to prevent local IP address leak.

allow_webgl

Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

cdp_url

Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.

google_search

Enabled by default, Scrapling will set a Google referer header.

extra_headers

A dictionary of extra headers to add to the request. The referer set by google_search takes priority over the referer set here if used together.

proxy

The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.

user_data_dir

Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.

extra_flags

A list of additional browser flags to pass to the browser on launch.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

additional_args

Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

Source code in scrapling/engines/_browsers/_stealth.py
def __init__(self, **kwargs: Unpack[StealthSession]):
    """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
    :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
    :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    """
    self.__validate__(**kwargs)
    super().__init__(max_pages=self._config.max_pages)

max_pages instance-attribute

max_pages = max_pages

page_pool instance-attribute

page_pool = PagePool(max_pages)

playwright instance-attribute

playwright = None

context instance-attribute

context = None

browser instance-attribute

browser = None

__slots__ class-attribute instance-attribute

__slots__ = (
    "_config",
    "_context_options",
    "_browser_options",
    "_user_data_dir",
    "_headers_keys",
)

__validate_routine__

__validate_routine__(params, model)
Source code in scrapling/engines/_browsers/_base.py
def __validate_routine__(
    self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
) -> PlaywrightConfig | StealthConfig:
    # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
    self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
    self._browser_options: Dict[str, Any] = {
        "args": DEFAULT_ARGS,
        "ignore_default_args": HARMFUL_ARGS,
    }
    if "__max_pages" in params:
        params["max_pages"] = params.pop("__max_pages")

    config = validate(params, model=model)
    self._headers_keys = (
        {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
    )

    return config

__generate_options__

__generate_options__(extra_flags=None)
Source code in scrapling/engines/_browsers/_base.py
def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
    config: PlaywrightConfig | StealthConfig = self._config
    self._context_options.update(
        {
            "proxy": config.proxy,
            "locale": config.locale,
            "timezone_id": config.timezone_id,
            "extra_http_headers": config.extra_headers,
        }
    )
    # The default useragent in the headful is always correct now in the current versions of Playwright
    if config.useragent:
        self._context_options["user_agent"] = config.useragent
    elif not config.useragent and config.headless:
        self._context_options["user_agent"] = (
            __default_chrome_useragent__ if config.real_chrome else __default_useragent__
        )

    if not config.cdp_url:
        flags = self._browser_options["args"]
        if config.extra_flags or extra_flags:
            flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))

        if config.dns_over_https:
            doh_flag = "--dns-over-https-templates=https://cloudflare-dns.com/dns-query"
            if isinstance(flags, list):
                flags.append(doh_flag)
            else:
                flags = list(flags) + [doh_flag]

        self._browser_options.update(
            {
                "args": flags,
                "headless": config.headless,
                "channel": "chrome" if config.real_chrome else "chromium",
            }
        )
        if config.executable_path:
            self._browser_options["executable_path"] = config.executable_path

        self._user_data_dir = config.user_data_dir
    else:
        self._browser_options = {}

    if config.additional_args:
        self._context_options.update(config.additional_args)

__validate__

__validate__(**params)
Source code in scrapling/engines/_browsers/_base.py
def __validate__(self, **params):
    self._config = self.__validate_routine__(params, model=StealthConfig)
    self._context_options.update(
        {
            "is_mobile": False,
            "has_touch": False,
            # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
            "service_workers": "allow",
            "ignore_https_errors": True,
            "screen": {"width": 1920, "height": 1080},
            "viewport": {"width": 1920, "height": 1080},
            "permissions": ["geolocation", "notifications"],
        }
    )
    self.__generate_stealth_options()

close async

close()

Close all resources

Source code in scrapling/engines/_browsers/_base.py
async def close(self):
    """Close all resources"""
    if not self._is_alive:  # pragma: no cover
        return

    if self.context:
        await self.context.close()
        self.context = None  # pyright: ignore

    if self.browser:
        await self.browser.close()
        self.browser = None

    if self.playwright:
        await self.playwright.stop()
        self.playwright = None  # pyright: ignore

    self._is_alive = False

__aenter__ async

__aenter__()
Source code in scrapling/engines/_browsers/_base.py
async def __aenter__(self):
    await self.start()
    return self

__aexit__ async

__aexit__(exc_type, exc_val, exc_tb)
Source code in scrapling/engines/_browsers/_base.py
async def __aexit__(self, exc_type, exc_val, exc_tb):
    await self.close()

get_pool_stats

get_pool_stats()

Get statistics about the current page pool

Source code in scrapling/engines/_browsers/_base.py
def get_pool_stats(self) -> Dict[str, int]:
    """Get statistics about the current page pool"""
    return {
        "total_pages": self.page_pool.pages_count,
        "busy_pages": self.page_pool.busy_count,
        "max_pages": self.max_pages,
    }

start async

start()

Create a browser for this instance and context.

Source code in scrapling/engines/_browsers/_stealth.py
async def start(self) -> None:
    """Create a browser for this instance and context."""
    if not self.playwright:
        self.playwright = await async_playwright().start()
        try:
            if self._config.cdp_url:
                self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                if not self._config.proxy_rotator:
                    assert self.browser is not None
                    self.context = await self.browser.new_context(**self._context_options)
            elif self._config.proxy_rotator:
                self.browser = await self.playwright.chromium.launch(**self._browser_options)
            else:
                persistent_options = (
                    self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                )
                self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)

            if self.context:
                self.context = await self._initialize_context(self._config, self.context)

            self._is_alive = True
        except Exception:
            # Clean up playwright if browser setup fails
            await self.playwright.stop()
            self.playwright = None
            raise
    else:
        raise RuntimeError("Session has been already started")

fetch async

fetch(url, **kwargs)

Opens up the browser and do your request based on your chosen options.

PARAMETER DESCRIPTION
url

The Target url.

TYPE: str

google_search

Enabled by default, Scrapling will set a Google referer header.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

extra_headers

A dictionary of extra headers to add to the request. The referer set by google_search takes priority over the referer set here if used together.

disable_resources

Drop requests for unnecessary resources for a speed boost. Requests dropped are of type font, image, media, beacon, object, imageset, texttrack, websocket, csp_report, and stylesheet.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

wait_selector

Wait for a specific CSS selector to be in a specific state.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

solve_cloudflare

Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

proxy

Static proxy to override rotator and session proxy. A new browser context will be created and used with it.

RETURNS DESCRIPTION
Response

A Response object.

Source code in scrapling/engines/_browsers/_stealth.py
async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
    """Opens up the browser and do your request based on your chosen options.

    :param url: The Target url.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
    :return: A `Response` object.
    """
    static_proxy = kwargs.pop("proxy", None)

    params = _validate(kwargs, self, StealthConfig)

    if not self._is_alive:  # pragma: no cover
        raise RuntimeError("Context manager has been closed")

    request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
    referer = (
        "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
    )

    for attempt in range(self._config.retries):
        proxy: Optional[ProxyType] = None
        if self._config.proxy_rotator and static_proxy is None:
            proxy = self._config.proxy_rotator.get_proxy()
        else:
            proxy = static_proxy

        async with self._page_generator(
            params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
        ) as page_info:
            final_response: List = [None]
            xhr_captured: List = []
            page = page_info.page
            page.on(
                "response",
                self._create_response_handler(
                    page_info,
                    final_response,
                    xhr_pattern=self._config.capture_xhr,
                    xhr_container=xhr_captured,
                ),
            )

            if params.page_setup:
                try:
                    await params.page_setup(page)
                except Exception as e:  # pragma: no cover
                    log.error(f"Error executing page_setup: {e}")

            try:
                first_response = await page.goto(url, referer=referer)
                await self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if not first_response:
                    raise RuntimeError(f"Failed to get response for {url}")

                if params.solve_cloudflare:
                    await self._cloudflare_solver(page)
                    # Make sure the page is fully loaded after the captcha
                    await self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if params.page_action:
                    try:
                        _ = await params.page_action(page)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error executing page_action: {e}")

                if params.wait_selector:
                    try:
                        waiter: AsyncLocator = page.locator(params.wait_selector)
                        await waiter.first.wait_for(state=params.wait_selector_state)
                        await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                await page.wait_for_timeout(params.wait)

                response = await ResponseFactory.from_async_playwright_response(
                    page,
                    first_response,
                    final_response[0],
                    params.selector_config,
                    meta={"proxy": proxy},
                    xhr_captured=xhr_captured,
                )
                return response

            except Exception as e:
                page_info.mark_error()
                if attempt < self._config.retries - 1:
                    if is_proxy_error(e):
                        log.warning(
                            f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                        )
                    else:
                        log.warning(
                            f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                        )
                    await asyncio_sleep(self._config.retry_delay)
                else:
                    log.error(f"Failed after {self._config.retries} attempts: {e}")
                    raise

    raise RuntimeError("Request failed")  # pragma: no cover

Dynamic Sessions

scrapling.fetchers.DynamicSession

DynamicSession(**kwargs)

Bases: SyncSession, DynamicSessionMixin


              flowchart TD
              scrapling.fetchers.DynamicSession[DynamicSession]
              scrapling.engines._browsers._base.SyncSession[SyncSession]
              scrapling.engines._browsers._base.DynamicSessionMixin[DynamicSessionMixin]
              scrapling.engines._browsers._base.BaseSessionMixin[BaseSessionMixin]

                              scrapling.engines._browsers._base.SyncSession --> scrapling.fetchers.DynamicSession
                
                scrapling.engines._browsers._base.DynamicSessionMixin --> scrapling.fetchers.DynamicSession
                                scrapling.engines._browsers._base.BaseSessionMixin --> scrapling.engines._browsers._base.DynamicSessionMixin
                



              click scrapling.fetchers.DynamicSession href "" "scrapling.fetchers.DynamicSession"
              click scrapling.engines._browsers._base.SyncSession href "" "scrapling.engines._browsers._base.SyncSession"
              click scrapling.engines._browsers._base.DynamicSessionMixin href "" "scrapling.engines._browsers._base.DynamicSessionMixin"
              click scrapling.engines._browsers._base.BaseSessionMixin href "" "scrapling.engines._browsers._base.BaseSessionMixin"
            

A Browser session manager with page pooling.

A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

PARAMETER DESCRIPTION
headless

Run the browser in headless/hidden (default), or headful/visible mode.

disable_resources

Drop requests for unnecessary resources for a speed boost. Requests dropped are of type font, image, media, beacon, object, imageset, texttrack, websocket, csp_report, and stylesheet.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

useragent

Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.

cookies

Set cookies for the next request.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

wait_selector

Wait for a specific CSS selector to be in a specific state.

init_script

An absolute path to a JavaScript file to be executed on page creation for all pages in this session.

locale

Specify user locale, for example, en-GB, de-DE, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.

timezone_id

Changes the timezone of the browser. Defaults to the system timezone.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

real_chrome

If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

cdp_url

Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.

google_search

Enabled by default, Scrapling will set a Google referer header.

extra_headers

A dictionary of extra headers to add to the request. The referer set by google_search takes priority over the referer set here if used together.

proxy

The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.

user_data_dir

Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.

extra_flags

A list of additional browser flags to pass to the browser on launch.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

additional_args

Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

Source code in scrapling/engines/_browsers/_controllers.py
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
    """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    """
    self.__validate__(**kwargs)
    super().__init__()

max_pages instance-attribute

max_pages = max_pages

page_pool instance-attribute

page_pool = PagePool(max_pages)

playwright instance-attribute

playwright = None

context instance-attribute

context = None

browser instance-attribute

browser = None

__slots__ class-attribute instance-attribute

__slots__ = (
    "_config",
    "_context_options",
    "_browser_options",
    "_user_data_dir",
    "_headers_keys",
    "max_pages",
    "page_pool",
    "_max_wait_for_page",
    "playwright",
    "context",
)

__validate_routine__

__validate_routine__(params, model)
Source code in scrapling/engines/_browsers/_base.py
def __validate_routine__(
    self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
) -> PlaywrightConfig | StealthConfig:
    # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
    self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
    self._browser_options: Dict[str, Any] = {
        "args": DEFAULT_ARGS,
        "ignore_default_args": HARMFUL_ARGS,
    }
    if "__max_pages" in params:
        params["max_pages"] = params.pop("__max_pages")

    config = validate(params, model=model)
    self._headers_keys = (
        {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
    )

    return config

__generate_options__

__generate_options__(extra_flags=None)
Source code in scrapling/engines/_browsers/_base.py
def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
    config: PlaywrightConfig | StealthConfig = self._config
    self._context_options.update(
        {
            "proxy": config.proxy,
            "locale": config.locale,
            "timezone_id": config.timezone_id,
            "extra_http_headers": config.extra_headers,
        }
    )
    # The default useragent in the headful is always correct now in the current versions of Playwright
    if config.useragent:
        self._context_options["user_agent"] = config.useragent
    elif not config.useragent and config.headless:
        self._context_options["user_agent"] = (
            __default_chrome_useragent__ if config.real_chrome else __default_useragent__
        )

    if not config.cdp_url:
        flags = self._browser_options["args"]
        if config.extra_flags or extra_flags:
            flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))

        if config.dns_over_https:
            doh_flag = "--dns-over-https-templates=https://cloudflare-dns.com/dns-query"
            if isinstance(flags, list):
                flags.append(doh_flag)
            else:
                flags = list(flags) + [doh_flag]

        self._browser_options.update(
            {
                "args": flags,
                "headless": config.headless,
                "channel": "chrome" if config.real_chrome else "chromium",
            }
        )
        if config.executable_path:
            self._browser_options["executable_path"] = config.executable_path

        self._user_data_dir = config.user_data_dir
    else:
        self._browser_options = {}

    if config.additional_args:
        self._context_options.update(config.additional_args)

__validate__

__validate__(**params)
Source code in scrapling/engines/_browsers/_base.py
def __validate__(self, **params):
    self._config = self.__validate_routine__(params, model=PlaywrightConfig)
    self.__generate_options__()

close

close()

Close all resources

Source code in scrapling/engines/_browsers/_base.py
def close(self):  # pragma: no cover
    """Close all resources"""
    if not self._is_alive:
        return

    if self.context:
        self.context.close()
        self.context = None

    if self.browser:
        self.browser.close()
        self.browser = None

    if self.playwright:
        self.playwright.stop()
        self.playwright = None  # pyright: ignore

    self._is_alive = False

__enter__

__enter__()
Source code in scrapling/engines/_browsers/_base.py
def __enter__(self):
    self.start()
    return self

__exit__

__exit__(exc_type, exc_val, exc_tb)
Source code in scrapling/engines/_browsers/_base.py
def __exit__(self, exc_type, exc_val, exc_tb):
    self.close()

get_pool_stats

get_pool_stats()

Get statistics about the current page pool

Source code in scrapling/engines/_browsers/_base.py
def get_pool_stats(self) -> Dict[str, int]:
    """Get statistics about the current page pool"""
    return {
        "total_pages": self.page_pool.pages_count,
        "busy_pages": self.page_pool.busy_count,
        "max_pages": self.max_pages,
    }

start

start()

Create a browser for this instance and context.

Source code in scrapling/engines/_browsers/_controllers.py
def start(self):
    """Create a browser for this instance and context."""
    if not self.playwright:
        self.playwright = sync_playwright().start()

        try:
            if self._config.cdp_url:  # pragma: no cover
                self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                if not self._config.proxy_rotator and self.browser:
                    self.context = self.browser.new_context(**self._context_options)
            elif self._config.proxy_rotator:
                self.browser = self.playwright.chromium.launch(**self._browser_options)
            else:
                persistent_options = (
                    self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                )
                self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)

            if self.context:
                self.context = self._initialize_context(self._config, self.context)

            self._is_alive = True
        except Exception:
            # Clean up playwright if browser setup fails
            self.playwright.stop()
            self.playwright = None
            raise
    else:
        raise RuntimeError("Session has been already started")

fetch

fetch(url, **kwargs)

Opens up the browser and do your request based on your chosen options.

PARAMETER DESCRIPTION
url

The Target url.

TYPE: str

google_search

Enabled by default, Scrapling will set a Google referer header.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

extra_headers

A dictionary of extra headers to add to the request. The referer set by google_search takes priority over the referer set here if used together.

disable_resources

Drop requests for unnecessary resources for a speed boost. Requests dropped are of type font, image, media, beacon, object, imageset, texttrack, websocket, csp_report, and stylesheet.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

wait_selector

Wait for a specific CSS selector to be in a specific state.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

proxy

Static proxy to override rotator and session proxy. A new browser context will be created and used with it.

RETURNS DESCRIPTION
Response

A Response object.

Source code in scrapling/engines/_browsers/_controllers.py
def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
    """Opens up the browser and do your request based on your chosen options.

    :param url: The Target url.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
    :return: A `Response` object.
    """
    static_proxy = kwargs.pop("proxy", None)

    params = _validate(kwargs, self, PlaywrightConfig)
    if not self._is_alive:  # pragma: no cover
        raise RuntimeError("Context manager has been closed")

    request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
    referer = (
        "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
    )

    for attempt in range(self._config.retries):
        proxy: Optional[ProxyType] = None
        if self._config.proxy_rotator and static_proxy is None:
            proxy = self._config.proxy_rotator.get_proxy()
        else:
            proxy = static_proxy

        with self._page_generator(
            params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
        ) as page_info:
            final_response: List = [None]
            xhr_captured: List = []
            page = page_info.page
            page.on(
                "response",
                self._create_response_handler(
                    page_info,
                    final_response,
                    xhr_pattern=self._config.capture_xhr,
                    xhr_container=xhr_captured,
                ),
            )

            if params.page_setup:
                try:
                    params.page_setup(page)
                except Exception as e:  # pragma: no cover
                    log.error(f"Error executing page_setup: {e}")

            try:
                first_response = page.goto(url, referer=referer)
                self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if not first_response:
                    raise RuntimeError(f"Failed to get response for {url}")

                if params.page_action:
                    try:
                        _ = params.page_action(page)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error executing page_action: {e}")

                if params.wait_selector:
                    try:
                        waiter: Locator = page.locator(params.wait_selector)
                        waiter.first.wait_for(state=params.wait_selector_state)
                        self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                page.wait_for_timeout(params.wait)

                response = ResponseFactory.from_playwright_response(
                    page,
                    first_response,
                    final_response[0],
                    params.selector_config,
                    meta={"proxy": proxy},
                    xhr_captured=xhr_captured,
                )
                return response

            except Exception as e:
                page_info.mark_error()
                if attempt < self._config.retries - 1:
                    if is_proxy_error(e):
                        log.warning(
                            f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                        )
                    else:
                        log.warning(
                            f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                        )
                    time_sleep(self._config.retry_delay)
                else:
                    log.error(f"Failed after {self._config.retries} attempts: {e}")
                    raise

    raise RuntimeError("Request failed")  # pragma: no cover

scrapling.fetchers.AsyncDynamicSession

AsyncDynamicSession(**kwargs)

Bases: AsyncSession, DynamicSessionMixin


              flowchart TD
              scrapling.fetchers.AsyncDynamicSession[AsyncDynamicSession]
              scrapling.engines._browsers._base.AsyncSession[AsyncSession]
              scrapling.engines._browsers._base.DynamicSessionMixin[DynamicSessionMixin]
              scrapling.engines._browsers._base.BaseSessionMixin[BaseSessionMixin]

                              scrapling.engines._browsers._base.AsyncSession --> scrapling.fetchers.AsyncDynamicSession
                
                scrapling.engines._browsers._base.DynamicSessionMixin --> scrapling.fetchers.AsyncDynamicSession
                                scrapling.engines._browsers._base.BaseSessionMixin --> scrapling.engines._browsers._base.DynamicSessionMixin
                



              click scrapling.fetchers.AsyncDynamicSession href "" "scrapling.fetchers.AsyncDynamicSession"
              click scrapling.engines._browsers._base.AsyncSession href "" "scrapling.engines._browsers._base.AsyncSession"
              click scrapling.engines._browsers._base.DynamicSessionMixin href "" "scrapling.engines._browsers._base.DynamicSessionMixin"
              click scrapling.engines._browsers._base.BaseSessionMixin href "" "scrapling.engines._browsers._base.BaseSessionMixin"
            

An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

A Browser session manager with page pooling

PARAMETER DESCRIPTION
headless

Run the browser in headless/hidden (default), or headful/visible mode.

disable_resources

Drop requests for unnecessary resources for a speed boost. Requests dropped are of type font, image, media, beacon, object, imageset, texttrack, websocket, csp_report, and stylesheet.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

useragent

Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.

cookies

Set cookies for the next request.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

wait_selector

Wait for a specific CSS selector to be in a specific state.

init_script

An absolute path to a JavaScript file to be executed on page creation for all pages in this session.

locale

Specify user locale, for example, en-GB, de-DE, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.

timezone_id

Changes the timezone of the browser. Defaults to the system timezone.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

real_chrome

If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.

cdp_url

Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.

google_search

Enabled by default, Scrapling will set a Google referer header.

extra_headers

A dictionary of extra headers to add to the request. The referer set by google_search takes priority over the referer set here if used together.

proxy

The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.

max_pages

The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.

user_data_dir

Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.

extra_flags

A list of additional browser flags to pass to the browser on launch.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

additional_args

Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

Source code in scrapling/engines/_browsers/_controllers.py
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
    """A Browser session manager with page pooling

    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    """
    self.__validate__(**kwargs)
    super().__init__(max_pages=self._config.max_pages)

max_pages instance-attribute

max_pages = max_pages

page_pool instance-attribute

page_pool = PagePool(max_pages)

playwright instance-attribute

playwright = None

context instance-attribute

context = None

browser instance-attribute

browser = None

__slots__ class-attribute instance-attribute

__slots__ = (
    "_config",
    "_context_options",
    "_browser_options",
    "_user_data_dir",
    "_headers_keys",
)

__validate_routine__

__validate_routine__(params, model)
Source code in scrapling/engines/_browsers/_base.py
def __validate_routine__(
    self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
) -> PlaywrightConfig | StealthConfig:
    # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
    self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
    self._browser_options: Dict[str, Any] = {
        "args": DEFAULT_ARGS,
        "ignore_default_args": HARMFUL_ARGS,
    }
    if "__max_pages" in params:
        params["max_pages"] = params.pop("__max_pages")

    config = validate(params, model=model)
    self._headers_keys = (
        {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
    )

    return config

__generate_options__

__generate_options__(extra_flags=None)
Source code in scrapling/engines/_browsers/_base.py
def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
    config: PlaywrightConfig | StealthConfig = self._config
    self._context_options.update(
        {
            "proxy": config.proxy,
            "locale": config.locale,
            "timezone_id": config.timezone_id,
            "extra_http_headers": config.extra_headers,
        }
    )
    # The default useragent in the headful is always correct now in the current versions of Playwright
    if config.useragent:
        self._context_options["user_agent"] = config.useragent
    elif not config.useragent and config.headless:
        self._context_options["user_agent"] = (
            __default_chrome_useragent__ if config.real_chrome else __default_useragent__
        )

    if not config.cdp_url:
        flags = self._browser_options["args"]
        if config.extra_flags or extra_flags:
            flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))

        if config.dns_over_https:
            doh_flag = "--dns-over-https-templates=https://cloudflare-dns.com/dns-query"
            if isinstance(flags, list):
                flags.append(doh_flag)
            else:
                flags = list(flags) + [doh_flag]

        self._browser_options.update(
            {
                "args": flags,
                "headless": config.headless,
                "channel": "chrome" if config.real_chrome else "chromium",
            }
        )
        if config.executable_path:
            self._browser_options["executable_path"] = config.executable_path

        self._user_data_dir = config.user_data_dir
    else:
        self._browser_options = {}

    if config.additional_args:
        self._context_options.update(config.additional_args)

__validate__

__validate__(**params)
Source code in scrapling/engines/_browsers/_base.py
def __validate__(self, **params):
    self._config = self.__validate_routine__(params, model=PlaywrightConfig)
    self.__generate_options__()

close async

close()

Close all resources

Source code in scrapling/engines/_browsers/_base.py
async def close(self):
    """Close all resources"""
    if not self._is_alive:  # pragma: no cover
        return

    if self.context:
        await self.context.close()
        self.context = None  # pyright: ignore

    if self.browser:
        await self.browser.close()
        self.browser = None

    if self.playwright:
        await self.playwright.stop()
        self.playwright = None  # pyright: ignore

    self._is_alive = False

__aenter__ async

__aenter__()
Source code in scrapling/engines/_browsers/_base.py
async def __aenter__(self):
    await self.start()
    return self

__aexit__ async

__aexit__(exc_type, exc_val, exc_tb)
Source code in scrapling/engines/_browsers/_base.py
async def __aexit__(self, exc_type, exc_val, exc_tb):
    await self.close()

get_pool_stats

get_pool_stats()

Get statistics about the current page pool

Source code in scrapling/engines/_browsers/_base.py
def get_pool_stats(self) -> Dict[str, int]:
    """Get statistics about the current page pool"""
    return {
        "total_pages": self.page_pool.pages_count,
        "busy_pages": self.page_pool.busy_count,
        "max_pages": self.max_pages,
    }

start async

start()

Create a browser for this instance and context.

Source code in scrapling/engines/_browsers/_controllers.py
async def start(self) -> None:
    """Create a browser for this instance and context."""
    if not self.playwright:
        self.playwright = await async_playwright().start()
        try:
            if self._config.cdp_url:
                self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                if not self._config.proxy_rotator and self.browser:
                    self.context = await self.browser.new_context(**self._context_options)
            elif self._config.proxy_rotator:
                self.browser = await self.playwright.chromium.launch(**self._browser_options)
            else:
                persistent_options = (
                    self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                )
                self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)

            if self.context:
                self.context = await self._initialize_context(self._config, self.context)

            self._is_alive = True
        except Exception:
            # Clean up playwright if browser setup fails
            await self.playwright.stop()
            self.playwright = None
            raise
    else:
        raise RuntimeError("Session has been already started")

fetch async

fetch(url, **kwargs)

Opens up the browser and do your request based on your chosen options.

PARAMETER DESCRIPTION
url

The Target url.

TYPE: str

google_search

Enabled by default, Scrapling will set a Google referer header.

timeout

The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000

wait

The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.

page_action

Added for automation. A function that takes the page object, runs after navigation, and does the automation you need.

page_setup

A function that takes the page object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.

extra_headers

A dictionary of extra headers to add to the request. The referer set by google_search takes priority over the referer set here if used together.

disable_resources

Drop requests for unnecessary resources for a speed boost. Requests dropped are of type font, image, media, beacon, object, imageset, texttrack, websocket, csp_report, and stylesheet.

blocked_domains

A set of domain names to block requests to. Subdomains are also matched (e.g., "example.com" blocks "sub.example.com" too).

wait_selector

Wait for a specific CSS selector to be in a specific state.

wait_selector_state

The state to wait for the selector given with wait_selector. The default state is attached.

network_idle

Wait for the page until there are no network connections for at least 500 ms.

load_dom

Enabled by default, wait for all JavaScript on page(s) to fully load and execute.

selector_config

The arguments that will be passed in the end while creating the final Selector's class.

proxy

Static proxy to override rotator and session proxy. A new browser context will be created and used with it.

RETURNS DESCRIPTION
Response

A Response object.

Source code in scrapling/engines/_browsers/_controllers.py
async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
    """Opens up the browser and do your request based on your chosen options.

    :param url: The Target url.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
    :return: A `Response` object.
    """
    static_proxy = kwargs.pop("proxy", None)

    params = _validate(kwargs, self, PlaywrightConfig)

    if not self._is_alive:  # pragma: no cover
        raise RuntimeError("Context manager has been closed")

    request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
    referer = (
        "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
    )

    for attempt in range(self._config.retries):
        proxy: Optional[ProxyType] = None
        if self._config.proxy_rotator and static_proxy is None:
            proxy = self._config.proxy_rotator.get_proxy()
        else:
            proxy = static_proxy

        async with self._page_generator(
            params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
        ) as page_info:
            final_response: List = [None]
            xhr_captured: List = []
            page = page_info.page
            page.on(
                "response",
                self._create_response_handler(
                    page_info,
                    final_response,
                    xhr_pattern=self._config.capture_xhr,
                    xhr_container=xhr_captured,
                ),
            )

            if params.page_setup:
                try:
                    await params.page_setup(page)
                except Exception as e:  # pragma: no cover
                    log.error(f"Error executing page_setup: {e}")

            try:
                first_response = await page.goto(url, referer=referer)
                await self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if not first_response:
                    raise RuntimeError(f"Failed to get response for {url}")

                if params.page_action:
                    try:
                        _ = await params.page_action(page)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error executing page_action: {e}")

                if params.wait_selector:
                    try:
                        waiter: AsyncLocator = page.locator(params.wait_selector)
                        await waiter.first.wait_for(state=params.wait_selector_state)
                        await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                await page.wait_for_timeout(params.wait)

                response = await ResponseFactory.from_async_playwright_response(
                    page,
                    first_response,
                    final_response[0],
                    params.selector_config,
                    meta={"proxy": proxy},
                    xhr_captured=xhr_captured,
                )
                return response

            except Exception as e:
                page_info.mark_error()
                if attempt < self._config.retries - 1:
                    if is_proxy_error(e):
                        log.warning(
                            f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                        )
                    else:
                        log.warning(
                            f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                        )
                    await asyncio_sleep(self._config.retry_delay)
                else:
                    log.error(f"Failed after {self._config.retries} attempts: {e}")
                    raise

    raise RuntimeError("Request failed")  # pragma: no cover