Fetchers Classes¶

Here's the reference information for all fetcher-type classes' parameters, attributes, and methods.

You can import all of them directly like below:

from scrapling.fetchers import (
    Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher,
    FetcherSession, AsyncStealthySession, StealthySession, DynamicSession, AsyncDynamicSession
)

scrapling.fetchers.Fetcher ¶

Fetcher(*args, **kwargs)

Bases: BaseFetcher


              flowchart TD
              scrapling.fetchers.Fetcher[Fetcher]
              scrapling.engines.toolbelt.custom.BaseFetcher[BaseFetcher]

                              scrapling.engines.toolbelt.custom.BaseFetcher --> scrapling.fetchers.Fetcher
                


              click scrapling.fetchers.Fetcher href "" "scrapling.fetchers.Fetcher"
              click scrapling.engines.toolbelt.custom.BaseFetcher href "" "scrapling.engines.toolbelt.custom.BaseFetcher"

A basic Fetcher class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on curl_cffi.

Source code in scrapling/engines/toolbelt/custom.py

def __init__(self, *args, **kwargs):
    # For backward-compatibility before 0.2.99
    args_str = ", ".join(args) or ""
    kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
    if args_str:
        args_str += ", "

    log.warning(
        f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
    )
    pass

slots `class-attribute` `instance-attribute` ¶

__slots__ = ()

huge_tree `class-attribute` `instance-attribute` ¶

huge_tree = True

adaptive `class-attribute` `instance-attribute` ¶

adaptive = False

storage `class-attribute` `instance-attribute` ¶

storage = SQLiteStorageSystem

keep_cdata `class-attribute` `instance-attribute` ¶

keep_cdata = False

storage_args `class-attribute` `instance-attribute` ¶

storage_args = None

keep_comments `class-attribute` `instance-attribute` ¶

keep_comments = False

adaptive_domain `class-attribute` `instance-attribute` ¶

adaptive_domain = ''

parser_keywords `class-attribute` `instance-attribute` ¶

parser_keywords = (
    "huge_tree",
    "adaptive",
    "storage",
    "keep_cdata",
    "storage_args",
    "keep_comments",
    "adaptive_domain",
)

display_config `classmethod` ¶

display_config()

Source code in scrapling/engines/toolbelt/custom.py

@classmethod
def display_config(cls):
    return dict(
        huge_tree=cls.huge_tree,
        keep_comments=cls.keep_comments,
        keep_cdata=cls.keep_cdata,
        adaptive=cls.adaptive,
        storage=cls.storage,
        storage_args=cls.storage_args,
        adaptive_domain=cls.adaptive_domain,
    )

configure `classmethod` ¶

configure(**kwargs)

Set multiple arguments for the parser at once globally

PARAMETER	DESCRIPTION
`kwargs`	The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain DEFAULT: `{}`

Source code in scrapling/engines/toolbelt/custom.py

@classmethod
def configure(cls, **kwargs):
    """Set multiple arguments for the parser at once globally

    :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
    """
    for key, value in kwargs.items():
        key = key.strip().lower()
        if hasattr(cls, key):
            if key in cls.parser_keywords:
                setattr(cls, key, value)
            else:
                # Yup, no fun allowed LOL
                raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
        else:
            raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')

    if not kwargs:
        raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")

get `classmethod` ¶

get(url, **kwargs)

Source code in scrapling/fetchers/requests.py

@classmethod
def get(cls, url: str, **kwargs: Unpack[GetRequestParams]) -> Response:
    return __FetcherClientInstance__.get(url, **_merge_selector_config(cls, kwargs))

post `classmethod` ¶

post(url, **kwargs)

Source code in scrapling/fetchers/requests.py

@classmethod
def post(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
    return __FetcherClientInstance__.post(url, **_merge_selector_config(cls, kwargs))

put `classmethod` ¶

put(url, **kwargs)

Source code in scrapling/fetchers/requests.py

@classmethod
def put(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
    return __FetcherClientInstance__.put(url, **_merge_selector_config(cls, kwargs))

delete `classmethod` ¶

delete(url, **kwargs)

Source code in scrapling/fetchers/requests.py

@classmethod
def delete(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
    return __FetcherClientInstance__.delete(url, **_merge_selector_config(cls, kwargs))

scrapling.fetchers.AsyncFetcher ¶

AsyncFetcher(*args, **kwargs)

Bases: BaseFetcher


              flowchart TD
              scrapling.fetchers.AsyncFetcher[AsyncFetcher]
              scrapling.engines.toolbelt.custom.BaseFetcher[BaseFetcher]

                              scrapling.engines.toolbelt.custom.BaseFetcher --> scrapling.fetchers.AsyncFetcher
                


              click scrapling.fetchers.AsyncFetcher href "" "scrapling.fetchers.AsyncFetcher"
              click scrapling.engines.toolbelt.custom.BaseFetcher href "" "scrapling.engines.toolbelt.custom.BaseFetcher"

A basic Fetcher class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on curl_cffi.

Source code in scrapling/engines/toolbelt/custom.py

def __init__(self, *args, **kwargs):
    # For backward-compatibility before 0.2.99
    args_str = ", ".join(args) or ""
    kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
    if args_str:
        args_str += ", "

    log.warning(
        f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
    )
    pass

slots `class-attribute` `instance-attribute` ¶

__slots__ = ()

huge_tree `class-attribute` `instance-attribute` ¶

huge_tree = True

adaptive `class-attribute` `instance-attribute` ¶

adaptive = False

storage `class-attribute` `instance-attribute` ¶

storage = SQLiteStorageSystem

keep_cdata `class-attribute` `instance-attribute` ¶

keep_cdata = False

storage_args `class-attribute` `instance-attribute` ¶

storage_args = None

keep_comments `class-attribute` `instance-attribute` ¶

keep_comments = False

adaptive_domain `class-attribute` `instance-attribute` ¶

adaptive_domain = ''

parser_keywords `class-attribute` `instance-attribute` ¶

parser_keywords = (
    "huge_tree",
    "adaptive",
    "storage",
    "keep_cdata",
    "storage_args",
    "keep_comments",
    "adaptive_domain",
)

display_config `classmethod` ¶

display_config()

Source code in scrapling/engines/toolbelt/custom.py

@classmethod
def display_config(cls):
    return dict(
        huge_tree=cls.huge_tree,
        keep_comments=cls.keep_comments,
        keep_cdata=cls.keep_cdata,
        adaptive=cls.adaptive,
        storage=cls.storage,
        storage_args=cls.storage_args,
        adaptive_domain=cls.adaptive_domain,
    )

configure `classmethod` ¶

configure(**kwargs)

Set multiple arguments for the parser at once globally

PARAMETER	DESCRIPTION
`kwargs`	The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain DEFAULT: `{}`

Source code in scrapling/engines/toolbelt/custom.py

@classmethod
def configure(cls, **kwargs):
    """Set multiple arguments for the parser at once globally

    :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
    """
    for key, value in kwargs.items():
        key = key.strip().lower()
        if hasattr(cls, key):
            if key in cls.parser_keywords:
                setattr(cls, key, value)
            else:
                # Yup, no fun allowed LOL
                raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
        else:
            raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')

    if not kwargs:
        raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")

get `classmethod` ¶

get(url, **kwargs)

Source code in scrapling/fetchers/requests.py

@classmethod
def get(cls, url: str, **kwargs: Unpack[GetRequestParams]) -> Awaitable[Response]:
    return __AsyncFetcherClientInstance__.get(url, **_merge_selector_config(cls, kwargs))

post `classmethod` ¶

post(url, **kwargs)

Source code in scrapling/fetchers/requests.py

@classmethod
def post(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
    return __AsyncFetcherClientInstance__.post(url, **_merge_selector_config(cls, kwargs))

put `classmethod` ¶

put(url, **kwargs)

Source code in scrapling/fetchers/requests.py

@classmethod
def put(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
    return __AsyncFetcherClientInstance__.put(url, **_merge_selector_config(cls, kwargs))

delete `classmethod` ¶

delete(url, **kwargs)

Source code in scrapling/fetchers/requests.py

@classmethod
def delete(cls, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
    return __AsyncFetcherClientInstance__.delete(url, **_merge_selector_config(cls, kwargs))

scrapling.fetchers.DynamicFetcher ¶

DynamicFetcher(*args, **kwargs)

Bases: BaseFetcher


              flowchart TD
              scrapling.fetchers.DynamicFetcher[DynamicFetcher]
              scrapling.engines.toolbelt.custom.BaseFetcher[BaseFetcher]

                              scrapling.engines.toolbelt.custom.BaseFetcher --> scrapling.fetchers.DynamicFetcher
                


              click scrapling.fetchers.DynamicFetcher href "" "scrapling.fetchers.DynamicFetcher"
              click scrapling.engines.toolbelt.custom.BaseFetcher href "" "scrapling.engines.toolbelt.custom.BaseFetcher"

A Fetcher that provide many options to fetch/load websites' pages through chromium-based browsers.

Source code in scrapling/engines/toolbelt/custom.py

def __init__(self, *args, **kwargs):
    # For backward-compatibility before 0.2.99
    args_str = ", ".join(args) or ""
    kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
    if args_str:
        args_str += ", "

    log.warning(
        f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
    )
    pass

slots `class-attribute` `instance-attribute` ¶

__slots__ = ()

huge_tree `class-attribute` `instance-attribute` ¶

huge_tree = True

adaptive `class-attribute` `instance-attribute` ¶

adaptive = False

storage `class-attribute` `instance-attribute` ¶

storage = SQLiteStorageSystem

keep_cdata `class-attribute` `instance-attribute` ¶

keep_cdata = False

storage_args `class-attribute` `instance-attribute` ¶

storage_args = None

keep_comments `class-attribute` `instance-attribute` ¶

keep_comments = False

adaptive_domain `class-attribute` `instance-attribute` ¶

adaptive_domain = ''

parser_keywords `class-attribute` `instance-attribute` ¶

parser_keywords = (
    "huge_tree",
    "adaptive",
    "storage",
    "keep_cdata",
    "storage_args",
    "keep_comments",
    "adaptive_domain",
)

display_config `classmethod` ¶

display_config()

Source code in scrapling/engines/toolbelt/custom.py

@classmethod
def display_config(cls):
    return dict(
        huge_tree=cls.huge_tree,
        keep_comments=cls.keep_comments,
        keep_cdata=cls.keep_cdata,
        adaptive=cls.adaptive,
        storage=cls.storage,
        storage_args=cls.storage_args,
        adaptive_domain=cls.adaptive_domain,
    )

configure `classmethod` ¶

configure(**kwargs)

Set multiple arguments for the parser at once globally

PARAMETER	DESCRIPTION
`kwargs`	The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain DEFAULT: `{}`

Source code in scrapling/engines/toolbelt/custom.py

@classmethod
def configure(cls, **kwargs):
    """Set multiple arguments for the parser at once globally

    :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
    """
    for key, value in kwargs.items():
        key = key.strip().lower()
        if hasattr(cls, key):
            if key in cls.parser_keywords:
                setattr(cls, key, value)
            else:
                # Yup, no fun allowed LOL
                raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
        else:
            raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')

    if not kwargs:
        raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")

fetch `classmethod` ¶

fetch(url, **kwargs)

Opens up a browser and do your request based on your chosen options below.

PARAMETER	DESCRIPTION
`url`	Target url. TYPE: `str`
`headless`	Run the browser in headless/hidden (default), or headful/visible mode.
`disable_resources`	Drop requests for unnecessary resources for a speed boost.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`block_ads`	Block requests to ~3,500 known ad/tracking domains. Can be combined with `blocked_domains`.
`dns_over_https`	Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
`useragent`	Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
`cookies`	Set cookies for the next request.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`init_script`	An absolute path to a JavaScript file to be executed on page creation with this request.
`locale`	Set the locale for the browser if wanted. Defaults to the system default locale.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`real_chrome`	If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
`cdp_url`	Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`extra_headers`	A dictionary of extra headers to add to the request.
`proxy`	The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
`extra_flags`	A list of additional browser flags to pass to the browser on launch.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`additional_args`	Additional arguments to be passed to Playwright's context as additional settings.

RETURNS	DESCRIPTION
`Response`	A `Response` object.

Source code in scrapling/fetchers/chrome.py

@classmethod
def fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
    """Opens up a browser and do your request based on your chosen options below.

    :param url: Target url.
    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param block_ads: Block requests to ~3,500 known ad/tracking domains. Can be combined with ``blocked_domains``.
    :param dns_over_https: Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
    :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request.
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
    :return: A `Response` object.
    """
    selector_config = kwargs.get("selector_config", {}) or kwargs.get(
        "custom_config", {}
    )  # Checking `custom_config` for backward compatibility
    if not isinstance(selector_config, dict):
        raise TypeError("Argument `selector_config` must be a dictionary.")

    kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

    with DynamicSession(**kwargs) as session:
        return session.fetch(url)

async_fetch `async` `classmethod` ¶

async_fetch(url, **kwargs)

Opens up a browser and do your request based on your chosen options below.

PARAMETER	DESCRIPTION
`url`	Target url. TYPE: `str`
`headless`	Run the browser in headless/hidden (default), or headful/visible mode.
`disable_resources`	Drop requests for unnecessary resources for a speed boost.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`block_ads`	Block requests to ~3,500 known ad/tracking domains. Can be combined with `blocked_domains`.
`dns_over_https`	Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
`useragent`	Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
`cookies`	Set cookies for the next request.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`init_script`	An absolute path to a JavaScript file to be executed on page creation with this request.
`locale`	Set the locale for the browser if wanted. Defaults to the system default locale.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`real_chrome`	If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
`cdp_url`	Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`extra_headers`	A dictionary of extra headers to add to the request.
`proxy`	The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
`extra_flags`	A list of additional browser flags to pass to the browser on launch.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`additional_args`	Additional arguments to be passed to Playwright's context as additional settings.

RETURNS	DESCRIPTION
`Response`	A `Response` object.

Source code in scrapling/fetchers/chrome.py

@classmethod
async def async_fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
    """Opens up a browser and do your request based on your chosen options below.

    :param url: Target url.
    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param block_ads: Block requests to ~3,500 known ad/tracking domains. Can be combined with ``blocked_domains``.
    :param dns_over_https: Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
    :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request.
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
    :return: A `Response` object.
    """
    selector_config = kwargs.get("selector_config", {}) or kwargs.get(
        "custom_config", {}
    )  # Checking `custom_config` for backward compatibility
    if not isinstance(selector_config, dict):
        raise TypeError("Argument `selector_config` must be a dictionary.")

    kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

    async with AsyncDynamicSession(**kwargs) as session:
        return await session.fetch(url)

scrapling.fetchers.StealthyFetcher ¶

StealthyFetcher(*args, **kwargs)

Bases: BaseFetcher


              flowchart TD
              scrapling.fetchers.StealthyFetcher[StealthyFetcher]
              scrapling.engines.toolbelt.custom.BaseFetcher[BaseFetcher]

                              scrapling.engines.toolbelt.custom.BaseFetcher --> scrapling.fetchers.StealthyFetcher
                


              click scrapling.fetchers.StealthyFetcher href "" "scrapling.fetchers.StealthyFetcher"
              click scrapling.engines.toolbelt.custom.BaseFetcher href "" "scrapling.engines.toolbelt.custom.BaseFetcher"

A Fetcher class type which is a completely stealthy built on top of Chromium.

It works as real browsers passing almost all online tests/protections with many customization options.

Source code in scrapling/engines/toolbelt/custom.py

def __init__(self, *args, **kwargs):
    # For backward-compatibility before 0.2.99
    args_str = ", ".join(args) or ""
    kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
    if args_str:
        args_str += ", "

    log.warning(
        f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
    )
    pass

slots `class-attribute` `instance-attribute` ¶

__slots__ = ()

huge_tree `class-attribute` `instance-attribute` ¶

huge_tree = True

adaptive `class-attribute` `instance-attribute` ¶

adaptive = False

storage `class-attribute` `instance-attribute` ¶

storage = SQLiteStorageSystem

keep_cdata `class-attribute` `instance-attribute` ¶

keep_cdata = False

storage_args `class-attribute` `instance-attribute` ¶

storage_args = None

keep_comments `class-attribute` `instance-attribute` ¶

keep_comments = False

adaptive_domain `class-attribute` `instance-attribute` ¶

adaptive_domain = ''

parser_keywords `class-attribute` `instance-attribute` ¶

parser_keywords = (
    "huge_tree",
    "adaptive",
    "storage",
    "keep_cdata",
    "storage_args",
    "keep_comments",
    "adaptive_domain",
)

display_config `classmethod` ¶

display_config()

Source code in scrapling/engines/toolbelt/custom.py

@classmethod
def display_config(cls):
    return dict(
        huge_tree=cls.huge_tree,
        keep_comments=cls.keep_comments,
        keep_cdata=cls.keep_cdata,
        adaptive=cls.adaptive,
        storage=cls.storage,
        storage_args=cls.storage_args,
        adaptive_domain=cls.adaptive_domain,
    )

configure `classmethod` ¶

configure(**kwargs)

Set multiple arguments for the parser at once globally

PARAMETER	DESCRIPTION
`kwargs`	The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain DEFAULT: `{}`

Source code in scrapling/engines/toolbelt/custom.py

@classmethod
def configure(cls, **kwargs):
    """Set multiple arguments for the parser at once globally

    :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
    """
    for key, value in kwargs.items():
        key = key.strip().lower()
        if hasattr(cls, key):
            if key in cls.parser_keywords:
                setattr(cls, key, value)
            else:
                # Yup, no fun allowed LOL
                raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
        else:
            raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')

    if not kwargs:
        raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")

fetch `classmethod` ¶

fetch(url, **kwargs)

Opens up a browser and do your request based on your chosen options below.

PARAMETER	DESCRIPTION
`url`	Target url. TYPE: `str`
`headless`	Run the browser in headless/hidden (default), or headful/visible mode.
`disable_resources`	Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`block_ads`	Block requests to ~3,500 known ad/tracking domains. Can be combined with `blocked_domains`.
`dns_over_https`	Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
`useragent`	Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
`cookies`	Set cookies for the next request.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`init_script`	An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
`locale`	Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.
`timezone_id`	Changes the timezone of the browser. Defaults to the system timezone.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`solve_cloudflare`	Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
`real_chrome`	If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
`hide_canvas`	Add random noise to canvas operations to prevent fingerprinting.
`block_webrtc`	Forces WebRTC to respect proxy settings to prevent local IP address leak.
`allow_webgl`	Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`cdp_url`	Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`extra_headers`	A dictionary of extra headers to add to the request. The referer set by `google_search` takes priority over the referer set here if used together.
`proxy`	The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
`user_data_dir`	Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
`extra_flags`	A list of additional browser flags to pass to the browser on launch.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`additional_args`	Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

RETURNS	DESCRIPTION
`Response`	A `Response` object.

Source code in scrapling/fetchers/stealth_chrome.py

@classmethod
def fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
    """
    Opens up a browser and do your request based on your chosen options below.

    :param url: Target url.
    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param block_ads: Block requests to ~3,500 known ad/tracking domains. Can be combined with ``blocked_domains``.
    :param dns_over_https: Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
    :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
    :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    :return: A `Response` object.
    """
    selector_config = kwargs.get("selector_config", {}) or kwargs.get(
        "custom_config", {}
    )  # Checking `custom_config` for backward compatibility
    if not isinstance(selector_config, dict):
        raise TypeError("Argument `selector_config` must be a dictionary.")

    kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

    with StealthySession(**kwargs) as engine:
        return engine.fetch(url)

async_fetch `async` `classmethod` ¶

async_fetch(url, **kwargs)

Opens up a browser and do your request based on your chosen options below.

PARAMETER	DESCRIPTION
`url`	Target url. TYPE: `str`
`headless`	Run the browser in headless/hidden (default), or headful/visible mode.
`disable_resources`	Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`block_ads`	Block requests to ~3,500 known ad/tracking domains. Can be combined with `blocked_domains`.
`dns_over_https`	Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
`useragent`	Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
`cookies`	Set cookies for the next request.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`init_script`	An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
`locale`	Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.
`timezone_id`	Changes the timezone of the browser. Defaults to the system timezone.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`solve_cloudflare`	Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
`real_chrome`	If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
`hide_canvas`	Add random noise to canvas operations to prevent fingerprinting.
`block_webrtc`	Forces WebRTC to respect proxy settings to prevent local IP address leak.
`allow_webgl`	Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`cdp_url`	Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`extra_headers`	A dictionary of extra headers to add to the request. The referer set by `google_search` takes priority over the referer set here if used together.
`proxy`	The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
`user_data_dir`	Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
`extra_flags`	A list of additional browser flags to pass to the browser on launch.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`additional_args`	Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

RETURNS	DESCRIPTION
`Response`	A `Response` object.

Source code in scrapling/fetchers/stealth_chrome.py

@classmethod
async def async_fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
    """
    Opens up a browser and do your request based on your chosen options below.

    :param url: Target url.
    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param block_ads: Block requests to ~3,500 known ad/tracking domains. Can be combined with ``blocked_domains``.
    :param dns_over_https: Route DNS queries through Cloudflare's DNS-over-HTTPS to prevent DNS leaks when using proxies.
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
    :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
    :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    :return: A `Response` object.
    """
    selector_config = kwargs.get("selector_config", {}) or kwargs.get(
        "custom_config", {}
    )  # Checking `custom_config` for backward compatibility
    if not isinstance(selector_config, dict):
        raise TypeError("Argument `selector_config` must be a dictionary.")

    kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

    async with AsyncStealthySession(**kwargs) as engine:
        return await engine.fetch(url)

Session Classes¶

HTTP Sessions¶

scrapling.fetchers.FetcherSession ¶

FetcherSession(
    impersonate="chrome",
    http3=False,
    stealthy_headers=True,
    proxies=None,
    proxy=None,
    proxy_auth=None,
    timeout=30,
    headers=None,
    retries=3,
    retry_delay=1,
    follow_redirects="safe",
    max_redirects=30,
    verify=True,
    cert=None,
    selector_config=None,
    proxy_rotator=None,
)

A factory context manager that provides configured Fetcher sessions.

When this manager is used in a 'with' or 'async with' block, it yields a new session configured with the manager's defaults. A single instance of this manager should ideally be used for one active session at a time (or sequentially). Re-entering a context with the same manager instance while a session is already active is disallowed.

PARAMETER	DESCRIPTION
`impersonate`	Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version) TYPE: `ImpersonateType` DEFAULT: `'chrome'`
`http3`	Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`. TYPE: `Optional[bool]` DEFAULT: `False`
`stealthy_headers`	If enabled (default), it creates and adds real browser headers. It also sets a Google referer header. TYPE: `Optional[bool]` DEFAULT: `True`
`proxies`	Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}. TYPE: `Optional[Dict[str, str]]` DEFAULT: `None`
`proxy`	Proxy URL to use. Format: "http://username:password@localhost:8030". Cannot be used together with the `proxies` parameter. TYPE: `Optional[str]` DEFAULT: `None`
`proxy_auth`	HTTP basic auth for proxy, tuple of (username, password). TYPE: `Optional[Tuple[str, str]]` DEFAULT: `None`
`timeout`	Number of seconds to wait before timing out. TYPE: `Optional[int \| float]` DEFAULT: `30`
`headers`	Headers to include in the session with every request. TYPE: `Optional[Dict[str, str]]` DEFAULT: `None`
`retries`	Number of retry attempts. Defaults to 3. TYPE: `Optional[int]` DEFAULT: `3`
`retry_delay`	Number of seconds to wait between retry attempts. Defaults to 1 second. TYPE: `Optional[int]` DEFAULT: `1`
`follow_redirects`	Whether to follow redirects. Defaults to "safe", which follows redirects but rejects those targeting internal/private IPs (SSRF protection). Pass True to follow all redirects without restriction. TYPE: `FollowRedirects` DEFAULT: `'safe'`
`max_redirects`	Maximum number of redirects. Default 30, use -1 for unlimited. TYPE: `int` DEFAULT: `30`
`verify`	Whether to verify HTTPS certificates. Defaults to True. TYPE: `bool` DEFAULT: `True`
`cert`	Tuple of (cert, key) filenames for the client certificate. TYPE: `Optional[str \| Tuple[str, str]]` DEFAULT: `None`
`selector_config`	Arguments passed when creating the final Selector class. TYPE: `Optional[Dict]` DEFAULT: `None`
`proxy_rotator`	A ProxyRotator instance for automatic proxy rotation. TYPE: `Optional[ProxyRotator]` DEFAULT: `None`

Source code in scrapling/engines/static.py

def __init__(
    self,
    impersonate: ImpersonateType = "chrome",
    http3: Optional[bool] = False,
    stealthy_headers: Optional[bool] = True,
    proxies: Optional[Dict[str, str]] = None,
    proxy: Optional[str] = None,
    proxy_auth: Optional[Tuple[str, str]] = None,
    timeout: Optional[int | float] = 30,
    headers: Optional[Dict[str, str]] = None,
    retries: Optional[int] = 3,
    retry_delay: Optional[int] = 1,
    follow_redirects: FollowRedirects = "safe",
    max_redirects: int = 30,
    verify: bool = True,
    cert: Optional[str | Tuple[str, str]] = None,
    selector_config: Optional[Dict] = None,
    proxy_rotator: Optional[ProxyRotator] = None,
):
    """
    :param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
    :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
    :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
    :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
    :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
                 Cannot be used together with the `proxies` parameter.
    :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
    :param timeout: Number of seconds to wait before timing out.
    :param headers: Headers to include in the session with every request.
    :param retries: Number of retry attempts. Defaults to 3.
    :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
    :param follow_redirects: Whether to follow redirects. Defaults to "safe", which follows redirects but rejects those targeting internal/private IPs (SSRF protection). Pass True to follow all redirects without restriction.
    :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
    :param verify: Whether to verify HTTPS certificates. Defaults to True.
    :param cert: Tuple of (cert, key) filenames for the client certificate.
    :param selector_config: Arguments passed when creating the final Selector class.
    :param proxy_rotator: A ProxyRotator instance for automatic proxy rotation.
    """
    self._default_impersonate: ImpersonateType = impersonate
    self._stealth = stealthy_headers
    self._default_proxies = proxies or {}
    self._default_proxy = proxy or None
    self._default_proxy_auth = proxy_auth or None
    self._default_timeout = timeout
    self._default_headers = headers or {}
    self._default_retries = retries
    self._default_retry_delay = retry_delay
    self._default_follow_redirects = follow_redirects
    self._default_max_redirects = max_redirects
    self._default_verify = verify
    self._default_cert = cert
    self._default_http3 = http3
    self.selector_config = selector_config or {}
    self._is_alive = False
    self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None
    self._proxy_rotator = proxy_rotator

slots `class-attribute` `instance-attribute` ¶

__slots__ = (
    "_default_impersonate",
    "_stealth",
    "_default_proxies",
    "_default_proxy",
    "_default_proxy_auth",
    "_default_timeout",
    "_default_headers",
    "_default_retries",
    "_default_retry_delay",
    "_default_follow_redirects",
    "_default_max_redirects",
    "_default_verify",
    "_default_cert",
    "_default_http3",
    "selector_config",
    "_client",
    "_is_alive",
    "_proxy_rotator",
)

selector_config `instance-attribute` ¶

selector_config = selector_config or {}

enter ¶

__enter__()

Creates and returns a new synchronous Fetcher Session

Source code in scrapling/engines/static.py

def __enter__(self) -> _SyncSessionLogic:
    """Creates and returns a new synchronous Fetcher Session"""
    if self._client is None:
        # Use **vars(self) to avoid repeating all parameters
        config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
        config["stealthy_headers"] = self._stealth
        config["selector_config"] = self.selector_config
        config["proxy_rotator"] = self._proxy_rotator
        self._client = _SyncSessionLogic(**config)
        try:
            result = self._client.__enter__()
        except Exception:
            self._client = None
            raise
        self._is_alive = True
        return result
    raise RuntimeError("This FetcherSession instance already has an active synchronous session.")

exit ¶

__exit__(exc_type, exc_val, exc_tb)

Source code in scrapling/engines/static.py

def __exit__(self, exc_type, exc_val, exc_tb):
    if self._client is not None and isinstance(self._client, _SyncSessionLogic):
        self._client.__exit__(exc_type, exc_val, exc_tb)
        self._client = None
        self._is_alive = False
        return
    raise RuntimeError("Cannot exit invalid session")

aenter `async` ¶

__aenter__()

Creates and returns a new asynchronous Session.

Source code in scrapling/engines/static.py

async def __aenter__(self) -> _ASyncSessionLogic:
    """Creates and returns a new asynchronous Session."""
    if self._client is None:
        # Use **vars(self) to avoid repeating all parameters
        config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
        config["stealthy_headers"] = self._stealth
        config["selector_config"] = self.selector_config
        config["proxy_rotator"] = self._proxy_rotator
        self._client = _ASyncSessionLogic(**config)
        try:
            result = await self._client.__aenter__()
        except Exception:
            self._client = None
            raise
        self._is_alive = True
        return result
    raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")

aexit `async` ¶

__aexit__(exc_type, exc_val, exc_tb)

Source code in scrapling/engines/static.py

async def __aexit__(self, exc_type, exc_val, exc_tb):
    if self._client is not None and isinstance(self._client, _ASyncSessionLogic):
        await self._client.__aexit__(exc_type, exc_val, exc_tb)
        self._client = None
        self._is_alive = False
        return
    raise RuntimeError("Cannot exit invalid session")

Stealth Sessions¶

scrapling.fetchers.StealthySession ¶

StealthySession(**kwargs)

Bases: SyncSession, StealthySessionMixin


              flowchart TD
              scrapling.fetchers.StealthySession[StealthySession]
              scrapling.engines._browsers._base.SyncSession[SyncSession]
              scrapling.engines._browsers._base.StealthySessionMixin[StealthySessionMixin]
              scrapling.engines._browsers._base.BaseSessionMixin[BaseSessionMixin]

                              scrapling.engines._browsers._base.SyncSession --> scrapling.fetchers.StealthySession
                
                scrapling.engines._browsers._base.StealthySessionMixin --> scrapling.fetchers.StealthySession
                                scrapling.engines._browsers._base.BaseSessionMixin --> scrapling.engines._browsers._base.StealthySessionMixin
                



              click scrapling.fetchers.StealthySession href "" "scrapling.fetchers.StealthySession"
              click scrapling.engines._browsers._base.SyncSession href "" "scrapling.engines._browsers._base.SyncSession"
              click scrapling.engines._browsers._base.StealthySessionMixin href "" "scrapling.engines._browsers._base.StealthySessionMixin"
              click scrapling.engines._browsers._base.BaseSessionMixin href "" "scrapling.engines._browsers._base.BaseSessionMixin"

A Stealthy Browser session manager with page pooling.

A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

PARAMETER	DESCRIPTION
`headless`	Run the browser in headless/hidden (default), or headful/visible mode.
`disable_resources`	Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`useragent`	Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
`cookies`	Set cookies for the next request.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`init_script`	An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
`locale`	Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.
`timezone_id`	Changes the timezone of the browser. Defaults to the system timezone.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`solve_cloudflare`	Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
`real_chrome`	If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
`hide_canvas`	Add random noise to canvas operations to prevent fingerprinting.
`block_webrtc`	Forces WebRTC to respect proxy settings to prevent local IP address leak.
`allow_webgl`	Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`cdp_url`	Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`extra_headers`	A dictionary of extra headers to add to the request. The referer set by `google_search` takes priority over the referer set here if used together.
`proxy`	The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
`user_data_dir`	Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
`extra_flags`	A list of additional browser flags to pass to the browser on launch.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`additional_args`	Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

Source code in scrapling/engines/_browsers/_stealth.py

def __init__(self, **kwargs: Unpack[StealthSession]):
    """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
    :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
    :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    """
    self.__validate__(**kwargs)
    super().__init__()

max_pages `instance-attribute` ¶

max_pages = max_pages

page_pool `instance-attribute` ¶

page_pool = PagePool(max_pages)

playwright `instance-attribute` ¶

playwright = None

context `instance-attribute` ¶

context = None

browser `instance-attribute` ¶

browser = None

slots `class-attribute` `instance-attribute` ¶

__slots__ = (
    "_config",
    "_context_options",
    "_browser_options",
    "_user_data_dir",
    "_headers_keys",
    "max_pages",
    "page_pool",
    "_max_wait_for_page",
    "playwright",
    "context",
)

__validate_routine__ ¶

__validate_routine__(params, model)

Source code in scrapling/engines/_browsers/_base.py

def __validate_routine__(
    self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
) -> PlaywrightConfig | StealthConfig:
    # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
    self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
    self._browser_options: Dict[str, Any] = {
        "args": DEFAULT_ARGS,
        "ignore_default_args": HARMFUL_ARGS,
    }
    if "__max_pages" in params:
        params["max_pages"] = params.pop("__max_pages")

    config = validate(params, model=model)
    self._headers_keys = (
        {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
    )

    return config

__generate_options__ ¶

__generate_options__(extra_flags=None)

Source code in scrapling/engines/_browsers/_base.py

def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
    config: PlaywrightConfig | StealthConfig = self._config
    self._context_options.update(
        {
            "proxy": config.proxy,
            "locale": config.locale,
            "timezone_id": config.timezone_id,
            "extra_http_headers": config.extra_headers,
        }
    )
    # The default useragent in the headful is always correct now in the current versions of Playwright
    if config.useragent:
        self._context_options["user_agent"] = config.useragent
    elif not config.useragent and config.headless:
        self._context_options["user_agent"] = (
            __default_chrome_useragent__ if config.real_chrome else __default_useragent__
        )

    if not config.cdp_url:
        flags = self._browser_options["args"]
        if config.extra_flags or extra_flags:
            flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))

        if config.dns_over_https:
            doh_flag = "--dns-over-https-templates=https://cloudflare-dns.com/dns-query"
            if isinstance(flags, list):
                flags.append(doh_flag)
            else:
                flags = list(flags) + [doh_flag]

        self._browser_options.update(
            {
                "args": flags,
                "headless": config.headless,
                "channel": "chrome" if config.real_chrome else "chromium",
            }
        )
        if config.executable_path:
            self._browser_options["executable_path"] = config.executable_path

        self._user_data_dir = config.user_data_dir
    else:
        self._browser_options = {}

    if config.additional_args:
        self._context_options.update(config.additional_args)

validate ¶

__validate__(**params)

Source code in scrapling/engines/_browsers/_base.py

def __validate__(self, **params):
    self._config = self.__validate_routine__(params, model=StealthConfig)
    self._context_options.update(
        {
            "is_mobile": False,
            "has_touch": False,
            # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
            "service_workers": "allow",
            "ignore_https_errors": True,
            "screen": {"width": 1920, "height": 1080},
            "viewport": {"width": 1920, "height": 1080},
            "permissions": ["geolocation", "notifications"],
        }
    )
    self.__generate_stealth_options()

close ¶

close()

Close all resources

Source code in scrapling/engines/_browsers/_base.py

def close(self):  # pragma: no cover
    """Close all resources"""
    if not self._is_alive:
        return

    if self.context:
        self.context.close()
        self.context = None

    if self.browser:
        self.browser.close()
        self.browser = None

    if self.playwright:
        self.playwright.stop()
        self.playwright = None  # pyright: ignore

    self._is_alive = False

enter ¶

__enter__()

Source code in scrapling/engines/_browsers/_base.py

def __enter__(self):
    self.start()
    return self

exit ¶

__exit__(exc_type, exc_val, exc_tb)

Source code in scrapling/engines/_browsers/_base.py

def __exit__(self, exc_type, exc_val, exc_tb):
    self.close()

get_pool_stats ¶

get_pool_stats()

Get statistics about the current page pool

Source code in scrapling/engines/_browsers/_base.py

def get_pool_stats(self) -> Dict[str, int]:
    """Get statistics about the current page pool"""
    return {
        "total_pages": self.page_pool.pages_count,
        "busy_pages": self.page_pool.busy_count,
        "max_pages": self.max_pages,
    }

start ¶

start()

Create a browser for this instance and context.

Source code in scrapling/engines/_browsers/_stealth.py

def start(self) -> None:
    """Create a browser for this instance and context."""
    if not self.playwright:
        self.playwright = sync_playwright().start()

        try:
            if self._config.cdp_url:  # pragma: no cover
                self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                if not self._config.proxy_rotator:
                    assert self.browser is not None
                    self.context = self.browser.new_context(**self._context_options)
            elif self._config.proxy_rotator:
                self.browser = self.playwright.chromium.launch(**self._browser_options)
            else:
                persistent_options = (
                    self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                )
                self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)

            if self.context:
                self.context = self._initialize_context(self._config, self.context)

            self._is_alive = True
        except Exception:
            # Clean up playwright if browser setup fails
            self.playwright.stop()
            self.playwright = None
            raise
    else:
        raise RuntimeError("Session has been already started")

fetch ¶

fetch(url, **kwargs)

Opens up the browser and do your request based on your chosen options.

PARAMETER	DESCRIPTION
`url`	The Target url. TYPE: `str`
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`extra_headers`	A dictionary of extra headers to add to the request. The referer set by `google_search` takes priority over the referer set here if used together.
`disable_resources`	Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`solve_cloudflare`	Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`proxy`	Static proxy to override rotator and session proxy. A new browser context will be created and used with it.

RETURNS	DESCRIPTION
`Response`	A `Response` object.

Source code in scrapling/engines/_browsers/_stealth.py

def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
    """Opens up the browser and do your request based on your chosen options.

    :param url: The Target url.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
    :return: A `Response` object.
    """
    static_proxy = kwargs.pop("proxy", None)

    params = _validate(kwargs, self, StealthConfig)
    if not self._is_alive:  # pragma: no cover
        raise RuntimeError("Context manager has been closed")

    request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
    referer = (
        "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
    )

    for attempt in range(self._config.retries):
        proxy: Optional[ProxyType] = None
        if self._config.proxy_rotator and static_proxy is None:
            proxy = self._config.proxy_rotator.get_proxy()
        else:
            proxy = static_proxy

        with self._page_generator(
            params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
        ) as page_info:
            final_response: List = [None]
            xhr_captured: List = []
            page = page_info.page
            page.on(
                "response",
                self._create_response_handler(
                    page_info,
                    final_response,
                    xhr_pattern=self._config.capture_xhr,
                    xhr_container=xhr_captured,
                ),
            )

            if params.page_setup:
                try:
                    params.page_setup(page)
                except Exception as e:  # pragma: no cover
                    log.error(f"Error executing page_setup: {e}")

            try:
                first_response = page.goto(url, referer=referer)
                self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if not first_response:
                    raise RuntimeError(f"Failed to get response for {url}")

                if params.solve_cloudflare:
                    self._cloudflare_solver(page)
                    # Make sure the page is fully loaded after the captcha
                    self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if params.page_action:
                    try:
                        _ = params.page_action(page)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error executing page_action: {e}")

                if params.wait_selector:
                    try:
                        waiter: Locator = page.locator(params.wait_selector)
                        waiter.first.wait_for(state=params.wait_selector_state)
                        self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                page.wait_for_timeout(params.wait)

                response = ResponseFactory.from_playwright_response(
                    page,
                    first_response,
                    final_response[0],
                    params.selector_config,
                    meta={"proxy": proxy},
                    xhr_captured=xhr_captured,
                )
                return response

            except Exception as e:
                page_info.mark_error()
                if attempt < self._config.retries - 1:
                    if is_proxy_error(e):
                        log.warning(
                            f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                        )
                    else:
                        log.warning(
                            f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                        )
                    time_sleep(self._config.retry_delay)
                else:
                    log.error(f"Failed after {self._config.retries} attempts: {e}")
                    raise

    raise RuntimeError("Request failed")  # pragma: no cover

scrapling.fetchers.AsyncStealthySession ¶

AsyncStealthySession(**kwargs)

Bases: AsyncSession, StealthySessionMixin


              flowchart TD
              scrapling.fetchers.AsyncStealthySession[AsyncStealthySession]
              scrapling.engines._browsers._base.AsyncSession[AsyncSession]
              scrapling.engines._browsers._base.StealthySessionMixin[StealthySessionMixin]
              scrapling.engines._browsers._base.BaseSessionMixin[BaseSessionMixin]

                              scrapling.engines._browsers._base.AsyncSession --> scrapling.fetchers.AsyncStealthySession
                
                scrapling.engines._browsers._base.StealthySessionMixin --> scrapling.fetchers.AsyncStealthySession
                                scrapling.engines._browsers._base.BaseSessionMixin --> scrapling.engines._browsers._base.StealthySessionMixin
                



              click scrapling.fetchers.AsyncStealthySession href "" "scrapling.fetchers.AsyncStealthySession"
              click scrapling.engines._browsers._base.AsyncSession href "" "scrapling.engines._browsers._base.AsyncSession"
              click scrapling.engines._browsers._base.StealthySessionMixin href "" "scrapling.engines._browsers._base.StealthySessionMixin"
              click scrapling.engines._browsers._base.BaseSessionMixin href "" "scrapling.engines._browsers._base.BaseSessionMixin"

An async Stealthy Browser session manager with page pooling.

A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

PARAMETER	DESCRIPTION
`headless`	Run the browser in headless/hidden (default), or headful/visible mode.
`disable_resources`	Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`useragent`	Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
`cookies`	Set cookies for the next request.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`init_script`	An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
`locale`	Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.
`timezone_id`	Changes the timezone of the browser. Defaults to the system timezone.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`solve_cloudflare`	Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
`real_chrome`	If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
`hide_canvas`	Add random noise to canvas operations to prevent fingerprinting.
`block_webrtc`	Forces WebRTC to respect proxy settings to prevent local IP address leak.
`allow_webgl`	Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`cdp_url`	Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`extra_headers`	A dictionary of extra headers to add to the request. The referer set by `google_search` takes priority over the referer set here if used together.
`proxy`	The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
`user_data_dir`	Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
`extra_flags`	A list of additional browser flags to pass to the browser on launch.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`additional_args`	Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

Source code in scrapling/engines/_browsers/_stealth.py

def __init__(self, **kwargs: Unpack[StealthSession]):
    """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
    :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
    :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    """
    self.__validate__(**kwargs)
    super().__init__(max_pages=self._config.max_pages)

max_pages `instance-attribute` ¶

max_pages = max_pages

page_pool `instance-attribute` ¶

page_pool = PagePool(max_pages)

playwright `instance-attribute` ¶

playwright = None

context `instance-attribute` ¶

context = None

browser `instance-attribute` ¶

browser = None

slots `class-attribute` `instance-attribute` ¶

__slots__ = (
    "_config",
    "_context_options",
    "_browser_options",
    "_user_data_dir",
    "_headers_keys",
)

__validate_routine__ ¶

__validate_routine__(params, model)

Source code in scrapling/engines/_browsers/_base.py

def __validate_routine__(
    self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
) -> PlaywrightConfig | StealthConfig:
    # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
    self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
    self._browser_options: Dict[str, Any] = {
        "args": DEFAULT_ARGS,
        "ignore_default_args": HARMFUL_ARGS,
    }
    if "__max_pages" in params:
        params["max_pages"] = params.pop("__max_pages")

    config = validate(params, model=model)
    self._headers_keys = (
        {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
    )

    return config

__generate_options__ ¶

__generate_options__(extra_flags=None)

Source code in scrapling/engines/_browsers/_base.py

def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
    config: PlaywrightConfig | StealthConfig = self._config
    self._context_options.update(
        {
            "proxy": config.proxy,
            "locale": config.locale,
            "timezone_id": config.timezone_id,
            "extra_http_headers": config.extra_headers,
        }
    )
    # The default useragent in the headful is always correct now in the current versions of Playwright
    if config.useragent:
        self._context_options["user_agent"] = config.useragent
    elif not config.useragent and config.headless:
        self._context_options["user_agent"] = (
            __default_chrome_useragent__ if config.real_chrome else __default_useragent__
        )

    if not config.cdp_url:
        flags = self._browser_options["args"]
        if config.extra_flags or extra_flags:
            flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))

        if config.dns_over_https:
            doh_flag = "--dns-over-https-templates=https://cloudflare-dns.com/dns-query"
            if isinstance(flags, list):
                flags.append(doh_flag)
            else:
                flags = list(flags) + [doh_flag]

        self._browser_options.update(
            {
                "args": flags,
                "headless": config.headless,
                "channel": "chrome" if config.real_chrome else "chromium",
            }
        )
        if config.executable_path:
            self._browser_options["executable_path"] = config.executable_path

        self._user_data_dir = config.user_data_dir
    else:
        self._browser_options = {}

    if config.additional_args:
        self._context_options.update(config.additional_args)

validate ¶

__validate__(**params)

Source code in scrapling/engines/_browsers/_base.py

def __validate__(self, **params):
    self._config = self.__validate_routine__(params, model=StealthConfig)
    self._context_options.update(
        {
            "is_mobile": False,
            "has_touch": False,
            # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
            "service_workers": "allow",
            "ignore_https_errors": True,
            "screen": {"width": 1920, "height": 1080},
            "viewport": {"width": 1920, "height": 1080},
            "permissions": ["geolocation", "notifications"],
        }
    )
    self.__generate_stealth_options()

close `async` ¶

close()

Close all resources

Source code in scrapling/engines/_browsers/_base.py

async def close(self):
    """Close all resources"""
    if not self._is_alive:  # pragma: no cover
        return

    if self.context:
        await self.context.close()
        self.context = None  # pyright: ignore

    if self.browser:
        await self.browser.close()
        self.browser = None

    if self.playwright:
        await self.playwright.stop()
        self.playwright = None  # pyright: ignore

    self._is_alive = False

aenter `async` ¶

__aenter__()

Source code in scrapling/engines/_browsers/_base.py

async def __aenter__(self):
    await self.start()
    return self

aexit `async` ¶

__aexit__(exc_type, exc_val, exc_tb)

Source code in scrapling/engines/_browsers/_base.py

async def __aexit__(self, exc_type, exc_val, exc_tb):
    await self.close()

get_pool_stats ¶

get_pool_stats()

Get statistics about the current page pool

Source code in scrapling/engines/_browsers/_base.py

def get_pool_stats(self) -> Dict[str, int]:
    """Get statistics about the current page pool"""
    return {
        "total_pages": self.page_pool.pages_count,
        "busy_pages": self.page_pool.busy_count,
        "max_pages": self.max_pages,
    }

start `async` ¶

start()

Create a browser for this instance and context.

Source code in scrapling/engines/_browsers/_stealth.py

async def start(self) -> None:
    """Create a browser for this instance and context."""
    if not self.playwright:
        self.playwright = await async_playwright().start()
        try:
            if self._config.cdp_url:
                self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                if not self._config.proxy_rotator:
                    assert self.browser is not None
                    self.context = await self.browser.new_context(**self._context_options)
            elif self._config.proxy_rotator:
                self.browser = await self.playwright.chromium.launch(**self._browser_options)
            else:
                persistent_options = (
                    self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                )
                self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)

            if self.context:
                self.context = await self._initialize_context(self._config, self.context)

            self._is_alive = True
        except Exception:
            # Clean up playwright if browser setup fails
            await self.playwright.stop()
            self.playwright = None
            raise
    else:
        raise RuntimeError("Session has been already started")

fetch `async` ¶

fetch(url, **kwargs)

Opens up the browser and do your request based on your chosen options.

PARAMETER	DESCRIPTION
`url`	The Target url. TYPE: `str`
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`extra_headers`	A dictionary of extra headers to add to the request. The referer set by `google_search` takes priority over the referer set here if used together.
`disable_resources`	Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`solve_cloudflare`	Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`proxy`	Static proxy to override rotator and session proxy. A new browser context will be created and used with it.

RETURNS	DESCRIPTION
`Response`	A `Response` object.

Source code in scrapling/engines/_browsers/_stealth.py

async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
    """Opens up the browser and do your request based on your chosen options.

    :param url: The Target url.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
    :return: A `Response` object.
    """
    static_proxy = kwargs.pop("proxy", None)

    params = _validate(kwargs, self, StealthConfig)

    if not self._is_alive:  # pragma: no cover
        raise RuntimeError("Context manager has been closed")

    request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
    referer = (
        "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
    )

    for attempt in range(self._config.retries):
        proxy: Optional[ProxyType] = None
        if self._config.proxy_rotator and static_proxy is None:
            proxy = self._config.proxy_rotator.get_proxy()
        else:
            proxy = static_proxy

        async with self._page_generator(
            params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
        ) as page_info:
            final_response: List = [None]
            xhr_captured: List = []
            page = page_info.page
            page.on(
                "response",
                self._create_response_handler(
                    page_info,
                    final_response,
                    xhr_pattern=self._config.capture_xhr,
                    xhr_container=xhr_captured,
                ),
            )

            if params.page_setup:
                try:
                    await params.page_setup(page)
                except Exception as e:  # pragma: no cover
                    log.error(f"Error executing page_setup: {e}")

            try:
                first_response = await page.goto(url, referer=referer)
                await self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if not first_response:
                    raise RuntimeError(f"Failed to get response for {url}")

                if params.solve_cloudflare:
                    await self._cloudflare_solver(page)
                    # Make sure the page is fully loaded after the captcha
                    await self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if params.page_action:
                    try:
                        _ = await params.page_action(page)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error executing page_action: {e}")

                if params.wait_selector:
                    try:
                        waiter: AsyncLocator = page.locator(params.wait_selector)
                        await waiter.first.wait_for(state=params.wait_selector_state)
                        await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                await page.wait_for_timeout(params.wait)

                response = await ResponseFactory.from_async_playwright_response(
                    page,
                    first_response,
                    final_response[0],
                    params.selector_config,
                    meta={"proxy": proxy},
                    xhr_captured=xhr_captured,
                )
                return response

            except Exception as e:
                page_info.mark_error()
                if attempt < self._config.retries - 1:
                    if is_proxy_error(e):
                        log.warning(
                            f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                        )
                    else:
                        log.warning(
                            f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                        )
                    await asyncio_sleep(self._config.retry_delay)
                else:
                    log.error(f"Failed after {self._config.retries} attempts: {e}")
                    raise

    raise RuntimeError("Request failed")  # pragma: no cover

Dynamic Sessions¶

scrapling.fetchers.DynamicSession ¶

DynamicSession(**kwargs)

Bases: SyncSession, DynamicSessionMixin


              flowchart TD
              scrapling.fetchers.DynamicSession[DynamicSession]
              scrapling.engines._browsers._base.SyncSession[SyncSession]
              scrapling.engines._browsers._base.DynamicSessionMixin[DynamicSessionMixin]
              scrapling.engines._browsers._base.BaseSessionMixin[BaseSessionMixin]

                              scrapling.engines._browsers._base.SyncSession --> scrapling.fetchers.DynamicSession
                
                scrapling.engines._browsers._base.DynamicSessionMixin --> scrapling.fetchers.DynamicSession
                                scrapling.engines._browsers._base.BaseSessionMixin --> scrapling.engines._browsers._base.DynamicSessionMixin
                



              click scrapling.fetchers.DynamicSession href "" "scrapling.fetchers.DynamicSession"
              click scrapling.engines._browsers._base.SyncSession href "" "scrapling.engines._browsers._base.SyncSession"
              click scrapling.engines._browsers._base.DynamicSessionMixin href "" "scrapling.engines._browsers._base.DynamicSessionMixin"
              click scrapling.engines._browsers._base.BaseSessionMixin href "" "scrapling.engines._browsers._base.BaseSessionMixin"

A Browser session manager with page pooling.

A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

PARAMETER	DESCRIPTION
`headless`	Run the browser in headless/hidden (default), or headful/visible mode.
`disable_resources`	Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`useragent`	Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
`cookies`	Set cookies for the next request.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`init_script`	An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
`locale`	Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.
`timezone_id`	Changes the timezone of the browser. Defaults to the system timezone.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`real_chrome`	If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`cdp_url`	Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`extra_headers`	A dictionary of extra headers to add to the request. The referer set by `google_search` takes priority over the referer set here if used together.
`proxy`	The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
`user_data_dir`	Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
`extra_flags`	A list of additional browser flags to pass to the browser on launch.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`additional_args`	Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

Source code in scrapling/engines/_browsers/_controllers.py

def __init__(self, **kwargs: Unpack[PlaywrightSession]):
    """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    """
    self.__validate__(**kwargs)
    super().__init__()

max_pages `instance-attribute` ¶

max_pages = max_pages

page_pool `instance-attribute` ¶

page_pool = PagePool(max_pages)

playwright `instance-attribute` ¶

playwright = None

context `instance-attribute` ¶

context = None

browser `instance-attribute` ¶

browser = None

slots `class-attribute` `instance-attribute` ¶

__slots__ = (
    "_config",
    "_context_options",
    "_browser_options",
    "_user_data_dir",
    "_headers_keys",
    "max_pages",
    "page_pool",
    "_max_wait_for_page",
    "playwright",
    "context",
)

__validate_routine__ ¶

__validate_routine__(params, model)

Source code in scrapling/engines/_browsers/_base.py

def __validate_routine__(
    self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
) -> PlaywrightConfig | StealthConfig:
    # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
    self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
    self._browser_options: Dict[str, Any] = {
        "args": DEFAULT_ARGS,
        "ignore_default_args": HARMFUL_ARGS,
    }
    if "__max_pages" in params:
        params["max_pages"] = params.pop("__max_pages")

    config = validate(params, model=model)
    self._headers_keys = (
        {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
    )

    return config

__generate_options__ ¶

__generate_options__(extra_flags=None)

Source code in scrapling/engines/_browsers/_base.py

def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
    config: PlaywrightConfig | StealthConfig = self._config
    self._context_options.update(
        {
            "proxy": config.proxy,
            "locale": config.locale,
            "timezone_id": config.timezone_id,
            "extra_http_headers": config.extra_headers,
        }
    )
    # The default useragent in the headful is always correct now in the current versions of Playwright
    if config.useragent:
        self._context_options["user_agent"] = config.useragent
    elif not config.useragent and config.headless:
        self._context_options["user_agent"] = (
            __default_chrome_useragent__ if config.real_chrome else __default_useragent__
        )

    if not config.cdp_url:
        flags = self._browser_options["args"]
        if config.extra_flags or extra_flags:
            flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))

        if config.dns_over_https:
            doh_flag = "--dns-over-https-templates=https://cloudflare-dns.com/dns-query"
            if isinstance(flags, list):
                flags.append(doh_flag)
            else:
                flags = list(flags) + [doh_flag]

        self._browser_options.update(
            {
                "args": flags,
                "headless": config.headless,
                "channel": "chrome" if config.real_chrome else "chromium",
            }
        )
        if config.executable_path:
            self._browser_options["executable_path"] = config.executable_path

        self._user_data_dir = config.user_data_dir
    else:
        self._browser_options = {}

    if config.additional_args:
        self._context_options.update(config.additional_args)

validate ¶

__validate__(**params)

Source code in scrapling/engines/_browsers/_base.py

def __validate__(self, **params):
    self._config = self.__validate_routine__(params, model=PlaywrightConfig)
    self.__generate_options__()

close ¶

close()

Close all resources

Source code in scrapling/engines/_browsers/_base.py

def close(self):  # pragma: no cover
    """Close all resources"""
    if not self._is_alive:
        return

    if self.context:
        self.context.close()
        self.context = None

    if self.browser:
        self.browser.close()
        self.browser = None

    if self.playwright:
        self.playwright.stop()
        self.playwright = None  # pyright: ignore

    self._is_alive = False

enter ¶

__enter__()

Source code in scrapling/engines/_browsers/_base.py

def __enter__(self):
    self.start()
    return self

exit ¶

__exit__(exc_type, exc_val, exc_tb)

Source code in scrapling/engines/_browsers/_base.py

def __exit__(self, exc_type, exc_val, exc_tb):
    self.close()

get_pool_stats ¶

get_pool_stats()

Get statistics about the current page pool

Source code in scrapling/engines/_browsers/_base.py

def get_pool_stats(self) -> Dict[str, int]:
    """Get statistics about the current page pool"""
    return {
        "total_pages": self.page_pool.pages_count,
        "busy_pages": self.page_pool.busy_count,
        "max_pages": self.max_pages,
    }

start ¶

start()

Create a browser for this instance and context.

Source code in scrapling/engines/_browsers/_controllers.py

def start(self):
    """Create a browser for this instance and context."""
    if not self.playwright:
        self.playwright = sync_playwright().start()

        try:
            if self._config.cdp_url:  # pragma: no cover
                self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                if not self._config.proxy_rotator and self.browser:
                    self.context = self.browser.new_context(**self._context_options)
            elif self._config.proxy_rotator:
                self.browser = self.playwright.chromium.launch(**self._browser_options)
            else:
                persistent_options = (
                    self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                )
                self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)

            if self.context:
                self.context = self._initialize_context(self._config, self.context)

            self._is_alive = True
        except Exception:
            # Clean up playwright if browser setup fails
            self.playwright.stop()
            self.playwright = None
            raise
    else:
        raise RuntimeError("Session has been already started")

fetch ¶

fetch(url, **kwargs)

Opens up the browser and do your request based on your chosen options.

PARAMETER	DESCRIPTION
`url`	The Target url. TYPE: `str`
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`extra_headers`	A dictionary of extra headers to add to the request. The referer set by `google_search` takes priority over the referer set here if used together.
`disable_resources`	Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`proxy`	Static proxy to override rotator and session proxy. A new browser context will be created and used with it.

RETURNS	DESCRIPTION
`Response`	A `Response` object.

Source code in scrapling/engines/_browsers/_controllers.py

def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
    """Opens up the browser and do your request based on your chosen options.

    :param url: The Target url.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
    :return: A `Response` object.
    """
    static_proxy = kwargs.pop("proxy", None)

    params = _validate(kwargs, self, PlaywrightConfig)
    if not self._is_alive:  # pragma: no cover
        raise RuntimeError("Context manager has been closed")

    request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
    referer = (
        "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
    )

    for attempt in range(self._config.retries):
        proxy: Optional[ProxyType] = None
        if self._config.proxy_rotator and static_proxy is None:
            proxy = self._config.proxy_rotator.get_proxy()
        else:
            proxy = static_proxy

        with self._page_generator(
            params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
        ) as page_info:
            final_response: List = [None]
            xhr_captured: List = []
            page = page_info.page
            page.on(
                "response",
                self._create_response_handler(
                    page_info,
                    final_response,
                    xhr_pattern=self._config.capture_xhr,
                    xhr_container=xhr_captured,
                ),
            )

            if params.page_setup:
                try:
                    params.page_setup(page)
                except Exception as e:  # pragma: no cover
                    log.error(f"Error executing page_setup: {e}")

            try:
                first_response = page.goto(url, referer=referer)
                self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if not first_response:
                    raise RuntimeError(f"Failed to get response for {url}")

                if params.page_action:
                    try:
                        _ = params.page_action(page)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error executing page_action: {e}")

                if params.wait_selector:
                    try:
                        waiter: Locator = page.locator(params.wait_selector)
                        waiter.first.wait_for(state=params.wait_selector_state)
                        self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                page.wait_for_timeout(params.wait)

                response = ResponseFactory.from_playwright_response(
                    page,
                    first_response,
                    final_response[0],
                    params.selector_config,
                    meta={"proxy": proxy},
                    xhr_captured=xhr_captured,
                )
                return response

            except Exception as e:
                page_info.mark_error()
                if attempt < self._config.retries - 1:
                    if is_proxy_error(e):
                        log.warning(
                            f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                        )
                    else:
                        log.warning(
                            f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                        )
                    time_sleep(self._config.retry_delay)
                else:
                    log.error(f"Failed after {self._config.retries} attempts: {e}")
                    raise

    raise RuntimeError("Request failed")  # pragma: no cover

scrapling.fetchers.AsyncDynamicSession ¶

AsyncDynamicSession(**kwargs)

Bases: AsyncSession, DynamicSessionMixin


              flowchart TD
              scrapling.fetchers.AsyncDynamicSession[AsyncDynamicSession]
              scrapling.engines._browsers._base.AsyncSession[AsyncSession]
              scrapling.engines._browsers._base.DynamicSessionMixin[DynamicSessionMixin]
              scrapling.engines._browsers._base.BaseSessionMixin[BaseSessionMixin]

                              scrapling.engines._browsers._base.AsyncSession --> scrapling.fetchers.AsyncDynamicSession
                
                scrapling.engines._browsers._base.DynamicSessionMixin --> scrapling.fetchers.AsyncDynamicSession
                                scrapling.engines._browsers._base.BaseSessionMixin --> scrapling.engines._browsers._base.DynamicSessionMixin
                



              click scrapling.fetchers.AsyncDynamicSession href "" "scrapling.fetchers.AsyncDynamicSession"
              click scrapling.engines._browsers._base.AsyncSession href "" "scrapling.engines._browsers._base.AsyncSession"
              click scrapling.engines._browsers._base.DynamicSessionMixin href "" "scrapling.engines._browsers._base.DynamicSessionMixin"
              click scrapling.engines._browsers._base.BaseSessionMixin href "" "scrapling.engines._browsers._base.BaseSessionMixin"

An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

A Browser session manager with page pooling

PARAMETER	DESCRIPTION
`headless`	Run the browser in headless/hidden (default), or headful/visible mode.
`disable_resources`	Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`useragent`	Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
`cookies`	Set cookies for the next request.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`init_script`	An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
`locale`	Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. Defaults to the system default locale.
`timezone_id`	Changes the timezone of the browser. Defaults to the system timezone.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`real_chrome`	If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
`cdp_url`	Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`extra_headers`	A dictionary of extra headers to add to the request. The referer set by `google_search` takes priority over the referer set here if used together.
`proxy`	The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
`max_pages`	The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
`user_data_dir`	Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
`extra_flags`	A list of additional browser flags to pass to the browser on launch.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`additional_args`	Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.

Source code in scrapling/engines/_browsers/_controllers.py

def __init__(self, **kwargs: Unpack[PlaywrightSession]):
    """A Browser session manager with page pooling

    :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
    :param cookies: Set cookies for the next request.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
    :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
        rules. Defaults to the system default locale.
    :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
    :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
    :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
    :param extra_flags: A list of additional browser flags to pass to the browser on launch.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
    """
    self.__validate__(**kwargs)
    super().__init__(max_pages=self._config.max_pages)

max_pages `instance-attribute` ¶

max_pages = max_pages

page_pool `instance-attribute` ¶

page_pool = PagePool(max_pages)

playwright `instance-attribute` ¶

playwright = None

context `instance-attribute` ¶

context = None

browser `instance-attribute` ¶

browser = None

slots `class-attribute` `instance-attribute` ¶

__slots__ = (
    "_config",
    "_context_options",
    "_browser_options",
    "_user_data_dir",
    "_headers_keys",
)

__validate_routine__ ¶

__validate_routine__(params, model)

Source code in scrapling/engines/_browsers/_base.py

def __validate_routine__(
    self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
) -> PlaywrightConfig | StealthConfig:
    # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
    self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
    self._browser_options: Dict[str, Any] = {
        "args": DEFAULT_ARGS,
        "ignore_default_args": HARMFUL_ARGS,
    }
    if "__max_pages" in params:
        params["max_pages"] = params.pop("__max_pages")

    config = validate(params, model=model)
    self._headers_keys = (
        {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
    )

    return config

__generate_options__ ¶

__generate_options__(extra_flags=None)

Source code in scrapling/engines/_browsers/_base.py

def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
    config: PlaywrightConfig | StealthConfig = self._config
    self._context_options.update(
        {
            "proxy": config.proxy,
            "locale": config.locale,
            "timezone_id": config.timezone_id,
            "extra_http_headers": config.extra_headers,
        }
    )
    # The default useragent in the headful is always correct now in the current versions of Playwright
    if config.useragent:
        self._context_options["user_agent"] = config.useragent
    elif not config.useragent and config.headless:
        self._context_options["user_agent"] = (
            __default_chrome_useragent__ if config.real_chrome else __default_useragent__
        )

    if not config.cdp_url:
        flags = self._browser_options["args"]
        if config.extra_flags or extra_flags:
            flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))

        if config.dns_over_https:
            doh_flag = "--dns-over-https-templates=https://cloudflare-dns.com/dns-query"
            if isinstance(flags, list):
                flags.append(doh_flag)
            else:
                flags = list(flags) + [doh_flag]

        self._browser_options.update(
            {
                "args": flags,
                "headless": config.headless,
                "channel": "chrome" if config.real_chrome else "chromium",
            }
        )
        if config.executable_path:
            self._browser_options["executable_path"] = config.executable_path

        self._user_data_dir = config.user_data_dir
    else:
        self._browser_options = {}

    if config.additional_args:
        self._context_options.update(config.additional_args)

validate ¶

__validate__(**params)

Source code in scrapling/engines/_browsers/_base.py

def __validate__(self, **params):
    self._config = self.__validate_routine__(params, model=PlaywrightConfig)
    self.__generate_options__()

close `async` ¶

close()

Close all resources

Source code in scrapling/engines/_browsers/_base.py

async def close(self):
    """Close all resources"""
    if not self._is_alive:  # pragma: no cover
        return

    if self.context:
        await self.context.close()
        self.context = None  # pyright: ignore

    if self.browser:
        await self.browser.close()
        self.browser = None

    if self.playwright:
        await self.playwright.stop()
        self.playwright = None  # pyright: ignore

    self._is_alive = False

aenter `async` ¶

__aenter__()

Source code in scrapling/engines/_browsers/_base.py

async def __aenter__(self):
    await self.start()
    return self

aexit `async` ¶

__aexit__(exc_type, exc_val, exc_tb)

Source code in scrapling/engines/_browsers/_base.py

async def __aexit__(self, exc_type, exc_val, exc_tb):
    await self.close()

get_pool_stats ¶

get_pool_stats()

Get statistics about the current page pool

Source code in scrapling/engines/_browsers/_base.py

def get_pool_stats(self) -> Dict[str, int]:
    """Get statistics about the current page pool"""
    return {
        "total_pages": self.page_pool.pages_count,
        "busy_pages": self.page_pool.busy_count,
        "max_pages": self.max_pages,
    }

start `async` ¶

start()

Create a browser for this instance and context.

Source code in scrapling/engines/_browsers/_controllers.py

async def start(self) -> None:
    """Create a browser for this instance and context."""
    if not self.playwright:
        self.playwright = await async_playwright().start()
        try:
            if self._config.cdp_url:
                self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                if not self._config.proxy_rotator and self.browser:
                    self.context = await self.browser.new_context(**self._context_options)
            elif self._config.proxy_rotator:
                self.browser = await self.playwright.chromium.launch(**self._browser_options)
            else:
                persistent_options = (
                    self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                )
                self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)

            if self.context:
                self.context = await self._initialize_context(self._config, self.context)

            self._is_alive = True
        except Exception:
            # Clean up playwright if browser setup fails
            await self.playwright.stop()
            self.playwright = None
            raise
    else:
        raise RuntimeError("Session has been already started")

fetch `async` ¶

fetch(url, **kwargs)

Opens up the browser and do your request based on your chosen options.

PARAMETER	DESCRIPTION
`url`	The Target url. TYPE: `str`
`google_search`	Enabled by default, Scrapling will set a Google referer header.
`timeout`	The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
`wait`	The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.
`page_action`	Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
`page_setup`	A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
`extra_headers`	A dictionary of extra headers to add to the request. The referer set by `google_search` takes priority over the referer set here if used together.
`disable_resources`	Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
`blocked_domains`	A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).
`wait_selector`	Wait for a specific CSS selector to be in a specific state.
`wait_selector_state`	The state to wait for the selector given with `wait_selector`. The default state is `attached`.
`network_idle`	Wait for the page until there are no network connections for at least 500 ms.
`load_dom`	Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
`selector_config`	The arguments that will be passed in the end while creating the final Selector's class.
`proxy`	Static proxy to override rotator and session proxy. A new browser context will be created and used with it.

RETURNS	DESCRIPTION
`Response`	A `Response` object.

Source code in scrapling/engines/_browsers/_controllers.py

async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
    """Opens up the browser and do your request based on your chosen options.

    :param url: The Target url.
    :param google_search: Enabled by default, Scrapling will set a Google referer header.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
    :param page_action: Added for automation. A function that takes the `page` object, runs after navigation, and does the automation you need.
    :param page_setup: A function that takes the `page` object, runs before navigation. Use it to register event listeners or routes that must be set up before the page loads.
    :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
    :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
    :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
    :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
    :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
    :return: A `Response` object.
    """
    static_proxy = kwargs.pop("proxy", None)

    params = _validate(kwargs, self, PlaywrightConfig)

    if not self._is_alive:  # pragma: no cover
        raise RuntimeError("Context manager has been closed")

    request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
    referer = (
        "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
    )

    for attempt in range(self._config.retries):
        proxy: Optional[ProxyType] = None
        if self._config.proxy_rotator and static_proxy is None:
            proxy = self._config.proxy_rotator.get_proxy()
        else:
            proxy = static_proxy

        async with self._page_generator(
            params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
        ) as page_info:
            final_response: List = [None]
            xhr_captured: List = []
            page = page_info.page
            page.on(
                "response",
                self._create_response_handler(
                    page_info,
                    final_response,
                    xhr_pattern=self._config.capture_xhr,
                    xhr_container=xhr_captured,
                ),
            )

            if params.page_setup:
                try:
                    await params.page_setup(page)
                except Exception as e:  # pragma: no cover
                    log.error(f"Error executing page_setup: {e}")

            try:
                first_response = await page.goto(url, referer=referer)
                await self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                if not first_response:
                    raise RuntimeError(f"Failed to get response for {url}")

                if params.page_action:
                    try:
                        _ = await params.page_action(page)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error executing page_action: {e}")

                if params.wait_selector:
                    try:
                        waiter: AsyncLocator = page.locator(params.wait_selector)
                        await waiter.first.wait_for(state=params.wait_selector_state)
                        await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                    except Exception as e:  # pragma: no cover
                        log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                await page.wait_for_timeout(params.wait)

                response = await ResponseFactory.from_async_playwright_response(
                    page,
                    first_response,
                    final_response[0],
                    params.selector_config,
                    meta={"proxy": proxy},
                    xhr_captured=xhr_captured,
                )
                return response

            except Exception as e:
                page_info.mark_error()
                if attempt < self._config.retries - 1:
                    if is_proxy_error(e):
                        log.warning(
                            f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                        )
                    else:
                        log.warning(
                            f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                        )
                    await asyncio_sleep(self._config.retry_delay)
                else:
                    log.error(f"Failed after {self._config.retries} attempts: {e}")
                    raise

    raise RuntimeError("Request failed")  # pragma: no cover

Fetchers Classes¶

scrapling.fetchers.Fetcher ¶

__slots__ class-attribute instance-attribute ¶

huge_tree class-attribute instance-attribute ¶

adaptive class-attribute instance-attribute ¶

storage class-attribute instance-attribute ¶

keep_cdata class-attribute instance-attribute ¶

storage_args class-attribute instance-attribute ¶

keep_comments class-attribute instance-attribute ¶

adaptive_domain class-attribute instance-attribute ¶

parser_keywords class-attribute instance-attribute ¶

display_config classmethod ¶

configure classmethod ¶

get classmethod ¶

post classmethod ¶

put classmethod ¶

delete classmethod ¶

scrapling.fetchers.AsyncFetcher ¶

__slots__ class-attribute instance-attribute ¶

huge_tree class-attribute instance-attribute ¶

adaptive class-attribute instance-attribute ¶

storage class-attribute instance-attribute ¶

keep_cdata class-attribute instance-attribute ¶

storage_args class-attribute instance-attribute ¶

keep_comments class-attribute instance-attribute ¶

adaptive_domain class-attribute instance-attribute ¶

parser_keywords class-attribute instance-attribute ¶

display_config classmethod ¶

configure classmethod ¶

get classmethod ¶

post classmethod ¶

put classmethod ¶

delete classmethod ¶

scrapling.fetchers.DynamicFetcher ¶

__slots__ class-attribute instance-attribute ¶

huge_tree class-attribute instance-attribute ¶

adaptive class-attribute instance-attribute ¶

storage class-attribute instance-attribute ¶

keep_cdata class-attribute instance-attribute ¶

storage_args class-attribute instance-attribute ¶

keep_comments class-attribute instance-attribute ¶

adaptive_domain class-attribute instance-attribute ¶

parser_keywords class-attribute instance-attribute ¶

display_config classmethod ¶

configure classmethod ¶

fetch classmethod ¶

async_fetch async classmethod ¶

scrapling.fetchers.StealthyFetcher ¶

__slots__ class-attribute instance-attribute ¶

huge_tree class-attribute instance-attribute ¶

adaptive class-attribute instance-attribute ¶

storage class-attribute instance-attribute ¶

keep_cdata class-attribute instance-attribute ¶

storage_args class-attribute instance-attribute ¶

keep_comments class-attribute instance-attribute ¶

adaptive_domain class-attribute instance-attribute ¶

parser_keywords class-attribute instance-attribute ¶

display_config classmethod ¶

configure classmethod ¶

fetch classmethod ¶

async_fetch async classmethod ¶

Session Classes¶

HTTP Sessions¶

scrapling.fetchers.FetcherSession ¶

__slots__ class-attribute instance-attribute ¶

selector_config instance-attribute ¶

__enter__ ¶

__exit__ ¶

__aenter__ async ¶

__aexit__ async ¶

Stealth Sessions¶

scrapling.fetchers.StealthySession ¶

max_pages instance-attribute ¶

page_pool instance-attribute ¶

playwright instance-attribute ¶

context instance-attribute ¶

browser instance-attribute ¶

__slots__ class-attribute instance-attribute ¶

__validate_routine__ ¶

__generate_options__ ¶

slots `class-attribute` `instance-attribute` ¶

huge_tree `class-attribute` `instance-attribute` ¶

adaptive `class-attribute` `instance-attribute` ¶

storage `class-attribute` `instance-attribute` ¶

keep_cdata `class-attribute` `instance-attribute` ¶

storage_args `class-attribute` `instance-attribute` ¶

keep_comments `class-attribute` `instance-attribute` ¶

adaptive_domain `class-attribute` `instance-attribute` ¶

parser_keywords `class-attribute` `instance-attribute` ¶

display_config `classmethod` ¶

configure `classmethod` ¶

get `classmethod` ¶

post `classmethod` ¶

put `classmethod` ¶

delete `classmethod` ¶

slots `class-attribute` `instance-attribute` ¶

huge_tree `class-attribute` `instance-attribute` ¶

adaptive `class-attribute` `instance-attribute` ¶

storage `class-attribute` `instance-attribute` ¶

keep_cdata `class-attribute` `instance-attribute` ¶

storage_args `class-attribute` `instance-attribute` ¶

keep_comments `class-attribute` `instance-attribute` ¶

adaptive_domain `class-attribute` `instance-attribute` ¶

parser_keywords `class-attribute` `instance-attribute` ¶

display_config `classmethod` ¶

configure `classmethod` ¶

get `classmethod` ¶

post `classmethod` ¶

put `classmethod` ¶

delete `classmethod` ¶

slots `class-attribute` `instance-attribute` ¶

huge_tree `class-attribute` `instance-attribute` ¶

adaptive `class-attribute` `instance-attribute` ¶

storage `class-attribute` `instance-attribute` ¶

keep_cdata `class-attribute` `instance-attribute` ¶

storage_args `class-attribute` `instance-attribute` ¶

keep_comments `class-attribute` `instance-attribute` ¶

adaptive_domain `class-attribute` `instance-attribute` ¶

parser_keywords `class-attribute` `instance-attribute` ¶

display_config `classmethod` ¶

configure `classmethod` ¶

fetch `classmethod` ¶

async_fetch `async` `classmethod` ¶

slots `class-attribute` `instance-attribute` ¶

huge_tree `class-attribute` `instance-attribute` ¶

adaptive `class-attribute` `instance-attribute` ¶

storage `class-attribute` `instance-attribute` ¶

keep_cdata `class-attribute` `instance-attribute` ¶

storage_args `class-attribute` `instance-attribute` ¶

keep_comments `class-attribute` `instance-attribute` ¶

adaptive_domain `class-attribute` `instance-attribute` ¶

parser_keywords `class-attribute` `instance-attribute` ¶

display_config `classmethod` ¶

configure `classmethod` ¶

fetch `classmethod` ¶

async_fetch `async` `classmethod` ¶

slots `class-attribute` `instance-attribute` ¶

selector_config `instance-attribute` ¶

enter ¶

exit ¶

aenter `async` ¶

aexit `async` ¶

max_pages `instance-attribute` ¶

page_pool `instance-attribute` ¶

playwright `instance-attribute` ¶

context `instance-attribute` ¶

browser `instance-attribute` ¶

slots `class-attribute` `instance-attribute` ¶

validate ¶

enter ¶

exit ¶

max_pages `instance-attribute` ¶

page_pool `instance-attribute` ¶

playwright `instance-attribute` ¶

context `instance-attribute` ¶

browser `instance-attribute` ¶

slots `class-attribute` `instance-attribute` ¶

validate ¶

close `async` ¶

aenter `async` ¶