Spider Classes¶

Here's the reference information for the spider framework classes' parameters, attributes, and methods.

You can import them directly like below:

from scrapling.spiders import Spider, Request, CrawlResult, SessionManager, Response

scrapling.spiders.Spider ¶

Spider(crawldir=None, interval=300.0)

Bases: ABC


              flowchart TD
              scrapling.spiders.Spider[Spider]

              

              click scrapling.spiders.Spider href "" "scrapling.spiders.Spider"

An abstract base class for creating web spiders.

Check the documentation website for more information.

Initialize the spider.

PARAMETER	DESCRIPTION
`crawldir`	Directory for checkpoint files. If provided, enables pause/resume. TYPE: `Optional[Union[str, Path, Path]]` DEFAULT: `None`
`interval`	Seconds between periodic checkpoint saves (default 5 minutes). TYPE: `float` DEFAULT: `300.0`

Source code in scrapling/spiders/spider.py

def __init__(self, crawldir: Optional[Union[str, Path, AsyncPath]] = None, interval: float = 300.0):
    """Initialize the spider.

    :param crawldir: Directory for checkpoint files. If provided, enables pause/resume.
    :param interval: Seconds between periodic checkpoint saves (default 5 minutes).
    """
    if self.name is None:
        raise ValueError(f"{self.__class__.__name__} must have a name.")

    self.logger = logging.getLogger(f"scrapling.spiders.{self.name}")
    self.logger.setLevel(self.logging_level)
    self.logger.handlers.clear()
    self.logger.propagate = False  # Don't propagate to parent 'scrapling' logger

    formatter = logging.Formatter(
        fmt=self.logging_format.format(spider_name=self.name), datefmt=self.logging_date_format
    )

    # Add a log counter handler to track log counts by level
    self._log_counter = LogCounterHandler()
    self.logger.addHandler(self._log_counter)

    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    self.logger.addHandler(console_handler)

    if self.log_file:
        Path(self.log_file).parent.mkdir(parents=True, exist_ok=True)
        file_handler = logging.FileHandler(self.log_file)
        file_handler.setFormatter(formatter)
        self.logger.addHandler(file_handler)

    self.crawldir: Optional[Path] = Path(crawldir) if crawldir else None
    self._interval = interval
    self._engine: Optional[CrawlerEngine] = None
    self._original_sigint_handler: Any = None

    self._session_manager = SessionManager()

    try:
        self.configure_sessions(self._session_manager)
    except Exception as e:
        raise SessionConfigurationError(f"Error in {self.__class__.__name__}.configure_sessions(): {e}") from e

    if len(self._session_manager) == 0:
        raise SessionConfigurationError(f"{self.__class__.__name__}.configure_sessions() did not add any sessions")

    self.logger.info("Spider initialized")

name `class-attribute` `instance-attribute` ¶

name = None

start_urls `class-attribute` `instance-attribute` ¶

start_urls = []

allowed_domains `class-attribute` `instance-attribute` ¶

allowed_domains = set()

robots_txt_obey `class-attribute` `instance-attribute` ¶

robots_txt_obey = False

development_mode `class-attribute` `instance-attribute` ¶

development_mode = False

development_cache_dir `class-attribute` `instance-attribute` ¶

development_cache_dir = None

concurrent_requests `class-attribute` `instance-attribute` ¶

concurrent_requests = 4

concurrent_requests_per_domain `class-attribute` `instance-attribute` ¶

concurrent_requests_per_domain = 0

download_delay `class-attribute` `instance-attribute` ¶

download_delay = 0.0

max_blocked_retries `class-attribute` `instance-attribute` ¶

max_blocked_retries = 3

fp_include_kwargs `class-attribute` `instance-attribute` ¶

fp_include_kwargs = False

fp_keep_fragments `class-attribute` `instance-attribute` ¶

fp_keep_fragments = False

fp_include_headers `class-attribute` `instance-attribute` ¶

fp_include_headers = False

logging_level `class-attribute` `instance-attribute` ¶

logging_level = logging.DEBUG

logging_format `class-attribute` `instance-attribute` ¶

logging_format = "[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s"

logging_date_format `class-attribute` `instance-attribute` ¶

logging_date_format = '%Y-%m-%d %H:%M:%S'

log_file `class-attribute` `instance-attribute` ¶

log_file = None

logger `instance-attribute` ¶

logger = logging.getLogger(f"scrapling.spiders.{self.name}")

crawldir `instance-attribute` ¶

crawldir = Path(crawldir) if crawldir else None

stats `property` ¶

stats

Access current crawl stats (works during streaming).

start_requests `async` ¶

start_requests()

Generate initial requests to start the crawl.

By default, this generates Request objects for each URL in start_urls using the session manager's default session and parse() as callback.

Override this method for more control over initial requests (e.g., to add custom headers, use different callbacks, etc.)

Source code in scrapling/spiders/spider.py

async def start_requests(self) -> AsyncGenerator[Request, None]:
    """Generate initial requests to start the crawl.

    By default, this generates Request objects for each URL in `start_urls`
    using the session manager's default session and `parse()` as callback.

    Override this method for more control over initial requests
    (e.g., to add custom headers, use different callbacks, etc.)
    """
    if not self.start_urls:
        raise RuntimeError(
            "Spider has no starting point, either set `start_urls` or override `start_requests` function."
        )

    for url in self.start_urls:
        yield Request(url, sid=self._session_manager.default_session_id)

parse `abstractmethod` `async` ¶

parse(response)

Default callback for processing responses

Source code in scrapling/spiders/spider.py

@abstractmethod
async def parse(self, response: "Response") -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
    """Default callback for processing responses"""
    raise NotImplementedError(f"{self.__class__.__name__} must implement parse() method")
    yield  # Make this a generator for type checkers

on_start `async` ¶

on_start(resuming=False)

Called before crawling starts. Override for setup logic.

PARAMETER	DESCRIPTION
`resuming`	It's enabled if the spider is resuming from a checkpoint, left for the user to use. TYPE: `bool` DEFAULT: `False`

Source code in scrapling/spiders/spider.py

async def on_start(self, resuming: bool = False) -> None:
    """Called before crawling starts. Override for setup logic.

    :param resuming: It's enabled if the spider is resuming from a checkpoint, left for the user to use.
    """
    if resuming:
        self.logger.debug("Resuming spider from checkpoint")
    else:
        self.logger.debug("Starting spider")

on_close `async` ¶

on_close()

Called after crawling finishes. Override for cleanup logic.

Source code in scrapling/spiders/spider.py

async def on_close(self) -> None:
    """Called after crawling finishes. Override for cleanup logic."""
    self.logger.debug("Spider closed")

on_error `async` ¶

on_error(request, error)

Handle request errors for all spider requests.

Override for custom error handling.

Source code in scrapling/spiders/spider.py

async def on_error(self, request: Request, error: Exception) -> None:
    """
    Handle request errors for all spider requests.

    Override for custom error handling.
    """
    pass

on_scraped_item `async` ¶

on_scraped_item(item)

A hook to be overridden by users to do some processing on scraped items, return None to drop the item silently.

Source code in scrapling/spiders/spider.py

async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:
    """A hook to be overridden by users to do some processing on scraped items, return `None` to drop the item silently."""
    return item

is_blocked `async` ¶

is_blocked(response)

Check if the response is blocked. Users should override this for custom detection logic.

Source code in scrapling/spiders/spider.py

async def is_blocked(self, response: "Response") -> bool:
    """Check if the response is blocked. Users should override this for custom detection logic."""
    if response.status in BLOCKED_CODES:
        return True
    return False

retry_blocked_request `async` ¶

retry_blocked_request(request, response)

Users should override this to prepare the blocked request before retrying, if needed.

Source code in scrapling/spiders/spider.py

async def retry_blocked_request(self, request: Request, response: "Response") -> Request:
    """Users should override this to prepare the blocked request before retrying, if needed."""
    return request

repr ¶

__repr__()

String representation of the spider.

Source code in scrapling/spiders/spider.py

def __repr__(self) -> str:
    """String representation of the spider."""
    return f"<{self.__class__.__name__} '{self.name}'>"

configure_sessions ¶

configure_sessions(manager)

Configure sessions for this spider.

Override this method to add custom sessions. The default implementation creates a FetcherSession session.

The first session added becomes the default for start_requests() unless specified otherwise.

PARAMETER	DESCRIPTION
`manager`	SessionManager to configure TYPE: `SessionManager`

Source code in scrapling/spiders/spider.py

def configure_sessions(self, manager: SessionManager) -> None:
    """Configure sessions for this spider.

    Override this method to add custom sessions.
    The default implementation creates a FetcherSession session.

    The first session added becomes the default for `start_requests()` unless specified otherwise.

    :param manager: SessionManager to configure
    """
    from scrapling.fetchers import FetcherSession

    manager.add("default", FetcherSession())

pause ¶

pause()

Request graceful shutdown of the crawling process.

Source code in scrapling/spiders/spider.py

def pause(self):
    """Request graceful shutdown of the crawling process."""
    if self._engine:
        self._engine.request_pause()
    else:
        raise RuntimeError("No active crawl to stop")

start ¶

start(use_uvloop=False, **backend_options)

Run the spider and return results.

This is the main entry point for running a spider. Handles async execution internally via anyio.

Pressing Ctrl+C will initiate graceful shutdown (waits for active tasks to complete). Pressing Ctrl+C a second time will force immediate stop.

If crawldir is set, a checkpoint will also be saved on graceful shutdown, allowing you to resume the crawl later by running the spider again.

PARAMETER	DESCRIPTION
`use_uvloop`	Whether to use the faster uvloop/winloop event loop implementation, if available. TYPE: `bool` DEFAULT: `False`
`backend_options`	Asyncio backend options to be used with `anyio.run` TYPE: `Any` DEFAULT: `{}`

Source code in scrapling/spiders/spider.py

def start(self, use_uvloop: bool = False, **backend_options: Any) -> CrawlResult:
    """Run the spider and return results.

    This is the main entry point for running a spider.
    Handles async execution internally via anyio.

    Pressing Ctrl+C will initiate graceful shutdown (waits for active tasks to complete).
    Pressing Ctrl+C a second time will force immediate stop.

    If crawldir is set, a checkpoint will also be saved on graceful shutdown,
    allowing you to resume the crawl later by running the spider again.

    :param use_uvloop: Whether to use the faster uvloop/winloop event loop implementation, if available.
    :param backend_options: Asyncio backend options to be used with `anyio.run`
    """
    backend_options = backend_options or {}
    if use_uvloop:
        backend_options.update({"use_uvloop": True})

    # Set up SIGINT handler for graceful shutdown
    self._setup_signal_handler()
    try:
        return anyio.run(self.__run, backend="asyncio", backend_options=backend_options)
    finally:
        self._restore_signal_handler()

stream `async` ¶

stream()

Stream items as they're scraped. Ideal for long-running spiders or building applications on top of the spiders.

Must be called from an async context. Yields items one by one as they are scraped. Access spider.stats during iteration for real-time statistics.

Note: SIGINT handling for pause/resume is not available in stream mode.

Source code in scrapling/spiders/spider.py

async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
    """Stream items as they're scraped. Ideal for long-running spiders or building applications on top of the spiders.

    Must be called from an async context. Yields items one by one as they are scraped.
    Access `spider.stats` during iteration for real-time statistics.

    Note: SIGINT handling for pause/resume is not available in stream mode.
    """
    token = set_logger(self.logger)
    try:
        self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)
        async for item in self._engine:
            yield item
    finally:
        self._engine = None
        reset_logger(token)
        if self.log_file:
            for handler in self.logger.handlers:
                if isinstance(handler, logging.FileHandler):
                    handler.close()

scrapling.spiders.Request ¶

Request(
    url,
    sid="",
    callback=None,
    priority=0,
    dont_filter=False,
    meta=None,
    _retry_count=0,
    **kwargs
)

Source code in scrapling/spiders/request.py

def __init__(
    self,
    url: str,
    sid: str = "",
    callback: Callable[[Response], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None,
    priority: int = 0,
    dont_filter: bool = False,
    meta: dict[str, Any] | None = None,
    _retry_count: int = 0,
    **kwargs: Any,
) -> None:
    self.url: str = url
    self.sid: str = sid
    self.callback = callback
    self.priority: int = priority
    self.dont_filter: bool = dont_filter
    self.meta: dict[str, Any] = meta if meta else {}
    self._retry_count: int = _retry_count
    self._session_kwargs = kwargs if kwargs else {}
    self._fp: Optional[bytes] = None

url `instance-attribute` ¶

url = url

sid `instance-attribute` ¶

sid = sid

callback `instance-attribute` ¶

callback = callback

priority `instance-attribute` ¶

priority = priority

dont_filter `instance-attribute` ¶

dont_filter = dont_filter

meta `instance-attribute` ¶

meta = meta if meta else {}

domain `cached` `property` ¶

domain

copy ¶

copy()

Create a copy of this request.

Source code in scrapling/spiders/request.py

def copy(self) -> "Request":
    """Create a copy of this request."""
    return Request(
        url=self.url,
        sid=self.sid,
        callback=self.callback,
        priority=self.priority,
        dont_filter=self.dont_filter,
        meta=self.meta.copy(),
        _retry_count=self._retry_count,
        **self._session_kwargs,
    )

update_fingerprint ¶

update_fingerprint(
    include_kwargs=False,
    include_headers=False,
    keep_fragments=False,
)

Generate a unique fingerprint for deduplication.

Caches the result in self._fp after first computation.

Source code in scrapling/spiders/request.py

def update_fingerprint(
    self,
    include_kwargs: bool = False,
    include_headers: bool = False,
    keep_fragments: bool = False,
) -> bytes:
    """Generate a unique fingerprint for deduplication.

    Caches the result in self._fp after first computation.
    """
    if self._fp is not None:
        return self._fp

    post_data = self._session_kwargs.get("data", {})
    body = b""
    if post_data:
        if isinstance(post_data, dict | list | tuple):
            body = urlencode(post_data).encode()
        elif isinstance(post_data, str):
            body = post_data.encode()
        elif isinstance(post_data, BytesIO):
            body = post_data.getvalue()
        elif isinstance(post_data, bytes):
            body = post_data
    else:
        post_data = self._session_kwargs.get("json", {})
        body = orjson.dumps(post_data) if post_data else b""

    data: Dict[str, str | Tuple] = {
        "sid": self.sid,
        "body": body.hex(),
        "method": self._session_kwargs.get("method", "GET"),
        "url": canonicalize_url(self.url, keep_fragments=keep_fragments),
    }

    if include_kwargs:
        filtered_kwargs = {
            key.lower(): _stable_value_repr(value)
            for key, value in self._session_kwargs.items()
            if key.lower() not in ("data", "json")
        }
        data["kwargs"] = tuple(sorted(filtered_kwargs.items()))

    if include_headers:
        headers = self._session_kwargs.get("headers") or self._session_kwargs.get("extra_headers") or {}
        processed_headers = {}
        # Some header normalization
        for key, value in headers.items():
            processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value).hex()
        data["headers"] = tuple(processed_headers.items())

    fp = hashlib.sha1(orjson.dumps(data, option=orjson.OPT_SORT_KEYS), usedforsecurity=False).digest()
    self._fp = fp
    return fp

repr ¶

__repr__()

Source code in scrapling/spiders/request.py

def __repr__(self) -> str:
    callback_name = getattr(self.callback, "__name__", None) or "None"
    return f"<Request({self.url}) priority={self.priority} callback={callback_name}>"

str ¶

__str__()

Source code in scrapling/spiders/request.py

def __str__(self) -> str:
    return self.url

lt ¶

__lt__(other)

Compare requests by priority

Source code in scrapling/spiders/request.py

def __lt__(self, other: object) -> bool:
    """Compare requests by priority"""
    if not isinstance(other, Request):
        return NotImplemented
    return self.priority < other.priority

gt ¶

__gt__(other)

Compare requests by priority

Source code in scrapling/spiders/request.py

def __gt__(self, other: object) -> bool:
    """Compare requests by priority"""
    if not isinstance(other, Request):
        return NotImplemented
    return self.priority > other.priority

eq ¶

__eq__(other)

Requests are equal if they have the same fingerprint.

Source code in scrapling/spiders/request.py

def __eq__(self, other: object) -> bool:
    """Requests are equal if they have the same fingerprint."""
    if not isinstance(other, Request):
        return NotImplemented
    if self._fp is None or other._fp is None:
        raise RuntimeError("Cannot compare requests before generating their fingerprints!")
    return self._fp == other._fp

getstate ¶

__getstate__()

Prepare state for pickling - store callback as name string for pickle compatibility.

Source code in scrapling/spiders/request.py

def __getstate__(self) -> dict[str, Any]:
    """Prepare state for pickling - store callback as name string for pickle compatibility."""
    state = self.__dict__.copy()
    state["_callback_name"] = getattr(self.callback, "__name__", None) if self.callback is not None else None
    state["callback"] = None  # Don't pickle the actual callable
    return state

setstate ¶

__setstate__(state)

Restore state from pickle - callback restored later via _restore_callback().

Source code in scrapling/spiders/request.py

def __setstate__(self, state: dict[str, Any]) -> None:
    """Restore state from pickle - callback restored later via _restore_callback()."""
    self._callback_name: str | None = state.pop("_callback_name", None)
    self.__dict__.update(state)

Result Classes¶

scrapling.spiders.result.CrawlResult `dataclass` ¶

CrawlResult(stats, items, paused=False)

Complete result from a spider run.

stats `instance-attribute` ¶

stats

items `instance-attribute` ¶

items

paused `class-attribute` `instance-attribute` ¶

paused = False

completed `property` ¶

completed

True if the crawl completed normally (not paused).

len ¶

__len__()

Source code in scrapling/spiders/result.py

def __len__(self) -> int:
    return len(self.items)

iter ¶

__iter__()

Source code in scrapling/spiders/result.py

def __iter__(self) -> Iterator[dict[str, Any]]:
    return iter(self.items)

scrapling.spiders.result.CrawlStats `dataclass` ¶

CrawlStats(
    requests_count=0,
    concurrent_requests=0,
    concurrent_requests_per_domain=0,
    failed_requests_count=0,
    offsite_requests_count=0,
    robots_disallowed_count=0,
    cache_hits=0,
    cache_misses=0,
    response_bytes=0,
    items_scraped=0,
    items_dropped=0,
    start_time=0.0,
    end_time=0.0,
    download_delay=0.0,
    blocked_requests_count=0,
    custom_stats=dict(),
    response_status_count=dict(),
    domains_response_bytes=dict(),
    sessions_requests_count=dict(),
    proxies=list(),
    log_levels_counter=dict(),
)

Statistics for a crawl run.

requests_count `class-attribute` `instance-attribute` ¶

requests_count = 0

concurrent_requests `class-attribute` `instance-attribute` ¶

concurrent_requests = 0

concurrent_requests_per_domain `class-attribute` `instance-attribute` ¶

concurrent_requests_per_domain = 0

failed_requests_count `class-attribute` `instance-attribute` ¶

failed_requests_count = 0

offsite_requests_count `class-attribute` `instance-attribute` ¶

offsite_requests_count = 0

robots_disallowed_count `class-attribute` `instance-attribute` ¶

robots_disallowed_count = 0

cache_hits `class-attribute` `instance-attribute` ¶

cache_hits = 0

cache_misses `class-attribute` `instance-attribute` ¶

cache_misses = 0

response_bytes `class-attribute` `instance-attribute` ¶

response_bytes = 0

items_scraped `class-attribute` `instance-attribute` ¶

items_scraped = 0

items_dropped `class-attribute` `instance-attribute` ¶

items_dropped = 0

start_time `class-attribute` `instance-attribute` ¶

start_time = 0.0

end_time `class-attribute` `instance-attribute` ¶

end_time = 0.0

download_delay `class-attribute` `instance-attribute` ¶

download_delay = 0.0

blocked_requests_count `class-attribute` `instance-attribute` ¶

blocked_requests_count = 0

custom_stats `class-attribute` `instance-attribute` ¶

custom_stats = field(default_factory=dict)

response_status_count `class-attribute` `instance-attribute` ¶

response_status_count = field(default_factory=dict)

domains_response_bytes `class-attribute` `instance-attribute` ¶

domains_response_bytes = field(default_factory=dict)

sessions_requests_count `class-attribute` `instance-attribute` ¶

sessions_requests_count = field(default_factory=dict)

proxies `class-attribute` `instance-attribute` ¶

proxies = field(default_factory=list)

log_levels_counter `class-attribute` `instance-attribute` ¶

log_levels_counter = field(default_factory=dict)

elapsed_seconds `property` ¶

elapsed_seconds

requests_per_second `property` ¶

requests_per_second

increment_status ¶

increment_status(status)

Source code in scrapling/spiders/result.py

def increment_status(self, status: int) -> None:
    self.response_status_count[f"status_{status}"] = self.response_status_count.get(f"status_{status}", 0) + 1

increment_response_bytes ¶

increment_response_bytes(domain, count)

Source code in scrapling/spiders/result.py

def increment_response_bytes(self, domain: str, count: int) -> None:
    self.response_bytes += count
    self.domains_response_bytes[domain] = self.domains_response_bytes.get(domain, 0) + count

increment_requests_count ¶

increment_requests_count(sid)

Source code in scrapling/spiders/result.py

def increment_requests_count(self, sid: str) -> None:
    self.requests_count += 1
    self.sessions_requests_count[sid] = self.sessions_requests_count.get(sid, 0) + 1

to_dict ¶

to_dict()

Source code in scrapling/spiders/result.py

def to_dict(self) -> dict[str, Any]:
    return {
        "items_scraped": self.items_scraped,
        "items_dropped": self.items_dropped,
        "elapsed_seconds": round(self.elapsed_seconds, 2),
        "download_delay": round(self.download_delay, 2),
        "concurrent_requests": self.concurrent_requests,
        "concurrent_requests_per_domain": self.concurrent_requests_per_domain,
        "requests_count": self.requests_count,
        "requests_per_second": round(self.requests_per_second, 2),
        "sessions_requests_count": self.sessions_requests_count,
        "failed_requests_count": self.failed_requests_count,
        "offsite_requests_count": self.offsite_requests_count,
        "robots_disallowed_count": self.robots_disallowed_count,
        "cache_hits": self.cache_hits,
        "cache_misses": self.cache_misses,
        "blocked_requests_count": self.blocked_requests_count,
        "response_status_count": self.response_status_count,
        "response_bytes": self.response_bytes,
        "domains_response_bytes": self.domains_response_bytes,
        "proxies": self.proxies,
        "custom_stats": self.custom_stats,
        "log_count": self.log_levels_counter,
    }

scrapling.spiders.result.ItemList ¶

Bases: list


              flowchart TD
              scrapling.spiders.result.ItemList[ItemList]

              

              click scrapling.spiders.result.ItemList href "" "scrapling.spiders.result.ItemList"

A list of scraped items with export capabilities.

to_json ¶

to_json(path, *, indent=False)

Export items to a JSON file.

PARAMETER	DESCRIPTION
`path`	Path to the output file TYPE: `Union[str, Path]`
`indent`	Pretty-print with 2-space indentation (slightly slower) TYPE: `bool` DEFAULT: `False`

Source code in scrapling/spiders/result.py

def to_json(self, path: Union[str, Path], *, indent: bool = False):
    """Export items to a JSON file.

    :param path: Path to the output file
    :param indent: Pretty-print with 2-space indentation (slightly slower)
    """
    options = orjson.OPT_SERIALIZE_NUMPY
    if indent:
        options |= orjson.OPT_INDENT_2

    file = Path(path)
    file.parent.mkdir(parents=True, exist_ok=True)
    file.write_bytes(orjson.dumps(list(self), option=options))
    log.info("Saved %d items to %s", len(self), path)

to_jsonl ¶

to_jsonl(path)

Export items as JSON Lines (one JSON object per line).

PARAMETER	DESCRIPTION
`path`	Path to the output file TYPE: `Union[str, Path]`

Source code in scrapling/spiders/result.py

def to_jsonl(self, path: Union[str, Path]):
    """Export items as JSON Lines (one JSON object per line).

    :param path: Path to the output file
    """
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    with open(path, "wb") as f:
        for item in self:
            f.write(orjson.dumps(item, option=orjson.OPT_SERIALIZE_NUMPY))
            f.write(b"\n")
    log.info("Saved %d items to %s", len(self), path)

Session Management¶

scrapling.spiders.session.SessionManager ¶

SessionManager()

Manages pre-configured session instances.

Source code in scrapling/spiders/session.py

def __init__(self) -> None:
    self._sessions: dict[str, Session] = {}
    self._default_session_id: str | None = None
    self._started: bool = False
    self._lazy_sessions: Set[str] = set()
    self._lazy_lock = Lock()

default_session_id `property` ¶

default_session_id

session_ids `property` ¶

session_ids

add ¶

add(session_id, session, *, default=False, lazy=False)

Register a session instance.

PARAMETER	DESCRIPTION
`session_id`	Name to reference this session in requests TYPE: `str`
`session`	Your pre-configured session instance TYPE: `Session`
`default`	If True, this becomes the default session TYPE: `bool` DEFAULT: `False`
`lazy`	If True, the session will be started only when a request uses its ID. TYPE: `bool` DEFAULT: `False`

Source code in scrapling/spiders/session.py

def add(self, session_id: str, session: Session, *, default: bool = False, lazy: bool = False) -> "SessionManager":
    """Register a session instance.

    :param session_id: Name to reference this session in requests
    :param session: Your pre-configured session instance
    :param default: If True, this becomes the default session
    :param lazy: If True, the session will be started only when a request uses its ID.
    """
    if session_id in self._sessions:
        raise ValueError(f"Session '{session_id}' already registered")

    self._sessions[session_id] = session

    if default or self._default_session_id is None:
        self._default_session_id = session_id

    if lazy:
        self._lazy_sessions.add(session_id)

    return self

remove ¶

remove(session_id)

Removes a session.

PARAMETER	DESCRIPTION
`session_id`	ID of session to remove TYPE: `str`

Source code in scrapling/spiders/session.py

def remove(self, session_id: str) -> None:
    """Removes a session.

    :param session_id: ID of session to remove
    """
    _ = self.pop(session_id)

pop ¶

pop(session_id)

Remove and returns a session.

PARAMETER	DESCRIPTION
`session_id`	ID of session to remove TYPE: `str`

Source code in scrapling/spiders/session.py

def pop(self, session_id: str) -> Session:
    """Remove and returns a session.

    :param session_id: ID of session to remove
    """
    if session_id not in self._sessions:
        raise KeyError(f"Session '{session_id}' not found")

    session = self._sessions.pop(session_id)
    if session_id in self._lazy_sessions:
        self._lazy_sessions.remove(session_id)

    if session and self._default_session_id == session_id:
        self._default_session_id = next(iter(self._sessions), None)

    return session

get ¶

get(session_id)

Source code in scrapling/spiders/session.py

def get(self, session_id: str) -> Session:
    if session_id not in self._sessions:
        available = ", ".join(self._sessions.keys())
        raise KeyError(f"Session '{session_id}' not found. Available: {available}")
    return self._sessions[session_id]

start `async` ¶

start()

Start all sessions that aren't already alive.

Source code in scrapling/spiders/session.py

async def start(self) -> None:
    """Start all sessions that aren't already alive."""
    if self._started:
        return

    for sid, session in self._sessions.items():
        if sid not in self._lazy_sessions and not session._is_alive:
            await session.__aenter__()

    self._started = True

close `async` ¶

close()

Close all registered sessions.

Source code in scrapling/spiders/session.py

async def close(self) -> None:
    """Close all registered sessions."""
    for sid, session in self._sessions.items():
        if sid in self._lazy_sessions and not session._is_alive:
            continue
        _ = await session.__aexit__(None, None, None)

    self._started = False

fetch `async` ¶

fetch(request)

Source code in scrapling/spiders/session.py

async def fetch(self, request: Request) -> Response:
    sid = request.sid if request.sid else self.default_session_id
    session = self.get(sid)

    if session:
        if sid in self._lazy_sessions and not session._is_alive:
            async with self._lazy_lock:
                if not session._is_alive:
                    await session.__aenter__()

        if isinstance(session, FetcherSession):
            client = session._client

            if isinstance(client, _ASyncSessionLogic):
                kwargs = request._session_kwargs.copy()
                method = cast(SUPPORTED_HTTP_METHODS, kwargs.pop("method", "GET"))
                response = await client._make_request(
                    method=method,
                    url=request.url,
                    **kwargs,
                )
            else:
                # Sync session or other types - shouldn't happen in async context
                raise TypeError(f"Session type {type(client)} not supported for async fetch")
        else:
            response = await session.fetch(url=request.url, **request._session_kwargs)

        response.request = request
        # Merge request meta into response meta (response meta takes priority)
        response.meta = {**request.meta, **response.meta}
        return response
    raise RuntimeError("No session found with the request session id")

aenter `async` ¶

__aenter__()

Source code in scrapling/spiders/session.py

async def __aenter__(self) -> "SessionManager":
    await self.start()
    return self

aexit `async` ¶

__aexit__(*exc)

Source code in scrapling/spiders/session.py

async def __aexit__(self, *exc) -> None:
    await self.close()

contains ¶

__contains__(session_id)

Check if a session ID is registered.

Source code in scrapling/spiders/session.py

def __contains__(self, session_id: str) -> bool:
    """Check if a session ID is registered."""
    return session_id in self._sessions

len ¶

__len__()

Number of registered sessions.

Source code in scrapling/spiders/session.py

def __len__(self) -> int:
    """Number of registered sessions."""
    return len(self._sessions)

Spider Classes¶

scrapling.spiders.Spider ¶

name class-attribute instance-attribute ¶

start_urls class-attribute instance-attribute ¶

allowed_domains class-attribute instance-attribute ¶

robots_txt_obey class-attribute instance-attribute ¶

development_mode class-attribute instance-attribute ¶

development_cache_dir class-attribute instance-attribute ¶

concurrent_requests class-attribute instance-attribute ¶

concurrent_requests_per_domain class-attribute instance-attribute ¶

download_delay class-attribute instance-attribute ¶

max_blocked_retries class-attribute instance-attribute ¶

fp_include_kwargs class-attribute instance-attribute ¶

fp_keep_fragments class-attribute instance-attribute ¶

fp_include_headers class-attribute instance-attribute ¶

logging_level class-attribute instance-attribute ¶

logging_format class-attribute instance-attribute ¶

logging_date_format class-attribute instance-attribute ¶

log_file class-attribute instance-attribute ¶

logger instance-attribute ¶

crawldir instance-attribute ¶

stats property ¶

start_requests async ¶

parse abstractmethod async ¶

on_start async ¶

on_close async ¶

on_error async ¶

on_scraped_item async ¶

is_blocked async ¶

retry_blocked_request async ¶

__repr__ ¶

configure_sessions ¶

pause ¶

start ¶

stream async ¶

scrapling.spiders.Request ¶

url instance-attribute ¶

sid instance-attribute ¶

callback instance-attribute ¶

priority instance-attribute ¶

dont_filter instance-attribute ¶

meta instance-attribute ¶

domain cached property ¶

copy ¶

update_fingerprint ¶

__repr__ ¶

__str__ ¶

__lt__ ¶

__gt__ ¶

__eq__ ¶

__getstate__ ¶

__setstate__ ¶

Result Classes¶

scrapling.spiders.result.CrawlResult dataclass ¶

stats instance-attribute ¶

items instance-attribute ¶

paused class-attribute instance-attribute ¶

completed property ¶

__len__ ¶

__iter__ ¶

scrapling.spiders.result.CrawlStats dataclass ¶

requests_count class-attribute instance-attribute ¶

concurrent_requests class-attribute instance-attribute ¶

concurrent_requests_per_domain class-attribute instance-attribute ¶

failed_requests_count class-attribute instance-attribute ¶

offsite_requests_count class-attribute instance-attribute ¶

robots_disallowed_count class-attribute instance-attribute ¶

cache_hits class-attribute instance-attribute ¶

cache_misses class-attribute instance-attribute ¶

response_bytes class-attribute instance-attribute ¶

items_scraped class-attribute instance-attribute ¶

items_dropped class-attribute instance-attribute ¶

start_time class-attribute instance-attribute ¶

end_time class-attribute instance-attribute ¶

download_delay class-attribute instance-attribute ¶

blocked_requests_count class-attribute instance-attribute ¶

custom_stats class-attribute instance-attribute ¶

response_status_count class-attribute instance-attribute ¶

domains_response_bytes class-attribute instance-attribute ¶

sessions_requests_count class-attribute instance-attribute ¶

name `class-attribute` `instance-attribute` ¶

start_urls `class-attribute` `instance-attribute` ¶

allowed_domains `class-attribute` `instance-attribute` ¶

robots_txt_obey `class-attribute` `instance-attribute` ¶

development_mode `class-attribute` `instance-attribute` ¶

development_cache_dir `class-attribute` `instance-attribute` ¶

concurrent_requests `class-attribute` `instance-attribute` ¶

concurrent_requests_per_domain `class-attribute` `instance-attribute` ¶

download_delay `class-attribute` `instance-attribute` ¶

max_blocked_retries `class-attribute` `instance-attribute` ¶

fp_include_kwargs `class-attribute` `instance-attribute` ¶

fp_keep_fragments `class-attribute` `instance-attribute` ¶

fp_include_headers `class-attribute` `instance-attribute` ¶

logging_level `class-attribute` `instance-attribute` ¶

logging_format `class-attribute` `instance-attribute` ¶

logging_date_format `class-attribute` `instance-attribute` ¶

log_file `class-attribute` `instance-attribute` ¶

logger `instance-attribute` ¶

crawldir `instance-attribute` ¶

stats `property` ¶

start_requests `async` ¶

parse `abstractmethod` `async` ¶

on_start `async` ¶

on_close `async` ¶

on_error `async` ¶

on_scraped_item `async` ¶

is_blocked `async` ¶

retry_blocked_request `async` ¶

repr ¶

stream `async` ¶

url `instance-attribute` ¶

sid `instance-attribute` ¶

callback `instance-attribute` ¶

priority `instance-attribute` ¶

dont_filter `instance-attribute` ¶

meta `instance-attribute` ¶

domain `cached` `property` ¶

repr ¶

str ¶

lt ¶

gt ¶

eq ¶

getstate ¶

setstate ¶

scrapling.spiders.result.CrawlResult `dataclass` ¶

stats `instance-attribute` ¶

items `instance-attribute` ¶

paused `class-attribute` `instance-attribute` ¶

completed `property` ¶

len ¶

iter ¶

scrapling.spiders.result.CrawlStats `dataclass` ¶

requests_count `class-attribute` `instance-attribute` ¶

concurrent_requests `class-attribute` `instance-attribute` ¶

concurrent_requests_per_domain `class-attribute` `instance-attribute` ¶

failed_requests_count `class-attribute` `instance-attribute` ¶

offsite_requests_count `class-attribute` `instance-attribute` ¶

robots_disallowed_count `class-attribute` `instance-attribute` ¶

cache_hits `class-attribute` `instance-attribute` ¶

cache_misses `class-attribute` `instance-attribute` ¶

response_bytes `class-attribute` `instance-attribute` ¶

items_scraped `class-attribute` `instance-attribute` ¶

items_dropped `class-attribute` `instance-attribute` ¶

start_time `class-attribute` `instance-attribute` ¶

end_time `class-attribute` `instance-attribute` ¶

download_delay `class-attribute` `instance-attribute` ¶

blocked_requests_count `class-attribute` `instance-attribute` ¶

custom_stats `class-attribute` `instance-attribute` ¶

response_status_count `class-attribute` `instance-attribute` ¶

domains_response_bytes `class-attribute` `instance-attribute` ¶

sessions_requests_count `class-attribute` `instance-attribute` ¶

proxies `class-attribute` `instance-attribute` ¶

log_levels_counter `class-attribute` `instance-attribute` ¶

elapsed_seconds `property` ¶

requests_per_second `property` ¶

default_session_id `property` ¶

session_ids `property` ¶

start `async` ¶

close `async` ¶

fetch `async` ¶