# _scanners URL: /api/lexers/_scanners/ Section: lexers -------------------------------------------------------------------------------- _scanners - Rosettes window.BENGAL_THEME_DEFAULTS = { appearance: 'light', palette: 'brown-bengal' }; window.Bengal = window.Bengal || {}; window.Bengal.enhanceBaseUrl = '/rosettes/assets/js/enhancements'; window.Bengal.watchDom = true; window.Bengal.debug = false; window.Bengal.enhanceUrls = { 'toc': '/rosettes/assets/js/enhancements/toc.632a9783.js', 'docs-nav': '/rosettes/assets/js/enhancements/docs-nav.57e4b129.js', 'tabs': '/rosettes/assets/js/enhancements/tabs.aac9e817.js', 'lightbox': '/rosettes/assets/js/enhancements/lightbox.1ca22aa1.js', 'interactive': '/rosettes/assets/js/enhancements/interactive.fc077855.js', 'mobile-nav': '/rosettes/assets/js/enhancements/mobile-nav.d991657f.js', 'action-bar': '/rosettes/assets/js/enhancements/action-bar.d62417f4.js', 'copy-link': '/rosettes/assets/js/enhancements/copy-link.7d9a5c29.js', 'data-table': '/rosettes/assets/js/enhancements/data-table.1f5bc1eb.js', 'lazy-loaders': '/rosettes/assets/js/enhancements/lazy-loaders.a5c38245.js', 'holo': '/rosettes/assets/js/enhancements/holo.ee13c841.js', 'link-previews': '/rosettes/assets/js/enhancements/link-previews.8d906535.js' }; (function () { try { var defaults = window.BENGAL_THEME_DEFAULTS || { appearance: 'system', palette: '' }; var defaultAppearance = defaults.appearance; if (defaultAppearance === 'system') { defaultAppearance = (window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches) ? 'dark' : 'light'; } var storedTheme = localStorage.getItem('bengal-theme'); var storedPalette = localStorage.getItem('bengal-palette'); var theme = storedTheme ? (storedTheme === 'system' ? defaultAppearance : storedTheme) : defaultAppearance; var palette = storedPalette ?? defaults.palette; document.documentElement.setAttribute('data-theme', theme); if (palette) { document.documentElement.setAttribute('data-palette', palette); } } catch (e) { document.documentElement.setAttribute('data-theme', 'light'); } })(); { "prerender": [ { "where": { "and": [ { "href_matches": "/docs/*" }, { "not": { "selector_matches": "[data-external], [target=_blank], .external" } } ] }, "eagerness": "conservative" } ], "prefetch": [ { "where": { "and": [ { "href_matches": "/*" }, { "not": { "selector_matches": "[data-external], [target=_blank], .external" } } ] }, "eagerness": "conservative" } ] } Skip to main content Magnifying Glass ESC Recent Clear Magnifying Glass No results for "" Start typing to search... ↑↓ Navigate ↵ Open ESC Close Powered by Lunr ⌾⌾⌾ DocumentationInfoAboutArrow ClockwiseGet StartedCodeHighlightingPaletteStylingStarburstExtendingFormattersNoteTutorialsBookmarkReferenceReleasesDevGitHubAPI Reference Magnifying Glass Search ⌘K Palette Appearance Chevron Down Mode Monitor System Sun Light Moon Dark Palette Snow Lynx Brown Bengal Silver Bengal Charcoal Bengal Blue Bengal List ⌾⌾⌾ Magnifying Glass Search X Close Documentation Caret Down Info About Arrow Clockwise Get Started Code Highlighting Palette Styling Starburst Extending Formatters Note Tutorials Bookmark Reference Releases Dev Caret Down GitHub API Reference Palette Appearance Chevron Down Mode Monitor System Sun Light Moon Dark Palette Snow Lynx Brown Bengal Silver Bengal Charcoal Bengal Blue Bengal Rosettes API Reference Caret Right Formatters html null terminal Caret Right Lexers _scanners _state_machine bash_sm c_sm clojure_sm cpp_sm css_sm csv_sm cuda_sm cue_sm dart_sm diff_sm dockerfile_sm elixir_sm gleam_sm go_sm graphql_sm groovy_sm haskell_sm hcl_sm html_sm ini_sm java_sm javascript_sm jinja_sm json_sm julia_sm kida_sm kotlin_sm lua_sm makefile_sm markdown_sm mojo_sm nginx_sm nim_sm perl_sm php_sm pkl_sm plaintext_sm powershell_sm protobuf_sm python_sm r_sm ruby_sm rust_sm scala_sm sql_sm stan_sm swift_sm toml_sm tree_sm triton_sm typescript_sm v_sm xml_sm yaml_sm zig_sm Caret Right Themes _mapping _palette _roles palettes _config _escape _formatter_registry _parallel _protocol _registry _types delegate rosettes Rosettes API ReferenceLexers ᗢ Caret Down Link Copy URL External Open LLM text Copy Copy LLM text Share with AI Ask Claude Ask ChatGPT Ask Gemini Ask Copilot Module lexers._scanners Reusable scanner components for state machine lexers. These mixins provide common scanning patterns that can be composed into language-specific lexers, dramatically reducing code duplication. Design Philosophy: Most programming languages share common syntax patterns: C-style comments (// and /* */) C-style numbers (hex, octal, binary, floats with exponents) C-style strings (double/single quotes with escape sequences) Multi-character operators (==, !=, +=, etc.) Rather than re-implementing these in every lexer, Rosettes provides composable mixins that handle common patterns. Language-specific lexers only need to define keywords and override edge cases. Architecture: Configuration Dataclasses: NumberConfig: Customize prefixes, suffixes, underscores StringConfig: Customize quote types, escape handling CommentConfig: Customize comment markers OperatorConfig: Define operator character sets Mixin Classes: WhitespaceMixin: Basic whitespace handling CStyleCommentsMixin: // and /* */ comments HashCommentsMixin: # comments (Python, Ruby, Bash) CStyleNumbersMixin: Full numeric literal support CStyleStringsMixin: Quote handling with escapes CStyleOperatorsMixin: Configurable operator scanning Standalone Functions: scan_identifier(): Fast identifier scanning scan_string(): String literal scanning scan_block_comment(): Block comment scanning Usage: class MyLexer( CStyleCommentsMixin, CStyleNumbersMixin, CStyleStringsMixin, StateMachineLexer, ): # Override configuration for language-specific behavior NUMBER_CONFIG = NumberConfig(integer_suffixes=("n",)) def tokenize(self, code, config=None, *, start=0, end=None): # Call mixin methods: self._try_comment(), self._try_number() ... Thread-Safety: All configuration dataclasses are frozen. Mixin methods use only local variables. Character sets are defined as module-level frozensets. See Also: rosettes.lexers.javascript_sm: Example of full mixin composition rosettes.lexers.python_sm: Reference implementation without mixins 10Classes11Functions Classes NumberConfig 7 ▼ Configuration for number scanning. Configuration for number scanning. Attributes Name Type Description hex_prefix tuple[str, ...] Prefix for hex numbers (e.g., "0x"). octal_prefix tuple[str, ...] Prefix for octal numbers (e.g., "0o"). binary_prefix tuple[str, ...] Prefix for binary numbers (e.g., "0b"). allow_underscores bool Whether underscores are allowed in numbers. integer_suffixes tuple[str, ...] Valid suffixes for integers (e.g., ("n",) for BigInt). float_suffixes tuple[str, ...] Valid suffixes for floats (e.g., ("f32", "f64")). imaginary_suffix str | None Suffix for imaginary numbers (e.g., "j" for Python). StringConfig 7 ▼ Configuration for string scanning. Configuration for string scanning. Attributes Name Type Description single_quote bool Whether single-quoted strings are allowed. double_quote bool Whether double-quoted strings are allowed. backtick bool Whether backtick strings are allowed (template literals). triple_quote bool Whether triple-quoted strings are allowed. escape_char str The escape character (usually backslash). prefixes frozenset[str] Valid string prefixes (e.g., "frb" for Python). raw_string_marker str | None Marker for raw strings (e.g., "r" for Python). CommentConfig 5 ▼ Configuration for comment scanning. Configuration for comment scanning. Attributes Name Type Description line_comment str | None Line comment marker (e.g., "//", "#"). block_start str | None Block comment start (e.g., "/*"). block_end str | None Block comment end (e.g., "*/"). doc_line str | None Documentation comment line marker (e.g., "///"). doc_block_start str | None Documentation block start (e.g., "/**"). OperatorConfig 3 ▼ Configuration for operator scanning. Operators are scanned longest-first. Group by length for effi… Configuration for operator scanning. Operators are scanned longest-first. Group by length for efficiency. Attributes Name Type Description three_char frozenset[str] — two_char frozenset[str] — one_char frozenset[str] — WhitespaceMixin 1 ▼ Mixin for whitespace scanning. All languages use this. Mixin for whitespace scanning. All languages use this. Methods Internal Methods 1 ▼ _scan_whitespace 2 tuple[int, int] ▼ Scan whitespace, returning (new_pos, newline_count). def _scan_whitespace(self, code: str, pos: int) -> tuple[int, int] Parameters Name Type Description code — pos — Returns tuple[int, int] CStyleCommentsMixin 3 ▼ Mixin for C-style comments (// and /* */). Used by: JavaScript, TypeScript, C, C++, Java, Go, Rust… Mixin for C-style comments (// and /* */). Used by: JavaScript, TypeScript, C, C++, Java, Go, Rust, etc. Methods Internal Methods 3 ▼ _scan_line_comment 2 int ▼ Scan // comment to end of line. def _scan_line_comment(self, code: str, pos: int) -> int Parameters Name Type Description code — pos — Returns int _scan_block_comment 2 int ▼ Scan /* */ block comment. def _scan_block_comment(self, code: str, pos: int) -> int Parameters Name Type Description code — pos — Returns int _try_comment 4 tuple[Token | None, int] ▼ Try to scan a C-style comment. Returns (token, new_pos) or (None, pos) if not … def _try_comment(self, code: str, pos: int, line: int, col: int) -> tuple[Token | None, int] Try to scan a C-style comment. Returns (token, new_pos) or (None, pos) if not a comment. Parameters Name Type Description code — pos — line — col — Returns tuple[Token | None, int] HashCommentsMixin 2 ▼ Mixin for hash comments (#). Used by: Python, Ruby, Bash, Perl, YAML, etc. Mixin for hash comments (#). Used by: Python, Ruby, Bash, Perl, YAML, etc. Methods Internal Methods 2 ▼ _scan_hash_comment 2 int ▼ Scan # comment to end of line. def _scan_hash_comment(self, code: str, pos: int) -> int Parameters Name Type Description code — pos — Returns int _try_hash_comment 4 tuple[Token | None, int] ▼ Try to scan a hash comment. def _try_hash_comment(self, code: str, pos: int, line: int, col: int) -> tuple[Token | None, int] Parameters Name Type Description code — pos — line — col — Returns tuple[Token | None, int] CStyleNumbersMixin 2 ▼ Mixin for C-style numbers. Used by: Most languages (Python, JavaScript, C, C++, Java, Go, Rust, et… Mixin for C-style numbers. Used by: Most languages (Python, JavaScript, C, C++, Java, Go, Rust, etc.) Attributes Name Type Description NUMBER_CONFIG NumberConfig — Methods Internal Methods 1 ▼ _try_number 4 tuple[Token | None, int] ▼ Try to scan a number literal. def _try_number(self, code: str, pos: int, line: int, col: int) -> tuple[Token | None, int] Parameters Name Type Description code — pos — line — col — Returns tuple[Token | None, int] CStyleStringsMixin 2 ▼ Mixin for C-style strings (" and '). Used by: Most languages. Mixin for C-style strings (" and '). Used by: Most languages. Attributes Name Type Description STRING_CONFIG StringConfig — Methods Internal Methods 1 ▼ _try_string 4 tuple[Token | None, int,… ▼ Try to scan a string literal. Returns (token, new_pos, newline_count) or (None… def _try_string(self, code: str, pos: int, line: int, col: int) -> tuple[Token | None, int, int] Try to scan a string literal. Returns (token, new_pos, newline_count) or (None, pos, 0). Parameters Name Type Description code — pos — line — col — Returns tuple[Token | None, int, int] CStyleOperatorsMixin 2 ▼ Mixin for scanning operators longest-first. Configure via OPERATOR_CONFIG in subclass. Mixin for scanning operators longest-first. Configure via OPERATOR_CONFIG in subclass. Attributes Name Type Description OPERATOR_CONFIG OperatorConfig — Methods Internal Methods 1 ▼ _try_operator 4 tuple[Token | None, int] ▼ Try to scan an operator. def _try_operator(self, code: str, pos: int, line: int, col: int) -> tuple[Token | None, int] Parameters Name Type Description code — pos — line — col — Returns tuple[Token | None, int] Functions scan_whitespace 2 tuple[int, int] ▼ Scan whitespace, returning (new_pos, newline_count). def scan_whitespace(code: str, pos: int) -> tuple[int, int] Parameters Name Type Description code str pos int Returns tuple[int, int] scan_line_comment 2 int ▼ Scan to end of line (for line comments). def scan_line_comment(code: str, pos: int) -> int Parameters Name Type Description code str pos int Position after comment marker (e.g., after "//"). Returns int scan_block_comment 3 int ▼ Scan block comment until end marker. def scan_block_comment(code: str, pos: int, end_marker: str = '*/') -> int Parameters Name Type Description code str pos int Position after opening marker. end_marker str The closing marker (e.g., "*/"). Default: '*/' Returns int scan_identifier 2 int ▼ Scan an identifier. def scan_identifier(code: str, pos: int) -> int Parameters Name Type Description code str pos int Position at start of identifier. Returns int scan_string 3 tuple[int, int] ▼ Scan a string literal using C-optimized str.find(). def scan_string(code: str, pos: int, quote: str) -> tuple[int, int] Parameters Name Type Description code str pos int Position after opening quote. quote str The quote character. Returns tuple[int, int] scan_triple_string 3 tuple[int, int] ▼ Scan a triple-quoted string using C-optimized str.find(). def scan_triple_string(code: str, pos: int, quote: str) -> tuple[int, int] Parameters Name Type Description code str pos int Position after opening triple quote. quote str The quote character. Returns tuple[int, int] scan_c_style_number 3 tuple[TokenType, int] ▼ Scan a C-style number literal. **Handles:** - Hex: 0x1a2b - Octal: 0o755 - Bin… def scan_c_style_number(code: str, pos: int, config: NumberConfig | None = None) -> tuple[TokenType, int] Scan a C-style number literal. Handles: Hex: 0x1a2b Octal: 0o755 Binary: 0b1010 Float: 3.14, 1e10, 3.14e-10 Integer: 42 Parameters Name Type Description code str pos int Position at first digit or dot. config NumberConfig | None Number scanning configuration. Default: None Returns tuple[TokenType, int] _scan_digits 4 int ▼ Scan digits, optionally with underscores. def _scan_digits(code: str, pos: int, digit_set: frozenset[str], allow_underscores: bool) -> int Parameters Name Type Description code str pos int digit_set frozenset[str] allow_underscores bool Returns int _scan_exponent 3 int ▼ Scan optional exponent (e.g., e10, E-5). def _scan_exponent(code: str, pos: int, allow_underscores: bool) -> int Parameters Name Type Description code str pos int allow_underscores bool Returns int _scan_suffix 3 int ▼ Scan optional type suffix (e.g., u32, f64, n). def _scan_suffix(code: str, pos: int, suffixes: tuple[str, ...]) -> int Parameters Name Type Description code str pos int suffixes tuple[str, ...] Returns int scan_operators 3 tuple[str | None, int] ▼ Scan operators using longest-match. def scan_operators(code: str, pos: int, config: OperatorConfig) -> tuple[str | None, int] Parameters Name Type Description code str pos int config OperatorConfig Returns tuple[str | None, int] Next → _state_machine List © 2026 Rosettes built in ᓚᘏᗢ { "linkPreviews": { "enabled": true, "hoverDelay": 200, "hideDelay": 150, "showSection": true, "showReadingTime": true, "showWordCount": true, "showDate": true, "showTags": true, "maxTags": 3, "includeSelectors": [".prose"], "excludeSelectors": ["nav", ".toc", ".breadcrumb", ".pagination", ".card", "[class*='-card']", ".tab-nav", "[class*='-widget']", ".child-items", ".content-tiles"], "allowedHosts": [], "allowedSchemes": ["https"], "hostFailureThreshold": 3 } } window.BENGAL_LAZY_ASSETS = { tabulator: '/rosettes/assets/js/tabulator.min.js', dataTable: '/rosettes/assets/js/data-table.js', mermaidToolbar: '/rosettes/assets/js/mermaid-toolbar.9de5abba.js', mermaidTheme: '/rosettes/assets/js/mermaid-theme.344822c5.js', graphMinimap: '/rosettes/assets/js/graph-minimap.ff04e939.js', graphContextual: '/rosettes/assets/js/graph-contextual.355458ba.js' }; window.BENGAL_ICONS = { close: '/rosettes/assets/icons/close.911d4fe1.svg', enlarge: '/rosettes/assets/icons/enlarge.652035e5.svg', copy: '/rosettes/assets/icons/copy.3d56e945.svg', 'download-svg': '/rosettes/assets/icons/download.04f07e1b.svg', 'download-png': '/rosettes/assets/icons/image.c34dfd40.svg', 'zoom-in': '/rosettes/assets/icons/zoom-in.237b4a83.svg', 'zoom-out': '/rosettes/assets/icons/zoom-out.38857c77.svg', reset: '/rosettes/assets/icons/reset.d26dba29.svg' }; Arrow Up -------------------------------------------------------------------------------- Metadata: - Word Count: 2032 - Reading Time: 10 minutes