Custom Formatter - Rosettes

Build custom formatters for terminal output, LaTeX, or any other format.

The Formatter Protocol

Formatters implement theFormatter protocol defined in rosettes._protocol:

from collections.abc import Iterator
from typing import Protocol
from rosettes import Token, TokenType, FormatConfig

class Formatter(Protocol):
    @property
    def name(self) -> str:
        """Canonical formatter name (e.g., 'html', 'terminal')."""
        ...

    def format(
        self,
        tokens: Iterator[Token],
        config: FormatConfig | None = None,
    ) -> Iterator[str]:
        """Stream formatted output from full Token objects."""
        ...

    def format_fast(
        self,
        tokens: Iterator[tuple[TokenType, str]],
        config: FormatConfig | None = None,
    ) -> Iterator[str]:
        """Stream formatted output from (type, value) tuples.
        
        Used by the fast path when line numbers aren't needed.
        """
        ...

    def format_string(
        self,
        tokens: Iterator[Token],
        config: FormatConfig | None = None,
    ) -> str:
        """Format tokens and return as a single string."""
        ...

    def format_string_fast(
        self,
        tokens: Iterator[tuple[TokenType, str]],
        config: FormatConfig | None = None,
    ) -> str:
        """Fast format and return as a single string."""
        ...

All four methods are required. The format_string*methods are typically implemented as:

def format_string(self, tokens, config=None):
    return "".join(self.format(tokens, config))

def format_string_fast(self, tokens, config=None):
    return "".join(self.format_fast(tokens, config))

Example: Simple ANSI Formatter

A minimal formatter that outputs ANSI-colored terminal text:

from collections.abc import Iterator
from dataclasses import dataclass
from rosettes import tokenize, Token, TokenType, FormatConfig

# ANSI color codes
COLORS = {
    TokenType.KEYWORD: "\033[95m",       # Magenta
    TokenType.NAME_FUNCTION: "\033[92m", # Green
    TokenType.STRING: "\033[93m",        # Yellow
}
RESET = "\033[0m"


@dataclass(frozen=True, slots=True)
class SimpleAnsiFormatter:
    """Thread-safe ANSI formatter using frozen dataclass."""

    @property
    def name(self) -> str:
        return "simple-ansi"

    def format(
        self,
        tokens: Iterator[Token],
        config: FormatConfig | None = None,
    ) -> Iterator[str]:
        for token in tokens:
            color = COLORS.get(token.type, "")
            if color:
                yield f"{color}{token.value}{RESET}"
            else:
                yield token.value

    def format_fast(
        self,
        tokens: Iterator[tuple[TokenType, str]],
        config: FormatConfig | None = None,
    ) -> Iterator[str]:
        for tt, value in tokens:
            color = COLORS.get(tt, "")
            if color:
                yield f"{color}{value}{RESET}"
            else:
                yield value

    def format_string(self, tokens, config=None):
        return "".join(self.format(tokens, config))

    def format_string_fast(self, tokens, config=None):
        return "".join(self.format_fast(tokens, config))


# Usage with highlight()
from rosettes import highlight

formatter = SimpleAnsiFormatter()
output = highlight("x = 1", "python", formatter=formatter)

Rosettes includes a full-featured TerminalFormatterwith semantic role mapping — see Terminal Formatter.

Example: Markdown Formatter

A formatter that outputs fenced code blocks:

from collections.abc import Iterator
from dataclasses import dataclass
from rosettes import Token, TokenType, FormatConfig


@dataclass(frozen=True, slots=True)
class MarkdownFormatter:
    """Wraps code in a fenced markdown block."""

    @property
    def name(self) -> str:
        return "markdown"

    def format(
        self,
        tokens: Iterator[Token],
        config: FormatConfig | None = None,
    ) -> Iterator[str]:
        lang = config.data_language if config else ""
        yield f"```{lang}\n"
        for token in tokens:
            yield token.value
        yield "\n```"

    def format_fast(
        self,
        tokens: Iterator[tuple[TokenType, str]],
        config: FormatConfig | None = None,
    ) -> Iterator[str]:
        lang = config.data_language if config else ""
        yield f"```{lang}\n"
        for _, value in tokens:
            yield value
        yield "\n```"

    def format_string(self, tokens, config=None):
        return "".join(self.format(tokens, config))

    def format_string_fast(self, tokens, config=None):
        return "".join(self.format_fast(tokens, config))


# Usage
from rosettes import highlight

output = highlight("x = 1", "python", formatter=MarkdownFormatter())
# ```python
# x = 1
# ```

Example: JSON Token Dump

Export tokens as JSON for analysis:

import json
from collections.abc import Iterator
from dataclasses import dataclass
from rosettes import Token, TokenType, FormatConfig


@dataclass(frozen=True, slots=True)
class JsonFormatter:
    """Exports tokens as a JSON array."""

    @property
    def name(self) -> str:
        return "json"

    def format(
        self,
        tokens: Iterator[Token],
        config: FormatConfig | None = None,
    ) -> Iterator[str]:
        # Collect tokens for JSON serialization
        token_list = [
            {
                "type": token.type.name,
                "value": token.value,
                "line": token.line,
                "column": token.column,
            }
            for token in tokens
        ]
        yield json.dumps(token_list, indent=2)

    def format_fast(
        self,
        tokens: Iterator[tuple[TokenType, str]],
        config: FormatConfig | None = None,
    ) -> Iterator[str]:
        # Fast path has no position info
        token_list = [{"type": tt.name, "value": value} for tt, value in tokens]
        yield json.dumps(token_list, indent=2)

    def format_string(self, tokens, config=None):
        return "".join(self.format(tokens, config))

    def format_string_fast(self, tokens, config=None):
        return "".join(self.format_fast(tokens, config))


# Usage
from rosettes import highlight

output = highlight("x = 1", "python", formatter=JsonFormatter())

Output:

[
  {"type": "NAME", "value": "x", "line": 1, "column": 1},
  {"type": "WHITESPACE", "value": " ", "line": 1, "column": 2},
  {"type": "OPERATOR", "value": "=", "line": 1, "column": 3},
  {"type": "WHITESPACE", "value": " ", "line": 1, "column": 4},
  {"type": "NUMBER_INTEGER", "value": "1", "line": 1, "column": 5}
]

Using Custom Formatters

Pass custom formatter instances directly tohighlight():

from rosettes import highlight

formatter = MarkdownFormatter()
output = highlight("def foo(): pass", "python", formatter=formatter)

Token Type Reference

See Token Types for the complete list of token types to handle in your formatter.

Next Steps

API Reference —tokenize()function details
Token Types — All available token types