# Custom Content Sources

URL: /docs/extending/custom-sources/
Section: extending

--------------------------------------------------------------------------------

Content sources let Bengal fetch content from anywhere—local files, GitHub repositories, REST APIs, Notion databases, or custom backends. You can create custom sources by implementing the ContentSource abstract class. Built-in Sources Bengal includes four content source types: Source Type ID Use Case LocalSource local, filesystem Local markdown files (default) GitHubSource github GitHub repository content RESTSource rest, api REST API endpoints NotionSource notion Notion database pages Using Built-in Sources Local Source (Default) The default source for local markdown files: # collections.py from bengal.collections import define_collection from bengal.content_layer import local_loader collections = { &quot;docs&quot;: define_collection( schema=Doc, loader=local_loader(&quot;content/docs&quot;, exclude=[&quot;_drafts/*&quot;]), ), } GitHub Source Fetch content from a GitHub repository: from bengal.content_layer import github_loader collections = { &quot;api-docs&quot;: define_collection( schema=APIDoc, loader=github_loader( repo=&quot;myorg/api-docs&quot;, branch=&quot;main&quot;, path=&quot;docs/&quot;, token=os.environ.get(&quot;GITHUB_TOKEN&quot;), ), ), } Requires: pip install bengal[github] REST Source Fetch content from a REST API: from bengal.content_layer import rest_loader collections = { &quot;posts&quot;: define_collection( schema=BlogPost, loader=rest_loader( url=&quot;https://api.example.com/posts&quot;, headers={&quot;Authorization&quot;: &quot;Bearer ${API_TOKEN}&quot;}, content_field=&quot;body&quot;, frontmatter_fields={&quot;title&quot;: &quot;title&quot;, &quot;date&quot;: &quot;published_at&quot;}, ), ), } Requires: pip install bengal[rest] Notion Source Fetch pages from a Notion database: from bengal.content_layer import notion_loader collections = { &quot;wiki&quot;: define_collection( schema=WikiPage, loader=notion_loader( database_id=&quot;abc123...&quot;, token=os.environ.get(&quot;NOTION_TOKEN&quot;), ), ), } Requires: pip install bengal[notion] Creating a Custom Source Implement the ContentSource abstract class: from bengal.content_layer.source import ContentSource from bengal.content_layer.entry import ContentEntry class MyAPISource(ContentSource): &quot;&quot;&quot;Fetch content from a custom API.&quot;&quot;&quot; @property def source_type(self) -&gt; str: return &quot;my-api&quot; async def fetch_all(self): &quot;&quot;&quot;Fetch all content entries.&quot;&quot;&quot; # Get items from your data source items = await self._fetch_items() for item in items: yield ContentEntry( id=item[&quot;id&quot;], slug=item[&quot;slug&quot;], content=item[&quot;body&quot;], frontmatter={ &quot;title&quot;: item[&quot;title&quot;], &quot;date&quot;: item[&quot;created_at&quot;], }, source_type=self.source_type, source_name=self.name, ) async def fetch_one(self, id: str): &quot;&quot;&quot;Fetch a single entry by ID.&quot;&quot;&quot; item = await self._fetch_item(id) if not item: return None return ContentEntry( id=item[&quot;id&quot;], slug=item[&quot;slug&quot;], content=item[&quot;body&quot;], frontmatter={ &quot;title&quot;: item[&quot;title&quot;], &quot;date&quot;: item[&quot;created_at&quot;], }, source_type=self.source_type, source_name=self.name, ) async def _fetch_items(self): &quot;&quot;&quot;Your API call implementation.&quot;&quot;&quot; import aiohttp async with aiohttp.ClientSession() as session: async with session.get(self.config[&quot;api_url&quot;]) as resp: return await resp.json() async def _fetch_item(self, id: str): &quot;&quot;&quot;Fetch single item.&quot;&quot;&quot; import aiohttp async with aiohttp.ClientSession() as session: url = f&quot;{self.config[&#x27;api_url&#x27;]}/{id}&quot; async with session.get(url) as resp: if resp.status == 404: return None return await resp.json() ContentEntry Structure Each source yields ContentEntry objects: @dataclass class ContentEntry: id: str # Unique identifier within source slug: str # URL-friendly slug for routing content: str # Raw markdown content frontmatter: dict[str, Any] # Parsed metadata dictionary source_type: str # Source type (e.g., &quot;github&quot;, &quot;notion&quot;) source_name: str # Source instance name source_url: str | None # Original URL for attribution last_modified: datetime | None # Last modification time checksum: str | None # Content hash for caching Registering Custom Sources Option 1: Direct Registration Register your source instance directly: from bengal.content_layer import ContentLayerManager manager = ContentLayerManager() manager.register_custom_source(&quot;my-content&quot;, MyAPISource( name=&quot;my-content&quot;, config={&quot;api_url&quot;: &quot;https://api.example.com/content&quot;}, )) Option 2: With Collections Use your source as a collection loader: # collections.py from bengal.collections import define_collection my_source = MyAPISource( name=&quot;my-content&quot;, config={&quot;api_url&quot;: &quot;https://api.example.com/content&quot;}, ) collections = { &quot;external&quot;: define_collection( schema=ExternalContent, loader=my_source, ), } Caching Content sources support caching to avoid redundant fetches: class MyAPISource(ContentSource): # ... def get_cache_key(self) -&gt; str: &quot;&quot;&quot;Generate cache key for this source configuration.&quot;&quot;&quot; # Default implementation hashes config # Override for custom cache key logic return super().get_cache_key() async def is_changed(self, cached_checksum: str | None) -&gt; bool: &quot;&quot;&quot;Check if source content has changed.&quot;&quot;&quot; # Return True to force refetch # Return False if content is unchanged current = await self._get_current_checksum() return current != cached_checksum async def get_last_modified(self): &quot;&quot;&quot;Return last modification time for cache invalidation.&quot;&quot;&quot; # Return datetime or None return None Sync Wrappers For convenience, ContentSource provides sync wrappers: # Async (preferred for performance) async for entry in source.fetch_all(): process(entry) # Sync (convenience wrapper) for entry in source.fetch_all_sync(): process(entry) # Single entry entry = source.fetch_one_sync(&quot;my-id&quot;) Error Handling Handle errors gracefully in your source: async def fetch_all(self): try: items = await self._fetch_items() except aiohttp.ClientError as e: logger.error(f&quot;Failed to fetch from {self.config[&#x27;api_url&#x27;]}: {e}&quot;) return # Yield nothing on error for item in items: try: yield self._to_entry(item) except KeyError as e: logger.warning(f&quot;Skipping malformed item {item.get(&#x27;id&#x27;)}: {e}&quot;) continue Testing Custom Sources import pytest from unittest.mock import AsyncMock, patch @pytest.mark.asyncio async def test_my_api_source(): source = MyAPISource( name=&quot;test&quot;, config={&quot;api_url&quot;: &quot;https://api.example.com&quot;}, ) with patch.object(source, &quot;_fetch_items&quot;, new_callable=AsyncMock) as mock: mock.return_value = [ {&quot;id&quot;: &quot;1&quot;, &quot;slug&quot;: &quot;test&quot;, &quot;title&quot;: &quot;Test&quot;, &quot;body&quot;: &quot;Content&quot;, &quot;created_at&quot;: &quot;2025-01-01&quot;}, ] entries = [entry async for entry in source.fetch_all()] assert len(entries) == 1 assert entries[0].frontmatter[&quot;title&quot;] == &quot;Test&quot; Related Content Collections for schema validation Build Pipeline for understanding discovery phase

--------------------------------------------------------------------------------

Metadata:
- Author: lbliii
- Word Count: 725
- Reading Time: 4 minutes