Skip to main content


Full example of a findkit.toml file.

id = "pd32hfasd"
name = "My Website Network"
description = "Description for"

# Run lighter partial crawl daily
schedule_partial_crawl = "daily"

# And full crawl every week to ensure everything is up to date
schedule_full_crawl = "weekly"

host = "www.mysite.example"

# Findkit can automatically extract the text content from pages but it's often
# beneficial to be explicit.
content_selector = ".content"

# Do not index ad content even if it is inside .content
cleanup_selector = ".ad"

# Ex. "Mysite - About" capture only "About"
title_selector_regex = "^Mysite - (.*?)$"

# Default value, but could be customized
title_selector = "head title"

# If the site has a proper sitemap there's no need to use link walking as the
# crawler can find all pages using the sitemap
use_sitemap = true
walk_links = false
crawl_pdfs = true

# If using walk_links this can be used to define where the link walking starts
start_paths = [

# Add reasonable max limit to avoid over using your crawl quota
max_pages = 50_000

# Avoid indexing tag listing pages
deny_patterns = [ "/tags" ]

# Add `event` tag to pages under https://www.mysite.example/events/
pathname_regex = '^\/events\/'
on_match = "event"

# Crawl additional domain to the same index
host = "blog.example"

# Add `author` tag to pages under https://blog.example/authors/
pathname_regex = '^\/authors\/'
on_match = "author"

# Allow Findkit UI installations on these domains
origin_domains = ["www.mysite.example", "intra.mysite.example"]