{
  "project": "LLMs.txt Kit",
  "dataset": "AI crawler policy benchmark",
  "version": "2026-06-24",
  "lastChecked": "2026-06-24",
  "license": "Public reference dataset; verify source links before using for legal or policy decisions.",
  "notes": [
    "Robots.txt is an access preference, not authentication.",
    "User-agent strings can be spoofed; verify crawlers with official IP or reverse-DNS methods where available.",
    "Search/discovery crawlers and model-training controls should be handled separately."
  ],
  "sources": [
    {
      "operator": "OpenAI",
      "url": "https://developers.openai.com/api/docs/bots"
    },
    {
      "operator": "Google",
      "url": "https://developers.google.com/crawling/docs/crawlers-fetchers/google-common-crawlers"
    },
    {
      "operator": "Apple",
      "url": "https://support.apple.com/en-us/119829"
    },
    {
      "operator": "Perplexity",
      "url": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers"
    },
    {
      "operator": "Common Crawl",
      "url": "https://commoncrawl.org/ccbot"
    }
  ],
  "crawlers": [
    {
      "operator": "OpenAI",
      "token": "OAI-SearchBot",
      "category": "search-discovery",
      "documentedPurpose": "Automatic search crawl for ChatGPT search visibility and search opt-outs.",
      "robotsTxtStrategy": "Allow if ChatGPT search eligibility matters.",
      "robotsTxtApplies": true,
      "verifyMethod": "Use OpenAI crawler documentation and server logs; treat user-agent-only evidence as weak.",
      "sourceUrl": "https://developers.openai.com/api/docs/bots"
    },
    {
      "operator": "OpenAI",
      "token": "GPTBot",
      "category": "training-use",
      "documentedPurpose": "Crawler that may be used to improve foundation models, separate from search crawling.",
      "robotsTxtStrategy": "Allow only if model-training use is permitted by site policy.",
      "robotsTxtApplies": true,
      "verifyMethod": "Use OpenAI crawler documentation and server logs; treat user-agent-only evidence as weak.",
      "sourceUrl": "https://developers.openai.com/api/docs/bots"
    },
    {
      "operator": "OpenAI",
      "token": "ChatGPT-User",
      "category": "user-requested-fetch",
      "documentedPurpose": "User-initiated fetches from ChatGPT or Custom GPT actions, not automatic web crawling.",
      "robotsTxtStrategy": "Monitor in logs; do not treat as the same policy decision as GPTBot or OAI-SearchBot.",
      "robotsTxtApplies": false,
      "verifyMethod": "Separate from crawler traffic in logs and rate-limit if abused.",
      "sourceUrl": "https://developers.openai.com/api/docs/bots"
    },
    {
      "operator": "Google",
      "token": "Googlebot",
      "category": "search-discovery",
      "documentedPurpose": "Google Search crawling across Search features and related surfaces.",
      "robotsTxtStrategy": "Allow for Google Search visibility unless a page should not be indexed.",
      "robotsTxtApplies": true,
      "verifyMethod": "Use Google crawler verification guidance, DNS verification, and server logs.",
      "sourceUrl": "https://developers.google.com/crawling/docs/crawlers-fetchers/google-common-crawlers"
    },
    {
      "operator": "Google",
      "token": "Google-Extended",
      "category": "ai-use-control",
      "documentedPurpose": "Publisher control for some Gemini and Vertex AI grounding/training uses; not Google Search crawling, Search inclusion control, or a Search ranking signal.",
      "robotsTxtStrategy": "Disallow when policy allows Google Search but opts out of these AI uses.",
      "robotsTxtApplies": true,
      "verifyMethod": "Check robots.txt rules and compare Google Search crawling separately from AI-use controls.",
      "sourceUrl": "https://developers.google.com/crawling/docs/crawlers-fetchers/google-common-crawlers"
    },
    {
      "operator": "Apple",
      "token": "Applebot",
      "category": "search-discovery",
      "documentedPurpose": "Search technology for Apple experiences including Spotlight, Siri, and Safari.",
      "robotsTxtStrategy": "Allow for Apple ecosystem discovery; avoid blocking render-critical assets.",
      "robotsTxtApplies": true,
      "verifyMethod": "Use Applebot documentation and server logs.",
      "sourceUrl": "https://support.apple.com/en-us/119829"
    },
    {
      "operator": "Apple",
      "token": "Applebot-Extended",
      "category": "training-use-control",
      "documentedPurpose": "Usage control for Apple's generative foundation model training; it does not crawl webpages by itself.",
      "robotsTxtStrategy": "Disallow to opt out of Apple foundation-model training while keeping Applebot discovery.",
      "robotsTxtApplies": true,
      "verifyMethod": "Check robots.txt policy separately from Applebot search crawling.",
      "sourceUrl": "https://support.apple.com/en-us/119829"
    },
    {
      "operator": "Perplexity",
      "token": "PerplexityBot",
      "category": "search-answer-discovery",
      "documentedPurpose": "Perplexity's documented crawler for indexing and answer retrieval surfaces.",
      "robotsTxtStrategy": "Allow if Perplexity answer visibility matters; verify with official IP lists for WAF rules.",
      "robotsTxtApplies": true,
      "verifyMethod": "Use Perplexity crawler documentation, IP guidance, and logs.",
      "sourceUrl": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers"
    },
    {
      "operator": "Perplexity",
      "token": "Perplexity-User",
      "category": "user-requested-fetch",
      "documentedPurpose": "User-requested fetcher; documented separately from web crawling or training collection.",
      "robotsTxtStrategy": "Monitor separately from crawler rules because user-requested fetches may not behave like crawlers.",
      "robotsTxtApplies": false,
      "verifyMethod": "Separate user-requested traffic from crawler traffic in logs.",
      "sourceUrl": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers"
    },
    {
      "operator": "Common Crawl",
      "token": "CCBot",
      "category": "open-web-dataset",
      "documentedPurpose": "Common Crawl's open web crawl used for research and datasets.",
      "robotsTxtStrategy": "Allow for open data participation; disallow if policy forbids broad dataset reuse.",
      "robotsTxtApplies": true,
      "verifyMethod": "Use Common Crawl documentation and server logs.",
      "sourceUrl": "https://commoncrawl.org/ccbot"
    }
  ]
}
