{
  "meta": {
    "title": "robots-ai.txt Specification",
    "description": "The canonical specification for robots-ai.txt files - AI crawler-specific access directives.",
    "version": "1.7.1",
    "versionDate": "2026-06-10",
    "canonicalUrl": "https://www.ai-visibility.org.uk/specifications/robots-ai-txt/",
    "publisher": {
      "name": "365i",
      "url": "https://www.365i.co.uk/"
    },
    "license": {
      "name": "Creative Commons Attribution 4.0 International",
      "identifier": "CC-BY-4.0",
      "url": "https://creativecommons.org/licenses/by/4.0/"
    },
    "datePublished": "2026-01-12T00:00:00Z",
    "dateModified": "2026-06-10T00:00:00Z",
    "language": "en-GB",
    "basedOn": {
      "name": "Robots Exclusion Protocol",
      "url": "https://www.rfc-editor.org/rfc/rfc9309.html"
    }
  },
  "specification": {
    "filename": "robots-ai.txt",
    "location": "Website root directory",
    "urlPattern": "https://example.com/robots-ai.txt",
    "mimeType": "text/plain",
    "encoding": "UTF-8",
    "purpose": "Provide AI crawler-specific access directives using robots.txt syntax. Supplements standard robots.txt with detailed AI crawler rules and explanatory context.",
    "relationship": {
      "robotsTxt": {
        "authority": "robots.txt remains the authoritative file for all crawlers",
        "supplementary": "robots-ai.txt provides supplementary, AI-specific guidance",
        "precedence": "AI crawlers should respect robots.txt first, then check robots-ai.txt"
      }
    },
    "format": {
      "type": "Structured text",
      "description": "Follows robots.txt syntax conventions from RFC 9309",
      "syntax": {
        "userAgent": "User-agent: crawler-name",
        "allow": "Allow: /path/",
        "disallow": "Disallow: /path/",
        "crawlDelay": "Crawl-delay: seconds",
        "sitemap": "Sitemap: url",
        "comments": "# Comment text"
      }
    },
    "directives": {
      "required": [
        {
          "directive": "User-agent",
          "description": "Specifies which crawler(s) the following rules apply to",
          "syntax": "User-agent: crawler-name",
          "examples": ["User-agent: GPTBot", "User-agent: *"]
        }
      ],
      "standard": [
        {
          "directive": "Allow",
          "description": "Explicitly permits crawling of specified path",
          "syntax": "Allow: /path/"
        },
        {
          "directive": "Disallow",
          "description": "Prohibits crawling of specified path",
          "syntax": "Disallow: /path/"
        },
        {
          "directive": "Crawl-delay",
          "description": "Minimum seconds between requests (advisory)",
          "syntax": "Crawl-delay: 10"
        },
        {
          "directive": "Sitemap",
          "description": "Location of XML sitemap",
          "syntax": "Sitemap: https://example.com/sitemap.xml"
        }
      ]
    },
    "knownAICrawlers": [
      {
        "userAgent": "GPTBot",
        "operator": "OpenAI",
        "purpose": "Training and retrieval for ChatGPT"
      },
      {
        "userAgent": "ChatGPT-User",
        "operator": "OpenAI",
        "purpose": "ChatGPT browsing feature"
      },
      {
        "userAgent": "OAI-SearchBot",
        "operator": "OpenAI",
        "purpose": "OpenAI search features"
      },
      {
        "userAgent": "ClaudeBot",
        "operator": "Anthropic",
        "purpose": "Training and retrieval for Claude"
      },
      {
        "userAgent": "Claude-User",
        "operator": "Anthropic",
        "purpose": "Real-time retrieval when Claude users ask about a site"
      },
      {
        "userAgent": "Claude-SearchBot",
        "operator": "Anthropic",
        "purpose": "Search indexing for Claude search features"
      },
      {
        "userAgent": "Google-Extended",
        "operator": "Google",
        "purpose": "Training for Bard/Gemini (separate from search)"
      },
      {
        "userAgent": "Googlebot",
        "operator": "Google",
        "purpose": "General Google crawling including AI features"
      },
      {
        "userAgent": "PerplexityBot",
        "operator": "Perplexity AI",
        "purpose": "Perplexity AI search"
      },
      {
        "userAgent": "CCBot",
        "operator": "Common Crawl",
        "purpose": "Common Crawl dataset (used by many AI training sets)"
      },
      {
        "userAgent": "Bytespider",
        "operator": "ByteDance",
        "purpose": "ByteDance AI and search"
      },
      {
        "userAgent": "cohere-ai",
        "operator": "Cohere",
        "purpose": "Cohere AI systems"
      },
      {
        "userAgent": "Amazonbot",
        "operator": "Amazon",
        "purpose": "Amazon AI and Alexa"
      },
      {
        "userAgent": "Diffbot",
        "operator": "Diffbot",
        "purpose": "Structured data extraction"
      },
      {
        "userAgent": "FacebookBot",
        "operator": "Meta",
        "purpose": "Meta AI systems"
      },
      {
        "userAgent": "ImagesiftBot",
        "operator": "Imagesift",
        "purpose": "Image analysis AI"
      },
      {
        "userAgent": "Omgilibot",
        "operator": "Omgili",
        "purpose": "Discussion and forum indexing"
      },
      {
        "userAgent": "YouBot",
        "operator": "You.com",
        "purpose": "You.com AI search"
      },
      {
        "userAgent": "AppleBot-Extended",
        "operator": "Apple",
        "purpose": "Apple AI training (separate from Siri/Spotlight)"
      }
    ],
    "commonPatterns": {
      "allowAll": {
        "description": "Permit all AI crawlers",
        "syntax": "User-agent: *\nAllow: /"
      },
      "blockAll": {
        "description": "Block all AI crawlers",
        "syntax": "User-agent: *\nDisallow: /"
      },
      "selectiveAllow": {
        "description": "Allow specific crawlers, block others",
        "syntax": "User-agent: GPTBot\nAllow: /\n\nUser-agent: *\nDisallow: /"
      },
      "pathBased": {
        "description": "Allow some paths, block others",
        "syntax": "User-agent: *\nAllow: /public/\nDisallow: /private/"
      }
    },
    "validation": {
      "rules": [
        "MUST contain at least one User-agent directive",
        "MUST follow robots.txt syntax from RFC 9309",
        "MUST be valid UTF-8 encoded plain text",
        "Directives MUST be one per line",
        "User-agent MUST precede its rules",
        "Blank lines SHOULD separate user-agent groups"
      ],
      "commonErrors": [
        "Missing User-agent directive",
        "Contradictory Allow/Disallow rules",
        "Incorrect path syntax",
        "Rules before User-agent declaration",
        "Invalid directive names"
      ]
    },
    "relationships": {
      "robots.txt": "Must not contradict; robots.txt takes precedence",
      "ai.txt": "Complements with behavioural (not access) guidance",
      "sitemap.xml": "May reference for crawler guidance"
    }
  },
  "example": {
    "url": "https://www.ai-visibility.org.uk/specifications/examples/robots-ai.txt",
    "business": "Horizon Strategic Consulting"
  },
  "versionHistory": [
    {
      "version": "1.7.1",
      "date": "2026-06-10",
      "changes": "Known AI crawlers registry updated to current user-agent tokens. The retired Anthropic agents Claude-Web and anthropic-ai are replaced by Claude-User (real-time retrieval) and Claude-SearchBot (search indexing), matching Anthropic's published crawler documentation. Registry correction only; no directive syntax or publisher behaviour changes."
    },
    {
      "version": "1.7.0",
      "date": "2026-05-11",
      "changes": "Phase 6 standardisation release. Added /specifications/roadmap/ (theme-pegged forward plan with Active/Next/Future/On hold status flags), /specifications/extensions/ (rules for experimental x- prefixed files and the promotion path), and /specifications/i18n-a11y/ (multi-language publication, locale-tagged identity fields, RTL handling, accessibility of llms.html). Added the Discovery: directive to the robots-ai.txt specification (publishers MAY advertise AI Discovery Files on the same host). Added a formal media-type stance to the HTTP behaviour page (existing IANA types, no bespoke registrations). Expanded the file integrity and signing section on the security and privacy page with four candidate mechanisms, cross-cutting concerns, and interim publisher / consumer guidance. The Discovery: directive is the only normative addition to publisher behaviour; all other additions are forward-looking documentation."
    },
    {
      "version": "1.6.0",
      "date": "2026-05-11",
      "changes": "Phase 5 standardisation release. Added /specifications/related-standards/ (positioning vs llmstxt.org, IETF AI Preferences, robots.txt, Schema.org, BCP 14, JSON Schema 2020-12, SemVer) and /specifications/implementations/ (public record of conformant implementations, IETF-style). Added an explicit llmstxt.org backward-compatibility statement to the llms.txt specification. Added a formal multi-domain and subdomain scoping rule to both the llms.txt and identity.json specifications (host-scoped files, cross-host identity asserted via sameAs). No normative requirements changed for existing publishers; the new scoping rules formalise behaviour the specification already implied."
    },
    {
      "version": "1.5.0",
      "date": "2026-05-11",
      "changes": "Phase 4 standardisation release. Added /specifications/processing-model/ (seven-stage algorithm for conformant consumers), /specifications/consumer-guidance/ (what AI systems should do with AI Discovery Files), /specifications/test-vectors/ (canonical test suite framing), and reference-implementation framing on the AI Visibility Checker. No normative requirements changed."
    },
    {
      "version": "1.4.0",
      "date": "2026-05-11",
      "changes": "Phase 3 standardisation release. Added /specifications/versioning/ (Semantic Versioning 2.0.0 commitments, deprecation timeline, lifecycle), /specifications/governance/ (proposal lifecycle, editorial process, working principles), /specifications/security-privacy/ (trust model, content-injection patterns, GDPR considerations, integrity primitives roadmap), and /specifications/http-behaviour/ (status codes, redirects, soft-404 detection, caching, rate limits). No normative requirements changed."
    },
    {
      "version": "1.3.0",
      "date": "2026-05-11",
      "changes": "Phase 2 standardisation release. Added formal conformance specification (Essential / Recommended / Complete classes). Published machine-readable registry at /specifications/registry.json, spec meta-schema, and validator-output schema. Introduced versioned JSON Schema URLs (/v1/) alongside unversioned 'latest' aliases. Added optional BCP 47 language declaration field across all applicable AI Discovery Files. No normative requirements changed."
    },
    {
      "version": "1.2.0",
      "date": "2026-05-10",
      "changes": "Phase 1 standardisation release. Added 'Status of This Document' block (Stable). Normalised normative requirement keywords to uppercase per RFC 2119 and RFC 8174. Added References section linking to /specifications/conventions/ and /licensing/. No normative requirements changed."
    },
    {
      "version": "1.1.1",
      "date": "2026-02-13",
      "changes": "Added AI Visibility Directory registration guidance. Minor documentation update."
    },
    {
      "version": "1.1.0",
      "date": "2026-01-14",
      "changes": "Added expanded optional directives (Request-rate, Visit-time) and Content Not Permitted guidance. Clarifies relationship with standard robots.txt."
    },
    {
      "version": "1.0.0",
      "date": "2026-01-12",
      "changes": "Initial publication. Establishes canonical structure for robots-ai.txt files with AI crawler user agent reference."
    }
  ]
}