readmeio · xavierandueza · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/src/commands/import.js b/src/commands/import.js
@@ -372,6 +372,9 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
     } else {
       styles.info(styles.dim(`Merged ${llms.sourceFiles.length} llms.txt files (root: ${llmsUrl}; aggregate ratio ${s.ratio.toFixed(2)}).`))
     }
+    if (llms.parsed.h3Fallback) {
+      styles.info(styles.dim(`H2 sections oversized — re-parsed using H3 (###) headings as section boundaries.`))
+    }
   }
 
   // Pre-extract changelog pages so AI / URL clustering / section-direct paths
@@ -397,6 +400,29 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
     )
   }
 
+  // Dedupe items across llms.parsed.sections by URL (first occurrence wins).
+  // Some llms.txt files cross-reference the same page under multiple headings —
+  // feeding duplicate URLs into the organize step causes the same page to appear
+  // in multiple sidebar sections regardless of which path (direct, icons, full
+  // reorg) runs downstream.
+  if (llms) {
+    const seenItemUrls = new Set()
+    let deduped = 0
+    for (const section of llms.parsed.sections) {
+      const before = section.items.length
+      section.items = section.items.filter((item) => {
+        const key = normalizePath(item.url)
+        if (seenItemUrls.has(key)) return false
+        seenItemUrls.add(key)
+        return true
+      })
+      deduped += before - section.items.length
+    }
+    if (deduped > 0) {
+      styles.info(styles.dim(`Deduped ${deduped} cross-section duplicate URL${deduped === 1 ? '' : 's'} from llms.txt sections.`))
+    }
+  }
+
   const dbgSuffix = `-${sourceUrl.hostname}`
   if (debugSnapshots) {
     debugSnapshots[`01-llms-parsed${dbgSuffix}.json`] = { llmsUrl, parsed: llms ? llms.parsed : null, skipped: skippedLlms }
@@ -475,14 +501,16 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
 
   let scraped
   let scrapeStart = Date.now()
+  const scrapeDiagnostics = {}
   if (mintlifyNav) {
     scraped = { title: mintlifyNav.title, categories: mintlifyNav.categories }
   } else if (archbeeNav) {
     scraped = { title: archbeeNav.title, categories: archbeeNav.categories }
   } else {
     styles.info(`Scraping sidebar nav from ${styles.bold(sourceUrl.toString())}${firecrawlKey ? ' ' + styles.dim('(via Firecrawl)') : ''}...`)
     scrapeStart = Date.now()
-    scraped = await timePhase('scrape nav', () => scrapeNavFromSite(sourceUrl.toString(), knownUrls, firecrawlKey))
+    scraped = await timePhase('scrape nav', () => scrapeNavFromSite(sourceUrl.toString(), knownUrls, firecrawlKey, scrapeDiagnostics))
+    if (!scraped) scrapeDiagnostics.reason ??= 'unknown'
   }
   if (debugSnapshots) {
     debugSnapshots[`02-scraped-raw${dbgSuffix}.json`] = scraped ? JSON.parse(JSON.stringify(scraped)) : null
@@ -496,6 +524,7 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
   // node because that's the one /docs page the scrape saw). Discard the
   // scrape and fall through to the llms.txt path, which uses URL-based
   // clustering when multiple files were merged.
+  let scrapeDiscardedForCoverage = false
   if (scraped && llms && knownUrls.length > 0) {
     const scrapedPages = scraped.categories.reduce((n, c) => n + countUrlPagesDeep(c.pages), 0)
     const coverage = scrapedPages / knownUrls.length
@@ -504,6 +533,7 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
         `Scrape covered ${styles.bold(Math.round(coverage * 100) + '%')} of llms.txt pages (need ≥75%) — discarding scrape and organizing from llms.txt.`,
       )
       scraped = null
+      scrapeDiscardedForCoverage = true
     }
   }
 
@@ -637,9 +667,31 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
   } else if (!llms && knownUrls.length === 0) {
     throw new Error(`No llms.txt or sitemap.xml and the sidebar scrape found no usable structure — can't import ${sourceUrl.toString()}.`)
   } else if (llms) {
-    styles.warning(`Couldn't extract a useful nav — falling back to llms.txt-based organization.`)
+    if (!scrapeDiscardedForCoverage) {
+      const d = scrapeDiagnostics || {}
+      if (d.reason === 'no-categories-round0') {
+        styles.warning(`Sidebar scrape found no nav structure on the round-0 fetch — falling back to llms.txt organization.`)
+      } else if (d.reason === 'below-threshold') {
+        styles.warning(
+          `Sidebar scrape below acceptance threshold (got ${d.categories} categor${d.categories === 1 ? 'y' : 'ies'}, ${d.matched} matched pages — need ${d.need}) — falling back to llms.txt organization.`,
+        )
+      } else {
+        styles.warning(`Sidebar scrape returned no usable nav — falling back to llms.txt organization.`)
+      }
+    }
   } else {
-    styles.warning(`Couldn't extract a useful nav — falling back to sitemap URL clustering.`)
+    if (!scrapeDiscardedForCoverage) {
+      const d = scrapeDiagnostics || {}
+      if (d.reason === 'no-categories-round0') {
+        styles.warning(`Sidebar scrape found no nav structure on the round-0 fetch — falling back to sitemap URL clustering.`)
+      } else if (d.reason === 'below-threshold') {
+        styles.warning(
+          `Sidebar scrape below acceptance threshold (got ${d.categories} categor${d.categories === 1 ? 'y' : 'ies'}, ${d.matched} matched pages — need ${d.need}) — falling back to sitemap URL clustering.`,
+        )
+      } else {
+        styles.warning(`Sidebar scrape returned no usable nav — falling back to sitemap URL clustering.`)
+      }
+    }
   }
   console.log()
 
@@ -1426,7 +1478,7 @@ function isMonotonicAlpha(titles) {
  * renders its sidebar server-side as <nav>/<aside> with <h*> section headers.
  * Returns null if coverage is too low to be useful.
  */
-async function scrapeNavFromSite(sourceUrl, knownPages, firecrawlKey) {
+async function scrapeNavFromSite(sourceUrl, knownPages, firecrawlKey, diagnostics = {}) {
   // Index known pages by normalized pathname so we can match nav hrefs against them.
   const byPath = new Map()
   for (const p of knownPages) byPath.set(normalizePath(p.url), p)
@@ -1562,7 +1614,10 @@ async function scrapeNavFromSite(sourceUrl, knownPages, firecrawlKey) {
   const r0Start = Date.now()
   await visit(sourceUrl)
   const r0Ms = Date.now() - r0Start
-  if (categoryOrder.length === 0) return null
+  if (categoryOrder.length === 0) {
+    diagnostics.reason = 'no-categories-round0'
+    return null
+  }
 
   // Round 1 (parallel): visit pages so each branch has a chance to expose
   // its sub-items. Sidebars on most docs sites auto-expand the current
@@ -1596,9 +1651,21 @@ async function scrapeNavFromSite(sourceUrl, knownPages, firecrawlKey) {
   // have llms.txt to compare against.
   const categories = categoryOrder.filter((c) => c.pages.length > 0)
   if (isDiscovery) {
-    if (categories.length < 1 || matched.size < 5) return null
+    if (categories.length < 1 || matched.size < 5) {
+      diagnostics.reason = 'below-threshold'
+      diagnostics.categories = categories.length
+      diagnostics.matched = matched.size
+      diagnostics.need = '≥1 category, ≥5 matched pages'
+      return null
+    }
   } else {
-    if (categories.length < 2 || matched.size < 10) return null
+    if (categories.length < 2 || matched.size < 10) {
+      diagnostics.reason = 'below-threshold'
+      diagnostics.categories = categories.length
+      diagnostics.matched = matched.size
+      diagnostics.need = '≥2 categories, ≥10 matched pages'
+      return null
+    }
   }
   return { title: null, categories }
 }
@@ -2936,8 +3003,10 @@ function usableSections(sections) {
 }
 
 function sectionsLookUsable(sections) {
-  if (!sections || sections.length > 40) return false
-  return usableSections(sections).length >= 3
+  const usable = usableSections(sections)
+  if (usable.length < 3) return false
+  const avg = usable.reduce((n, s) => n + s.items.length, 0) / usable.length
+  return avg >= 5 && avg <= 100
 }
 
 async function organizeWithClaude(parsed, model) {
@@ -3002,13 +3071,17 @@ async function organizeFromScratch(parsed, model) {
   // Rehydrate pages from the id references Claude returned.
   const expandedCategories = []
   const usedIds = new Set()
+  const usedUrls = new Set()
   for (const cat of raw.categories || []) {
     const pages = []
     for (const id of cat.pageIds || []) {
       const item = items[id]
       if (!item) continue // ignore out-of-range ids
       if (usedIds.has(id)) continue // ignore dupes
+      const normUrl = normalizePath(item.url)
+      if (usedUrls.has(normUrl)) continue // guard against same URL at different input indexes
       usedIds.add(id)
+      usedUrls.add(normUrl)
       pages.push({
         title: item.title,
         url: item.url,

diff --git a/src/utils/llms.js b/src/utils/llms.js
@@ -1,5 +1,10 @@
 const H1_RE = /^#\s+(.+)$/
 const H2_RE = /^##\s+(.+)$/
+const H3_RE = /^###\s+(.+)$/
+
+// Max items per section before we consider it "oversized" for usability purposes.
+// Mirrors the cap in usableSections() in import.js.
+const MAX_SECTION_ITEMS = 200
 // Any line whose meaningful content is a markdown link. Accepts:
 //   - Standard list rows:           `- [text](url)`, `* [text](url) — desc`
 //   - Bare link lines:              `[text](url)`
@@ -39,6 +44,24 @@ export function analyzeLlmsTxt(body, llmsUrl) {
 }
 
 export function parseLlmsTxt(body, llmsUrl) {
+  const h2Result = parseSections(body, llmsUrl, H2_RE)
+
+  // If every H2 section is oversized (or there are none), and the file has H3
+  // headings, re-parse treating H3 (###) as section boundaries.
+  // For strange LLMS.txt formatting
+  const allOversized = h2Result.sections.length === 0 ||
+    h2Result.sections.every((s) => s.items.length > MAX_SECTION_ITEMS)
+  if (allOversized && /^###\s/m.test(body)) {
+    const h3Result = parseSections(body, llmsUrl, H3_RE)
+    if (h3Result.sections.length > 1) {
+      return { ...h3Result, h3Fallback: true }
+    }
+  }
+
+  return h2Result
+}
+
+function parseSections(body, llmsUrl, sectionRe) {
   const lines = body.split(/\r?\n/)
   let title = null
   const sections = []
@@ -51,9 +74,9 @@ export function parseLlmsTxt(body, llmsUrl) {
       continue
     }
 
-    const h2 = line.match(H2_RE)
-    if (h2) {
-      current = { title: h2[1].trim(), items: [] }
+    const sectionMatch = line.match(sectionRe)
+    if (sectionMatch) {
+      current = { title: sectionMatch[1].trim(), items: [] }
       sections.push(current)
       continue
     }

diff --git a/src/utils/llms.test.js b/src/utils/llms.test.js
@@ -0,0 +1,160 @@
+import { test } from 'node:test'
+import assert from 'node:assert/strict'
+import { parseLlmsTxt, analyzeLlmsTxt } from './llms.js'
+
+const BASE = 'https://docs.example.com/llms.txt'
+
+// ---------------------------------------------------------------------------
+// parseLlmsTxt — standard H2 sections
+// ---------------------------------------------------------------------------
+
+test('parseLlmsTxt: parses H1 title and H2 sections', () => {
+  const body = [
+    '# My Docs',
+    '',
+    '## Getting Started',
+    '- [Overview](https://docs.example.com/overview)',
+    '- [Quickstart](https://docs.example.com/quickstart)',
+    '',
+    '## Reference',
+    '- [API](https://docs.example.com/api)',
+  ].join('\n')
+
+  const result = parseLlmsTxt(body, BASE)
+  assert.equal(result.title, 'My Docs')
+  assert.equal(result.sections.length, 2)
+  assert.equal(result.sections[0].title, 'Getting Started')
+  assert.equal(result.sections[0].items.length, 2)
+  assert.equal(result.sections[1].title, 'Reference')
+  assert.equal(result.h3Fallback, undefined)
+})
+
+test('parseLlmsTxt: items with no H2 fall into implicit Resources section', () => {
+  const body = [
+    '# My Docs',
+    '- [Overview](https://docs.example.com/overview)',
+  ].join('\n')
+
+  const result = parseLlmsTxt(body, BASE)
+  assert.equal(result.sections.length, 1)
+  assert.equal(result.sections[0].title, 'Resources')
+})
+
+// ---------------------------------------------------------------------------
+// parseLlmsTxt — H3 fallback
+// ---------------------------------------------------------------------------
+
+// Generates a Couchbase-style body: one H2 bucket, N H3 sub-sections each
+// with `itemsPerSection` pages. With itemsPerSection >= 68, the single H2
+// section exceeds MAX_SECTION_ITEMS (200) and triggers the H3 fallback.
+function makeCouchbaseBody(sections, itemsPerSection) {
+  const lines = ['# Couchbase', '', '> Official Couchbase docs.', '', '## Docs', '']
+  for (const title of sections) {
+    lines.push(`### ${title}`)
+    for (let i = 0; i < itemsPerSection; i++) {
+      const slug = title.toLowerCase().replace(/[^a-z0-9]+/g, '-')
+      lines.push(`- [Page ${i}](https://docs.couchbase.com/${slug}/page-${i})`)
+    }
+    lines.push('')
+  }
+  return lines.join('\n')
+}
+
+const CB_SECTIONS = ['.NET Analytics SDK (1.0)', '.NET Entity Framework (1.0)', '.NET SDK (3.9)']
+
+test('parseLlmsTxt: falls back to H3 when single H2 section is oversized', () => {
+  // Build a body with one H2 containing 201 items so it trips the oversized threshold,
+  // then verify the H3 re-parse produces sensible sections.
+  const manyItems = Array.from({ length: 201 }, (_, i) =>
+    `- [Page ${i}](https://docs.example.com/page-${i})`
+  ).join('\n')
+
+  const body = [
+    '# Big Site',
+    '',
+    '## Docs',
+    manyItems,
+    '',
+    '### Section A',
+    '- [A1](https://docs.example.com/a1)',
+    '- [A2](https://docs.example.com/a2)',
+    '',
+    '### Section B',
+    '- [B1](https://docs.example.com/b1)',
+  ].join('\n')
+
+  const result = parseLlmsTxt(body, BASE)
+  assert.equal(result.h3Fallback, true)
+  assert.ok(result.sections.length > 1, `expected >1 section, got ${result.sections.length}`)
+
+  const titles = result.sections.map((s) => s.title)
+  assert.ok(titles.includes('Section A'))
+  assert.ok(titles.includes('Section B'))
+})
+
+test('parseLlmsTxt: Couchbase-style H3 structure produces one section per SDK', () => {
+  // 3 sections × 70 items = 210 items in the single H2 → oversized → H3 fallback
+  const body = makeCouchbaseBody(CB_SECTIONS, 70)
+  const result = parseLlmsTxt(body, 'https://docs.couchbase.com/llms.txt')
+
+  assert.equal(result.h3Fallback, true)
+  assert.equal(result.title, 'Couchbase')
+  assert.equal(result.sections.length, 3)
+  assert.equal(result.sections[0].title, '.NET Analytics SDK (1.0)')
+  assert.equal(result.sections[1].title, '.NET Entity Framework (1.0)')
+  assert.equal(result.sections[2].title, '.NET SDK (3.9)')
+  assert.equal(result.sections[0].items.length, 70)
+})
+
+test('parseLlmsTxt: does NOT use H3 fallback when H2 sections are usable', () => {
+  const body = [
+    '# Site',
+    '',
+    '## Section A',
+    '- [A1](https://docs.example.com/a1)',
+    '- [A2](https://docs.example.com/a2)',
+    '',
+    '## Section B',
+    '- [B1](https://docs.example.com/b1)',
+    '',
+    '### Sub B',
+    '- [B2](https://docs.example.com/b2)',
+  ].join('\n')
+
+  const result = parseLlmsTxt(body, BASE)
+  assert.equal(result.h3Fallback, undefined)
+  assert.equal(result.sections.length, 2)
+  assert.equal(result.sections[0].title, 'Section A')
+  assert.equal(result.sections[1].title, 'Section B')
+})
+
+test('parseLlmsTxt: does NOT use H3 fallback when no H3 headings exist', () => {
+  const manyItems = Array.from({ length: 201 }, (_, i) =>
+    `- [Page ${i}](https://docs.example.com/page-${i})`
+  ).join('\n')
+
+  const body = ['# Big Site', '', '## Docs', manyItems].join('\n')
+
+  const result = parseLlmsTxt(body, BASE)
+  assert.equal(result.h3Fallback, undefined)
+  assert.equal(result.sections.length, 1)
+  assert.equal(result.sections[0].title, 'Docs')
+})
+
+// ---------------------------------------------------------------------------
+// analyzeLlmsTxt
+// ---------------------------------------------------------------------------
+
+test('analyzeLlmsTxt: marks usable when it has link items', () => {
+  const body = '## Section\n- [Page](https://docs.example.com/page)'
+  const result = analyzeLlmsTxt(body, BASE)
+  assert.equal(result.usable, true)
+  assert.equal(result.reason, null)
+})
+
+test('analyzeLlmsTxt: marks not usable when no link items', () => {
+  const body = '# Just a title\n\nSome prose with no links.'
+  const result = analyzeLlmsTxt(body, BASE)
+  assert.equal(result.usable, false)
+  assert.ok(result.reason)
+})