Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 82 additions & 9 deletions src/commands/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,9 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
} else {
styles.info(styles.dim(`Merged ${llms.sourceFiles.length} llms.txt files (root: ${llmsUrl}; aggregate ratio ${s.ratio.toFixed(2)}).`))
}
if (llms.parsed.h3Fallback) {
styles.info(styles.dim(`H2 sections oversized — re-parsed using H3 (###) headings as section boundaries.`))
}
}

// Pre-extract changelog pages so AI / URL clustering / section-direct paths
Expand All @@ -397,6 +400,29 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
)
}

// Dedupe items across llms.parsed.sections by URL (first occurrence wins).
// Some llms.txt files cross-reference the same page under multiple headings —
// feeding duplicate URLs into the organize step causes the same page to appear
// in multiple sidebar sections regardless of which path (direct, icons, full
// reorg) runs downstream.
if (llms) {
const seenItemUrls = new Set()
let deduped = 0
for (const section of llms.parsed.sections) {
const before = section.items.length
section.items = section.items.filter((item) => {
const key = normalizePath(item.url)
if (seenItemUrls.has(key)) return false
seenItemUrls.add(key)
return true
})
deduped += before - section.items.length
}
if (deduped > 0) {
styles.info(styles.dim(`Deduped ${deduped} cross-section duplicate URL${deduped === 1 ? '' : 's'} from llms.txt sections.`))
}
}

const dbgSuffix = `-${sourceUrl.hostname}`
if (debugSnapshots) {
debugSnapshots[`01-llms-parsed${dbgSuffix}.json`] = { llmsUrl, parsed: llms ? llms.parsed : null, skipped: skippedLlms }
Expand Down Expand Up @@ -475,14 +501,16 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna

let scraped
let scrapeStart = Date.now()
const scrapeDiagnostics = {}
if (mintlifyNav) {
scraped = { title: mintlifyNav.title, categories: mintlifyNav.categories }
} else if (archbeeNav) {
scraped = { title: archbeeNav.title, categories: archbeeNav.categories }
} else {
styles.info(`Scraping sidebar nav from ${styles.bold(sourceUrl.toString())}${firecrawlKey ? ' ' + styles.dim('(via Firecrawl)') : ''}...`)
scrapeStart = Date.now()
scraped = await timePhase('scrape nav', () => scrapeNavFromSite(sourceUrl.toString(), knownUrls, firecrawlKey))
scraped = await timePhase('scrape nav', () => scrapeNavFromSite(sourceUrl.toString(), knownUrls, firecrawlKey, scrapeDiagnostics))
if (!scraped) scrapeDiagnostics.reason ??= 'unknown'
}
if (debugSnapshots) {
debugSnapshots[`02-scraped-raw${dbgSuffix}.json`] = scraped ? JSON.parse(JSON.stringify(scraped)) : null
Expand All @@ -496,6 +524,7 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
// node because that's the one /docs page the scrape saw). Discard the
// scrape and fall through to the llms.txt path, which uses URL-based
// clustering when multiple files were merged.
let scrapeDiscardedForCoverage = false
if (scraped && llms && knownUrls.length > 0) {
const scrapedPages = scraped.categories.reduce((n, c) => n + countUrlPagesDeep(c.pages), 0)
const coverage = scrapedPages / knownUrls.length
Expand All @@ -504,6 +533,7 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
`Scrape covered ${styles.bold(Math.round(coverage * 100) + '%')} of llms.txt pages (need ≥75%) — discarding scrape and organizing from llms.txt.`,
)
scraped = null
scrapeDiscardedForCoverage = true
}
}

Expand Down Expand Up @@ -637,9 +667,31 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna
} else if (!llms && knownUrls.length === 0) {
throw new Error(`No llms.txt or sitemap.xml and the sidebar scrape found no usable structure — can't import ${sourceUrl.toString()}.`)
} else if (llms) {
styles.warning(`Couldn't extract a useful nav — falling back to llms.txt-based organization.`)
if (!scrapeDiscardedForCoverage) {
const d = scrapeDiagnostics || {}
if (d.reason === 'no-categories-round0') {
styles.warning(`Sidebar scrape found no nav structure on the round-0 fetch — falling back to llms.txt organization.`)
} else if (d.reason === 'below-threshold') {
styles.warning(
`Sidebar scrape below acceptance threshold (got ${d.categories} categor${d.categories === 1 ? 'y' : 'ies'}, ${d.matched} matched pages — need ${d.need}) — falling back to llms.txt organization.`,
)
} else {
styles.warning(`Sidebar scrape returned no usable nav — falling back to llms.txt organization.`)
}
}
} else {
styles.warning(`Couldn't extract a useful nav — falling back to sitemap URL clustering.`)
if (!scrapeDiscardedForCoverage) {
const d = scrapeDiagnostics || {}
if (d.reason === 'no-categories-round0') {
styles.warning(`Sidebar scrape found no nav structure on the round-0 fetch — falling back to sitemap URL clustering.`)
} else if (d.reason === 'below-threshold') {
styles.warning(
`Sidebar scrape below acceptance threshold (got ${d.categories} categor${d.categories === 1 ? 'y' : 'ies'}, ${d.matched} matched pages — need ${d.need}) — falling back to sitemap URL clustering.`,
)
} else {
styles.warning(`Sidebar scrape returned no usable nav — falling back to sitemap URL clustering.`)
}
}
}
console.log()

Expand Down Expand Up @@ -1426,7 +1478,7 @@ function isMonotonicAlpha(titles) {
* renders its sidebar server-side as <nav>/<aside> with <h*> section headers.
* Returns null if coverage is too low to be useful.
*/
async function scrapeNavFromSite(sourceUrl, knownPages, firecrawlKey) {
async function scrapeNavFromSite(sourceUrl, knownPages, firecrawlKey, diagnostics = {}) {
// Index known pages by normalized pathname so we can match nav hrefs against them.
const byPath = new Map()
for (const p of knownPages) byPath.set(normalizePath(p.url), p)
Expand Down Expand Up @@ -1562,7 +1614,10 @@ async function scrapeNavFromSite(sourceUrl, knownPages, firecrawlKey) {
const r0Start = Date.now()
await visit(sourceUrl)
const r0Ms = Date.now() - r0Start
if (categoryOrder.length === 0) return null
if (categoryOrder.length === 0) {
diagnostics.reason = 'no-categories-round0'
return null
}

// Round 1 (parallel): visit pages so each branch has a chance to expose
// its sub-items. Sidebars on most docs sites auto-expand the current
Expand Down Expand Up @@ -1596,9 +1651,21 @@ async function scrapeNavFromSite(sourceUrl, knownPages, firecrawlKey) {
// have llms.txt to compare against.
const categories = categoryOrder.filter((c) => c.pages.length > 0)
if (isDiscovery) {
if (categories.length < 1 || matched.size < 5) return null
if (categories.length < 1 || matched.size < 5) {
diagnostics.reason = 'below-threshold'
diagnostics.categories = categories.length
diagnostics.matched = matched.size
diagnostics.need = '≥1 category, ≥5 matched pages'
return null
}
} else {
if (categories.length < 2 || matched.size < 10) return null
if (categories.length < 2 || matched.size < 10) {
diagnostics.reason = 'below-threshold'
diagnostics.categories = categories.length
diagnostics.matched = matched.size
diagnostics.need = '≥2 categories, ≥10 matched pages'
return null
}
}
return { title: null, categories }
}
Expand Down Expand Up @@ -2936,8 +3003,10 @@ function usableSections(sections) {
}

function sectionsLookUsable(sections) {
if (!sections || sections.length > 40) return false
return usableSections(sections).length >= 3
const usable = usableSections(sections)
if (usable.length < 3) return false
const avg = usable.reduce((n, s) => n + s.items.length, 0) / usable.length
return avg >= 5 && avg <= 100
}

async function organizeWithClaude(parsed, model) {
Expand Down Expand Up @@ -3002,13 +3071,17 @@ async function organizeFromScratch(parsed, model) {
// Rehydrate pages from the id references Claude returned.
const expandedCategories = []
const usedIds = new Set()
const usedUrls = new Set()
for (const cat of raw.categories || []) {
const pages = []
for (const id of cat.pageIds || []) {
const item = items[id]
if (!item) continue // ignore out-of-range ids
if (usedIds.has(id)) continue // ignore dupes
const normUrl = normalizePath(item.url)
if (usedUrls.has(normUrl)) continue // guard against same URL at different input indexes
usedIds.add(id)
usedUrls.add(normUrl)
pages.push({
title: item.title,
url: item.url,
Expand Down
29 changes: 26 additions & 3 deletions src/utils/llms.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
const H1_RE = /^#\s+(.+)$/
const H2_RE = /^##\s+(.+)$/
const H3_RE = /^###\s+(.+)$/

// Max items per section before we consider it "oversized" for usability purposes.
// Mirrors the cap in usableSections() in import.js.
const MAX_SECTION_ITEMS = 200
// Any line whose meaningful content is a markdown link. Accepts:
// - Standard list rows: `- [text](url)`, `* [text](url) — desc`
// - Bare link lines: `[text](url)`
Expand Down Expand Up @@ -39,6 +44,24 @@ export function analyzeLlmsTxt(body, llmsUrl) {
}

export function parseLlmsTxt(body, llmsUrl) {
const h2Result = parseSections(body, llmsUrl, H2_RE)

// If every H2 section is oversized (or there are none), and the file has H3
// headings, re-parse treating H3 (###) as section boundaries.
// For strange LLMS.txt formatting
const allOversized = h2Result.sections.length === 0 ||
h2Result.sections.every((s) => s.items.length > MAX_SECTION_ITEMS)
if (allOversized && /^###\s/m.test(body)) {
const h3Result = parseSections(body, llmsUrl, H3_RE)
if (h3Result.sections.length > 1) {
return { ...h3Result, h3Fallback: true }
}
}

return h2Result
}

function parseSections(body, llmsUrl, sectionRe) {
const lines = body.split(/\r?\n/)
let title = null
const sections = []
Expand All @@ -51,9 +74,9 @@ export function parseLlmsTxt(body, llmsUrl) {
continue
}

const h2 = line.match(H2_RE)
if (h2) {
current = { title: h2[1].trim(), items: [] }
const sectionMatch = line.match(sectionRe)
if (sectionMatch) {
current = { title: sectionMatch[1].trim(), items: [] }
sections.push(current)
continue
}
Expand Down
160 changes: 160 additions & 0 deletions src/utils/llms.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import { test } from 'node:test'
import assert from 'node:assert/strict'
import { parseLlmsTxt, analyzeLlmsTxt } from './llms.js'

const BASE = 'https://docs.example.com/llms.txt'

// ---------------------------------------------------------------------------
// parseLlmsTxt — standard H2 sections
// ---------------------------------------------------------------------------

test('parseLlmsTxt: parses H1 title and H2 sections', () => {
const body = [
'# My Docs',
'',
'## Getting Started',
'- [Overview](https://docs.example.com/overview)',
'- [Quickstart](https://docs.example.com/quickstart)',
'',
'## Reference',
'- [API](https://docs.example.com/api)',
].join('\n')

const result = parseLlmsTxt(body, BASE)
assert.equal(result.title, 'My Docs')
assert.equal(result.sections.length, 2)
assert.equal(result.sections[0].title, 'Getting Started')
assert.equal(result.sections[0].items.length, 2)
assert.equal(result.sections[1].title, 'Reference')
assert.equal(result.h3Fallback, undefined)
})

test('parseLlmsTxt: items with no H2 fall into implicit Resources section', () => {
const body = [
'# My Docs',
'- [Overview](https://docs.example.com/overview)',
].join('\n')

const result = parseLlmsTxt(body, BASE)
assert.equal(result.sections.length, 1)
assert.equal(result.sections[0].title, 'Resources')
})

// ---------------------------------------------------------------------------
// parseLlmsTxt — H3 fallback
// ---------------------------------------------------------------------------

// Generates a Couchbase-style body: one H2 bucket, N H3 sub-sections each
// with `itemsPerSection` pages. With itemsPerSection >= 68, the single H2
// section exceeds MAX_SECTION_ITEMS (200) and triggers the H3 fallback.
function makeCouchbaseBody(sections, itemsPerSection) {
const lines = ['# Couchbase', '', '> Official Couchbase docs.', '', '## Docs', '']
for (const title of sections) {
lines.push(`### ${title}`)
for (let i = 0; i < itemsPerSection; i++) {
const slug = title.toLowerCase().replace(/[^a-z0-9]+/g, '-')
lines.push(`- [Page ${i}](https://docs.couchbase.com/${slug}/page-${i})`)
}
lines.push('')
}
return lines.join('\n')
}

const CB_SECTIONS = ['.NET Analytics SDK (1.0)', '.NET Entity Framework (1.0)', '.NET SDK (3.9)']

test('parseLlmsTxt: falls back to H3 when single H2 section is oversized', () => {
// Build a body with one H2 containing 201 items so it trips the oversized threshold,
// then verify the H3 re-parse produces sensible sections.
const manyItems = Array.from({ length: 201 }, (_, i) =>
`- [Page ${i}](https://docs.example.com/page-${i})`
).join('\n')

const body = [
'# Big Site',
'',
'## Docs',
manyItems,
'',
'### Section A',
'- [A1](https://docs.example.com/a1)',
'- [A2](https://docs.example.com/a2)',
'',
'### Section B',
'- [B1](https://docs.example.com/b1)',
].join('\n')

const result = parseLlmsTxt(body, BASE)
assert.equal(result.h3Fallback, true)
assert.ok(result.sections.length > 1, `expected >1 section, got ${result.sections.length}`)

const titles = result.sections.map((s) => s.title)
assert.ok(titles.includes('Section A'))
assert.ok(titles.includes('Section B'))
})

test('parseLlmsTxt: Couchbase-style H3 structure produces one section per SDK', () => {
// 3 sections × 70 items = 210 items in the single H2 → oversized → H3 fallback
const body = makeCouchbaseBody(CB_SECTIONS, 70)
const result = parseLlmsTxt(body, 'https://docs.couchbase.com/llms.txt')

assert.equal(result.h3Fallback, true)
assert.equal(result.title, 'Couchbase')
assert.equal(result.sections.length, 3)
assert.equal(result.sections[0].title, '.NET Analytics SDK (1.0)')
assert.equal(result.sections[1].title, '.NET Entity Framework (1.0)')
assert.equal(result.sections[2].title, '.NET SDK (3.9)')
assert.equal(result.sections[0].items.length, 70)
})

test('parseLlmsTxt: does NOT use H3 fallback when H2 sections are usable', () => {
const body = [
'# Site',
'',
'## Section A',
'- [A1](https://docs.example.com/a1)',
'- [A2](https://docs.example.com/a2)',
'',
'## Section B',
'- [B1](https://docs.example.com/b1)',
'',
'### Sub B',
'- [B2](https://docs.example.com/b2)',
].join('\n')

const result = parseLlmsTxt(body, BASE)
assert.equal(result.h3Fallback, undefined)
assert.equal(result.sections.length, 2)
assert.equal(result.sections[0].title, 'Section A')
assert.equal(result.sections[1].title, 'Section B')
})

test('parseLlmsTxt: does NOT use H3 fallback when no H3 headings exist', () => {
const manyItems = Array.from({ length: 201 }, (_, i) =>
`- [Page ${i}](https://docs.example.com/page-${i})`
).join('\n')

const body = ['# Big Site', '', '## Docs', manyItems].join('\n')

const result = parseLlmsTxt(body, BASE)
assert.equal(result.h3Fallback, undefined)
assert.equal(result.sections.length, 1)
assert.equal(result.sections[0].title, 'Docs')
})

// ---------------------------------------------------------------------------
// analyzeLlmsTxt
// ---------------------------------------------------------------------------

test('analyzeLlmsTxt: marks usable when it has link items', () => {
const body = '## Section\n- [Page](https://docs.example.com/page)'
const result = analyzeLlmsTxt(body, BASE)
assert.equal(result.usable, true)
assert.equal(result.reason, null)
})

test('analyzeLlmsTxt: marks not usable when no link items', () => {
const body = '# Just a title\n\nSome prose with no links.'
const result = analyzeLlmsTxt(body, BASE)
assert.equal(result.usable, false)
assert.ok(result.reason)
})