From 3c944e1fa20261f203625405d98802580c4e03aa Mon Sep 17 00:00:00 2001 From: Xavier Andueza Date: Fri, 26 Jun 2026 16:22:00 +1000 Subject: [PATCH 1/2] fix: on input de-duplicate urls, on output from claude reorganize dedupe urls --- src/commands/import.js | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/commands/import.js b/src/commands/import.js index e8c5411..aa46167 100644 --- a/src/commands/import.js +++ b/src/commands/import.js @@ -397,6 +397,29 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna ) } + // Dedupe items across llms.parsed.sections by URL (first occurrence wins). + // Some llms.txt files cross-reference the same page under multiple headings — + // feeding duplicate URLs into the organize step causes the same page to appear + // in multiple sidebar sections regardless of which path (direct, icons, full + // reorg) runs downstream. + if (llms) { + const seenItemUrls = new Set() + let deduped = 0 + for (const section of llms.parsed.sections) { + const before = section.items.length + section.items = section.items.filter((item) => { + const key = normalizePath(item.url) + if (seenItemUrls.has(key)) return false + seenItemUrls.add(key) + return true + }) + deduped += before - section.items.length + } + if (deduped > 0) { + styles.info(styles.dim(`Deduped ${deduped} cross-section duplicate URL${deduped === 1 ? '' : 's'} from llms.txt sections.`)) + } + } + const dbgSuffix = `-${sourceUrl.hostname}` if (debugSnapshots) { debugSnapshots[`01-llms-parsed${dbgSuffix}.json`] = { llmsUrl, parsed: llms ? llms.parsed : null, skipped: skippedLlms } @@ -2936,8 +2959,10 @@ function usableSections(sections) { } function sectionsLookUsable(sections) { - if (!sections || sections.length > 40) return false - return usableSections(sections).length >= 3 + const usable = usableSections(sections) + if (usable.length < 3) return false + const avg = usable.reduce((n, s) => n + s.items.length, 0) / usable.length + return avg >= 5 && avg <= 100 } async function organizeWithClaude(parsed, model) { @@ -3002,13 +3027,17 @@ async function organizeFromScratch(parsed, model) { // Rehydrate pages from the id references Claude returned. const expandedCategories = [] const usedIds = new Set() + const usedUrls = new Set() for (const cat of raw.categories || []) { const pages = [] for (const id of cat.pageIds || []) { const item = items[id] if (!item) continue // ignore out-of-range ids if (usedIds.has(id)) continue // ignore dupes + const normUrl = normalizePath(item.url) + if (usedUrls.has(normUrl)) continue // guard against same URL at different input indexes usedIds.add(id) + usedUrls.add(normUrl) pages.push({ title: item.title, url: item.url, From 2f49ed2303e145f46f09326a6298b49bf8c2a805 Mon Sep 17 00:00:00 2001 From: Xavier Andueza Date: Fri, 26 Jun 2026 17:10:19 +1000 Subject: [PATCH 2/2] feat: better logging when we can't auto-format, fallback to h3 for weird llms.txt --- src/commands/import.js | 58 +++++++++++++-- src/utils/llms.js | 29 +++++++- src/utils/llms.test.js | 160 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 237 insertions(+), 10 deletions(-) create mode 100644 src/utils/llms.test.js diff --git a/src/commands/import.js b/src/commands/import.js index aa46167..488ccb3 100644 --- a/src/commands/import.js +++ b/src/commands/import.js @@ -372,6 +372,9 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna } else { styles.info(styles.dim(`Merged ${llms.sourceFiles.length} llms.txt files (root: ${llmsUrl}; aggregate ratio ${s.ratio.toFixed(2)}).`)) } + if (llms.parsed.h3Fallback) { + styles.info(styles.dim(`H2 sections oversized — re-parsed using H3 (###) headings as section boundaries.`)) + } } // Pre-extract changelog pages so AI / URL clustering / section-direct paths @@ -498,6 +501,7 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna let scraped let scrapeStart = Date.now() + const scrapeDiagnostics = {} if (mintlifyNav) { scraped = { title: mintlifyNav.title, categories: mintlifyNav.categories } } else if (archbeeNav) { @@ -505,7 +509,8 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna } else { styles.info(`Scraping sidebar nav from ${styles.bold(sourceUrl.toString())}${firecrawlKey ? ' ' + styles.dim('(via Firecrawl)') : ''}...`) scrapeStart = Date.now() - scraped = await timePhase('scrape nav', () => scrapeNavFromSite(sourceUrl.toString(), knownUrls, firecrawlKey)) + scraped = await timePhase('scrape nav', () => scrapeNavFromSite(sourceUrl.toString(), knownUrls, firecrawlKey, scrapeDiagnostics)) + if (!scraped) scrapeDiagnostics.reason ??= 'unknown' } if (debugSnapshots) { debugSnapshots[`02-scraped-raw${dbgSuffix}.json`] = scraped ? JSON.parse(JSON.stringify(scraped)) : null @@ -519,6 +524,7 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna // node because that's the one /docs page the scrape saw). Discard the // scrape and fall through to the llms.txt path, which uses URL-based // clustering when multiple files were merged. + let scrapeDiscardedForCoverage = false if (scraped && llms && knownUrls.length > 0) { const scrapedPages = scraped.categories.reduce((n, c) => n + countUrlPagesDeep(c.pages), 0) const coverage = scrapedPages / knownUrls.length @@ -527,6 +533,7 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna `Scrape covered ${styles.bold(Math.round(coverage * 100) + '%')} of llms.txt pages (need ≥75%) — discarding scrape and organizing from llms.txt.`, ) scraped = null + scrapeDiscardedForCoverage = true } } @@ -660,9 +667,31 @@ async function produceOrganizedForSource(sourceUrl, options, timePhase, debugSna } else if (!llms && knownUrls.length === 0) { throw new Error(`No llms.txt or sitemap.xml and the sidebar scrape found no usable structure — can't import ${sourceUrl.toString()}.`) } else if (llms) { - styles.warning(`Couldn't extract a useful nav — falling back to llms.txt-based organization.`) + if (!scrapeDiscardedForCoverage) { + const d = scrapeDiagnostics || {} + if (d.reason === 'no-categories-round0') { + styles.warning(`Sidebar scrape found no nav structure on the round-0 fetch — falling back to llms.txt organization.`) + } else if (d.reason === 'below-threshold') { + styles.warning( + `Sidebar scrape below acceptance threshold (got ${d.categories} categor${d.categories === 1 ? 'y' : 'ies'}, ${d.matched} matched pages — need ${d.need}) — falling back to llms.txt organization.`, + ) + } else { + styles.warning(`Sidebar scrape returned no usable nav — falling back to llms.txt organization.`) + } + } } else { - styles.warning(`Couldn't extract a useful nav — falling back to sitemap URL clustering.`) + if (!scrapeDiscardedForCoverage) { + const d = scrapeDiagnostics || {} + if (d.reason === 'no-categories-round0') { + styles.warning(`Sidebar scrape found no nav structure on the round-0 fetch — falling back to sitemap URL clustering.`) + } else if (d.reason === 'below-threshold') { + styles.warning( + `Sidebar scrape below acceptance threshold (got ${d.categories} categor${d.categories === 1 ? 'y' : 'ies'}, ${d.matched} matched pages — need ${d.need}) — falling back to sitemap URL clustering.`, + ) + } else { + styles.warning(`Sidebar scrape returned no usable nav — falling back to sitemap URL clustering.`) + } + } } console.log() @@ -1449,7 +1478,7 @@ function isMonotonicAlpha(titles) { * renders its sidebar server-side as