From 65a01b0eb4f086b75b9c67e862e94f040569cb96 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat Date: Mon, 29 Jun 2026 20:00:10 +0800 Subject: [PATCH 01/34] docs: finalize P1 provider plans + helper refactor + roadmap status --- docs/provider-roadmap.md | 59 +- .../plans/2026-06-29-p1-providers-index.md | 206 +++++++ .../plans/2026-06-29-provider-europeana.md | 555 +++++++++++++++++ .../plans/2026-06-29-provider-freesound.md | 361 +++++++++++ .../2026-06-29-provider-helpers-refactor.md | 438 ++++++++++++++ .../2026-06-29-provider-internet-archive.md | 417 +++++++++++++ .../plans/2026-06-29-provider-jamendo.md | 445 ++++++++++++++ .../plans/2026-06-29-provider-polyhaven.md | 460 ++++++++++++++ .../plans/2026-06-29-provider-rijksmuseum.md | 572 ++++++++++++++++++ 9 files changed, 3490 insertions(+), 23 deletions(-) create mode 100644 docs/superpowers/plans/2026-06-29-p1-providers-index.md create mode 100644 docs/superpowers/plans/2026-06-29-provider-europeana.md create mode 100644 docs/superpowers/plans/2026-06-29-provider-freesound.md create mode 100644 docs/superpowers/plans/2026-06-29-provider-helpers-refactor.md create mode 100644 docs/superpowers/plans/2026-06-29-provider-internet-archive.md create mode 100644 docs/superpowers/plans/2026-06-29-provider-jamendo.md create mode 100644 docs/superpowers/plans/2026-06-29-provider-polyhaven.md create mode 100644 docs/superpowers/plans/2026-06-29-provider-rijksmuseum.md diff --git a/docs/provider-roadmap.md b/docs/provider-roadmap.md index 8ec6020..bd02183 100644 --- a/docs/provider-roadmap.md +++ b/docs/provider-roadmap.md @@ -1,24 +1,34 @@ # refkit provider roadmap -Status as of 2026-06-23. Grounded in a web-verified landscape scan (104 +Status as of 2026-06-29. Grounded in a web-verified landscape scan (104 candidate sources → 101 unique → 16 depth-verified). This is the contract for expanding refkit's provider coverage; execute against it, not against memory. -## Current inventory (7 providers) +> **Progress: Phases 1–4 are DONE in refkit.** §1 (CC version axis) shipped in +> commit `75c557e`; §2 P0 providers (flickr, wikimedia-commons, met, artic, +> smithsonian) and §3/§4 cheap modality wins (openverse-audio, pexels-video, +> pixabay-video) are all built and tested. The remaining work is the §3 **P1 +> backlog** (rijksmuseum, europeana, freesound, jamendo, internet-archive, +> poly-haven/ambientcg). One open caveat: §1 item 7 lives in the **Slate** repo +> (not this worktree) and is not verified here. -| Modality | Providers | Verdict | +## Current inventory (12 provider packages, ~15 provider ids) + +| Modality | Providers | Status | |---|---|---| -| image | openverse, unsplash, pexels, pixabay | mainstream stock + the main CC aggregator — solid, **but two glaring omissions: Flickr, Wikimedia Commons** | -| text | gutendex (Project Gutenberg), poetrydb | thin — only PD books + a niche poetry DB | +| image | openverse, unsplash, pexels, pixabay, **flickr**, **wikimedia-commons**, **met**, **artic**, **smithsonian** | ✅ Flickr + Wikimedia gaps closed; GLAM CC0 cluster (met/artic/smithsonian) added | +| text | gutendex (Project Gutenberg), poetrydb | unchanged — still thin (PD books + niche poetry DB) | +| audio | **openverse-audio** | ✅ cheap leg added (§4) | +| video | **pexels-video**, **pixabay-video** | ✅ cheap legs added (§4) | | grey/discovery | brave | represents the web-search category; do **not** bulk-add more (every web source is `license:unknown`) | -| video / audio / icon·vector / 3d·texture | — | **no leg at all** | +| icon·vector / 3d·texture | — | still no leg (P1 backlog: poly-haven/ambientcg) | The moat is per-item license normalization, so the highest-value additions are mainstream sources that return **structured per-item license** (Flickr, Wikimedia, the GLAM museum APIs), not more commodity stock or more grey web search. -## §1 — Prerequisite: CC version axis (Phase 1, atomic, blocks everything) +## §1 — Prerequisite: CC version axis (Phase 1) — ✅ DONE (`75c557e`) The current `LicenseId` enum only models `CC-BY-4.0` / `CC-BY-SA-4.0`. Every CC-BY/BY-SA at version 1.0–3.0 collapses to `unknown` → `needs-review`. The @@ -49,7 +59,7 @@ which today throws away CC-BY-2.0/3.0 results as `unknown`. **Files (atomic — a partial rename leaves the build red, so it is one phase):** -refkit: +refkit (all done in `75c557e`): 1. `packages/core/src/license.ts` — `LicenseId` union + `LICENSE_FACTS` keys. 2. `packages/core/src/rights.ts` — `licenseIdSchema` enum + add `licenseVersion?` to interface & schema. @@ -62,7 +72,7 @@ refkit: "older CC-BY → CC-BY, allowed-with-attribution, version preserved". This is the proof the fix works. -Slate (consumes refkit via link — same atomic change): +Slate (consumes refkit via link — same atomic change) — ⚠️ NOT verified in this worktree: 7. `packages/core/src/retrieval/__tests__/reference-to-asset.test.ts` — test data `'CC-BY-4.0'` → `'CC-BY'` (+ `licenseVersion: '4.0'`), and the `metadata.license`/attribution assertions. @@ -73,9 +83,11 @@ suite green in the Slate worktree. Optional follow-up (not Phase 1): a `licenseDeedUrl(license, version?)` helper so attribution links the exact CC deed instead of only the source page. -## §2 — P0 providers (mainstream + per-item clean license + i2i-usable) +## §2 — P0 providers (mainstream + per-item clean license + i2i-usable) — ✅ DONE -Each is an independent `@refkit/provider-*` satellite. Build after Phase 1. +All five shipped as independent `@refkit/provider-*` satellites +(`provider-flickr`, `provider-wikimedia-commons`, `provider-met`, +`provider-artic`, `provider-smithsonian`). | Provider | Modality | Effort | Auth | License field (verified) | Mapping | |---|---|---|---|---|---| @@ -96,13 +108,13 @@ Notes: ## §3 — P1 providers, modality gaps & cheap wins -**Cheapest wins first — reuse an existing integration's key + license mapping:** -- **openverse audio** — the openverse API already serves audio under the same - key/shape; near-free audio leg. -- **pexels-video / pixabay-video** — same keys, same license as the image - providers we already ship; a different endpoint adds the video leg cheaply. +**Cheapest wins — ✅ DONE (all three built):** +- **openverse-audio** ✅ — same key/shape as openverse image; `openverseAudio()` + in `provider-openverse`. +- **pexels-video** ✅ / **pixabay-video** ✅ — `pexelsVideo()` etc., same keys as + the image providers, video endpoint. -**Other P1:** +**Other P1 — ⬜ REMAINING (this is the actual next work):** | Provider | Leg | Caveat (verified) | |---|---|---| @@ -129,11 +141,12 @@ Notes: ## §5 — Sequencing -1. **Phase 1** — §1 CC version axis (atomic, refkit + Slate test). ← do first. -2. **Phase 2** — flickr + wikimedia-commons (the two mainstream image gaps). -3. **Phase 3** — met + artic + smithsonian (GLAM CC0 cluster; Met/Artic are S). -4. **Phase 4** — cheap modality wins: openverse-audio, pexels-video, pixabay-video. -5. **Phase 5+** — P1 backlog as demand dictates. +1. ✅ **Phase 1** — §1 CC version axis (refkit done in `75c557e`; Slate test + unverified here). +2. ✅ **Phase 2** — flickr + wikimedia-commons (the two mainstream image gaps). +3. ✅ **Phase 3** — met + artic + smithsonian (GLAM CC0 cluster). +4. ✅ **Phase 4** — cheap modality wins: openverse-audio, pexels-video, pixabay-video. +5. ⬜ **Phase 5+** — P1 backlog as demand dictates. ← **only remaining work**. -Phases 2–4 are independent per-package satellites → parallelizable via +Phase 5+ items are independent per-package satellites → parallelizable via worktree-isolated subagents (one provider per agent). diff --git a/docs/superpowers/plans/2026-06-29-p1-providers-index.md b/docs/superpowers/plans/2026-06-29-p1-providers-index.md new file mode 100644 index 0000000..91595e8 --- /dev/null +++ b/docs/superpowers/plans/2026-06-29-p1-providers-index.md @@ -0,0 +1,206 @@ +# Phase 5+ P1 Providers — Index & Shared Skeleton + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement each per-provider plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add the remaining roadmap §3 P1 provider satellites to refkit, each as an independent `@refkit/provider-*` package that returns license-normalized `Reference`s. + +**Architecture:** Every provider is a thin satellite depending only on `@refkit/core`. It exposes one factory function returning `defineProvider({ id, modalities, queryFeatures, capabilities, search })`. `search` fetches the source API via `ctx.fetch`, maps each item's source-declared license to a `LicenseId` (+ optional `licenseVersion`), and emits `Reference`s through a `toReference` mapper. Permissions are never stored — they derive from `license` via core's `factsFor()`/`evaluateUse()`. This index defines the boilerplate skeleton and cross-cutting decisions shared by all six per-provider plans; read it first. + +**Tech Stack:** TypeScript (ESM, `"type": "module"`), tsup (build), vitest (test), zod (via core), pnpm workspaces, changesets. + +--- + +## Provider set & sequencing + +Build in this order (cheapest/cleanest first; each is an independent package → parallelizable via worktree-isolated subagents): + +| # | Plan file | Package | Modality | Auth | License source | Effort | +|---|---|---|---|---|---|---| +| 1 | `2026-06-29-provider-rijksmuseum.md` | `@refkit/provider-rijksmuseum` | image (art) | keyless (modern `data.rijksmuseum.nl`, N+1) | per-item CC0/PD rights URI | M | +| 2 | `2026-06-29-provider-polyhaven.md` | `@refkit/provider-polyhaven` | image (texture/HDRI) | keyless | whole-source CC0 (hardcoded ToS) | S | +| 3 | `2026-06-29-provider-freesound.md` | `@refkit/provider-freesound` | audio (SFX) | BYOK (token) | per-item CC name string | M | +| 4 | `2026-06-29-provider-jamendo.md` | `@refkit/provider-jamendo` | audio (music) | BYOK (client_id) | per-item `license_ccurl` | M | +| 5 | `2026-06-29-provider-europeana.md` | `@refkit/provider-europeana` | image/mixed | BYOK (free key) | per-item `edm:rights` vocab | L | +| 6 | `2026-06-29-provider-internet-archive.md` | `@refkit/provider-internet-archive` | video / text | keyless | dirty per-item `licenseurl` | M–L | + +ambientcg is folded into the poly-haven plan as a sibling factory (`ambientcg()`) — same whole-source-CC0 shape, different endpoint. + +> **Parallelization caveat:** the per-package work (new `packages/provider-/` dir + `src/` + tests, Tasks 1–N of each plan) is fully independent and parallelizable via worktree-isolated subagents. But the **Shared Task S9 central-wiring steps touch five shared files** — `packages/mcp/src/cli.ts`, `packages/mcp/src/__tests__/mcp.test.ts`, root `README.md`, root `vitest.config.ts`, and `packages/mcp/package.json` (S9.5). If you build providers in parallel, **defer S9 and run all the central wiring in a single serialized integration pass** at the end (one commit per provider is fine, but edit the shared files sequentially), or you will get merge conflicts. S9.1–S9.6 are append-only edits, so a serial pass is quick. + +--- + +## Cross-cutting design decisions (lock these before coding) + +These resolve the roadmap §3 caveats. They follow refkit's existing conventions (see `provider-met`, `provider-flickr`, `evaluate-use.ts`): **conservative, per-item, strict-deny; anything not clearly granted → `unknown` → `needs-review`. Never fabricate a license.** + +- **D1 — Modality ceiling (affects poly-haven):** `core/src/modality.ts` defines exactly `image | video | audio | text`; `referenceSchema` enforces it. There is **no `3d`/`texture` modality**. Decision: map textures and HDRIs as `modality: 'image'` (they are image files); **skip 3D model formats (.blend/.fbx/.gltf) for v1** (YAGNI — no core change). Document the skip in the README. Adding a `3d` modality is explicitly out of scope for Phase 5. +- **D2 — Whole-source CC0 hardcode (poly-haven, ambientcg):** no per-item license field exists; hardcode `license: 'CC0-1.0'`, `rights.raw.sourceTerms = `. Mirror `provider-met`'s hardcoded-CC0 shape exactly. +- **D3 — Dirty license (internet-archive):** map an item only when it carries a parseable `licenseurl`/`rights`; **every item without one → `license: 'unknown'`** (core turns it into `needs-review`). Do not drop them silently and do not guess PD. +- **D4 — License name-string mapping (freesound):** freesound returns a CC name/short string (e.g. `"Attribution"`, `"Creative Commons 0"`), not a URL or version. Map the name → family `LicenseId`; **omit `licenseVersion`** (no reliable version). Unrecognized name → `unknown`. +- **D5 — Partial enum fit (jamendo):** jamendo `license_ccurl` → match the URL to a family: CC-BY → `CC-BY`, CC-BY-SA → `CC-BY-SA` (capture version from the URL when present); CC-BY-NC*/CC-BY-ND* → `proprietary`; anything unrecognized → `unknown`. +- **D6 — Hotlinked media (europeana):** media bytes are third-party-hosted; set `rights.rehostPolicy: 'hotlink-required'` (not `cache-allowed`). Map `edm:rights` controlled-vocab URIs per D5-style URL matching: CC deeds → CC family (via `mapCcDeedUrl`); **rightsstatements.org → faithful per-token mapping** (via the shared `mapRightsUrl`): In-Copyright `InC*` → `proprietary`; `NoC-US` → `PD` + `jurisdiction:'US'` (jurisdiction-scoped); `NoC-NC` → `proprietary`; opaque/undetermined (`NoC-OKLR`/`NoC-CR`/`CNE`/`UND`/`NKC`) → `unknown`. rightsstatements are rights-STATUS statements, not grants — map them to the closest TRUE representation rather than discarding the source's signal as blanket `unknown` (that would be lossy, not "faithful"); but never *guess* PD where the source said nothing. Same `mapRightsUrl` used by internet-archive's `licenseurl`. +- **D7 — License version from a CC URL:** when a CC deed URL is available, extract the version with `/\/licenses\/by(?:-sa)?\/(\d\.\d)\//` and set `licenseVersion` only for `CC-BY`/`CC-BY-SA` families. (This is the same *version-guard* convention as `provider-openverse`/`provider-flickr` — `licenseVersion` lives only on the BY/BY-SA families — but the *extraction mechanism* differs by source: openverse reads a structured `license_version` field and flickr maps a numeric license id, neither parses the version out of a URL by regex.) +- **D8 — `preview.url`/`thumbnail.url` must be an image resource, never a web page.** Some sources expose a *viewer/landing page* URL alongside (or instead of) the real image — Rijksmuseum Linked-Art `access_point`s, Europeana `edmIsShownAt`. We do **not** know from a URL string alone whether it's an image, and a network probe is out of scope (`core` never fetches bytes; an extra request per item is too costly). So: **(1) read the type the API gives** — a MIME field (`format`, `ebucoreHasMimeType`) or the media-vs-page semantic distinction (`edmIsShownBy` vs `edmIsShownAt`); **(2) cheap URL-string heuristic fallback** (image extension / `iiif` / IIIF request path / known image CDN / `/thumbnail/`); **(3) degrade** — if no image-like URL qualifies, omit `preview` (fall back to a known-image thumbnail if any), and for an image-only provider drop the item rather than surface a page. Never put a page URL in `preview.url`. Do **not** add an npm image-detection dependency (`file-type`/`image-type` need the bytes; `is-image-url` is just an extension check) — a one-line heuristic + reading the response type covers it with zero deps. Applies to: rijksmuseum, europeana (and any future image provider whose API mixes media and page URLs). + +Each per-provider plan's first task is a 1-line checkbox confirming which decisions apply. + +--- + +## Shared Task S0 — Provider satellite skeleton (every plan starts here) + +Each per-provider plan references this task with its own substitution row, then adds only the `src/index.ts` mapper/search and the test. Substitute throughout: + +- `` — provider id / dir suffix, e.g. `rijksmuseum` (used in `provider-`, `referenceId('', …)`, the provider `id`). +- `` — exported factory name, e.g. `rijksmuseum`. +- `` — human name, e.g. `Rijksmuseum`. +- `<modality>` — `image` | `audio` | `video` | `text`. +- `<auth>` — `keyless` | `API key`. +- `<licenseCol>` — README license column, e.g. `per-item CC / PD`. + +- [ ] **S0.1: Create the package directory and `package.json`** + +Create `packages/provider-<id>/package.json` (copy of `provider-met`'s, renamed). Keywords should reflect the source: + +```json +{ + "name": "@refkit/provider-<id>", + "version": "0.1.0", + "description": "<Title> provider satellite for refkit.", + "type": "module", + "license": "Apache-2.0", + "keywords": ["refkit", "reference-retrieval", "license", "attribution", "refkit-provider", "<id>"], + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { ".": "./src/index.ts" }, + "scripts": { + "typecheck": "tsc --noEmit", + "test": "vitest run", + "test:watch": "vitest watch", + "build": "tsup", + "prepublishOnly": "tsup" + }, + "dependencies": { "@refkit/core": "workspace:*" }, + "files": ["dist", "LICENSE"], + "publishConfig": { + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { ".": { "types": "./dist/index.d.ts", "import": "./dist/index.js" } } + } +} +``` + +- [ ] **S0.2: Create `tsconfig.json`, `tsup.config.ts`, `vitest.config.ts`** + +`packages/provider-<id>/tsconfig.json`: +```json +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { "outDir": "out", "rootDir": "src", "types": ["node"] }, + "include": ["src/**/*"] +} +``` + +`packages/provider-<id>/tsup.config.ts`: +```ts +import { defineConfig } from 'tsup' + +export default defineConfig({ + entry: ['src/index.ts'], + format: ['esm'], + dts: true, + clean: true, + outDir: 'dist', + sourcemap: true, +}) +``` + +`packages/provider-<id>/vitest.config.ts`: +```ts +import { defineConfig } from 'vitest/config' +export default defineConfig({ test: { name: 'provider-<id>', environment: 'node', include: ['src/**/*.{test,spec}.ts'] } }) +``` + +- [ ] **S0.3: Copy `LICENSE` and write `README.md`** + +```bash +cp packages/provider-met/LICENSE packages/provider-<id>/LICENSE +``` + +`packages/provider-<id>/README.md` (follow `provider-met`'s shape): +```markdown +# @refkit/provider-<id> + +Search **<Title>** as license-tagged <modality> references — a provider satellite for **refkit** (use with [`@refkit/core`](https://www.npmjs.com/package/@refkit/core)). + +- **Source:** <Title> +- **Auth:** <auth> +- **Modality:** <modality> +- **License:** <licenseCol> + +## Usage + +​```ts +import { createRefkit } from '@refkit/core' +import { <Fn> } from '@refkit/provider-<id>' + +const refkit = createRefkit({ providers: [<Fn>(/* config */)] }) +const refs = await refkit.search({ query: 'cat', modalities: ['<modality>'] }) +​``` + +Gate by intended use with `refkit.evaluateUse(ref, 'commercial-product')`. See [`@refkit/core`](https://www.npmjs.com/package/@refkit/core) for the full API. +``` + +- [ ] **S0.4: Install workspace deps** + +Run: `pnpm install` +Expected: lockfile updates; `@refkit/provider-<id>` resolves `@refkit/core` via `workspace:*`. (No commit yet — bundle with the first real change.) + +## Shared Task S9 — Central wiring (every plan ends here) + +After `src/index.ts` + test are green, register the provider: + +- [ ] **S9.1: Add the leaf vitest project** — in root `vitest.config.ts`, append `'./packages/provider-<id>/vitest.config.ts',` to the `projects` array. + +- [ ] **S9.2: Add to the README provider table** — in `README.md` (the `| @refkit/provider-… |` table around line 156-167), add a row: + `| `@refkit/provider-<id>` | <Title> | <modality> | <auth> | <licenseCol> |` + +- [ ] **S9.3: Wire the zero-config CLI** — in `packages/mcp/src/cli.ts`: + - add `import { <Fn> } from '@refkit/provider-<id>'` + - **keyless** providers: add `<Fn>()` to the base `providers` array. + - **BYOK** providers: add `if (env.<ENVVAR>) providers.push(<Fn>({ ...: env.<ENVVAR> }))` after the existing BYOK block. Pick a clear `<ENVVAR>` (e.g. `RIJKS_KEY`, `FREESOUND_TOKEN`, `JAMENDO_CLIENT_ID`, `EUROPEANA_KEY`). internet-archive is keyless. + +- [ ] **S9.4: Extend the CLI wiring test** — in `packages/mcp/src/__tests__/mcp.test.ts` (`describe('defaultProviders'…)`, ~line 227): + - keyless: add `'<id>'` to the id list asserted by `'includes every keyless provider by default'`. + - BYOK: add an assertion mirroring the unsplash gate — id absent without env, present with `{ <ENVVAR>: 'k' }`. + +- [ ] **S9.5: Add `mcp` as a devDep consumer if needed** — `mcp` already lists provider deps in `packages/mcp/package.json`; add `"@refkit/provider-<id>": "workspace:*"` there. + +- [ ] **S9.6: Write a changeset** — create `.changeset/provider-<id>.md`: +```markdown +--- +"@refkit/provider-<id>": minor +"@refkit/mcp": minor +--- + +Add @refkit/provider-<id>: <Title> as license-normalized <modality> references. +``` + +- [ ] **S9.7: Verify the whole repo green** + +Run: `pnpm install && pnpm -r typecheck && pnpm test:run` +Expected: typecheck clean; all vitest projects (including `provider-<id>`) pass. + +- [ ] **S9.8: Commit** +```bash +git add -A +git commit -m "feat(provider-<id>): <Title> satellite (P1)" +``` + +--- + +## Self-Review (run after all per-provider plans are written) + +1. **Spec coverage:** all six §3 P1 rows have a plan; §3 cheap wins (openverse-audio, pexels/pixabay-video) already shipped — no plan needed. +2. **Decision coverage:** each plan's Task 1 states which of D1–D8 apply. +3. **Type consistency:** every plan emits a valid `Reference` (required: `id, modality, source{providerId,sourceUrl}, canonicalUrl, rights, verifiedAt, relevance`) and a valid `RightsRecord` (required: `license, rehostPolicy, raw{sourceTerms,sourceUrl}`); `licenseVersion` only for CC-BY/CC-BY-SA. +4. **Skip list intact:** none of these are on §4 (no Getty/Shutterstock/Kaboompics/web-search). diff --git a/docs/superpowers/plans/2026-06-29-provider-europeana.md b/docs/superpowers/plans/2026-06-29-provider-europeana.md new file mode 100644 index 0000000..15191a0 --- /dev/null +++ b/docs/superpowers/plans/2026-06-29-provider-europeana.md @@ -0,0 +1,555 @@ +# Europeana Provider Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. This plan extends the **shared skeleton** in `2026-06-29-p1-providers-index.md` — read that first; execute **Shared Task S0** before Task 2 and **Shared Task S9** as the final task. Do not re-paste S0/S9 boilerplate here. Write tests first (TDD): each code task is failing-test → run (FAIL) → implement → run (PASS) → commit. + +**Goal:** Add `@refkit/provider-europeana` — a thin satellite that searches the **Europeana Search API** and returns license-normalized image `Reference`s. Each item's `edm:rights` controlled-vocabulary URI is mapped to a core `LicenseId` (+ CC version where applicable); permissions are never stored (derived by core's `factsFor()`/`evaluateUse()`). + +**Architecture:** One factory `europeana(config: EuropeanaConfig)` returning `defineProvider({ id: 'europeana', modalities: ['image'], queryFeatures, capabilities, search })`. `search` calls the Search API via `ctx.fetch`, maps each `items[]` element through `toReference`. Two traits make Europeana different from the Met/Flickr templates: + +- **Hotlinked media (D6):** media bytes (`edmIsShownBy`) are hosted by third-party data providers, NOT by Europeana. Set `rights.rehostPolicy: 'hotlink-required'` (NOT Flickr's `'cache-allowed'`). +- **`preview` must be an image, never a web page:** EDM distinguishes `edmIsShownBy` (the media resource) from `edmIsShownAt` (a landing **page**). `toReference` puts **only `edmIsShownBy`** into `preview` (gated by `ebucoreHasMimeType`/URL heuristic) and **never `edmIsShownAt`**; `thumbnail` comes from `edmPreview` (Europeana's own thumbnail image service). An item with neither a usable image nor a thumbnail is dropped. `mediaType` is read from `ebucoreHasMimeType`, inferred from the URL extension, or defaulted — not hardcoded. +- **Image-only v1 scope (D1):** Europeana returns mixed media (`type` ∈ `IMAGE | SOUND | VIDEO | TEXT | 3D`). v1 maps only `type === 'IMAGE'` → `modality: 'image'`; non-image items are dropped. SOUND/VIDEO/TEXT support is a documented follow-up (would map to `audio`/`video`/`text` modalities). The search request is constrained server-side with `qf=TYPE:IMAGE` and `media=true`, and `toReference` defensively re-checks `type` (a belt-and-suspenders guard against the filter being relaxed). +- **Array-typed fields:** nearly every metadata field is a JSON array (`title`, `dataProvider`, `provider`, `edmPreview`, `edmIsShownBy`, `edmIsShownAt`, `rights`); only `id`, `type`, `guid` are scalars. Use a safe `first()` helper that returns the first element or `undefined`. + +**Tech Stack:** TypeScript (ESM, `"type": "module"`), tsup, vitest, zod (via core), pnpm workspaces, changesets. Depends only on `@refkit/core`. + +--- + +## Task 1: Decisions & scaffold + +- [ ] **1.1: Confirm the decisions that apply** (from the index's D1–D8): + - **D1 — Modality ceiling / image-only scope:** v1 maps only `type === 'IMAGE'` → `modality: 'image'`; SOUND/VIDEO/TEXT/3D are dropped and noted as a follow-up in the README. + - **D5-style — Map a rights-vocab URI to a license family** (the index files europeana's rights mapping under **D6**, "per D5-style URL matching"; D5 proper is jamendo's): `creativecommons.org/...` deeds → CC family; `creativecommons.org/publicdomain/...` → CC0/PD; CC NC/ND → `proprietary`. **rightsstatements.org is mapped faithfully per token** (not blanket-unknown): InC* → `proprietary` (copyrighted, no grant); `NoC-US` → `PD` + `jurisdiction:'US'`; `NoC-NC` → `proprietary` (non-commercial); `NoC-OKLR`/`NoC-CR`/`CNE`/`UND`/`NKC` → `unknown`. CC NC/ND and in-copyright statements are NEVER mapped to a permissive license; a versioned jurisdiction-scoped PD carries its `jurisdiction`. + - **D6 — Hotlinked media:** `rights.rehostPolicy: 'hotlink-required'`. + - **D7 — License version from a CC URL:** extract version via `/\/licenses\/by(?:-sa)?\/(\d\.\d)\//`; set `licenseVersion` only for `CC-BY`/`CC-BY-SA`. + - **D8 — `preview` must be an image, not a page:** source `preview` from `edmIsShownBy` only (gated by `ebucoreHasMimeType`/URL heuristic), never `edmIsShownAt`; `thumbnail` from `edmPreview`; drop items with neither; `mediaType` read/inferred, not hardcoded. + +- [ ] **1.2: Execute Shared Task S0** (see `2026-06-29-p1-providers-index.md` → "Shared Task S0 — Provider satellite skeleton") with this substitution row: + + | placeholder | value | + |---|---| + | `<id>` | `europeana` | + | `<Fn>` | `europeana` | + | `<Title>` | `Europeana` | + | `<modality>` | `image` | + | `<auth>` | `API key` | + | `<licenseCol>` | `per-item CC / PD / rights-statement` | + + In the README (S0.3), under the bullet list, add a one-line scope note: *"v1 returns images only (`type=IMAGE`); audio/video/text records are a planned follow-up. Media is hotlinked from third-party data providers — cache/rehost is not permitted (`rehostPolicy: 'hotlink-required'`)."* + + Do NOT commit at the end of S0 — bundle the package skeleton with the first real change in Task 2. + +--- + +## Task 2: Rights mapper — `mapEuropeanaRights` (TDD) + +- [ ] **2.1: Write the failing test** `packages/provider-europeana/src/__tests__/europeana.test.ts`. This first slice tests only the exported pure mapper: + +```ts +import { describe, expect, it } from 'vitest' +import { mapEuropeanaRights } from '../index' + +describe('mapEuropeanaRights', () => { + it('maps CC0 and Public Domain Mark to open licenses (no version)', () => { + expect(mapEuropeanaRights('http://creativecommons.org/publicdomain/zero/1.0/')).toEqual({ license: 'CC0-1.0' }) + expect(mapEuropeanaRights('http://creativecommons.org/publicdomain/mark/1.0/')).toEqual({ license: 'PD' }) + }) + + it('maps CC-BY / CC-BY-SA and captures the version', () => { + expect(mapEuropeanaRights('http://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapEuropeanaRights('https://creativecommons.org/licenses/by-sa/3.0/')).toEqual({ license: 'CC-BY-SA', version: '3.0' }) + }) + + it('maps NC / ND variants to proprietary (not an open grant)', () => { + expect(mapEuropeanaRights('http://creativecommons.org/licenses/by-nc/4.0/')).toEqual({ license: 'proprietary' }) + expect(mapEuropeanaRights('http://creativecommons.org/licenses/by-nc-sa/4.0/')).toEqual({ license: 'proprietary' }) + expect(mapEuropeanaRights('http://creativecommons.org/licenses/by-nd/4.0/')).toEqual({ license: 'proprietary' }) + }) + + it('maps rightsstatements.org faithfully: InC→proprietary, NoC-US→PD+US, NoC-NC→proprietary', () => { + expect(mapEuropeanaRights('http://rightsstatements.org/vocab/InC/1.0/')).toEqual({ license: 'proprietary' }) + expect(mapEuropeanaRights('http://rightsstatements.org/vocab/NoC-US/1.0/')).toEqual({ license: 'PD', jurisdiction: 'US' }) + expect(mapEuropeanaRights('http://rightsstatements.org/vocab/NoC-NC/1.0/')).toEqual({ license: 'proprietary' }) + }) + + it('maps opaque/undetermined rightsstatements + empty/unrecognized to unknown', () => { + expect(mapEuropeanaRights('http://rightsstatements.org/vocab/NoC-OKLR/1.0/')).toEqual({ license: 'unknown' }) + expect(mapEuropeanaRights('http://rightsstatements.org/vocab/CNE/1.0/')).toEqual({ license: 'unknown' }) + expect(mapEuropeanaRights('')).toEqual({ license: 'unknown' }) + expect(mapEuropeanaRights('http://example.org/some-other-license')).toEqual({ license: 'unknown' }) + }) +}) +``` + +- [ ] **2.2: Run (expect FAIL — `mapEuropeanaRights` not exported yet)** + +```bash +pnpm --filter @refkit/provider-europeana test +``` + +Expected: FAIL (import/resolve error or assertion failures). + +- [ ] **2.3: Implement the mapper** in `packages/provider-europeana/src/index.ts`. Order matters: check `publicdomain/zero` and `publicdomain/mark` first, then NC/ND (→ proprietary) before plain BY/BY-SA, because `by-nc-sa` contains the substring `by-sa`. + +```ts +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type LicenseId, + type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +const BASE = 'https://api.europeana.eu/record/v2/search.json' + +/** Map a Europeana `edm:rights` controlled-vocabulary URI to a core license id (+ CC version, + * + jurisdiction for jurisdiction-scoped PD). Conservative (D5): only clearly-open CC deeds and + * PD/CC0 become open grants; CC NC/ND → proprietary; rightsstatements.org is mapped faithfully + * per token (see below); anything unrecognized/empty → unknown. */ +// rightsstatements.org is a rights-STATUS vocabulary (not license grants). Map each token +// FAITHFULLY (index D5-style): InC* → proprietary (copyrighted, no grant); NoC-US → PD scoped +// to the US via the jurisdiction field; NoC-NC → proprietary (non-commercial → commercial out); +// opaque/undetermined (NoC-OKLR/CR, CNE, UND, NKC) → unknown. (This mirrors core `mapRightsUrl`; +// the helper-refactor Task 4 replaces this inlined copy with that import.) +const RIGHTS_STATEMENT: Record<string, { license: LicenseId; jurisdiction?: string }> = { + 'inc': { license: 'proprietary' }, 'inc-ow-eu': { license: 'proprietary' }, 'inc-edu': { license: 'proprietary' }, + 'inc-nc': { license: 'proprietary' }, 'inc-ruu': { license: 'proprietary' }, + 'noc-us': { license: 'PD', jurisdiction: 'US' }, + 'noc-nc': { license: 'proprietary' }, + 'noc-oklr': { license: 'unknown' }, 'noc-cr': { license: 'unknown' }, + 'cne': { license: 'unknown' }, 'und': { license: 'unknown' }, 'nkc': { license: 'unknown' }, +} + +export function mapEuropeanaRights(uri: string): { license: LicenseId; version?: string; jurisdiction?: string } { + const u = (uri || '').toLowerCase() + if (!u) return { license: 'unknown' } + // rightsstatements.org — faithful per-token mapping (not blanket unknown). + const rs = u.match(/rightsstatements\.org\/(?:vocab|page)\/([a-z-]+)/) + if (rs) return RIGHTS_STATEMENT[rs[1]] ?? { license: 'unknown' } + // Public domain dedications / marks (no version surfaced). + if (u.includes('creativecommons.org/publicdomain/zero')) return { license: 'CC0-1.0' } + if (u.includes('creativecommons.org/publicdomain/mark')) return { license: 'PD' } + // Non-commercial / no-derivatives variants are NOT open grants → proprietary. + // Checked before plain by/by-sa because "by-nc-sa" contains "by-sa". + if (/creativecommons\.org\/licenses\/by-(?:nc|nd)/.test(u)) return { license: 'proprietary' } + // Open CC deeds: capture the version (D7) for the attribution families only. + const bySa = u.match(/creativecommons\.org\/licenses\/by-sa\/(\d\.\d)/) + if (bySa) return { license: 'CC-BY-SA', version: bySa[1] } + const by = u.match(/creativecommons\.org\/licenses\/by\/(\d\.\d)/) + if (by) return { license: 'CC-BY', version: by[1] } + return { license: 'unknown' } +} +``` + +- [ ] **2.4: Run (expect PASS)** + +```bash +pnpm --filter @refkit/provider-europeana test +``` + +Expected: PASS (the `mapEuropeanaRights` describe block). + +- [ ] **2.5: Commit** + +```bash +git add -A && git commit -m "feat(provider-europeana): scaffold + edm:rights mapper" +``` + +--- + +## Task 3: `toReference` mapper (TDD) + +- [ ] **3.1: Add failing tests** to `europeana.test.ts`. These exercise the full item → `Reference` mapping with realistic, **array-typed** fixtures and assert downstream `evaluateUse` verdicts + the hotlink rehost policy. Append: + +```ts +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { europeana } from '../index' + +// Realistic Europeana Search API item shapes. Note every metadata field is an +// array; id/type/guid are scalars. id is "/datasetId/recordId" with a leading slash. +const ITEM_CC0 = { + id: '/2048128/europeana_fashion_12345', + type: 'IMAGE', + title: ['A Painted Fan'], + dataProvider: ['Rijksmuseum'], + provider: ['Europeana Fashion'], + edmPreview: ['https://api.europeana.eu/thumbnail/v3/200/cc0thumb.jpg'], + edmIsShownBy: ['https://images.example.org/cc0-full.jpg'], + edmIsShownAt: ['https://www.rijksmuseum.nl/item/cc0'], + rights: ['http://creativecommons.org/publicdomain/zero/1.0/'], +} +const ITEM_BY_SA = { + id: '/9876543/abc_xyz', + type: 'IMAGE', + title: ['A Photographed Statue'], + dataProvider: ['Some Museum'], + provider: ['Some Aggregator'], + edmPreview: ['https://api.europeana.eu/thumbnail/v3/200/bysathumb.jpg'], + edmIsShownBy: ['https://images.example.org/bysa-full.jpg'], + edmIsShownAt: ['https://museum.example.org/item/bysa'], + rights: ['https://creativecommons.org/licenses/by-sa/3.0/'], +} +const ITEM_INC = { + id: '/111/in_copyright', + type: 'IMAGE', + title: ['A Modern Photo'], + dataProvider: ['Living Archive'], + provider: ['Aggregator'], + edmPreview: ['https://api.europeana.eu/thumbnail/v3/200/incthumb.jpg'], + edmIsShownBy: ['https://images.example.org/inc-full.jpg'], + edmIsShownAt: ['https://archive.example.org/item/inc'], + rights: ['http://rightsstatements.org/vocab/InC/1.0/'], +} + +const okCtx = (items: unknown[]): ProviderContext => ({ + fetch: (async () => + new Response(JSON.stringify({ success: true, itemsCount: items.length, totalResults: items.length, items }), { status: 200 }) + ) as typeof fetch, +}) + +describe('europeana toReference', () => { + it('maps a CC0 image to an allowed reference with hotlink rehost policy', async () => { + const refs = await europeana({ apiKey: 'k' }).search({ text: 'fan', modalities: ['image'], limit: 5 }, okCtx([ITEM_CC0])) + expect(refs).toHaveLength(1) + const r = refs[0] + expect(r.modality).toBe('image') + expect(r.title).toBe('A Painted Fan') + expect(r.rights.license).toBe('CC0-1.0') + expect(r.rights.rehostPolicy).toBe('hotlink-required') + expect(r.canonicalUrl).toBe('https://www.europeana.eu/item/2048128/europeana_fashion_12345') + expect(r.preview?.url).toBe('https://images.example.org/cc0-full.jpg') // from edmIsShownBy + expect(r.thumbnail?.url).toBe('https://api.europeana.eu/thumbnail/v3/200/cc0thumb.jpg') // from edmPreview + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed') + }) + + it('preserves the CC-BY-SA version and gates to allowed-with-attribution', async () => { + const refs = await europeana({ apiKey: 'k' }).search({ text: 'statue', modalities: ['image'] }, okCtx([ITEM_BY_SA])) + const r = refs[0] + expect(r.rights.license).toBe('CC-BY-SA') + expect(r.rights.licenseVersion).toBe('3.0') + expect(r.rights.rehostPolicy).toBe('hotlink-required') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed-with-attribution') + }) + + it('maps an in-copyright (InC) rights statement to proprietary → denied', async () => { + const refs = await europeana({ apiKey: 'k' }).search({ text: 'photo', modalities: ['image'] }, okCtx([ITEM_INC])) + const r = refs[0] + expect(r.rights.license).toBe('proprietary') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('denied') + }) + + it('maps NoC-US to PD scoped to the US (allowed by default; jurisdiction-aware callers gate)', async () => { + const nocUs = { ...ITEM_CC0, id: '/x/noc_us', rights: ['http://rightsstatements.org/vocab/NoC-US/1.0/'] } + const refs = await europeana({ apiKey: 'k' }).search({ text: 'x', modalities: ['image'] }, okCtx([nocUs])) + const r = refs[0] + expect(r.rights.license).toBe('PD') + expect(r.rights.jurisdiction).toBe('US') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed') + // a caller whose jurisdiction differs from the source's is deferred to review: + expect(evaluateUse(r.rights, 'commercial-product', { userJurisdiction: 'DE' }).decision).toBe('needs-review') + }) + + it('drops non-IMAGE items and items with no usable media at all', async () => { + const sound = { ...ITEM_CC0, id: '/x/sound', type: 'SOUND' } + const noMedia = { ...ITEM_CC0, id: '/x/nomedia', edmIsShownBy: [], edmIsShownAt: [], edmPreview: [] } + const refs = await europeana({ apiKey: 'k' }).search({ text: 'x', modalities: ['image'] }, okCtx([sound, noMedia, ITEM_CC0])) + expect(refs).toHaveLength(1) + expect(refs[0].canonicalUrl).toBe('https://www.europeana.eu/item/2048128/europeana_fashion_12345') + }) + + it('never uses edmIsShownAt (a landing page) as preview; keeps the item via its thumbnail', async () => { + // No media resource, only a landing PAGE + a Europeana thumbnail image. + const pageOnly = { + ...ITEM_CC0, + id: '/x/page_only', + edmIsShownBy: [], + edmIsShownAt: ['https://www.rijksmuseum.nl/en/collection/SK-A-1'], // a web page, NOT an image + edmPreview: ['https://api.europeana.eu/thumbnail/v3/200/pagethumb.jpg'], + } + const refs = await europeana({ apiKey: 'k' }).search({ text: 'x', modalities: ['image'] }, okCtx([pageOnly])) + expect(refs).toHaveLength(1) + expect(refs[0].preview).toBeUndefined() // the landing page is never surfaced as media + expect(refs[0].thumbnail?.url).toBe('https://api.europeana.eu/thumbnail/v3/200/pagethumb.jpg') + }) + + it('reads ebucoreHasMimeType for the preview media type when the URL has no extension', async () => { + const png = { + ...ITEM_CC0, + id: '/x/png', + edmIsShownBy: ['https://images.example.org/no-extension'], + ebucoreHasMimeType: ['image/png'], + } + const refs = await europeana({ apiKey: 'k' }).search({ text: 'x', modalities: ['image'] }, okCtx([png])) + expect(refs[0].preview?.url).toBe('https://images.example.org/no-extension') + expect(refs[0].preview?.mediaType).toBe('image/png') + }) +}) +``` + +- [ ] **3.2: Run (expect FAIL — `europeana` factory / `toReference` not implemented)** + +```bash +pnpm --filter @refkit/provider-europeana test +``` + +Expected: FAIL. + +- [ ] **3.3: Implement `EuropeanaConfig`, the `first()` helper, and `toReference`** in `src/index.ts` (append after the mapper): + +```ts +export interface EuropeanaConfig { + /** Free BYOK Europeana API key (sent as the `wskey` query param). */ + apiKey: string +} + +interface EuropeanaItem { + id: string + type?: string + title?: string[] + dataProvider?: string[] + provider?: string[] + edmPreview?: string[] + edmIsShownBy?: string[] + edmIsShownAt?: string[] + /** MIME type of the media resource when the record declares it. */ + ebucoreHasMimeType?: string[] + rights?: string[] +} +interface EuropeanaResponse { success?: boolean; items?: EuropeanaItem[] } + +/** First element of an array-typed Europeana field, or undefined. */ +function first<T>(arr: T[] | undefined): T | undefined { + return Array.isArray(arr) && arr.length > 0 ? arr[0] : undefined +} + +// edmIsShownBy is the MEDIA resource; edmIsShownAt is a LANDING PAGE (a web page, not +// an image) — it must never become preview.url. The record usually tells us the media +// type (ebucoreHasMimeType); otherwise fall back to a URL-string heuristic (no network — +// `core` never fetches bytes, and a probe would add a request per item). +const IMAGE_EXT = /\.(jpe?g|png|webp|gif|tiff?)(?:$|\?)/i + +/** URL-string heuristic only (no network): does this look like an image resource? */ +function isLikelyImageUrl(url: string): boolean { + return IMAGE_EXT.test(url) || /iiif/i.test(url) || /\/thumbnail\//i.test(url) +} + +/** Best image mediaType: the declared MIME if it's an image, else inferred from the + * URL extension, else a safe default. */ +function imageMediaType(mime: string | undefined, url: string): string { + if (mime && mime.startsWith('image/')) return mime + const m = url.match(IMAGE_EXT) + if (m) { const e = m[1].toLowerCase(); return e === 'jpg' ? 'image/jpeg' : `image/${e === 'tif' ? 'tiff' : e}` } + return 'image/jpeg' +} + +function toReference(it: EuropeanaItem): Reference | null { + // v1 image-only scope (D1): defensively re-check type even though the search is + // server-filtered with qf=TYPE:IMAGE. + if (it.type && it.type !== 'IMAGE') return null + if (!it.id) return null + + // id is "/datasetId/recordId" (leading slash) → canonical Europeana item page. + const canonicalUrl = `https://www.europeana.eu/item${it.id}` + + // preview = the actual IMAGE media (edmIsShownBy) ONLY — NEVER edmIsShownAt, which is + // a landing web page. Trust edmIsShownBy when the record's MIME says image/*, or the + // URL looks like an image, or no MIME contradicts it (type is already IMAGE). thumbnail + // = edmPreview (Europeana's own thumbnail image service — reliable). Drop the item only + // when there is neither a usable preview nor a thumbnail (nothing visual to surface). + const shownBy = first(it.edmIsShownBy) + const mime = first(it.ebucoreHasMimeType) + const thumbUrl = first(it.edmPreview) + const previewUrl = shownBy && (mime?.startsWith('image/') || isLikelyImageUrl(shownBy) || !mime) + ? shownBy + : undefined + if (!previewUrl && !thumbUrl) return null + + const rightsUri = first(it.rights) ?? '' + const { license, version, jurisdiction } = mapEuropeanaRights(rightsUri) + + const rights: RightsRecord = { + license, + licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + // jurisdiction-scoped PD (e.g. NoC-US → PD in the US); metadata for evaluateUse. + ...(jurisdiction ? { jurisdiction } : {}), + author: first(it.dataProvider) ?? first(it.provider) ?? undefined, + // D6: media is hotlinked from data providers — caching/rehosting not permitted. + rehostPolicy: 'hotlink-required', + raw: { sourceTerms: rightsUri || 'https://www.europeana.eu/rights', sourceUrl: canonicalUrl }, + } + return { + id: referenceId('europeana', canonicalUrl), + modality: 'image', + title: first(it.title) || undefined, + source: { providerId: 'europeana', sourceUrl: canonicalUrl }, + canonicalUrl, + rights, + verifiedAt: new Date().toISOString(), + ...(thumbUrl ? { thumbnail: { url: thumbUrl } } : {}), + ...(previewUrl ? { preview: { url: previewUrl, mediaType: imageMediaType(mime, previewUrl) } } : {}), + relevance: 0, + raw: it, + } +} +``` + +- [ ] **3.4: Run (expect PASS for the mapper + the `toReference` blocks; `europeana()` exists now)** + +```bash +pnpm --filter @refkit/provider-europeana test +``` + +Expected: PASS for Tasks 2 & 3 tests. (The `search` factory must already be present for the import to resolve — implement Task 4's `search` in the same pass if needed, or stub-then-fill in Task 4.) + +- [ ] **3.5: Commit** + +```bash +git add -A && git commit -m "feat(provider-europeana): toReference mapper (image-only, hotlink rehost)" +``` + +--- + +## Task 4: `search` + provider factory (TDD) + +- [ ] **4.1: Add a failing search-param-forwarding test** to `europeana.test.ts`: + +```ts +describe('europeana search request', () => { + it('sets wskey, query, rows, and the image/media filters', async () => { + let url = '' + const ctx: ProviderContext = { + fetch: (async (input: Parameters<typeof fetch>[0]) => { + url = String(input) + return new Response(JSON.stringify({ success: true, items: [] }), { status: 200 }) + }) as typeof fetch, + } + await europeana({ apiKey: 'my-key' }).search({ text: 'tulips', modalities: ['image'], limit: 7 }, ctx) + const u = new URL(url) + expect(u.searchParams.get('wskey')).toBe('my-key') + expect(u.searchParams.get('query')).toBe('tulips') + expect(u.searchParams.get('rows')).toBe('7') + expect(u.searchParams.get('media')).toBe('true') + expect(u.searchParams.get('qf')).toBe('TYPE:IMAGE') + }) + + it('returns [] when the API yields no items', async () => { + const ctx: ProviderContext = { + fetch: (async () => new Response(JSON.stringify({ success: true, items: [] }), { status: 200 })) as typeof fetch, + } + expect(await europeana({ apiKey: 'k' }).search({ text: 'zzz', modalities: ['image'] }, ctx)).toEqual([]) + }) + + it('throws on a non-ok HTTP status', async () => { + const ctx: ProviderContext = { + fetch: (async () => new Response('forbidden', { status: 401 })) as typeof fetch, + } + await expect(europeana({ apiKey: 'bad' }).search({ text: 'x', modalities: ['image'] }, ctx)).rejects.toThrow(/europeana search failed: 401/) + }) +}) +``` + +- [ ] **4.2: Run (expect FAIL — `search` not yet wired / asserts unmet)** + +```bash +pnpm --filter @refkit/provider-europeana test +``` + +Expected: FAIL. + +- [ ] **4.3: Implement the `europeana` factory + `search`** in `src/index.ts` (append): + +```ts +export function europeana(config: EuropeanaConfig) { + return defineProvider({ + id: 'europeana', + modalities: ['image'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const url = new URL(BASE) + url.searchParams.set('wskey', config.apiKey) + url.searchParams.set('query', q.text) + url.searchParams.set('rows', String(q.limit ?? 20)) + url.searchParams.set('media', 'true') // only items that actually carry media + url.searchParams.set('qf', 'TYPE:IMAGE') // v1 image-only scope (D1) + const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`europeana search failed: ${res.status}`) + const json = (await res.json()) as EuropeanaResponse + if (!json.items || json.items.length === 0) return [] + return json.items + .map(toReference) + .filter((r): r is Reference => r !== null) + }, + }) +} +``` + +- [ ] **4.4: Run (expect PASS — all describe blocks)** + +```bash +pnpm --filter @refkit/provider-europeana test +``` + +Expected: PASS (mapper + toReference + search). Also typecheck the package: + +```bash +pnpm --filter @refkit/provider-europeana typecheck +``` + +Expected: clean. + +- [ ] **4.5: Commit** + +```bash +git add -A && git commit -m "feat(provider-europeana): search + factory wiring" +``` + +--- + +## Final Task: Central wiring + +- [ ] **F.1: Execute Shared Task S9** (see `2026-06-29-p1-providers-index.md` → "Shared Task S9 — Central wiring") with these substitutions: + + | placeholder | value | + |---|---| + | `<id>` | `europeana` | + | `<Fn>` | `europeana` | + | `<Title>` | `Europeana` | + | `<modality>` | `image` | + | `<auth>` | `API key` | + | `<licenseCol>` | `per-item CC / PD / rights-statement` | + | `<ENVVAR>` | `EUROPEANA_KEY` | + + Europeana is **BYOK** (not keyless). Concretely: + + - **S9.3 (`packages/mcp/src/cli.ts`):** add `import { europeana } from '@refkit/provider-europeana'`, and after the existing BYOK block add: + ```ts + if (env.EUROPEANA_KEY) providers.push(europeana({ apiKey: env.EUROPEANA_KEY })) + ``` + - **S9.4 (`packages/mcp/src/__tests__/mcp.test.ts`, `describe('defaultProviders'…)`):** mirror the unsplash BYOK gate: + ```ts + it('adds europeana only when EUROPEANA_KEY is present', () => { + expect(defaultProviders({}).map(p => p.id)).not.toContain('europeana') + expect(defaultProviders({ EUROPEANA_KEY: 'k' }).map(p => p.id)).toContain('europeana') + }) + ``` + - **S9.5:** add `"@refkit/provider-europeana": "workspace:*"` to `packages/mcp/package.json`. + - **S9.1 / S9.2 / S9.6 / S9.7 / S9.8:** vitest project list, root README table row, changeset, full-repo green check, final commit — per S9. + +- [ ] **F.2: Final verification** (S9.7) + +```bash +pnpm install && pnpm -r typecheck && pnpm test:run +``` + +Expected: typecheck clean; every vitest project (including `provider-europeana`) passes. + +--- + +## Self-Review + +1. **Decisions applied:** D1 (image-only v1), D5-style (rights-vocab URI → family), D6 (hotlink rehost + the rights-mapping the index files under D6), D7 (CC version from URL), D8 (preview from edmIsShownBy only, never edmIsShownAt) — all confirmed in Task 1.1. +2. **Reference validity:** every emitted `Reference` has `id, modality, source{providerId,sourceUrl}, canonicalUrl, rights, verifiedAt, relevance`; every `RightsRecord` has `license, rehostPolicy: 'hotlink-required', raw{sourceTerms,sourceUrl}`; `licenseVersion` only for CC-BY/CC-BY-SA. +3. **Faithful, conservative mapping:** CC NC/ND → `proprietary`; rightsstatements InC* → `proprietary`, NoC-US → `PD`+`jurisdiction:'US'`, NoC-NC → `proprietary`, opaque/undetermined (NoC-OKLR/CR, CNE, UND, NKC) + empty/unrecognized → `unknown` (→ `needs-review`). No fabricated open license; no permissive mapping of a restricted/in-copyright statement. +4. **Array safety:** every metadata field read through `first()`; `id`/`type` read as scalars. +5. **Follow-up flagged:** SOUND/VIDEO/TEXT modalities are out of scope for v1 and noted in the README. +6. **No web page as media:** `preview` is sourced only from `edmIsShownBy` (never the `edmIsShownAt` landing page); `thumbnail` from `edmPreview`; `mediaType` from `ebucoreHasMimeType`/extension; items with no usable image or thumbnail are dropped. Tested by the edmIsShownAt-page and MIME cases. +``` diff --git a/docs/superpowers/plans/2026-06-29-provider-freesound.md b/docs/superpowers/plans/2026-06-29-provider-freesound.md new file mode 100644 index 0000000..6e78a9e --- /dev/null +++ b/docs/superpowers/plans/2026-06-29-provider-freesound.md @@ -0,0 +1,361 @@ +# Freesound Provider Implementation Plan + +> **For agentic workers:** Implement this plan task-by-task using superpowers:subagent-driven-development (recommended) or superpowers:executing-plans. Steps use checkbox (`- [ ]`) syntax. This plan is a satellite under the shared skeleton in [`2026-06-29-p1-providers-index.md`](./2026-06-29-p1-providers-index.md) — read that index first. It defines **Shared Task S0** (package scaffold) and **Shared Task S9** (central wiring); this plan references them with substitution rows rather than repeating the boilerplate. Closest code template: `packages/provider-openverse/src/index.ts` (its `openverseAudio`/`toAudioReference` AUDIO leg) plus `packages/provider-flickr/src/index.ts` (BYOK config + exported license mapper). + +**Goal:** Add `@refkit/provider-freesound` — search [Freesound](https://freesound.org) for CC/CC0 sound effects and clips, emitted as license-normalized `audio` `Reference`s. BYOK (Freesound API token). + +**Architecture:** A thin satellite depending only on `@refkit/core`. The `freesound(config)` factory returns `defineProvider({ id: 'freesound', modalities: ['audio'], queryFeatures: ['keyword'], capabilities, search })`. `search` hits the Freesound APIv2 text-search endpoint via `ctx.fetch`, requests an explicit `fields=` set, maps each result's source-declared `license` to a `LicenseId` via `mapFreesoundLicense`, and builds an `audio` `Reference` via `toAudioReference` (preview = the hq-mp3 preview URL, `mediaType: 'audio/mpeg'`). Permissions are never stored — they derive from `license` via core's `factsFor()`/`evaluateUse()`. + +**Tech Stack:** TypeScript (ESM, `"type": "module"`), tsup (build), vitest (test), zod (via core), pnpm workspaces, changesets. + +--- + +## API reference (web-verified 2026-06-29) + +- **Endpoint:** `GET https://freesound.org/apiv2/search/text/?query=<q>&token=<API_KEY>&fields=<csv>` +- **Auth:** token. Two equivalent forms documented — query param `&token=YOUR_API_KEY` **or** header `Authorization: Token YOUR_API_KEY`. We use the **query-token** form for simplicity (matches other BYOK providers' "key in the request" convention). Config holds `apiKey`. +- **Response shape:** `{ count, next, previous, results: FreesoundResult[] }`. Request fields explicitly via `fields=id,name,license,username,previews,url,duration,filesize,tags` (default fields are sparse — `previews` and `license` must be requested). +- **`license` field — handle BOTH forms.** Official docs describe a **plain CC name string** (`"Attribution"`, `"Attribution NonCommercial"`, `"Creative Commons 0"`). In practice Freesound has also returned **CC deed URLs** (e.g. `http://creativecommons.org/licenses/by/4.0/`, `http://creativecommons.org/publicdomain/zero/1.0/`) and other name variants (`"Attribution Noncommercial"` casing, `"Sampling+"`, `"Attribution Sampling+"`). `mapFreesoundLicense` must accept either a deed URL or a name string. **D4** (name string → family `LicenseId`, omit `licenseVersion`) is the primary path; **D7** (extract CC version from a deed URL for CC-BY/CC-BY-SA) applies only when a URL form is seen. Unrecognized → `unknown`. +- **`previews` object keys:** `preview-hq-mp3`, `preview-lq-mp3`, `preview-hq-ogg`, `preview-lq-ogg`. We use `preview-hq-mp3` (`audio/mpeg`). + +--- + +## Task 1: Decisions & scaffold + +- [ ] **1.1: Confirm applicable cross-cutting decisions.** This provider applies **D4** (Freesound returns a CC name/short string with no reliable version → map name → family `LicenseId`, **omit `licenseVersion`**; unrecognized name → `unknown`) and, defensively, **D7** (if the `license` value is a CC deed *URL* instead, extract the version via `/\/licenses\/by(?:-sa)?\/(\d\.\d)\//` and set `licenseVersion` for `CC-BY`/`CC-BY-SA` only). Conservative/strict-deny throughout: noncommercial/sampling/unrecognized → `proprietary` or `unknown`, never fabricated as free. + +- [ ] **1.2: Execute Shared Task S0** (see index) with this substitution row: + + | placeholder | value | + |---|---| + | `<id>` | `freesound` | + | `<Fn>` | `freesound` | + | `<Title>` | `Freesound` | + | `<modality>` | `audio` | + | `<auth>` | `API key` | + | `<licenseCol>` | `per-item CC / CC0` | + + This produces `packages/provider-freesound/` with `package.json`, `tsconfig.json`, `tsup.config.ts`, `vitest.config.ts`, `LICENSE`, `README.md`, and a `pnpm install`. Do not commit yet — bundle with Task 2's first green commit. + +--- + +## Task 2: License mapper (`mapFreesoundLicense`) — TDD + +- [ ] **2.1: Write the failing test.** Create `packages/provider-freesound/src/__tests__/freesound.test.ts` with a `describe('mapFreesoundLicense')` block. Mirror the exported-mapper test style from `provider-flickr`'s `mapFlickrLicense` tests. Run it — it MUST FAIL (module/symbol does not exist yet). + + ```ts + import { describe, expect, it } from 'vitest' + import { mapFreesoundLicense } from '../index' + + describe('mapFreesoundLicense', () => { + it('maps CC name strings (D4 — no version)', () => { + expect(mapFreesoundLicense('Attribution')).toEqual({ license: 'CC-BY' }) + expect(mapFreesoundLicense('Attribution NonCommercial')).toEqual({ license: 'proprietary' }) + expect(mapFreesoundLicense('Attribution Noncommercial')).toEqual({ license: 'proprietary' }) + expect(mapFreesoundLicense('Creative Commons 0')).toEqual({ license: 'CC0-1.0' }) + expect(mapFreesoundLicense('Sampling+')).toEqual({ license: 'proprietary' }) + expect(mapFreesoundLicense('Attribution Sampling+')).toEqual({ license: 'proprietary' }) + }) + + it('maps CC deed URLs and extracts version for BY/BY-SA (D7)', () => { + expect(mapFreesoundLicense('http://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapFreesoundLicense('http://creativecommons.org/licenses/by-sa/3.0/')).toEqual({ license: 'CC-BY-SA', version: '3.0' }) + expect(mapFreesoundLicense('http://creativecommons.org/publicdomain/zero/1.0/')).toEqual({ license: 'CC0-1.0' }) + expect(mapFreesoundLicense('http://creativecommons.org/licenses/by-nc/3.0/')).toEqual({ license: 'proprietary' }) + }) + + it('returns unknown for anything unrecognized', () => { + expect(mapFreesoundLicense('Weird Custom License')).toEqual({ license: 'unknown' }) + expect(mapFreesoundLicense('')).toEqual({ license: 'unknown' }) + }) + }) + ``` + + Run: `pnpm --filter @refkit/provider-freesound test` + Expected: **FAIL** (`mapFreesoundLicense` is not exported / file `src/index.ts` has no such symbol). + +- [ ] **2.2: Implement `mapFreesoundLicense` in `src/index.ts`.** Start the module with the core imports and the mapper. The mapper handles the URL form first (D7), then the name-string form (D4), then falls through to `unknown`. + + ```ts + import { + defineProvider, referenceId, + type Reference, type RightsRecord, type LicenseId, + type NormalizedQuery, type ProviderContext, + } from '@refkit/core' + + // Freesound's `license` is usually a CC NAME string ("Attribution", "Creative + // Commons 0") but has historically also been a CC DEED URL. Handle both. + // D4: name → family LicenseId, no version. D7: URL → family (+ version for BY/BY-SA). + // Conservative: noncommercial / sampling / unrecognized → proprietary or unknown. + const FREESOUND_NAME_LICENSE: Record<string, { license: LicenseId }> = { + 'attribution': { license: 'CC-BY' }, + 'attribution noncommercial': { license: 'proprietary' }, // NC → not commercially usable + 'creative commons 0': { license: 'CC0-1.0' }, + 'sampling+': { license: 'proprietary' }, // bespoke CC sampling licence, not a clean free grant + 'attribution sampling+': { license: 'proprietary' }, + } + + /** Map a Freesound `license` value (CC name string OR CC deed URL) to our + * license + optional CC version. Unrecognized → `unknown` (strict-deny). */ + export function mapFreesoundLicense(value: string): { license: LicenseId; version?: string } { + const v = (value ?? '').trim() + if (!v) return { license: 'unknown' } + + // D7 — deed URL form + if (/^https?:\/\//i.test(v)) { + if (/\/publicdomain\/zero\//i.test(v)) return { license: 'CC0-1.0' } + const m = v.match(/\/licenses\/(by(?:-sa)?|by-nc[a-z-]*|by-nd[a-z-]*)\/(\d\.\d)\//i) + if (m) { + const fam = m[1].toLowerCase() + const version = m[2] + if (fam === 'by') return { license: 'CC-BY', version } + if (fam === 'by-sa') return { license: 'CC-BY-SA', version } + return { license: 'proprietary' } // any NC/ND variant + } + return { license: 'unknown' } + } + + // D4 — name string form (case-insensitive) + return FREESOUND_NAME_LICENSE[v.toLowerCase()] ?? { license: 'unknown' } + } + ``` + + Run: `pnpm --filter @refkit/provider-freesound test` + Expected: **PASS** (the `mapFreesoundLicense` describe block is green). + +- [ ] **2.3: Commit.** `git add -A && git commit -m "feat(provider-freesound): scaffold + license mapper"` (this folds in the Task 1 scaffold). + +--- + +## Task 3: `toAudioReference` + `FreesoundConfig` + `search` — TDD + +- [ ] **3.1: Write the failing test.** Extend `freesound.test.ts` with a `describe('freesound provider')` block that mocks `ctx.fetch` with a realistic Freesound text-search JSON body (mirror `provider-met`'s `ctxRouting` style and the `evaluateUse` import). The mock must return a `results[]` array with each license case. Run it — MUST FAIL (`freesound` factory not exported yet, `toAudioReference`/`search` not implemented). + + ```ts + import { evaluateUse, type ProviderContext } from '@refkit/core' + import { freesound } from '../index' + + const ctxJson = (body: unknown, capture?: (url: string) => void): ProviderContext => ({ + fetch: (async (input: string) => { + capture?.(String(input)) + return new Response(JSON.stringify(body), { status: 200 }) + }) as typeof fetch, + }) + + const RESULTS = { + count: 4, next: null, previous: null, + results: [ + { id: 1, name: 'Door creak', license: 'Attribution', username: 'alice', + url: 'https://freesound.org/people/alice/sounds/1/', + previews: { 'preview-hq-mp3': 'https://cdn.freesound.org/previews/1/1_hq.mp3', 'preview-lq-mp3': 'https://cdn.freesound.org/previews/1/1_lq.mp3' }, + duration: 2.5, filesize: 41000, tags: ['door', 'creak'] }, + { id: 2, name: 'Loop NC', license: 'Attribution NonCommercial', username: 'bob', + url: 'https://freesound.org/people/bob/sounds/2/', + previews: { 'preview-hq-mp3': 'https://cdn.freesound.org/previews/2/2_hq.mp3' }, duration: 5, filesize: 80000, tags: [] }, + { id: 3, name: 'Public bell', license: 'Creative Commons 0', username: 'carol', + url: 'https://freesound.org/people/carol/sounds/3/', + previews: { 'preview-hq-mp3': 'https://cdn.freesound.org/previews/3/3_hq.mp3' }, duration: 1, filesize: 16000, tags: [] }, + { id: 4, name: 'Mystery', license: 'Weird Custom License', username: 'dave', + url: 'https://freesound.org/people/dave/sounds/4/', + previews: { 'preview-hq-mp3': 'https://cdn.freesound.org/previews/4/4_hq.mp3' }, duration: 3, filesize: 48000, tags: [] }, + ], + } + + describe('freesound provider', () => { + it('maps each license family to audio references', async () => { + const refs = await freesound({ apiKey: 'k' }).search({ text: 'door', modalities: ['audio'], limit: 10 }, ctxJson(RESULTS)) + expect(refs).toHaveLength(4) + const byId = Object.fromEntries(refs.map(r => [r.canonicalUrl, r])) + + const cc = byId['https://freesound.org/people/alice/sounds/1/'] + expect(cc.modality).toBe('audio') + expect(cc.rights.license).toBe('CC-BY') + expect(cc.rights.author).toBe('alice') + expect(cc.preview?.url).toBe('https://cdn.freesound.org/previews/1/1_hq.mp3') + expect(cc.preview?.mediaType).toBe('audio/mpeg') + + const nc = byId['https://freesound.org/people/bob/sounds/2/'] + expect(nc.rights.license).toBe('proprietary') + expect(evaluateUse(nc.rights, 'commercial-product').decision).toBe('denied') + + const cc0 = byId['https://freesound.org/people/carol/sounds/3/'] + expect(cc0.rights.license).toBe('CC0-1.0') + expect(evaluateUse(cc0.rights, 'commercial-product').decision).toBe('allowed') + + const unk = byId['https://freesound.org/people/dave/sounds/4/'] + expect(unk.rights.license).toBe('unknown') + expect(evaluateUse(unk.rights, 'commercial-product').decision).toBe('needs-review') + }) + + it('forwards query, token, and fields; respects limit', async () => { + let url = '' + await freesound({ apiKey: 'secret' }).search( + { text: 'rain', modalities: ['audio'], limit: 7, providerOptions: { sort: 'rating_desc', filter: 'duration:[1 TO 10]' } }, + ctxJson(RESULTS, u => { url = u }), + ) + const u = new URL(url) + expect(u.pathname).toBe('/apiv2/search/text/') + expect(u.searchParams.get('query')).toBe('rain') + expect(u.searchParams.get('token')).toBe('secret') + expect(u.searchParams.get('fields')).toContain('previews') + expect(u.searchParams.get('fields')).toContain('license') + expect(u.searchParams.get('page_size')).toBe('7') + expect(u.searchParams.get('sort')).toBe('rating_desc') + expect(u.searchParams.get('filter')).toBe('duration:[1 TO 10]') + }) + }) + ``` + + Run: `pnpm --filter @refkit/provider-freesound test` + Expected: **FAIL** (`freesound` factory undefined; only the mapper block passes). + +- [ ] **3.2: Implement `FreesoundConfig`, `FreesoundSearchOptions`, `toAudioReference`, and the `freesound` factory** in `src/index.ts` (append below the mapper). Model `toAudioReference` on openverse's `toAudioReference` (audio modality, preview `{url, mediaType}`, no `visual`). Audio has no thumbnail image — omit `thumbnail` (Freesound has no waveform field in the basic search response). + + ```ts + export interface FreesoundConfig { + /** Freesound APIv2 token (https://freesound.org/apiv2/apply). Passed as the + * `token` query param. The `Authorization: Token <key>` header is the documented + * equivalent if a future need arises. */ + apiKey: string + } + + export interface FreesoundSearchOptions { + /** Freesound `sort` (e.g. 'score', 'rating_desc', 'downloads_desc', 'created_desc'). */ + sort?: string + /** Freesound `filter` query (field-scoped Solr-style filter, e.g. 'duration:[1 TO 10]'). */ + filter?: string + page?: number + pageSize?: number + } + + const BASE = 'https://freesound.org/apiv2/search/text/' + // Fields must be requested explicitly — default search responses omit previews/license. + const FIELDS = 'id,name,license,username,previews,url,duration,filesize,tags' + + interface FreesoundResult { + id: number + name: string + license: string + username?: string + url: string + previews?: Record<string, string> + duration?: number + filesize?: number + tags?: string[] + } + interface FreesoundResponse { count: number; results: FreesoundResult[] } + + function toAudioReference(r: FreesoundResult): Reference { + const { license, version } = mapFreesoundLicense(r.license) + const canonicalUrl = r.url + const rights: RightsRecord = { + license, + // version only ever set when the license arrived as a CC deed URL (D7); D4 omits it. + licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + author: r.username || undefined, + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: 'https://freesound.org/help/tos_api/', sourceUrl: canonicalUrl }, + } + const previewUrl = r.previews?.['preview-hq-mp3'] ?? r.previews?.['preview-lq-mp3'] + return { + id: referenceId('freesound', canonicalUrl), + modality: 'audio', + title: r.name || undefined, + source: { providerId: 'freesound', sourceUrl: canonicalUrl }, + canonicalUrl, + rights, + verifiedAt: new Date().toISOString(), + ...(previewUrl ? { preview: { url: previewUrl, mediaType: 'audio/mpeg' } } : {}), + relevance: 0, // mergeReferences assigns the final RRF relevance + raw: r, + } + } + + function setIfString(url: URL, key: string, value: unknown) { + if (typeof value !== 'string' || !value) return + url.searchParams.set(key, value) + } + + function setIfPositiveInt(url: URL, key: string, value: unknown) { + if (typeof value !== 'number' || !Number.isInteger(value) || value < 1) return + url.searchParams.set(key, String(value)) + } + + export function freesound(config: FreesoundConfig) { + return defineProvider({ + id: 'freesound', + modalities: ['audio'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const opts = q.providerOptions as FreesoundSearchOptions | undefined + const url = new URL(BASE) + url.searchParams.set('query', q.text) + url.searchParams.set('token', config.apiKey) + url.searchParams.set('fields', FIELDS) + url.searchParams.set('page_size', String(opts?.pageSize ?? q.limit ?? 20)) + setIfString(url, 'sort', opts?.sort) + setIfString(url, 'filter', opts?.filter) + setIfPositiveInt(url, 'page', opts?.page) + const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`freesound search failed: ${res.status}`) + const json = (await res.json()) as FreesoundResponse + if (!json.results) return [] + return json.results.map(toAudioReference) + }, + }) + } + ``` + + Run: `pnpm --filter @refkit/provider-freesound test` + Expected: **PASS** (both describe blocks green). + +- [ ] **3.3: Typecheck the package.** + Run: `pnpm --filter @refkit/provider-freesound typecheck` + Expected: clean. + +- [ ] **3.4: Commit.** `git add -A && git commit -m "feat(provider-freesound): audio search + reference mapper"` + +--- + +## Final Task: Central wiring + +- [ ] **F.1: Execute Shared Task S9** (see index) with this substitution row: + + | placeholder | value | + |---|---| + | `<id>` | `freesound` | + | `<Fn>` | `freesound` | + | `<Title>` | `Freesound` | + | `<modality>` | `audio` | + | `<auth>` | `API key` | + | `<licenseCol>` | `per-item CC / CC0` | + | `<ENVVAR>` | `FREESOUND_TOKEN` | + + Freesound is **BYOK**, so: + - **S9.3 (cli.ts):** add `import { freesound } from '@refkit/provider-freesound'` and, in the BYOK block of `defaultProviders` in `packages/mcp/src/cli.ts`, append: + ```ts + if (env.FREESOUND_TOKEN) providers.push(freesound({ apiKey: env.FREESOUND_TOKEN })) + ``` + - **S9.4 (mcp.test.ts):** in `describe('defaultProviders'…)`, add a BYOK-gating assertion mirroring the unsplash one: + ```ts + expect(defaultProviders({}).map(p => p.id)).not.toContain('freesound') + expect(defaultProviders({ FREESOUND_TOKEN: 'k' }).map(p => p.id)).toContain('freesound') + ``` + - **S9.5:** add `"@refkit/provider-freesound": "workspace:*"` to `packages/mcp/package.json` deps. + - **S9.1 / S9.2 / S9.6:** root `vitest.config.ts` projects array, root `README.md` provider table row, and `.changeset/provider-freesound.md` per the index template. + +- [ ] **F.2: Verify the whole repo green (S9.7).** + Run: `pnpm install && pnpm -r typecheck && pnpm test:run` + Expected: typecheck clean; all vitest projects (including `provider-freesound`) pass. + +- [ ] **F.3: Commit (S9.8).** `git add -A && git commit -m "feat(provider-freesound): Freesound satellite (P1)"` + +--- + +## Self-Review + +1. **Decision coverage:** Task 1.1 states D4 (primary, name string, no version) and D7 (defensive, deed URL → version for BY/BY-SA) apply. +2. **Type consistency:** `toAudioReference` emits a valid `Reference` (`id, modality:'audio', source, canonicalUrl, rights, verifiedAt, relevance`) and a valid `RightsRecord` (`license, rehostPolicy:'cache-allowed', raw{sourceTerms,sourceUrl}`); `licenseVersion` only set for CC-BY/CC-BY-SA (and only when a deed URL supplied one). +3. **Strict-deny intact:** NC/sampling → `proprietary` → `denied` for commercial; unrecognized/empty → `unknown` → `needs-review`. No fabricated free licenses. +4. **API form documented:** query-token auth chosen; header form noted as equivalent. `fields=` requested explicitly so `previews`/`license` are present. diff --git a/docs/superpowers/plans/2026-06-29-provider-helpers-refactor.md b/docs/superpowers/plans/2026-06-29-provider-helpers-refactor.md new file mode 100644 index 0000000..66608b0 --- /dev/null +++ b/docs/superpowers/plans/2026-06-29-provider-helpers-refactor.md @@ -0,0 +1,438 @@ +# Provider Helpers in Core — Refactor Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax. This is a **behavior-preserving refactor** — the safety net is: every provider's existing test suite must stay green after its retrofit. Never change a provider's request output or mapping behavior; only its source of helper functions. + +**Goal:** Eliminate the duplicated per-provider URL/license/image helpers by centralizing the shared, stable ones in `@refkit/core`, then retrofit **all** providers that use them to import them. Of the 12 existing provider packages, **11 are retrofitted** — `provider-poetrydb` is excluded because it builds path-based URLs and defines no `setIf*`/searchParams query setters. Plus the 6 Phase-5 plans. `setIfString` alone is currently copy-pasted into 11 providers; the CC-deed-URL→license regex into ~5 of the new plans. + +**Architecture:** A new pure-function module `packages/core/src/provider-helpers.ts`, re-exported from `@refkit/core`'s public API. It joins `defineProvider`/`referenceId` (provider-authoring helpers that already live in core), so providers gain these utilities with **zero new dependency** — they already depend only on `@refkit/core`. A separate `@refkit/provider-utils` package was rejected: providers must depend on core regardless (for `defineProvider`, `referenceId`, `Reference`, `RightsRecord`, `LicenseId`, `NormalizedQuery`, `ProviderContext`), and the shared surface is small (~120 LOC), so a new package would only add a publish/versioning layer (see [[provider-roadmap]] discussion). + +**Tech Stack:** TypeScript (ESM), vitest, zod (already in core). No new dependencies. + +**Scope decision:** centralize ONLY the low-divergence, broadly-shared helpers. Genuinely one-off or behavior-divergent helpers **stay local** (enumerated in Task 3). When unsure, leave it local — over-centralizing helpers that legitimately differ per source is worse than a little duplication. + +--- + +## Divergences found (must be respected — do not flatten blindly) + +A survey of the 11 retrofitted providers (all 12 except path-based `provider-poetrydb`) found these real differences the refactor must preserve: + +- **`setIfString` empty-string handling — FOUR providers skip-empty-divergent, not one.** `flickr`, `unsplash`, `pexels`, AND `pixabay` all use `if (typeof value !== 'string') return` (no `|| !value`), so they emit `key=` for `''`; the rest skip empty. The canonical helper **skips empty** (the majority, safer behavior). For all four, an empty `''` providerOption value previously produced an empty query param and now produces none — a deliberate, safe improvement (empty escape-hatch params are meaningless), but it IS a behavior change. Each of the four must run its tests after the swap; if any genuinely depends on emitting an empty param (none is expected to), keep that provider's local variant. +- **Boolean encoding:** most use `String(value)` → `"true"/"false"`; flickr's `setBooleanFlag` uses `"1"/"0"`. Canonical `setIfBoolean` = `String(value)`. **flickr keeps `setBooleanFlag` local.** +- **String-list separator + empty edges:** most join with `,`; jamendo (Phase-5) joins tags with a space. Canonical `setIfStringList` takes an optional `separator` (default `,`). It also **skips empty arrays and empty-string elements**, whereas some local variants (artic/flickr `setStringList`) would emit `key=''` for `[]` or accept `['']`. This edge is behavior-divergent for those rows (flagged in the table); real call sites never pass `[]`/`['']`, so it is safe, but note it. +- **Int helpers — reject vs CLAMP is a real behavior split, not just a signature rename.** Four names (`setIfInt`, `setIfPositiveInt`, `setIfNonNegativeInt`, `setIfNumber`) and two `max` behaviors: `flickr`'s `setIfInt` **rejects** when `value > max`; but `pexels`/`unsplash`/`brave`/`smithsonian`/`wikimedia-commons`/`pixabay` **CLAMP** via `Math.min(value, max)`. The canonical helpers default to **reject** and take a `clamp: true` option to clamp instead. Retrofit must pass `{ max, clamp: true }` at the six clamping call sites to preserve behavior; rejecting call sites pass `{ max }`. Also unify the `max` convention onto the `{min?,max?}` bag (call sites passing a positional `max` must be updated). **Without `clamp: true` this swap would silently turn a too-large `per_page` from "clamped to the cap" into "dropped → API default" — and no existing test catches it because all fixtures are within range.** + +--- + +## Task 1: Create `packages/core/src/provider-helpers.ts` (TDD red) + +- [ ] **1.1: Write `packages/core/src/__tests__/provider-helpers.test.ts` first.** Cover each canonical helper, including the divergence edge cases above. + +```ts +import { describe, expect, it } from 'vitest' +import { + setIfString, setIfBoolean, setIfStringList, + setIfInt, setIfPositiveInt, setIfNonNegativeInt, setIfNumber, + first, mapCcDeedUrl, mapRightsUrl, isLikelyImageUrl, imageMediaType, +} from '../provider-helpers' + +const params = (fn: (u: URL) => void) => { const u = new URL('https://x.test/'); fn(u); return u.searchParams } + +describe('setIfString', () => { + it('sets a non-empty string; skips non-strings and empty', () => { + expect(params(u => setIfString(u, 'a', 'x')).get('a')).toBe('x') + expect(params(u => setIfString(u, 'a', '')).get('a')).toBeNull() + expect(params(u => setIfString(u, 'a', 5)).get('a')).toBeNull() + }) + it('honors an allowlist', () => { + expect(params(u => setIfString(u, 'a', 'no', ['yes'])).get('a')).toBeNull() + expect(params(u => setIfString(u, 'a', 'yes', ['yes'])).get('a')).toBe('yes') + }) +}) + +describe('setIfBoolean', () => { + it('encodes as true/false strings; skips non-booleans', () => { + expect(params(u => setIfBoolean(u, 'b', true)).get('b')).toBe('true') + expect(params(u => setIfBoolean(u, 'b', false)).get('b')).toBe('false') + expect(params(u => setIfBoolean(u, 'b', 'true')).get('b')).toBeNull() + }) +}) + +describe('setIfStringList', () => { + it('joins arrays (default comma), accepts a string, supports a custom separator + allowlist', () => { + expect(params(u => setIfStringList(u, 't', ['a', 'b'])).get('t')).toBe('a,b') + expect(params(u => setIfStringList(u, 't', 'solo')).get('t')).toBe('solo') + expect(params(u => setIfStringList(u, 't', ['a', 'b'], { separator: ' ' })).get('t')).toBe('a b') + expect(params(u => setIfStringList(u, 't', ['a', 'x'], { allowed: ['a', 'b'] })).get('t')).toBeNull() + expect(params(u => setIfStringList(u, 't', [])).get('t')).toBeNull() + }) +}) + +describe('int/number setters', () => { + it('setIfInt respects min/max and integer-ness', () => { + expect(params(u => setIfInt(u, 'n', 5)).get('n')).toBe('5') + expect(params(u => setIfInt(u, 'n', 5.5)).get('n')).toBeNull() + expect(params(u => setIfInt(u, 'n', 0, { min: 1 })).get('n')).toBeNull() + expect(params(u => setIfInt(u, 'n', 999, { max: 100 })).get('n')).toBeNull() + }) + it('setIfPositiveInt defaults to min 1; setIfNonNegativeInt to min 0', () => { + expect(params(u => setIfPositiveInt(u, 'p', 0)).get('p')).toBeNull() + expect(params(u => setIfPositiveInt(u, 'p', 1)).get('p')).toBe('1') + expect(params(u => setIfPositiveInt(u, 'p', 999, { max: 500 })).get('p')).toBeNull() + expect(params(u => setIfNonNegativeInt(u, 'q', 0)).get('q')).toBe('0') + expect(params(u => setIfNonNegativeInt(u, 'q', -1)).get('q')).toBeNull() + }) + it('clamp:true clamps to max instead of rejecting (preserves provider Math.min behavior)', () => { + expect(params(u => setIfInt(u, 'n', 999, { max: 100, clamp: true })).get('n')).toBe('100') + expect(params(u => setIfPositiveInt(u, 'p', 999, { max: 500, clamp: true })).get('p')).toBe('500') + expect(params(u => setIfPositiveInt(u, 'p', 0, { max: 500, clamp: true })).get('p')).toBeNull() // min floor still rejects + expect(params(u => setIfNonNegativeInt(u, 'q', 999, { max: 200, clamp: true })).get('q')).toBe('200') + }) + it('setIfNumber allows non-integers', () => { + expect(params(u => setIfNumber(u, 'f', 1.5, { min: 0, max: 10 })).get('f')).toBe('1.5') + expect(params(u => setIfNumber(u, 'f', 20, { max: 10 })).get('f')).toBeNull() + }) +}) + +describe('first', () => { + it('returns the first element or undefined', () => { + expect(first(['a', 'b'])).toBe('a') + expect(first([])).toBeUndefined() + expect(first(undefined)).toBeUndefined() + }) +}) + +describe('mapCcDeedUrl', () => { + it('maps PD/CC0, BY/BY-SA (+version), NC/ND → proprietary, else unknown', () => { + expect(mapCcDeedUrl('http://creativecommons.org/publicdomain/zero/1.0/')).toEqual({ license: 'CC0-1.0' }) + expect(mapCcDeedUrl('https://creativecommons.org/publicdomain/mark/1.0/')).toEqual({ license: 'PD' }) + expect(mapCcDeedUrl('http://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapCcDeedUrl('http://creativecommons.org/licenses/by-sa/3.0/')).toEqual({ license: 'CC-BY-SA', version: '3.0' }) + expect(mapCcDeedUrl('http://creativecommons.org/licenses/by-nc-nd/3.0/')).toEqual({ license: 'proprietary' }) + expect(mapCcDeedUrl('http://creativecommons.org/licenses/by-nd/4.0/')).toEqual({ license: 'proprietary' }) + // mapCcDeedUrl is CC-only — a rightsstatements URL has no CC pattern → unknown here + // (the faithful rightsstatements mapping lives in mapRightsUrl, tested below). + expect(mapCcDeedUrl('http://rightsstatements.org/vocab/InC/1.0/')).toEqual({ license: 'unknown' }) + expect(mapCcDeedUrl(undefined)).toEqual({ license: 'unknown' }) + expect(mapCcDeedUrl('https://example.org/x')).toEqual({ license: 'unknown' }) + }) +}) + +describe('mapRightsUrl (CC deeds + faithful rightsstatements.org)', () => { + it('delegates CC deeds to mapCcDeedUrl', () => { + expect(mapRightsUrl('http://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapRightsUrl('http://creativecommons.org/publicdomain/zero/1.0/')).toEqual({ license: 'CC0-1.0' }) + }) + it('maps rightsstatements faithfully: InC→proprietary, NoC-US→PD+US, NoC-NC→proprietary', () => { + expect(mapRightsUrl('http://rightsstatements.org/vocab/InC/1.0/')).toEqual({ license: 'proprietary' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/InC-OW-EU/1.0/')).toEqual({ license: 'proprietary' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/NoC-US/1.0/')).toEqual({ license: 'PD', jurisdiction: 'US' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/NoC-NC/1.0/')).toEqual({ license: 'proprietary' }) + }) + it('maps opaque/undetermined rightsstatements → unknown', () => { + expect(mapRightsUrl('http://rightsstatements.org/vocab/NoC-OKLR/1.0/')).toEqual({ license: 'unknown' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/NoC-CR/1.0/')).toEqual({ license: 'unknown' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/CNE/1.0/')).toEqual({ license: 'unknown' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/UND/1.0/')).toEqual({ license: 'unknown' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/NKC/1.0/')).toEqual({ license: 'unknown' }) + expect(mapRightsUrl(undefined)).toEqual({ license: 'unknown' }) + }) +}) + +describe('image helpers', () => { + it('isLikelyImageUrl: extension / iiif / thumbnail / image CDN', () => { + expect(isLikelyImageUrl('https://x/y.jpg')).toBe(true) + expect(isLikelyImageUrl('https://iiif.x/a/full/full/0/default.jpg')).toBe(true) + expect(isLikelyImageUrl('https://api.europeana.eu/thumbnail/v3/200/a.jpg')).toBe(true) + expect(isLikelyImageUrl('https://lh3.googleusercontent.com/abc=s0')).toBe(true) + expect(isLikelyImageUrl('https://www.rijksmuseum.nl/en/collection/SK-A-1')).toBe(false) + }) + it('imageMediaType: MIME wins, else extension, else default', () => { + expect(imageMediaType('image/png', 'https://x/y')).toBe('image/png') + expect(imageMediaType(undefined, 'https://x/y.png')).toBe('image/png') + expect(imageMediaType(undefined, 'https://x/y.jpg')).toBe('image/jpeg') + expect(imageMediaType('application/octet-stream', 'https://x/y')).toBe('image/jpeg') + }) +}) +``` + +- [ ] **1.2: Run — expect FAIL** (module does not exist). + +```bash +pnpm --filter @refkit/core test -- provider-helpers +``` +Expected: FAIL — `Cannot find module '../provider-helpers'`. + +--- + +## Task 2: Implement `provider-helpers.ts` + export (TDD green) + +- [ ] **2.1: Write `packages/core/src/provider-helpers.ts`.** + +```ts +import type { LicenseId } from './license' + +// — URL query-param setters (shared by every provider's search()) — + +/** Set `key=value` when value is a non-empty string (optionally within an allowlist). */ +export function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]): void { + if (typeof value !== 'string' || !value) return + if (allowed && !allowed.includes(value)) return + url.searchParams.set(key, value) +} + +/** Set `key=true|false` when value is a boolean. */ +export function setIfBoolean(url: URL, key: string, value: unknown): void { + if (typeof value !== 'boolean') return + url.searchParams.set(key, String(value)) +} + +/** Set `key` to a joined list from a string or string[] (default separator ","). + * Optional allowlist rejects the whole value if any element is not allowed. */ +export function setIfStringList( + url: URL, key: string, value: unknown, + opts?: { separator?: string; allowed?: readonly string[] }, +): void { + const sep = opts?.separator ?? ',' + const allowed = opts?.allowed + const ok = (v: string) => !allowed || allowed.includes(v) + if (typeof value === 'string' && value && ok(value)) { url.searchParams.set(key, value); return } + if (Array.isArray(value) && value.length > 0 && value.every(v => typeof v === 'string' && v && ok(v))) { + url.searchParams.set(key, value.join(sep)) + } +} + +/** Set `key` when value is an integer. `min` is a reject floor (value < min → skip). + * For `max`: the default REJECTS when value > max; with `clamp: true` it instead sets + * `max` — preserving the `Math.min(value, max)` clamp several providers rely on. */ +export function setIfInt(url: URL, key: string, value: unknown, opts?: { min?: number; max?: number; clamp?: boolean }): void { + if (typeof value !== 'number' || !Number.isInteger(value)) return + if (opts?.min !== undefined && value < opts.min) return + if (opts?.max !== undefined && value > opts.max) { + if (opts.clamp) { url.searchParams.set(key, String(opts.max)); return } + return + } + url.searchParams.set(key, String(value)) +} + +/** Integer ≥ (opts.min ?? 1). Pass `clamp: true` to clamp to `max` instead of rejecting. */ +export function setIfPositiveInt(url: URL, key: string, value: unknown, opts?: { min?: number; max?: number; clamp?: boolean }): void { + setIfInt(url, key, value, { min: opts?.min ?? 1, max: opts?.max, clamp: opts?.clamp }) +} + +/** Integer ≥ (opts.min ?? 0). Pass `clamp: true` to clamp to `max` instead of rejecting. */ +export function setIfNonNegativeInt(url: URL, key: string, value: unknown, opts?: { min?: number; max?: number; clamp?: boolean }): void { + setIfInt(url, key, value, { min: opts?.min ?? 0, max: opts?.max, clamp: opts?.clamp }) +} + +/** Set `key` when value is a finite number (non-integers allowed) within [min, max]. */ +export function setIfNumber(url: URL, key: string, value: unknown, opts?: { min?: number; max?: number }): void { + if (typeof value !== 'number' || !Number.isFinite(value)) return + if (opts?.min !== undefined && value < opts.min) return + if (opts?.max !== undefined && value > opts.max) return + url.searchParams.set(key, String(value)) +} + +// — array helper — + +/** First element of an array-typed field, or undefined. */ +export function first<T>(arr: T[] | undefined | null): T | undefined { + return Array.isArray(arr) && arr.length > 0 ? arr[0] : undefined +} + +// — license: CC deed URL → LicenseId (the moat; shared by URL-based sources) — + +/** Map a Creative Commons deed URL to a core LicenseId (+ CC version for the BY/BY-SA + * families). Conservative: NC/ND variants → proprietary; PD mark / CC0 → PD / CC0-1.0; + * absent/unrecognized → unknown. **CC deeds only** — rightsstatements.org is handled by + * `mapRightsUrl`. Match is on the path so http/https both work. */ +export function mapCcDeedUrl(url: string | undefined | null): { license: LicenseId; version?: string } { + if (!url) return { license: 'unknown' } + const u = url.toLowerCase() + if (u.includes('creativecommons.org/publicdomain/zero')) return { license: 'CC0-1.0' } + if (u.includes('creativecommons.org/publicdomain/mark')) return { license: 'PD' } + // NC / ND are NOT open grants — check before plain by/by-sa ("by-nc-sa" contains "by-sa"). + if (/creativecommons\.org\/licenses\/by-(?:nc|nd)/.test(u)) return { license: 'proprietary' } + const sa = u.match(/creativecommons\.org\/licenses\/by-sa\/(\d(?:\.\d)?)/) + if (sa) return { license: 'CC-BY-SA', version: sa[1] } + const by = u.match(/creativecommons\.org\/licenses\/by\/(\d(?:\.\d)?)/) + if (by) return { license: 'CC-BY', version: by[1] } + if (/creativecommons\.org\/licenses\/by-sa\b/.test(u)) return { license: 'CC-BY-SA' } + if (/creativecommons\.org\/licenses\/by\b/.test(u)) return { license: 'CC-BY' } + return { license: 'unknown' } +} + +// rightsstatements.org is a controlled vocabulary of rights STATUS statements (not license +// grants). Map each token FAITHFULLY to the closest true refkit representation rather than +// collapsing all to unknown — discarding a signal the source did give us is not "honest": +// • In-Copyright (InC*) → proprietary — we KNOW it's copyrighted with no grant (commercial +// denied), which is more faithful than "needs-review". +// • NoC-US → PD scoped to the US via the jurisdiction field (RightsRecord.jurisdiction +// exists for exactly this; a jurisdiction-aware caller is gated, default stays lenient). +// • NoC-NC → proprietary — no copyright BUT non-commercial only, so commercial is definitely +// out (closest honest gate; loses the "non-commercial derivatives OK" nuance, which no +// LicenseId can express — acceptable approximation). +// • Opaque/undetermined (NoC-OKLR, NoC-CR, CNE, UND, NKC) → unknown (genuinely needs review). +const RIGHTS_STATEMENT: Record<string, { license: LicenseId; jurisdiction?: string }> = { + 'inc': { license: 'proprietary' }, 'inc-ow-eu': { license: 'proprietary' }, 'inc-edu': { license: 'proprietary' }, + 'inc-nc': { license: 'proprietary' }, 'inc-ruu': { license: 'proprietary' }, + 'noc-us': { license: 'PD', jurisdiction: 'US' }, + 'noc-nc': { license: 'proprietary' }, + 'noc-oklr': { license: 'unknown' }, 'noc-cr': { license: 'unknown' }, + 'cne': { license: 'unknown' }, 'und': { license: 'unknown' }, 'nkc': { license: 'unknown' }, +} + +/** Map any rights URI — a CC deed OR a rightsstatements.org statement — to a faithful + * LicenseId (+ CC version / source jurisdiction). For sources whose rights field can be + * either (europeana `edm:rights`, internet-archive `licenseurl`). CC-only sources should + * call `mapCcDeedUrl` directly. Unknown rightsstatements tokens → unknown. */ +export function mapRightsUrl(url: string | undefined | null): { license: LicenseId; version?: string; jurisdiction?: string } { + if (!url) return { license: 'unknown' } + const rs = url.toLowerCase().match(/rightsstatements\.org\/(?:vocab|page)\/([a-z-]+)/) + if (rs) return RIGHTS_STATEMENT[rs[1]] ?? { license: 'unknown' } + return mapCcDeedUrl(url) +} + +// — image-URL heuristics (decision D8): preview.url must be an image, never a web page — + +export const IMAGE_EXT = /\.(jpe?g|png|webp|gif|tiff?)(?:$|\?)/i + +/** URL-string heuristic only (no network): does this look like an image resource? */ +export function isLikelyImageUrl(url: string): boolean { + return IMAGE_EXT.test(url) + || /iiif/i.test(url) + || /\/full\/[^/]+\/\d+\/default/i.test(url) // IIIF Image API request path + || /\/thumbnail\//i.test(url) + || /googleusercontent\.com/.test(url) // Rijksmuseum/Met image CDN +} + +/** Best image mediaType: declared MIME if image/*, else inferred from extension, else default. */ +export function imageMediaType(mime: string | undefined, url: string): string { + if (mime && mime.startsWith('image/')) return mime + const m = url.match(IMAGE_EXT) + if (m) { const e = m[1].toLowerCase(); return e === 'jpg' ? 'image/jpeg' : `image/${e === 'tif' ? 'tiff' : e}` } + return 'image/jpeg' +} +``` + +- [ ] **2.2: Export from `packages/core/src/index.ts`.** Add after the `defineProvider` export block: + +```ts +export { + setIfString, setIfBoolean, setIfStringList, + setIfInt, setIfPositiveInt, setIfNonNegativeInt, setIfNumber, + first, mapCcDeedUrl, mapRightsUrl, isLikelyImageUrl, imageMediaType, IMAGE_EXT, +} from './provider-helpers' +``` + +- [ ] **2.3: Run — expect PASS.** + +```bash +pnpm --filter @refkit/core test -- provider-helpers +pnpm --filter @refkit/core typecheck +``` +Expected: PASS + clean. The rest of core's suite is untouched (purely additive). + +- [ ] **2.4: Commit.** + +```bash +git add packages/core/src/provider-helpers.ts packages/core/src/__tests__/provider-helpers.test.ts packages/core/src/index.ts +git commit -m "feat(core): shared provider helpers (setIf*, first, mapCcDeedUrl, image heuristics)" +``` + +--- + +## Task 3: Retrofit each provider (behavior-preserving) — Shared Recipe `R` + +Apply recipe **R** to each provider in the table below, **one provider per task, one commit each**, running that provider's tests after. The providers are independent (each edits only its own `src/index.ts`), so these tasks are parallelizable. + +**Recipe R (per provider):** +1. Add the needed names to the existing `import { … } from '@refkit/core'`. +2. Delete the now-redundant local helper functions. +3. Update call sites whose signature changed — specifically any positional `max` → `{ max }` opts bag, and any `setIfStringList`/`setStringList` needing a non-default separator → pass `{ separator }`. +4. Run `pnpm --filter <pkg> test` and `pnpm --filter <pkg> typecheck`. **Both must stay green** — this is the proof the refactor preserved behavior. `noUnusedLocals` will flag any helper you imported but didn't use, or forgot to delete. +5. Commit: `refactor(<pkg>): use shared core provider helpers`. + +**Per-provider mapping** (✅ = replace with core import; **keep local** = do NOT centralize): + +| Provider | Replace with core import | Call-site changes | Keep local (do not touch) | +|---|---|---|---| +| **provider-met** | `setIfBoolean`, `setIfInt`, `setIfString` | none (met's `setIfInt` had no opts; rejects on bounds — matches canonical default) | — | +| **provider-artic** | `setIfString`, `setIfNonNegativeInt`, `setIfStringList`(rename from `setStringList`, default `,`) | rename `setStringList`→`setIfStringList` call sites; ⚠ canonical skips `[]`/`['']` (artic's emitted them — verify no call site passes those) | **`articFields`** (parses comma/array + injects required fields — provider-specific) | +| **provider-openverse** | `setIfString`, `setIfStringList`, `setIfBoolean`, `setIfPositiveInt`, `setIfNumber` | `setIfNumber(...,{min,max})` already opts-bag → compatible | `hasStringList` (internal field-search detection — openverse-only) | +| **provider-unsplash** | `setIfString` (⚠ does NOT skip empty), `setIfPositiveInt` | positional `max?` → **`{ max, clamp: true }`** (unsplash CLAMPs via Math.min) | `setCollections` | +| **provider-pexels** | `setIfString` (⚠ does NOT skip empty), `setIfPositiveInt` | positional `max?` → **`{ max, clamp: true }`** (pexels CLAMPs) | `pickVideoFile` | +| **provider-pixabay** | `setIfString` (⚠ does NOT skip empty), `setIfStringList`, `setIfNonNegativeInt`, `setIfPositiveInt`, `setIfBoolean` | `setIfPositiveInt`: signature compatible but **body CLAMPs** → pass **`{ max, clamp: true }`** (NOT "already compatible"); `setIfStringList` allowlist → `{ allowed }` | — | +| **provider-gutendex** | `setIfInt`, `setIfPositiveInt`, `setIfString`, `setIfStringList` | `setIfStringList` allowlist → `{ allowed }`; gutendex's int helpers reject on bounds → no `clamp` needed | — | +| **provider-smithsonian** | `setIfString`, `setIfNonNegativeInt` | positional `max?` → **`{ max, clamp: true }`** (smithsonian CLAMPs) | — | +| **provider-brave** | `setIfString`, `setIfPositiveInt`, `setIfBoolean` | positional `max?` → **`{ max, clamp: true }`** (brave CLAMPs) | — | +| **provider-flickr** | `setIfString` (⚠ does NOT skip empty), `setIfInt`, `setStringList`→`setIfStringList` | `setIfInt` opts-bag compatible and **rejects** on max (no `clamp`) — matches canonical default; `setStringList`→`setIfStringList` | **`setBooleanFlag` (1/0), `setIfSafeSearch`, `setTags`, `setStringOrNumber`** | +| **provider-wikimedia-commons** | `setIfString`, `setIfNonNegativeInt`, `setIfPositiveInt`, `setIfBoolean` | `setIfPositiveInt` positional `max?` → **`{ max, clamp: true }`** (wikimedia CLAMPs); `setIfNonNegativeInt` has no max → `{}` | `setPipeList`, `pickTitle` | + +- [ ] **3.x (one checkbox per provider above):** apply Recipe R; tests + typecheck green; commit. + +> **⚠ empty-string check (Task 3 — applies to flickr, unsplash, pexels, AND pixabay):** all four use `if (typeof value !== 'string') return` (they emit `key=` for `''`); the core `setIfString` skips `''`. After swapping each, run its tests. If any test asserts an empty-string param is emitted, keep that provider's local `setIfString` (rename it, e.g. `setIfStringAllowEmpty`) for the affected call sites and document the keep in a code comment. (Expected: none of the four intentionally emits empty params, so the swap is a clean, safe improvement — but verify per provider, don't assume.) +> +> **⚠ clamp check (Task 3 — applies to unsplash, pexels, brave, smithsonian, wikimedia-commons, pixabay):** all six CLAMP via `Math.min(value, max)`; the core int helpers reject by default. You MUST pass `{ max, clamp: true }` at these call sites. A unit test won't catch a missed `clamp` (all fixtures are in-range), so this is a manual review point — diff each retrofitted `search()` and confirm every former positional-`max` call now carries `clamp: true`. + +--- + +## Task 4: Point the 6 Phase-5 plans at the shared helpers + +The Phase-5 provider plans (`2026-06-29-provider-*.md`) currently inline these helpers. They are plans (not yet code), so just update them to import from core when implemented: + +- [ ] **4.1:** Replace each plan's inlined license/rights mapper with the right core helper: + - **CC-only sources** (`jamendo` `mapJamendoLicense`, `rijksmuseum` `mapRijksRights`, `freesound`'s URL branch) → core **`mapCcDeedUrl`**. (freesound's CC **name-string** table stays local and falls back to `mapCcDeedUrl` for URL-form values. rijksmuseum's local was named `mapRijksRights` specifically to avoid clashing with the core `mapRightsUrl`.) + - **Mixed sources whose rights field can be a CC deed OR a rightsstatements.org statement** (`europeana` `mapEuropeanaRights`, `internet-archive` `mapIaLicense`) → core **`mapRightsUrl`**, which faithfully maps rightsstatements (see below) and delegates CC deeds to `mapCcDeedUrl`. These two must read `jurisdiction` off the result and set `rights.jurisdiction` (for NoC-US). + + Update each plan's tests to import the helper accordingly. + + **⚠ Behavior notes — confirm before swapping:** + 1. **`rightsstatements.org` → faithful mapping (decision: faithful, not blanket-unknown).** `mapRightsUrl` maps rightsstatements statements to their closest TRUE representation: **In-Copyright (InC*) → `proprietary`** (copyrighted, no grant → denied, more faithful than needs-review); **NoC-US → `PD` + `jurisdiction:'US'`** (jurisdiction-scoped PD; default gate stays lenient, jurisdiction-aware callers are gated); **NoC-NC → `proprietary`** (non-commercial → commercial denied); **NoC-OKLR / NoC-CR / CNE / UND / NKC → `unknown`** (opaque/undetermined). This is MORE faithful than the providers' earlier "all rightsstatements → unknown" inlined behavior and REDUCES needs-review noise — update europeana/IA tests to the new verdicts (InC items now `denied`, not `needs-review`; NoC-US now `PD`+`US`). IA's D3 "never guess PD" still holds: it governs items with NO licenseurl (→ unknown); NoC-US→PD is not a guess, it is the source's explicit declaration. + 2. **Versionless `by`/`by-sa` deed URLs.** `mapCcDeedUrl` has a fallback that maps a versionless `.../licenses/by/` (no `/X.Y/`) to `CC-BY` (no version); jamendo/europeana/freesound's inlined regexes return `unknown` for those. After the swap a versionless CC-BY deed URL would map `unknown`→`CC-BY`. This is more correct (the family permission is version-invariant) but IS a behavior change — note it in each plan and confirm no test asserts the old `unknown`. +- [ ] **4.2:** In the `rijksmuseum` plan import **only `isLikelyImageUrl`** (plus `IMAGE_EXT` if its `findImage` references it) from core — rijksmuseum hardcodes its `preview.mediaType` and does **not** use `imageMediaType`, so importing it would be an unused import (`noUnusedLocals` fails). In the `europeana` plan import both `isLikelyImageUrl` and `imageMediaType`. Keep the provider-specific selectors local (`findImage`/`collectDigitalObjects` for rijksmuseum; the `edmIsShownBy` vs `edmIsShownAt` choice for europeana). +- [ ] **4.3:** In all 6 plans, import `setIf*`/`first` from core instead of inlining (per the same mapping as Task 3). +- [ ] **4.4:** Update the index `2026-06-29-p1-providers-index.md` Shared Task S0 note to say: "import shared helpers (`setIf*`, `first`, `mapCcDeedUrl`, `isLikelyImageUrl`, `imageMediaType`) from `@refkit/core` — do not re-inline them (see `2026-06-29-provider-helpers-refactor.md`)." + +--- + +## Task 5: Whole-repo verification + changeset + +- [ ] **5.1: Verify the entire monorepo is green** (this is the behavior-preservation proof across all retrofits): + +```bash +pnpm install && pnpm -r typecheck && pnpm test:run +``` +Expected: every package's typecheck clean; every vitest project green. No test should need editing — if a provider test breaks, the retrofit changed behavior; fix the retrofit, not the test (except flickr's documented empty-string case, if it materializes). + +- [ ] **5.2: Changeset.** Core gains exports (minor); every retrofitted provider is an internal refactor (patch). + +```markdown +--- +"@refkit/core": minor +"@refkit/provider-met": patch +"@refkit/provider-artic": patch +"@refkit/provider-openverse": patch +"@refkit/provider-unsplash": patch +"@refkit/provider-pexels": patch +"@refkit/provider-pixabay": patch +"@refkit/provider-gutendex": patch +"@refkit/provider-smithsonian": patch +"@refkit/provider-brave": patch +"@refkit/provider-flickr": patch +"@refkit/provider-wikimedia-commons": patch +--- + +Add shared provider helpers to @refkit/core (setIf* URL setters, first, mapCcDeedUrl, image-URL heuristics) and refactor all providers to use them instead of per-package copies. +``` + +- [ ] **5.3: Final commit** (if not already per-task): `refactor: centralize provider helpers in core`. + +--- + +## Self-Review + +1. **No new package** — helpers live in core (providers already depend on it); a separate package was rejected on dependency + size grounds. +2. **Behavior preserved** — every provider's existing tests are the gate; the only intentional behavior question (flickr empty-string) is explicitly flagged with a fallback. +3. **Divergences respected** — boolean `1/0` (flickr), string-list separator (jamendo space), int `{min,max}` bag, and all one-off helpers are kept local, not flattened. +4. **DRY win quantified** — removes 11 copies of `setIfString` and ~5 copies of the CC-deed-URL mapper; centralizes the D8 image heuristic used by 2 providers. +5. **Phase-5 plans aligned** — Task 4 points the 6 unbuilt providers at the shared helpers so they never re-introduce the duplication. diff --git a/docs/superpowers/plans/2026-06-29-provider-internet-archive.md b/docs/superpowers/plans/2026-06-29-provider-internet-archive.md new file mode 100644 index 0000000..73e33f0 --- /dev/null +++ b/docs/superpowers/plans/2026-06-29-provider-internet-archive.md @@ -0,0 +1,417 @@ +# Internet Archive Provider Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. Read `2026-06-29-p1-providers-index.md` FIRST — it defines Shared Task **S0** (package skeleton) and Shared Task **S9** (central wiring), plus decisions **D1, D3, D5, D7** which this plan applies. Do not re-paste S0/S9 boilerplate; reference them. + +**Goal:** Add `@refkit/provider-internet-archive` — a keyless satellite that searches the Internet Archive Advanced Search API and returns license-normalized `Reference`s for `video` (mediatype `movies`) and `text` (mediatype `texts`) items. + +**Architecture:** One factory `internetArchive(config?)` returning `defineProvider({ id: 'internet-archive', modalities: ['video','text'], queryFeatures: ['keyword'], capabilities: { controls: [] }, search })`. `search` builds an `advancedsearch.php` URL, reads `response.docs[]`, maps each doc through `toReference`, and filters out docs whose `mediatype` does not map to a supported modality. Three realities drive the design: + +- **Dirty license (D3):** licenseurl is ABSENT on the large majority of items (verified live: a `mediatype:movies` page returned 0/20 docs with a `licenseurl`; only curated CC collections carry one — roughly ~7% overall). Every doc **without** a parseable `licenseurl` → `license: 'unknown'` (core turns that into a `needs-review` verdict). **Never** guess PD, and **never** silently drop the item. (When a `licenseurl` IS present, it is mapped faithfully — including rightsstatements.org statements via the same per-token table as europeana/core `mapRightsUrl`: e.g. `NoC-US` → `PD` + `jurisdiction:'US'`, `InC` → `proprietary`. That is the source's declaration, not a guess, so it does not violate the "never guess PD" rule, which is about the *absent* case.) +- **mediatype → modality (D1):** map `mediatype: 'movies'` → `'video'` and `mediatype: 'texts'` → `'text'`. **v1 scope is exactly these two.** Other mediatypes (`audio`, `image`, `collection`, `software`, `web`, `data`, `etree`) map to `null` and are filtered out of results. Document `audio`/`image`/etc. as a follow-up in the README. Note: `search` returns **both** legs and does not narrow to the caller's `q.modalities` (a `['video']`-only query still yields `texts` items) — this matches the existing single-modality providers, which rely on core routing rather than in-provider modality filtering. If the core merge/client layer does not narrow downstream, add a `q.modalities`-based filter in `search`. +- **CC URL mapping (D5/D7):** when a `licenseurl` is present, regex-map it to a family and (for CC-BY/CC-BY-SA) capture the version. + +`text` references may omit the optional `TextMeta` (`reference.ts`: `text?` is optional) — IA search returns no excerpt, so omit it. Canonical/page URL is the details page; thumbnail is the services image endpoint; `preview` is omitted (search exposes no clean direct media stream). + +**Tech Stack:** TypeScript (ESM), tsup, vitest, zod (via `@refkit/core`), pnpm workspaces, changesets. Mirror `provider-met` (template + `ctx.fetch` mocking) and `provider-flickr` (license/URL mapper pattern). + +--- + +## Task 1: Decisions & scaffold + +- [ ] **1.1: Confirm decisions** — this provider applies **D1** (mediatype→modality, v1 scope = movies/texts only), **D3** (dirty license → `unknown` fallback, never drop, never guess PD), **D5** (CC URL → family), **D7** (CC version from URL for CC-BY / CC-BY-SA). + +- [ ] **1.2: Execute Shared Task S0** (see the index) with these substitutions: + + | token | value | + |---|---| + | `<id>` | `internet-archive` | + | `<Fn>` | `internetArchive` | + | `<Title>` | `Internet Archive` | + | `<modality>` | `video / text` | + | `<auth>` | `keyless` | + | `<licenseCol>` | `per-item CC (dirty) → unknown fallback` | + + Notes: + - In S0.1, the package directory is `packages/provider-internet-archive`, `package.json` `name` is `@refkit/provider-internet-archive`, and the keywords list ends with `"internet-archive"`. The provider `id` and `referenceId(...)` namespace are also the hyphenated `internet-archive`. + - The **factory export is camelCase `internetArchive`** (not hyphenated) — only the id/keywords/dir use `internet-archive`. + - README "Modality" line: `video · text`. Add a short note that v1 covers `movies` (→ video) and `texts` (→ text) only, and that other mediatypes (audio, image, …) are a documented follow-up. + - S0.2/S0.3/S0.4 unchanged. + +--- + +## Task 2: `mapIaLicense` + `mediatypeToModality` (TDD) + +- [ ] **2.1: Write the failing unit test** `packages/provider-internet-archive/src/__tests__/internet-archive.test.ts` covering only the two pure mappers first: + +```ts +import { describe, expect, it } from 'vitest' +import { mapIaLicense, mediatypeToModality } from '../index' + +describe('mapIaLicense', () => { + it('maps CC0 / PD mark / PD dedication URLs', () => { + expect(mapIaLicense('https://creativecommons.org/publicdomain/zero/1.0/')).toEqual({ license: 'CC0-1.0' }) + expect(mapIaLicense('http://creativecommons.org/publicdomain/mark/1.0/')).toEqual({ license: 'PD' }) + }) + + it('maps CC-BY and CC-BY-SA with version (D7)', () => { + expect(mapIaLicense('https://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapIaLicense('http://creativecommons.org/licenses/by-sa/3.0/')).toEqual({ license: 'CC-BY-SA', version: '3.0' }) + }) + + it('maps NC/ND variants to proprietary (D5)', () => { + expect(mapIaLicense('https://creativecommons.org/licenses/by-nc/4.0/').license).toBe('proprietary') + expect(mapIaLicense('https://creativecommons.org/licenses/by-nd/4.0/').license).toBe('proprietary') + expect(mapIaLicense('https://creativecommons.org/licenses/by-nc-sa/4.0/').license).toBe('proprietary') + }) + + it('falls back to unknown for absent / unrecognized URLs (D3)', () => { + expect(mapIaLicense(undefined)).toEqual({ license: 'unknown' }) + expect(mapIaLicense('')).toEqual({ license: 'unknown' }) + expect(mapIaLicense('https://example.com/some-license')).toEqual({ license: 'unknown' }) + }) + + it('maps rightsstatements.org faithfully (InC→proprietary, NoC-US→PD+US, opaque→unknown)', () => { + expect(mapIaLicense('http://rightsstatements.org/vocab/InC/1.0/')).toEqual({ license: 'proprietary' }) + expect(mapIaLicense('http://rightsstatements.org/vocab/NoC-US/1.0/')).toEqual({ license: 'PD', jurisdiction: 'US' }) + expect(mapIaLicense('http://rightsstatements.org/vocab/NoC-NC/1.0/')).toEqual({ license: 'proprietary' }) + expect(mapIaLicense('http://rightsstatements.org/vocab/CNE/1.0/')).toEqual({ license: 'unknown' }) + }) +}) + +describe('mediatypeToModality (D1)', () => { + it('maps movies→video and texts→text', () => { + expect(mediatypeToModality('movies')).toBe('video') + expect(mediatypeToModality('texts')).toBe('text') + }) + it('returns null for unsupported mediatypes (filtered out of v1)', () => { + expect(mediatypeToModality('audio')).toBeNull() + expect(mediatypeToModality('image')).toBeNull() + expect(mediatypeToModality('collection')).toBeNull() + expect(mediatypeToModality('software')).toBeNull() + }) +}) +``` + +- [ ] **2.2: Run — expect FAIL** (module/exports do not exist yet): + + ```bash + pnpm --filter @refkit/provider-internet-archive test + ``` + Expected: FAIL (cannot resolve `../index` / no such exports). + +- [ ] **2.3: Implement the mappers** in `packages/provider-internet-archive/src/index.ts`: + +```ts +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type LicenseId, type Modality, + type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +const BASE = 'https://archive.org/advancedsearch.php' + +export interface InternetArchiveConfig { + /** Max docs requested per search (advancedsearch `rows`). Default falls back to + * the query limit, then 20. Bounded to 100. */ + maxRows?: number +} + +// rightsstatements.org is a rights-STATUS vocabulary (not license grants). Mapped faithfully +// per token (mirrors core `mapRightsUrl`; helper-refactor Task 4 dedups this): InC* → +// proprietary; NoC-US → PD scoped to the US; NoC-NC → proprietary; opaque/undetermined → unknown. +const RIGHTS_STATEMENT: Record<string, { license: LicenseId; jurisdiction?: string }> = { + 'inc': { license: 'proprietary' }, 'inc-ow-eu': { license: 'proprietary' }, 'inc-edu': { license: 'proprietary' }, + 'inc-nc': { license: 'proprietary' }, 'inc-ruu': { license: 'proprietary' }, + 'noc-us': { license: 'PD', jurisdiction: 'US' }, + 'noc-nc': { license: 'proprietary' }, + 'noc-oklr': { license: 'unknown' }, 'noc-cr': { license: 'unknown' }, + 'cne': { license: 'unknown' }, 'und': { license: 'unknown' }, 'nkc': { license: 'unknown' }, +} + +/** Map an Internet Archive `licenseurl` to our license id (+ CC version, + jurisdiction for + * jurisdiction-scoped PD). **ABSENT licenseurl → 'unknown' (D3)** — IA rarely carries one, so + * most items legitimately land here → needs-review; this is the "never guess PD" rule and it + * governs the ABSENT case only. A PRESENT rightsstatements.org statement is a real declaration + * and is mapped faithfully (NoC-US → PD is the source's word, not a guess). NC/ND → proprietary + * (D5); PD mark/dedication → PD; CC0 → CC0-1.0; unrecognized → unknown. */ +export function mapIaLicense(licenseurl?: string): { license: LicenseId; version?: string; jurisdiction?: string } { + if (!licenseurl) return { license: 'unknown' } + const u = licenseurl.toLowerCase() + const rs = u.match(/rightsstatements\.org\/(?:vocab|page)\/([a-z-]+)/) + if (rs) return RIGHTS_STATEMENT[rs[1]] ?? { license: 'unknown' } + if (/\/publicdomain\/zero\b/.test(u)) return { license: 'CC0-1.0' } + if (/\/publicdomain\/mark\b/.test(u)) return { license: 'PD' } + // Exclude any NC / ND variant before matching the open by / by-sa families. + if (/\/licenses\/by-(?:nc|nd)/.test(u)) return { license: 'proprietary' } + const bySa = u.match(/\/licenses\/by-sa\/(\d(?:\.\d)?)\b/) + if (bySa) return { license: 'CC-BY-SA', version: bySa[1] } + const by = u.match(/\/licenses\/by\/(\d(?:\.\d)?)\b/) + if (by) return { license: 'CC-BY', version: by[1] } + // by / by-sa with no version still maps to the family (version omitted). + if (/\/licenses\/by-sa\b/.test(u)) return { license: 'CC-BY-SA' } + if (/\/licenses\/by\b/.test(u)) return { license: 'CC-BY' } + return { license: 'unknown' } +} + +const MEDIATYPE_MODALITY: Record<string, Modality> = { movies: 'video', texts: 'text' } + +/** v1 scope (D1): only `movies`→video and `texts`→text. Everything else → null + * (filtered out). audio / image / etc. are a documented follow-up. */ +export function mediatypeToModality(mt: string): Modality | null { + return MEDIATYPE_MODALITY[mt] ?? null +} +``` + +- [ ] **2.4: Run — expect PASS**: + + ```bash + pnpm --filter @refkit/provider-internet-archive test + ``` + Expected: PASS (both `describe` blocks green). + +- [ ] **2.5: Commit** — `git add -A && git commit -m "feat(provider-internet-archive): license + mediatype mappers"` + +--- + +## Task 3: `toReference` + `search` (TDD) + +- [ ] **3.1: Add the failing integration test** — append to `internet-archive.test.ts`. It mocks `ctx.fetch` to return one `advancedsearch` body whose `response.docs[]` mixes a CC-BY movie (creator string), a movie with NO licenseurl, a `texts` item (creator array), and a `collection` doc that must be filtered out: + +```ts +import { evaluateUse, referenceId, type ProviderContext } from '@refkit/core' +import { internetArchive } from '../index' + +const DOCS = [ + { // CC-BY movie, creator as a string + identifier: 'big_buck_bunny', + title: 'Big Buck Bunny', + creator: 'Blender Foundation', + licenseurl: 'https://creativecommons.org/licenses/by/3.0/', + mediatype: 'movies', + }, + { // movie with NO licenseurl — must NOT be dropped (D3) + identifier: 'cbsnews-clip', + title: 'News Clip', + creator: 'cbsnews.com', + mediatype: 'movies', + }, + { // texts item, creator as an array (IA creator is multi-value) + identifier: 'alices_adventures', + title: "Alice's Adventures in Wonderland", + creator: ['Carroll, Lewis', 'Tenniel, John'], + licenseurl: 'https://creativecommons.org/publicdomain/zero/1.0/', + mediatype: 'texts', + }, + { // unsupported mediatype — filtered out (D1) + identifier: 'some_collection', + title: 'A Collection', + mediatype: 'collection', + }, +] + +const ctxResponding = (body: unknown, onUrl?: (u: string) => void): ProviderContext => ({ + fetch: (async (input: string) => { + onUrl?.(String(input)) + return new Response(JSON.stringify(body), { status: 200 }) + }) as typeof fetch, +}) + +describe('internetArchive search', () => { + it('maps CC-BY movie with version + video modality', async () => { + const refs = await internetArchive().search( + { text: 'animation', modalities: ['video', 'text'], limit: 10 }, + ctxResponding({ response: { numFound: 4, docs: DOCS } }), + ) + const bunny = refs.find(r => r.id === referenceId('internet-archive', 'https://archive.org/details/big_buck_bunny'))! + expect(bunny.modality).toBe('video') + expect(bunny.rights.license).toBe('CC-BY') + expect(bunny.rights.licenseVersion).toBe('3.0') + expect(bunny.rights.author).toBe('Blender Foundation') + expect(bunny.canonicalUrl).toBe('https://archive.org/details/big_buck_bunny') + expect(bunny.thumbnail?.url).toBe('https://archive.org/services/img/big_buck_bunny') + expect(bunny.preview).toBeUndefined() + expect(evaluateUse(bunny.rights, 'commercial-product').decision).toBe('allowed-with-attribution') + }) + + it('keeps a licenseurl-less movie as unknown → needs-review (D3, NOT dropped)', async () => { + const refs = await internetArchive().search( + { text: 'news', modalities: ['video', 'text'] }, + ctxResponding({ response: { numFound: 4, docs: DOCS } }), + ) + const clip = refs.find(r => r.canonicalUrl === 'https://archive.org/details/cbsnews-clip')! + expect(clip).toBeDefined() + expect(clip.rights.license).toBe('unknown') + expect(evaluateUse(clip.rights, 'commercial-product').decision).toBe('needs-review') + }) + + it('maps a texts item to text modality and joins an array creator', async () => { + const refs = await internetArchive().search( + { text: 'alice', modalities: ['video', 'text'] }, + ctxResponding({ response: { numFound: 4, docs: DOCS } }), + ) + const alice = refs.find(r => r.canonicalUrl === 'https://archive.org/details/alices_adventures')! + expect(alice.modality).toBe('text') + expect(alice.rights.license).toBe('CC0-1.0') + expect(alice.rights.author).toBe('Carroll, Lewis, Tenniel, John') + expect(alice.text).toBeUndefined() + }) + + it('filters out unsupported mediatypes (collection)', async () => { + const refs = await internetArchive().search( + { text: 'x', modalities: ['video', 'text'] }, + ctxResponding({ response: { numFound: 4, docs: DOCS } }), + ) + expect(refs.map(r => r.canonicalUrl)).not.toContain('https://archive.org/details/some_collection') + expect(refs).toHaveLength(3) // bunny + clip + alice + }) + + it('forwards query and rows to advancedsearch', async () => { + let seen = '' + await internetArchive({ maxRows: 7 }).search( + { text: 'jazz', modalities: ['video', 'text'] }, + ctxResponding({ response: { numFound: 0, docs: [] } }, u => { seen = u }), + ) + const url = new URL(seen) + expect(url.pathname).toBe('/advancedsearch.php') + expect(url.searchParams.get('q')).toBe('jazz') + expect(url.searchParams.get('output')).toBe('json') + expect(url.searchParams.get('rows')).toBe('7') + expect(url.searchParams.get('page')).toBe('1') + expect(url.searchParams.getAll('fl[]')).toEqual( + expect.arrayContaining(['identifier', 'title', 'creator', 'licenseurl', 'mediatype']), + ) + }) +}) +``` + +- [ ] **3.2: Run — expect FAIL** (`internetArchive` / `toReference` / `search` not implemented): + + ```bash + pnpm --filter @refkit/provider-internet-archive test + ``` + Expected: FAIL. + +- [ ] **3.3: Implement `toReference` + `search`** in `src/index.ts` (append below the mappers): + +```ts +interface IaDoc { + identifier: string + title?: string + creator?: string | string[] + licenseurl?: string + mediatype: string +} +interface IaResponse { response?: { numFound: number; docs: IaDoc[] } } + +function authorOf(creator: string | string[] | undefined): string | undefined { + if (!creator) return undefined + return Array.isArray(creator) ? creator.join(', ') || undefined : creator || undefined +} + +/** Map one search doc → Reference, or null if its mediatype is out of v1 scope (D1). + * canonicalUrl = the details page; thumbnail = the services image endpoint; preview + * omitted (search exposes no clean direct media stream). */ +export function toReference(doc: IaDoc): Reference | null { + const modality = mediatypeToModality(doc.mediatype) + if (!modality) return null + const canonicalUrl = `https://archive.org/details/${doc.identifier}` + const { license, version, jurisdiction } = mapIaLicense(doc.licenseurl) + const rights: RightsRecord = { + license, + licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + // jurisdiction-scoped PD (e.g. rightsstatements NoC-US → PD in the US) + ...(jurisdiction ? { jurisdiction } : {}), + author: authorOf(doc.creator), + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: 'https://archive.org/about/terms.php', sourceUrl: canonicalUrl }, + } + return { + id: referenceId('internet-archive', canonicalUrl), + modality, + title: doc.title || undefined, + source: { providerId: 'internet-archive', sourceUrl: canonicalUrl }, + canonicalUrl, + rights, + verifiedAt: new Date().toISOString(), + thumbnail: { url: `https://archive.org/services/img/${doc.identifier}` }, + relevance: 0, + raw: doc, + } +} + +export function internetArchive(config: InternetArchiveConfig = {}) { + return defineProvider({ + id: 'internet-archive', + modalities: ['video', 'text'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const url = new URL(BASE) + url.searchParams.set('q', q.text) + for (const f of ['identifier', 'title', 'creator', 'licenseurl', 'mediatype']) { + url.searchParams.append('fl[]', f) + } + url.searchParams.set('output', 'json') + url.searchParams.set('page', '1') + const rows = Math.min(config.maxRows ?? q.limit ?? 20, 100) + url.searchParams.set('rows', String(rows)) + const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`internet-archive search failed: ${res.status}`) + const json = (await res.json()) as IaResponse + const docs = json.response?.docs ?? [] + return docs + .map(toReference) + .filter((r): r is Reference => r !== null) + }, + }) +} +``` + +- [ ] **3.4: Run — expect PASS** (all of Task 2 + Task 3 green): + + ```bash + pnpm --filter @refkit/provider-internet-archive test + ``` + Expected: PASS. + +- [ ] **3.5: Typecheck the package**: + + ```bash + pnpm --filter @refkit/provider-internet-archive typecheck + ``` + Expected: clean. + +- [ ] **3.6: Commit** — `git add -A && git commit -m "feat(provider-internet-archive): toReference + search"` + +--- + +## Final Task: Central wiring + +- [ ] **F.1: Execute Shared Task S9** (see the index) for a **keyless** provider: + - **S9.1:** append `'./packages/provider-internet-archive/vitest.config.ts',` to root `vitest.config.ts` `projects`. + - **S9.2:** add the README table row (after the brave row, ~line 167): + `| `@refkit/provider-internet-archive` | Internet Archive | video · text | keyless | per-item CC (dirty) → unknown |` + - **S9.3:** in `packages/mcp/src/cli.ts`, add `import { internetArchive } from '@refkit/provider-internet-archive'` and add `internetArchive()` to the **base** `providers` array (the keyless line that already lists `openverse(), openverseAudio(), …, poetrydb()`). + - **S9.4:** in `packages/mcp/src/__tests__/mcp.test.ts`, add `'internet-archive'` to the id list in the `'includes every keyless provider by default'` test (~line 230). + - **S9.5:** add `"@refkit/provider-internet-archive": "workspace:*"` to `packages/mcp/package.json` dependencies. + - **S9.6:** create `.changeset/provider-internet-archive.md`: + ```markdown + --- + "@refkit/provider-internet-archive": minor + "@refkit/mcp": minor + --- + + Add @refkit/provider-internet-archive: Internet Archive as license-normalized video / text references (movies → video, texts → text; dirty per-item CC licenseurl → unknown fallback). + ``` + - **S9.7:** `pnpm install && pnpm -r typecheck && pnpm test:run` → all green (incl. `provider-internet-archive` and the updated `mcp` test). + - **S9.8:** `git add -A && git commit -m "feat(provider-internet-archive): Internet Archive satellite (P1)"` + +--- + +## Self-Review + +1. **D3 proven:** a movie WITHOUT a `licenseurl` is kept (not dropped) with `license: 'unknown'`, and `evaluateUse(...).decision === 'needs-review'`. PD is never guessed. +2. **D1 proven:** `movies`→`video`, `texts`→`text`; `collection` (and all other mediatypes) filtered out; v1 scope and the audio/image follow-up are documented in the README. +3. **D5/D7 proven:** CC-BY/CC-BY-SA carry `licenseVersion`; NC/ND → `proprietary`; CC0/PD-mark map correctly. +4. **Type validity:** every emitted `Reference` has all required fields and a valid `RightsRecord`; `licenseVersion` set only for CC-BY/CC-BY-SA; `preview` omitted; `text` TextMeta omitted (optional). +5. **Creator robustness:** `authorOf` handles both the string and array forms IA returns. diff --git a/docs/superpowers/plans/2026-06-29-provider-jamendo.md b/docs/superpowers/plans/2026-06-29-provider-jamendo.md new file mode 100644 index 0000000..fc98529 --- /dev/null +++ b/docs/superpowers/plans/2026-06-29-provider-jamendo.md @@ -0,0 +1,445 @@ +# Jamendo Provider Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. This plan depends on the shared skeleton in [`2026-06-29-p1-providers-index.md`](./2026-06-29-p1-providers-index.md) — read it first; **Task 1** and the **Final Task** delegate to Shared Tasks **S0** and **S9** there rather than repeating boilerplate. + +**Goal:** Add `@refkit/provider-jamendo` — a BYOK audio (music) satellite that searches the Jamendo API v3.0 `/tracks/` endpoint and returns license-normalized `Reference`s. Each track carries a per-item Creative Commons deed URL (`license_ccurl`); only CC-BY and CC-BY-SA fit refkit's enum, so the mapper applies decisions **D5** (partial enum fit) and **D7** (CC version from URL). + +**Architecture:** A thin satellite depending only on `@refkit/core`. `jamendo(config)` returns `defineProvider({ id: 'jamendo', modalities: ['audio'], … , search })`. `search` GETs `https://api.jamendo.com/v3.0/tracks/` with `client_id` (BYOK), `format=json`, `search`, and `limit`, then maps each `results[]` track to a `Reference` via `toAudioReference`. The CC permission family is derived from `license_ccurl` by `mapJamendoLicense`; permissions are never stored — core's `factsFor()`/`evaluateUse()` derive them from `license`. Mirror `provider-openverse`'s `toAudioReference` (modality `'audio'`, preview = the mp3 stream, image → thumbnail). + +**Tech Stack:** TypeScript (ESM, `"type": "module"`), tsup (build), vitest (test), zod (via core), pnpm workspaces, changesets. + +--- + +## Task 1: Decisions & scaffold + +- [ ] **1.1: Confirm decisions.** This provider applies **D5** (jamendo `license_ccurl` → match the CC URL to a family: `/licenses/by/<v>/` → `CC-BY`, `/licenses/by-sa/<v>/` → `CC-BY-SA`; any `by-nc*`/`by-nd*` variant → `proprietary`; missing/unrecognized → `unknown`) and **D7** (capture the CC version from the deed URL, set `licenseVersion` only for the `CC-BY`/`CC-BY-SA` families). No other decisions apply. + +- [ ] **1.2: Execute Shared Task S0** (provider satellite skeleton) with these substitutions: + + | placeholder | value | + |---|---| + | `<id>` | `jamendo` | + | `<Fn>` | `jamendo` | + | `<Title>` | `Jamendo` | + | `<modality>` | `audio` | + | `<auth>` | `API key` | + | `<licenseCol>` | `per-item CC` | + + This creates `packages/provider-jamendo/` with `package.json`, `tsconfig.json`, `tsup.config.ts`, `vitest.config.ts`, `LICENSE`, `README.md`, and runs `pnpm install`. Do not commit yet — bundle with the first real change in Task 2. + +--- + +## Task 2: TDD `mapJamendoLicense` + `toAudioReference` (CC-BY happy path) + +- [ ] **2.1: Write the failing test first.** Create `packages/provider-jamendo/src/__tests__/jamendo.test.ts`. Mock `ctx.fetch` with realistic Jamendo JSON (a `headers`/`results[]` envelope). Start with a single CC-BY 4.0 track. + +```ts +import { describe, expect, it } from 'vitest' +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { jamendo, mapJamendoLicense } from '../index' + +// Jamendo wraps results in { headers, results }. This ctx captures the request URL +// (to assert client_id/search/limit forwarding) and returns the supplied body. +const ctxCapturing = (body: unknown): { ctx: ProviderContext; url: () => string } => { + let captured = '' + const ctx: ProviderContext = { + fetch: (async (input: Parameters<typeof fetch>[0]) => { + captured = String(input) + return new Response(JSON.stringify(body), { status: 200 }) + }) as typeof fetch, + } + return { ctx, url: () => captured } +} + +const envelope = (results: unknown[]) => ({ + headers: { status: 'success', code: 0, error_message: '', results_count: results.length }, + results, +}) + +const TRACK_BY = { + id: '1848357', + name: 'Sunrise', + artist_name: 'fankel', + audio: 'https://prod-1.storage.jamendo.com/?trackid=1848357&format=mp31&from=app-devsite', + audiodownload: 'https://prod-1.storage.jamendo.com/download/track/1848357/mp32/', + image: 'https://usercontent.jamendo.com?type=album&id=368084&width=300&trackid=1848357', + shareurl: 'https://www.jamendo.com/track/1848357', + shorturl: 'https://jamen.do/t/1848357', + license_ccurl: 'http://creativecommons.org/licenses/by/4.0/', +} + +describe('mapJamendoLicense', () => { + it('maps CC-BY and CC-BY-SA with version, NC/ND → proprietary, missing → unknown', () => { + expect(mapJamendoLicense('http://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapJamendoLicense('https://creativecommons.org/licenses/by-sa/3.0/')).toEqual({ license: 'CC-BY-SA', version: '3.0' }) + expect(mapJamendoLicense('http://creativecommons.org/licenses/by-nc-nd/3.0/')).toEqual({ license: 'proprietary' }) + expect(mapJamendoLicense('http://creativecommons.org/licenses/by-nc/2.0/')).toEqual({ license: 'proprietary' }) + expect(mapJamendoLicense('http://creativecommons.org/licenses/by-nd/4.0/')).toEqual({ license: 'proprietary' }) + expect(mapJamendoLicense('')).toEqual({ license: 'unknown' }) + expect(mapJamendoLicense('https://example.com/whatever')).toEqual({ license: 'unknown' }) + }) +}) + +describe('jamendo provider', () => { + it('maps a CC-BY track to a CC-BY audio reference (allowed-with-attribution)', async () => { + const { ctx } = ctxCapturing(envelope([TRACK_BY])) + const refs = await jamendo({ clientId: 'cid' }).search({ text: 'sunrise', modalities: ['audio'], limit: 5 }, ctx) + expect(refs).toHaveLength(1) + const r = refs[0] + expect(r.modality).toBe('audio') + expect(r.rights.license).toBe('CC-BY') + expect(r.rights.licenseVersion).toBe('4.0') + expect(r.rights.author).toBe('fankel') + expect(r.title).toBe('Sunrise') + expect(r.canonicalUrl).toBe('https://www.jamendo.com/track/1848357') + expect(r.preview?.url).toContain('trackid=1848357') + expect(r.preview?.mediaType).toBe('audio/mpeg') + expect(r.thumbnail?.url).toContain('usercontent.jamendo.com') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed-with-attribution') + }) +}) +``` + +- [ ] **2.2: Run the test — expect FAIL** (module/exports do not exist yet). + +```bash +pnpm --filter @refkit/provider-jamendo test +``` +Expected: FAIL — `Cannot find module '../index'` / `jamendo`/`mapJamendoLicense` is not exported. + +- [ ] **2.3: Implement `src/index.ts` to pass.** Create `packages/provider-jamendo/src/index.ts`: + +```ts +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type LicenseId, + type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +export interface JamendoConfig { + /** Jamendo API client_id (BYOK). Register at https://devportal.jamendo.com/. */ + clientId: string +} + +export interface JamendoSearchOptions { + /** mp3 stream quality. Default 'mp31' (96 kbps). */ + audioformat?: 'mp31' | 'mp32' | 'ogg' | 'flac' + order?: 'relevance' | 'popularity_total' | 'popularity_month' | 'popularity_week' | 'releasedate_asc' | 'releasedate_desc' | 'buzzrate' + /** Restrict to tracks whose license permits a given use, server-side. Relevance + * hint only — mapJamendoLicense below is the authoritative rights gate. */ + ccsa?: boolean + ccnd?: boolean + ccnc?: boolean + tags?: string | readonly string[] + artist_name?: string + offset?: number +} + +const BASE = 'https://api.jamendo.com/v3.0/tracks/' + +// The `audioformat` request param decides what `t.audio` streams; reflect it in mediaType +// rather than hardcoding audio/mpeg (which would mislabel ogg/flac requests). +const JAMENDO_AUDIO_MIME: Record<string, string> = { + mp31: 'audio/mpeg', mp32: 'audio/mpeg', ogg: 'audio/ogg', flac: 'audio/flac', +} + +interface JamendoTrack { + id: string + name: string + artist_name: string + audio: string + audiodownload?: string + image: string + shareurl: string + shorturl?: string + license_ccurl: string +} +interface JamendoResponse { + headers: { status: string; code: number; error_message?: string; results_count: number } + results: JamendoTrack[] +} + +// Jamendo deed URLs look like http(s)://creativecommons.org/licenses/<variant>/<v>/. +// Only by/by-sa fit our enum (D5); capture the version (D7). Any nc/nd variant is +// non-commercial or no-derivatives → 'proprietary'. Missing/unrecognized → 'unknown'. +export function mapJamendoLicense(ccurl: string): { license: LicenseId; version?: string } { + if (!ccurl) return { license: 'unknown' } + const by = ccurl.match(/\/licenses\/by\/(\d\.\d)\//) + if (by) return { license: 'CC-BY', version: by[1] } + const bySa = ccurl.match(/\/licenses\/by-sa\/(\d\.\d)\//) + if (bySa) return { license: 'CC-BY-SA', version: bySa[1] } + if (/\/licenses\/by-(nc|nd)/.test(ccurl)) return { license: 'proprietary' } + return { license: 'unknown' } +} + +function toAudioReference(t: JamendoTrack, mediaType: string): Reference { + const { license, version } = mapJamendoLicense(t.license_ccurl) + const canonicalUrl = t.shareurl + const rights: RightsRecord = { + license, + // CC version is metadata only (attribution/audit), kept for the BY/BY-SA family. + licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + author: t.artist_name || undefined, + // governed by the per-item CC license; the mp3 stream is served directly by Jamendo + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: t.license_ccurl, sourceUrl: canonicalUrl }, + } + return { + id: referenceId('jamendo', canonicalUrl), + modality: 'audio', + title: t.name || undefined, + source: { providerId: 'jamendo', sourceUrl: canonicalUrl }, + canonicalUrl, + rights, + verifiedAt: new Date().toISOString(), + // audio has no native thumbnail; the album art is the closest visual handle + ...(t.image ? { thumbnail: { url: t.image } } : {}), + preview: { url: t.audio, mediaType }, + relevance: 0, // per-source order; mergeReferences assigns the final RRF relevance + raw: t, + } +} + +function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { + if (typeof value !== 'string' || !value) return + if (allowed && !allowed.includes(value)) return + url.searchParams.set(key, value) +} + +function setIfStringList(url: URL, key: string, value: unknown) { + if (typeof value === 'string' && value) url.searchParams.set(key, value) + if (Array.isArray(value) && value.length > 0 && value.every(v => typeof v === 'string' && v)) url.searchParams.set(key, value.join(' ')) +} + +function setIfBooleanFlag(url: URL, key: string, value: unknown) { + if (typeof value !== 'boolean') return + url.searchParams.set(key, value ? 'true' : 'false') +} + +function setIfPositiveInt(url: URL, key: string, value: unknown) { + if (typeof value !== 'number' || !Number.isInteger(value) || value < 0) return + url.searchParams.set(key, String(value)) +} + +export function jamendo(config: JamendoConfig) { + return defineProvider({ + id: 'jamendo', + modalities: ['audio'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const url = new URL(BASE) + url.searchParams.set('client_id', config.clientId) + url.searchParams.set('format', 'json') + url.searchParams.set('search', q.text) + url.searchParams.set('limit', String(Math.min(q.limit ?? 20, 200))) + const opts = q.providerOptions as JamendoSearchOptions | undefined + setIfString(url, 'audioformat', opts?.audioformat, ['mp31', 'mp32', 'ogg', 'flac']) + setIfString(url, 'order', opts?.order, ['relevance', 'popularity_total', 'popularity_month', 'popularity_week', 'releasedate_asc', 'releasedate_desc', 'buzzrate']) + setIfBooleanFlag(url, 'ccsa', opts?.ccsa) + setIfBooleanFlag(url, 'ccnd', opts?.ccnd) + setIfBooleanFlag(url, 'ccnc', opts?.ccnc) + setIfStringList(url, 'tags', opts?.tags) + setIfString(url, 'artist_name', opts?.artist_name) + setIfPositiveInt(url, 'offset', opts?.offset) + const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`jamendo search failed: ${res.status}`) + const json = (await res.json()) as JamendoResponse + if (json.headers?.status !== 'success') throw new Error(`jamendo search error: ${json.headers?.error_message || json.headers?.status}`) + const mediaType = JAMENDO_AUDIO_MIME[opts?.audioformat ?? 'mp31'] ?? 'audio/mpeg' + return (json.results ?? []).map((t) => toAudioReference(t, mediaType)) + }, + }) +} +``` + +- [ ] **2.4: Run the test — expect PASS.** + +```bash +pnpm --filter @refkit/provider-jamendo test +``` +Expected: PASS — both the `mapJamendoLicense` table and the CC-BY reference test green. + +- [ ] **2.5: Commit.** + +```bash +git add -A +git commit -m "feat(provider-jamendo): scaffold + CC-BY audio mapping (P1)" +``` + +--- + +## Task 3: TDD non-commercial track → proprietary → denied + +- [ ] **3.1: Add the failing test.** Append to `jamendo.test.ts` a CC-BY-NC track and assert it gates closed for commercial use. + +```ts +const TRACK_NC = { + ...TRACK_BY, + id: '2000001', + name: 'For Listening Only', + license_ccurl: 'http://creativecommons.org/licenses/by-nc-nd/3.0/', + shareurl: 'https://www.jamendo.com/track/2000001', +} + +it('maps a CC-BY-NC-ND track to proprietary → denied for commercial use', async () => { + const { ctx } = ctxCapturing(envelope([TRACK_NC])) + const refs = await jamendo({ clientId: 'cid' }).search({ text: 'listen', modalities: ['audio'] }, ctx) + expect(refs).toHaveLength(1) + expect(refs[0].rights.license).toBe('proprietary') + expect(refs[0].rights.licenseVersion).toBeUndefined() + expect(evaluateUse(refs[0].rights, 'commercial-product').decision).toBe('denied') +}) +``` + +- [ ] **3.2: Run — expect PASS** (already handled by the `by-(nc|nd)` branch in `mapJamendoLicense`; this test locks the behavior in). + +```bash +pnpm --filter @refkit/provider-jamendo test +``` +Expected: PASS. (Why it's correct: the `CC-BY`/`CC-BY-SA` branches use literal-slash anchors — `/\/licenses\/by\/(\d\.\d)\//` and `/\/licenses\/by-sa\/(\d\.\d)\//` — so a `by-nc*`/`by-nd*` URL can never match them no matter the branch order; the `by-(nc|nd)` test then catches those → `proprietary`. Branch order is irrelevant here. If this FAILs, the bug is in a regex anchor, not the ordering.) + +- [ ] **3.3: Commit.** + +```bash +git add -A +git commit -m "test(provider-jamendo): NC/ND → proprietary denied for commercial" +``` + +--- + +## Task 4: TDD missing/unrecognized ccurl → unknown → needs-review + +- [ ] **4.1: Add the failing test.** Append a track with an empty `license_ccurl` and one with a non-CC URL. + +```ts +const TRACK_NO_LICENSE = { + ...TRACK_BY, + id: '3000002', + name: 'Mystery Track', + license_ccurl: '', + shareurl: 'https://www.jamendo.com/track/3000002', +} + +it('maps a track with no recognizable license to unknown → needs-review', async () => { + const { ctx } = ctxCapturing(envelope([TRACK_NO_LICENSE])) + const refs = await jamendo({ clientId: 'cid' }).search({ text: 'mystery', modalities: ['audio'] }, ctx) + expect(refs).toHaveLength(1) + expect(refs[0].rights.license).toBe('unknown') + expect(evaluateUse(refs[0].rights, 'commercial-product').decision).toBe('needs-review') +}) +``` + +- [ ] **4.2: Run — expect PASS** (the empty/unrecognized branch already returns `unknown`; core turns `unknown` into `needs-review`). + +```bash +pnpm --filter @refkit/provider-jamendo test +``` +Expected: PASS. + +- [ ] **4.3: Commit.** + +```bash +git add -A +git commit -m "test(provider-jamendo): missing/unknown ccurl → needs-review" +``` + +--- + +## Task 5: TDD request forwarding (client_id, search, limit, options) + +- [ ] **5.1: Add the failing test.** Assert the outgoing request carries the BYOK `client_id`, the `search` text, the `limit`, `format=json`, and forwarded provider options. + +```ts +it('forwards client_id, search, limit, format and documented options', async () => { + const { ctx, url } = ctxCapturing(envelope([])) + await jamendo({ clientId: 'my-client-id' }).search({ + text: 'ambient', + modalities: ['audio'], + limit: 7, + providerOptions: { audioformat: 'mp32', order: 'popularity_total', ccnc: false, tags: ['ambient', 'chill'], artist_name: 'fankel', offset: 20 }, + }, ctx) + const u = new URL(url()) + expect(u.searchParams.get('client_id')).toBe('my-client-id') + expect(u.searchParams.get('format')).toBe('json') + expect(u.searchParams.get('search')).toBe('ambient') + expect(u.searchParams.get('limit')).toBe('7') + expect(u.searchParams.get('audioformat')).toBe('mp32') + expect(u.searchParams.get('order')).toBe('popularity_total') + expect(u.searchParams.get('ccnc')).toBe('false') + expect(u.searchParams.get('tags')).toBe('ambient chill') + expect(u.searchParams.get('artist_name')).toBe('fankel') + expect(u.searchParams.get('offset')).toBe('20') +}) + +it('returns [] when Jamendo finds nothing', async () => { + const { ctx } = ctxCapturing(envelope([])) + const refs = await jamendo({ clientId: 'cid' }).search({ text: 'zzzz', modalities: ['audio'] }, ctx) + expect(refs).toEqual([]) +}) +``` + +- [ ] **5.2: Run — expect PASS** (forwarding is implemented in Task 2's `search`; this test pins the contract). + +```bash +pnpm --filter @refkit/provider-jamendo test +``` +Expected: PASS — all jamendo tests green. + +- [ ] **5.3: Typecheck the package.** + +```bash +pnpm --filter @refkit/provider-jamendo typecheck +``` +Expected: clean (no type errors). + +- [ ] **5.4: Commit.** + +```bash +git add -A +git commit -m "test(provider-jamendo): request forwarding (client_id/search/limit/options)" +``` + +--- + +## Final Task: Central wiring + +- [ ] **F.1: Execute Shared Task S9** (central wiring) with these substitutions: `<id>=jamendo`, `<Fn>=jamendo`, `<Title>=Jamendo`, `<modality>=audio`, `<auth>=API key`, `<licenseCol>=per-item CC`, **`<ENVVAR>=JAMENDO_CLIENT_ID`**. This covers S9.1 (root `vitest.config.ts` project), S9.2 (README provider table row), S9.5 (`mcp` devDep), S9.6 (changeset), S9.7 (repo-green verify), S9.8 (commit). + +- [ ] **F.2: S9.3 — BYOK gating in `packages/mcp/src/cli.ts`.** Jamendo is BYOK: + - add `import { jamendo } from '@refkit/provider-jamendo'` + - after the existing BYOK block, add: + ```ts + if (env.JAMENDO_CLIENT_ID) providers.push(jamendo({ clientId: env.JAMENDO_CLIENT_ID })) + ``` + +- [ ] **F.3: S9.4 — CLI wiring test in `packages/mcp/src/__tests__/mcp.test.ts`.** In the `describe('defaultProviders'…)` block, add an assertion mirroring the unsplash gate (id absent without env, present with the key): + ```ts + it('adds jamendo only when JAMENDO_CLIENT_ID is present', () => { + expect(defaultProviders({}).map(p => p.id)).not.toContain('jamendo') + expect(defaultProviders({ JAMENDO_CLIENT_ID: 'k' }).map(p => p.id)).toContain('jamendo') + }) + ``` + +- [ ] **F.4: Verify the whole repo green** (S9.7). + +```bash +pnpm install && pnpm -r typecheck && pnpm test:run +``` +Expected: typecheck clean; all vitest projects (including `provider-jamendo` and the extended `mcp` wiring test) PASS. + +- [ ] **F.5: Commit** (S9.8). + +```bash +git add -A +git commit -m "feat(provider-jamendo): Jamendo satellite (P1)" +``` + +--- + +## Self-Review + +1. **Reference validity:** every emitted `Reference` has `id, modality:'audio', source{providerId,sourceUrl}, canonicalUrl, rights, verifiedAt, relevance`; `RightsRecord` has `license, rehostPolicy:'cache-allowed', raw{sourceTerms,sourceUrl}`; `licenseVersion` set only for `CC-BY`/`CC-BY-SA`. +2. **Decision coverage:** D5 (CC URL → family; NC/ND → proprietary; unrecognized → unknown) and D7 (version from CC URL) are both implemented in `mapJamendoLicense` and tested. +3. **Conservative rights:** missing/unrecognized `license_ccurl` → `unknown` → `needs-review`; never fabricated. +4. **BYOK gating:** `jamendo` requires `clientId`; CLI adds it only when `JAMENDO_CLIENT_ID` is set, asserted by the wiring test. diff --git a/docs/superpowers/plans/2026-06-29-provider-polyhaven.md b/docs/superpowers/plans/2026-06-29-provider-polyhaven.md new file mode 100644 index 0000000..7ffef39 --- /dev/null +++ b/docs/superpowers/plans/2026-06-29-provider-polyhaven.md @@ -0,0 +1,460 @@ +# Poly Haven + ambientCG Provider Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. This plan **builds on** `2026-06-29-p1-providers-index.md` — read it first. Where this plan says "Execute Shared Task S0/S9", run that task from the index with the substitutions given here; do **not** re-paste the index boilerplate. + +**Goal:** Add `@refkit/provider-polyhaven` — a single keyless satellite package exposing **two factories**, `polyhaven()` (Poly Haven) and `ambientcg()` (ambientCG), that return license-normalized `image` `Reference`s. Both sources are wholly CC0; every emitted reference is hardcoded to `license: 'CC0-1.0'`. + +**Architecture:** +- **Two factories, one package** (mirrors `pexels` + `pexelsVideo` living together in `@refkit/provider-pexels`). `polyhaven()` and `ambientcg()` are independent `defineProvider(...)` instances exported from the same `src/index.ts`; they share the CC0 `toReference` shape but hit different APIs. +- **D1 — modality ceiling (image only, skip 3D):** core defines exactly `image | video | audio | text` (no `3d`/`texture`). Both sources host textures/HDRIs as **image files**, so we emit `modality: 'image'`. We surface only the image-format preview per asset and **skip 3D model formats** (`.blend`/`.gltf`/`.fbx`/`.mtlx`/`.usd`) for v1 — no core change, no `3d` modality. The README must document this skip. +- **D2 — whole-source CC0 hardcode:** neither API exposes a per-item license field. We hardcode `license: 'CC0-1.0'`, `rehostPolicy: 'cache-allowed'`, and `rights.raw.sourceTerms = <license page URL>`. No `licenseVersion`, no per-item license parsing. Mirror `provider-met`'s hardcoded-CC0 shape exactly. +- **N+1 detail fetch (Poly Haven only):** Poly Haven's `/assets` list does not contain download URLs. To get a real image preview URL we fan out to `/files/<id>` per asset (same N+1 pattern as `provider-met`'s `/objects/<id>`). ambientCG's `full_json` returns preview URLs inline (no second fetch). + +**Tech Stack:** TypeScript (ESM, `"type": "module"`), tsup (build), vitest (test), zod (via core), pnpm workspaces, changesets. + +--- + +## API facts (web-verified 2026-06-29) + +**Poly Haven (keyless, all CC0):** +- List: `https://api.polyhaven.com/assets?t=textures` and `?t=hdris` → a **map of `id → asset`**. Each asset: `{ type, name, categories: string[], tags: string[], authors: { [name]: role }, thumbnail_url, max_resolution, ... }`. `type` is `1` for textures, `0` for HDRIs. (`thumbnail_url` example: `https://cdn.polyhaven.com/asset_img/thumbs/aerial_asphalt_01.png?width=256&height=256`.) +- Files: `https://api.polyhaven.com/files/<id>`. For a **texture**, image maps nest as `<MapKey> → <res> → <fmt> → { url, ... }`, e.g. `Diffuse → "1k" → "jpg" → url` = `https://dl.polyhaven.org/file/ph-assets/Textures/jpg/1k/aerial_asphalt_01/aerial_asphalt_01_diff_1k.jpg`. Non-image keys (`blend`, `gltf`, `mtlx`) also appear — **skip them (D1)**. For an **HDRI**, top-level keys are `tonemapped` (a real `.jpg`), `hdri` (`.hdr`/`.exr` by res), `colorchart`, `backplates`. We use `tonemapped.url` as the image preview and skip `hdri` (HDR/EXR are not standard web images, D1). +- License page (sourceTerms): `https://polyhaven.com/license` — "Our assets are all licensed as CC0…". + +**ambientCG (keyless, all CC0):** +- JSON: `https://ambientcg.com/api/v2/full_json?type=Material&include=displayData,imageData&limit=<n>&offset=<o>` (optionally add `&q=<query>`). Response top-level key: **`foundAssets`** (array). Each asset: `{ assetId, displayName, dataType, category, tags, previewImage: { "256-PNG": url, "512-PNG": url, ... }, downloadFolders, ... }`. +- `previewImage` keys are `<size>-<fmt>` strings: `64-PNG, 128-PNG, 256-PNG, 512-PNG, 1024-PNG, 2048-PNG, …-WEBP, …-JPG-FFFFFF, …`. Path to a preview: `foundAssets[0].previewImage["256-PNG"]` = `https://acg-media.struffelproductions.com/file/ambientCG-Web/media/thumbnail/256-PNG/Tiles141.png`. We use the PNG preview (e.g. `512-PNG`, falling back to `256-PNG`) and ignore the zipped 3D/material `downloadFolders` (D1). +- We pass `type=Material` (image-based PBR materials). Non-`Material` dataTypes (e.g. `3DModel`, `Atlas`, `Substance`) must not be emitted (D1) — but since we only ever request `type=Material`, also assert defensively in the mapper. +- License page (sourceTerms): `https://ambientcg.com/license/` — "All ambientCG assets are provided under the Creative Commons CC0 1.0 Universal License." + +--- + +## Task 1: Decisions & scaffold + +- [ ] **1.1: Confirm decisions.** This provider applies: + - **D1 (modality ceiling):** emit `modality: 'image'` for textures/HDRIs/materials; surface only the image-format preview (`Diffuse → jpg` / `tonemapped.jpg` / `previewImage["*-PNG"]`); **skip 3D model formats** (`blend`/`gltf`/`fbx`/`mtlx`/`usd`/`hdr`/`exr`) for v1. No `3d` modality, no core change. + - **D2 (whole-source CC0 hardcode):** hardcode `license: 'CC0-1.0'`, `rehostPolicy: 'cache-allowed'`, `rights.raw.sourceTerms = <license page URL>` per source. No per-item license, no `licenseVersion`. + +- [ ] **1.2: Execute Shared Task S0** (skeleton) with this substitution row: + + | token | value | + |---|---| + | `<id>` | `polyhaven` | + | `<Fn>` | `polyhaven` | + | `<Title>` | `Poly Haven` | + | `<modality>` | `image` | + | `<auth>` | `keyless` | + | `<licenseCol>` | `CC0` | + + Notes when running S0: + - In `package.json` keywords add `"ambientcg"` alongside `"polyhaven"` so the sibling source is discoverable. + - The README (S0.3) must additionally: (a) state the **3D-model skip (D1)** — only image previews of textures/HDRIs/materials are returned; (b) document the **`ambientcg()` sibling factory** in the same package with a second usage snippet (`import { polyhaven, ambientcg } from '@refkit/provider-polyhaven'`). + +--- + +## Task 2: TDD `polyhaven()` factory — failing test + +- [ ] **2.1: Write `packages/provider-polyhaven/src/__tests__/polyhaven.test.ts`** (FAIL first — `polyhaven` not implemented). It routes the list endpoint and the per-id `/files/<id>` endpoint to fixtures, like `provider-met`'s N+1 router. + +```ts +import { describe, expect, it } from 'vitest' +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { polyhaven } from '../index' + +// Poly Haven: /assets returns id→asset (no URLs); /files/<id> returns the download tree. +const ctxRouting = (list: unknown, files: Record<string, unknown>): ProviderContext => ({ + fetch: (async (input: string) => { + const u = String(input) + if (u.includes('/assets')) return new Response(JSON.stringify(list), { status: 200 }) + const m = u.match(/\/files\/([^/?]+)/) + if (m && files[m[1]]) return new Response(JSON.stringify(files[m[1]]), { status: 200 }) + return new Response('null', { status: 404 }) + }) as typeof fetch, +}) + +const LIST = { + aerial_asphalt_01: { + type: 1, name: 'Aerial Asphalt 01', categories: ['asphalt', 'road'], tags: ['flat'], + authors: { 'Rob Tuytel': 'All' }, + thumbnail_url: 'https://cdn.polyhaven.com/asset_img/thumbs/aerial_asphalt_01.png?width=256&height=256', + }, +} +const FILES_TEX = { + aerial_asphalt_01: { + Diffuse: { + '1k': { jpg: { url: 'https://dl.polyhaven.org/file/ph-assets/Textures/jpg/1k/aerial_asphalt_01/aerial_asphalt_01_diff_1k.jpg' } }, + }, + // non-image keys that must be ignored: + blend: { '1k': { blend: { url: 'https://dl.polyhaven.org/x.blend' } } }, + gltf: { '1k': { gltf: { url: 'https://dl.polyhaven.org/x.gltf' } } }, + }, +} + +describe('polyhaven provider', () => { + it('maps a texture to a CC0 image reference with a resolved jpg preview', async () => { + const refs = await polyhaven().search( + { text: 'asphalt', modalities: ['image'], limit: 5 }, + ctxRouting(LIST, FILES_TEX), + ) + expect(refs).toHaveLength(1) + const r = refs[0] + expect(r.modality).toBe('image') + expect(r.title).toBe('Aerial Asphalt 01') + expect(r.rights.license).toBe('CC0-1.0') + expect(r.rights.author).toBe('Rob Tuytel') + expect(r.rights.rehostPolicy).toBe('cache-allowed') + expect(r.rights.raw.sourceTerms).toBe('https://polyhaven.com/license') + expect(r.preview?.url).toContain('aerial_asphalt_01_diff_1k.jpg') + expect(r.preview?.mediaType).toBe('image/jpeg') + expect(r.thumbnail?.url).toContain('thumbs/aerial_asphalt_01.png') + expect(r.canonicalUrl).toBe('https://polyhaven.com/a/aerial_asphalt_01') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed') + }) + + it('returns [] when the list is empty', async () => { + const refs = await polyhaven().search({ text: 'zzz', modalities: ['image'] }, ctxRouting({}, {})) + expect(refs).toEqual([]) + }) +}) +``` + +- [ ] **2.2: Run (expect FAIL).** + `pnpm --filter @refkit/provider-polyhaven test` + Expected: FAIL — `polyhaven` is not exported / file has no implementation. + +## Task 3: Implement `polyhaven()` — make Task 2 pass + +- [ ] **3.1: Write `packages/provider-polyhaven/src/index.ts`** with the `polyhaven()` factory. (`ambientcg()` is added in Task 5.) + +```ts +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +const PH_BASE = 'https://api.polyhaven.com' +const PH_TERMS = 'https://polyhaven.com/license' + +export interface PolyHavenConfig { + /** texture vs HDRI listing. Default 'textures'. */ + assetType?: 'textures' | 'hdris' + /** Max assets resolved per search; each costs one /files/<id> call (N+1). Default 12. */ + maxAssets?: number +} + +interface PolyHavenAsset { + type: number + name: string + categories?: string[] + tags?: string[] + authors?: Record<string, string> + thumbnail_url?: string +} +type PolyHavenList = Record<string, PolyHavenAsset> +// /files tree: maps/resolutions/formats → { url }. Loosely typed; we walk known image paths only. +type PolyHavenFiles = Record<string, unknown> + +interface PhFileLeaf { url?: string } + +/** First image URL for a texture: Diffuse (then a couple of fallbacks) → smallest res → jpg/png. */ +function textureImageUrl(files: PolyHavenFiles): string | undefined { + for (const mapKey of ['Diffuse', 'diff', 'Color', 'albedo']) { + const byRes = files[mapKey] as Record<string, Record<string, PhFileLeaf>> | undefined + if (!byRes) continue + for (const res of ['1k', '2k', '4k']) { + const byFmt = byRes[res] + const url = byFmt?.jpg?.url ?? byFmt?.png?.url + if (url) return url + } + } + return undefined +} + +/** HDRI image preview: the tonemapped .jpg (skip .hdr/.exr — D1). */ +function hdriImageUrl(files: PolyHavenFiles): string | undefined { + const tm = files.tonemapped as PhFileLeaf | undefined + return tm?.url +} + +function firstAuthor(authors?: Record<string, string>): string | undefined { + if (!authors) return undefined + const names = Object.keys(authors) + return names.length ? names.join(', ') : undefined +} + +function toReference(id: string, asset: PolyHavenAsset, imageUrl: string): Reference { + const canonical = `https://polyhaven.com/a/${id}` + const rights: RightsRecord = { + license: 'CC0-1.0', + author: firstAuthor(asset.authors), + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: PH_TERMS, sourceUrl: canonical }, + } + return { + id: referenceId('polyhaven', canonical), + modality: 'image', + title: asset.name || undefined, + source: { providerId: 'polyhaven', sourceUrl: canonical }, + canonicalUrl: canonical, + rights, + verifiedAt: new Date().toISOString(), + ...(asset.thumbnail_url ? { thumbnail: { url: asset.thumbnail_url } } : {}), + // textureImageUrl may resolve a .png fallback — derive the MIME from the extension + // rather than hardcoding jpeg (mislabeling a PNG as JPEG). + preview: { url: imageUrl, mediaType: imageUrl.toLowerCase().includes('.png') ? 'image/png' : 'image/jpeg' }, + relevance: 0, + raw: asset, + } +} + +export function polyhaven(config: PolyHavenConfig = {}) { + const assetType = config.assetType ?? 'textures' + return defineProvider({ + id: 'polyhaven', + modalities: ['image'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const listUrl = new URL(`${PH_BASE}/assets`) + listUrl.searchParams.set('t', assetType) + const res = await ctx.fetch(listUrl.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`polyhaven list failed: ${res.status}`) + const list = (await res.json()) as PolyHavenList + let entries = Object.entries(list) + // Client-side keyword filter — the list endpoint has no query param. + const text = q.text?.trim().toLowerCase() + if (text) { + entries = entries.filter(([id, a]) => + id.includes(text) || + a.name?.toLowerCase().includes(text) || + a.categories?.some((c) => c.toLowerCase().includes(text)) || + a.tags?.some((t) => t.toLowerCase().includes(text))) + } + const n = Math.min(config.maxAssets ?? q.limit ?? 12, 30) + const picked = entries.slice(0, n) + const refs = await Promise.all(picked.map(async ([id, asset]) => { + try { + const fr = await ctx.fetch(`${PH_BASE}/files/${id}`, { signal: ctx.signal }) + if (!fr.ok) return null + const files = (await fr.json()) as PolyHavenFiles + const imageUrl = assetType === 'hdris' ? hdriImageUrl(files) : textureImageUrl(files) + if (!imageUrl) return null // no image-format file → skip (D1) + return toReference(id, asset, imageUrl) + } catch { + return null // one bad files fetch must not drop the whole batch + } + })) + return refs.filter((r): r is Reference => r !== null) + }, + }) +} +``` + +- [ ] **3.2: Run (expect PASS).** + `pnpm --filter @refkit/provider-polyhaven test` + Expected: PASS — both `polyhaven` tests green. + +- [ ] **3.3: Typecheck & commit.** + `pnpm --filter @refkit/provider-polyhaven typecheck` + Then: `git add -A && git commit -m "feat(provider-polyhaven): polyhaven() CC0 image satellite"` + +## Task 4: TDD `ambientcg()` factory — failing test + +- [ ] **4.1: Append to `packages/provider-polyhaven/src/__tests__/ambientcg.test.ts`** (new file; FAIL first — `ambientcg` not exported). ambientCG returns preview URLs inline (no N+1). + +```ts +import { describe, expect, it } from 'vitest' +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { ambientcg } from '../index' + +const ctxJson = (body: unknown, capture?: (u: string) => void): ProviderContext => ({ + fetch: (async (input: string) => { + capture?.(String(input)) + return new Response(JSON.stringify(body), { status: 200 }) + }) as typeof fetch, +}) + +const FOUND = { + foundAssets: [ + { + assetId: 'Tiles141', displayName: 'Tiles 141', dataType: 'Material', + category: 'Tiles', tags: ['tiles', 'floor'], + previewImage: { + '256-PNG': 'https://acg-media.struffelproductions.com/file/ambientCG-Web/media/thumbnail/256-PNG/Tiles141.png', + '512-PNG': 'https://acg-media.struffelproductions.com/file/ambientCG-Web/media/thumbnail/512-PNG/Tiles141.png', + }, + }, + ], +} +const FOUND_NO_IMAGE = { + foundAssets: [ + // a non-image asset (e.g. plugin/3D-only) with no previewImage → must not be emitted (D1) + { assetId: 'SomeModel', displayName: 'Some Model', dataType: '3DModel', tags: [] }, + ], +} + +describe('ambientcg provider', () => { + it('maps a Material to a CC0 image reference using the PNG preview', async () => { + let url = '' + const refs = await ambientcg().search( + { text: 'tiles', modalities: ['image'], limit: 5 }, + ctxJson(FOUND, (u) => { url = u }), + ) + expect(url).toContain('type=Material') + expect(url).toContain('q=tiles') + expect(refs).toHaveLength(1) + const r = refs[0] + expect(r.modality).toBe('image') + expect(r.title).toBe('Tiles 141') + expect(r.rights.license).toBe('CC0-1.0') + expect(r.rights.rehostPolicy).toBe('cache-allowed') + expect(r.rights.raw.sourceTerms).toBe('https://ambientcg.com/license/') + expect(r.preview?.url).toContain('512-PNG/Tiles141.png') + expect(r.canonicalUrl).toBe('https://ambientcg.com/view?id=Tiles141') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed') + }) + + it('drops assets without an image preview (non-image dataType, D1)', async () => { + const refs = await ambientcg().search({ text: 'x', modalities: ['image'] }, ctxJson(FOUND_NO_IMAGE)) + expect(refs).toEqual([]) + }) +}) +``` + +- [ ] **4.2: Run (expect FAIL).** + `pnpm --filter @refkit/provider-polyhaven test` + Expected: FAIL on the ambientcg suite (`ambientcg` not exported); polyhaven suite still PASS. + +## Task 5: Implement `ambientcg()` — make Task 4 pass + +- [ ] **5.1: Append the `ambientcg()` factory to `packages/provider-polyhaven/src/index.ts`.** + +```ts +const ACG_BASE = 'https://ambientcg.com/api/v2/full_json' +const ACG_TERMS = 'https://ambientcg.com/license/' + +export interface AmbientCgConfig { + /** Max materials per search. Default 12. */ + limit?: number +} + +interface AmbientCgAsset { + assetId: string + displayName?: string + dataType?: string + previewImage?: Record<string, string> +} +interface AmbientCgResponse { foundAssets?: AmbientCgAsset[] } + +/** Pick the largest available PNG preview (image-format only — D1). */ +function acgPreviewUrl(preview?: Record<string, string>): string | undefined { + if (!preview) return undefined + for (const key of ['1024-PNG', '512-PNG', '256-PNG', '128-PNG']) { + if (preview[key]) return preview[key] + } + return undefined +} + +function acgToReference(a: AmbientCgAsset, imageUrl: string): Reference { + const canonical = `https://ambientcg.com/view?id=${a.assetId}` + const rights: RightsRecord = { + license: 'CC0-1.0', + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: ACG_TERMS, sourceUrl: canonical }, + } + return { + id: referenceId('ambientcg', canonical), + modality: 'image', + title: a.displayName || undefined, + source: { providerId: 'ambientcg', sourceUrl: canonical }, + canonicalUrl: canonical, + rights, + verifiedAt: new Date().toISOString(), + thumbnail: { url: imageUrl }, + preview: { url: imageUrl, mediaType: 'image/png' }, + relevance: 0, + raw: a, + } +} + +export function ambientcg(config: AmbientCgConfig = {}) { + return defineProvider({ + id: 'ambientcg', + modalities: ['image'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const url = new URL(ACG_BASE) + url.searchParams.set('type', 'Material') // image-based PBR materials only (D1) + url.searchParams.set('include', 'displayData,imageData') + url.searchParams.set('limit', String(Math.min(config.limit ?? q.limit ?? 12, 30))) + if (q.text?.trim()) url.searchParams.set('q', q.text.trim()) + const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`ambientcg search failed: ${res.status}`) + const { foundAssets } = (await res.json()) as AmbientCgResponse + if (!foundAssets || foundAssets.length === 0) return [] + return foundAssets + .map((a) => { + // Defensive D1 guard: only Material assets carry an image previewImage. + if (a.dataType && a.dataType !== 'Material') return null + const imageUrl = acgPreviewUrl(a.previewImage) + return imageUrl ? acgToReference(a, imageUrl) : null + }) + .filter((r): r is Reference => r !== null) + }, + }) +} +``` + +- [ ] **5.2: Run (expect PASS).** + `pnpm --filter @refkit/provider-polyhaven test` + Expected: PASS — all four tests (2 polyhaven + 2 ambientcg) green. + +- [ ] **5.3: Typecheck & commit.** + `pnpm --filter @refkit/provider-polyhaven typecheck` + Then: `git add -A && git commit -m "feat(provider-polyhaven): ambientcg() sibling CC0 image satellite"` + +--- + +## Final Task: Central wiring — Execute Shared Task S9 + +Both `polyhaven()` and `ambientcg()` are **keyless**, so both join the base providers array (no env gate). When running S9, apply these source-specific details: + +- [ ] **S9.1 (leaf vitest project):** append `'./packages/provider-polyhaven/vitest.config.ts',` to root `vitest.config.ts` `projects`. + +- [ ] **S9.2 (README table):** add **one** row to the `@refkit/provider-…` table in root `README.md` (one row per package, matching the pexels precedent — the two factories share the package): + - `| `@refkit/provider-polyhaven` | Poly Haven + ambientCG | image | keyless | CC0 |` + +- [ ] **S9.3 (CLI wiring) in `packages/mcp/src/cli.ts`:** + - add `import { polyhaven, ambientcg } from '@refkit/provider-polyhaven'` + - add **both** factories to the keyless base array: + `openverse(), openverseAudio(), wikimediaCommons(), met(), artic(), gutendex(), poetrydb(), polyhaven(), ambientcg(),` + +- [ ] **S9.4 (CLI wiring test) in `packages/mcp/src/__tests__/mcp.test.ts`:** add both ids to the keyless assertion list in `'includes every keyless provider by default'`: + `for (const id of ['openverse', 'wikimedia-commons', 'met', 'artic', 'gutendex', 'poetrydb', 'polyhaven', 'ambientcg'])` + +- [ ] **S9.5 (mcp devDep):** add `"@refkit/provider-polyhaven": "workspace:*"` to `packages/mcp/package.json`. + +- [ ] **S9.6 (changeset):** create `.changeset/provider-polyhaven.md`: +```markdown +--- +"@refkit/provider-polyhaven": minor +"@refkit/mcp": minor +--- + +Add @refkit/provider-polyhaven: Poly Haven and ambientCG (sibling factory `ambientcg`) as CC0-normalized image references (textures/HDRIs/materials; 3D model formats skipped for v1). +``` + +- [ ] **S9.7 (verify repo green):** `pnpm install && pnpm -r typecheck && pnpm test:run` — typecheck clean; all vitest projects (including `provider-polyhaven`) pass. + +- [ ] **S9.8 (commit):** `git add -A && git commit -m "feat(provider-polyhaven): Poly Haven + ambientCG satellite (P1)"` + +--- + +## Self-Review + +1. **Decisions:** D1 (image-only, skip 3D) + D2 (whole-source CC0 hardcode) stated in Task 1 and enforced in both mappers. +2. **Two factories, one package:** `polyhaven()` + `ambientcg()` exported from one `src/index.ts`, both keyless, both wired to the base array (mirrors pexels+pexelsVideo). +3. **Reference validity:** both emit required `id, modality, source{providerId,sourceUrl}, canonicalUrl, rights, verifiedAt, relevance`; `rights` has `license:'CC0-1.0'`, `rehostPolicy:'cache-allowed'`, `raw{sourceTerms,sourceUrl}`; no `licenseVersion` (correct — not CC-BY/CC-BY-SA). +4. **CC0 round-trip tested:** each suite asserts `rights.license==='CC0-1.0'`, `rights.raw.sourceTerms` is the license page, and `evaluateUse(r.rights,'commercial-product').decision==='allowed'`. +5. **Non-image skip tested:** polyhaven ignores `blend`/`gltf` keys; ambientcg drops non-`Material`/no-preview assets. diff --git a/docs/superpowers/plans/2026-06-29-provider-rijksmuseum.md b/docs/superpowers/plans/2026-06-29-provider-rijksmuseum.md new file mode 100644 index 0000000..1ccac22 --- /dev/null +++ b/docs/superpowers/plans/2026-06-29-provider-rijksmuseum.md @@ -0,0 +1,572 @@ +# Rijksmuseum Provider Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. This plan is a leaf of `2026-06-29-p1-providers-index.md` — read that index first; it owns the shared skeleton (Task **S0**), central wiring (Task **S9**), and cross-cutting decisions **D1–D8**. Do **not** repeat their boilerplate here; reference them. + +**Goal:** Add `@refkit/provider-rijksmuseum` — a thin, **keyless** satellite that searches the Rijksmuseum collection via the modern **Linked-Art** data services and returns license-normalized `Reference`s. Works carrying a CC0 / Public-Domain-Mark rights URI are surfaced as `CC0-1.0` / `PD`; items without a parseable open-rights URI map to `unknown` (→ `needs-review`). + +**Architecture:** One factory `rijksmuseum(config)` returning `defineProvider({ id: 'rijksmuseum', modalities: ['image'], queryFeatures: ['keyword'], capabilities, search })`. The modern Search API returns only Linked-Open-Data **IDs**, so this is an **N+1 provider mirroring `provider-met`**: `search` fetches the search list, then `Promise.all`-fans-out a per-item Linked-Art record fetch for each id (each wrapped in try/catch so one bad fetch never drops the batch), capped via `Math.min(config.maxObjects ?? q.limit ?? 12, 30)`. Each record's per-item rights URI is mapped to a `LicenseId` (D7-style URL match). Permissions are never stored — they derive from `license` via core's `factsFor()`/`evaluateUse()`. + +**Tech Stack:** TypeScript (ESM), tsup, vitest, zod (via `@refkit/core`), pnpm workspaces, changesets. + +### Modern API shape (verified 2026-06-29 — see Open Questions for residual field-path uncertainty) + +Decision (made by coordinator): target the **modern keyless Linked-Art API at `data.rijksmuseum.nl`**, not the deprecated classic Collection API. + +- **Search endpoint:** `https://data.rijksmuseum.nl/search/collection` — **keyless** ("No API key is needed"). Params are partial keyword matches: `title`, `creator`, `type`, `material`, `technique`, `description`, `imageAvailable`, `objectNumber`. Paging: each page caps at 100; the next page is the URL in the `next.id` field (server appends a `pageToken`). There is no single global `q=` — this plan uses `title` as the primary keyword param and forwards the others as documented options (flagged in Open Questions). +- **Search response (Linked-Art / ActivityStreams):** + ```json + { + "@context": "https://linked.art/ns/v1/search.json", + "type": "OrderedCollectionPage", + "partOf": { "type": "OrderedCollection", "totalItems": 1234, "first": {"id":"…"}, "last": {"id":"…"} }, + "orderedItems": [ + { "id": "https://id.rijksmuseum.nl/200100988", "type": "HumanMadeObject" } + ], + "next": { "id": "https://data.rijksmuseum.nl/search/collection?title=…&pageToken=…", "type": "OrderedCollectionPage" } + } + ``` + `orderedItems[]` carries **IDs only** — no title/image/rights — hence the N+1 fan-out. +- **Per-item record:** the `id` URL (`https://id.rijksmuseum.nl/{n}`) **303-redirects** to `https://data.rijksmuseum.nl/{n}`; request the Linked-Art JSON-LD with the content-negotiation query arg **`?_profile=la`**. Verified field anchors on a live record (`200100988`, "Misty Sea", Jan Toorop): + - `type`: `"HumanMadeObject"` + - **title:** an `identified_by[]` entry of type `"Name"` with a `"content"` string. + - **creator:** `produced_by` → (`part[].`)`carried_out_by[]` → actor with a `_label` / name. + - **rights URI (the key signal):** a `subject_to[]` / `subject_of[].subject_to[]` `Right` whose `classified_as[].id` is a creativecommons URI — verified value `https://creativecommons.org/publicdomain/zero/1.0/`. + - **image URL:** `subject_of[].digitally_carried_by[].access_point[].id`. A record can hold several `DigitalObject`s and **not all `access_point`s are images** — on the live record one resolved to a viewer/collection *page*, not a raw image. The reliable signal is the `DigitalObject`'s own `format` (a MIME type, e.g. `image/jpeg`) and/or an IIIF `conforms_to`. So `findImage()` (Task 3) selects an image-typed/IIIF DigitalObject first, then falls back to a URL heuristic, and **drops the item if neither yields an image** — it never puts a web page in `preview.url`. + + Because the Linked-Art graph is deeply nested and varies per record, the provider extracts these with **defensive recursive walks** (find first creativecommons/rightsstatements URI anywhere; find first image-like `access_point` URL), not brittle fixed index paths. See Open Questions. + +**Auth:** **keyless.** No API key, no env var. (The `RijksmuseumConfig.apiKey` from the prior draft is removed.) + +--- + +## Task 1: Decisions & scaffold + +- [ ] **1.1: Confirm which cross-cutting decisions apply.** Applicable: **D7 — per-item rights URI is present**, so map the record's CC/PD URI → `LicenseId` via the index's D7-style URL match: `creativecommons.org/publicdomain/zero` → `CC0-1.0`; `creativecommons.org/publicdomain/mark` (and `rightsstatements.org/.../NoCopyright`) → `PD`; `creativecommons.org/licenses/by/<v>` → `CC-BY` (capture version) and `…/by-sa/<v>` → `CC-BY-SA` (capture version); anything else / missing → `unknown`. **D2 still applies as the practical reality** — Rijksmuseum open-access works are effectively CC0/PDM, so in practice the mapper resolves to `CC0-1.0`/`PD`; **CC-BY / CC-BY-SA are not expected** from this source (so `licenseVersion` is implemented for correctness but will normally be absent). Items with no parseable open-rights URI are **kept but marked `unknown`** (→ `needs-review`), not silently dropped — matches the conservative strict-deny convention. **D8 also applies** — `access_point`s can be viewer/collection *pages*, so `findImage()` picks an image-typed/IIIF DigitalObject (then a URL heuristic) and drops items with no real image rather than putting a page in `preview.url`. (D2/D4/D5/D6 — D3 dirty-license — are primarily other providers.) + +- [ ] **1.2: Execute Shared Task S0 from the index** with this substitution row: + + | token | value | + |---|---| + | `<id>` | `rijksmuseum` | + | `<Fn>` | `rijksmuseum` | + | `<Title>` | `Rijksmuseum` | + | `<modality>` | `image` | + | `<auth>` | `keyless` | + | `<licenseCol>` | `CC0 / PD` | + + This creates `packages/provider-rijksmuseum/` (`package.json`, `tsconfig.json`, `tsup.config.ts`, `vitest.config.ts`, `LICENSE`, `README.md`) and runs `pnpm install`. Keywords array should include `"rijksmuseum"`, `"art"`, `"museum"`, `"public-domain"`, `"linked-art"`. The README Usage block needs **no config** (`rijksmuseum()`). Do not commit yet — bundle with the first real change in Task 3. + +--- + +## Task 2: Failing tests for `src/index.ts` (TDD red) + +REQUIRED SUB-SKILL: superpowers:test-driven-development — write the test first, watch it fail for the right reason, then implement. + +- [ ] **2.1: Write `packages/provider-rijksmuseum/src/__tests__/rijksmuseum.test.ts`.** + +This is an N+1 provider, so the mock `ctx.fetch` routes like `met.test.ts`: the search endpoint → an `orderedItems` list body; each per-item URL → that item's Linked-Art record. Use **realistic** verified Linked-Art fixtures (CC0 record, a PDM record, and a rights-less record that must become `unknown`). + +```ts +import { describe, expect, it } from 'vitest' +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { rijksmuseum } from '../index' + +// Search returns IDs only → N+1 record fetch. Route /search/collection to the +// list body, and each /{id} (with ?_profile=la) to its record body. +const ctxRouting = ( + list: unknown, + records: Record<string, unknown>, + capture?: (searchUrl: string) => void, +): ProviderContext => ({ + fetch: (async (input: Parameters<typeof fetch>[0]) => { + const u = String(input) + if (u.includes('/search/collection')) { + capture?.(u) + return new Response(JSON.stringify(list), { status: 200 }) + } + const m = u.match(/\/(\d+)(?:\?|$)/) + if (m && records[m[1]]) return new Response(JSON.stringify(records[m[1]]), { status: 200 }) + return new Response('null', { status: 404 }) + }) as typeof fetch, +}) + +const LIST = { + '@context': 'https://linked.art/ns/v1/search.json', + type: 'OrderedCollectionPage', + partOf: { type: 'OrderedCollection', totalItems: 3 }, + orderedItems: [ + { id: 'https://id.rijksmuseum.nl/200100988', type: 'HumanMadeObject' }, + { id: 'https://id.rijksmuseum.nl/200100777', type: 'HumanMadeObject' }, + { id: 'https://id.rijksmuseum.nl/200100666', type: 'HumanMadeObject' }, + ], + next: { id: 'https://data.rijksmuseum.nl/search/collection?title=sea&pageToken=abc', type: 'OrderedCollectionPage' }, +} + +// CC0 record (verified shape: title=identified_by[].content of type Name; creator +// via produced_by.carried_out_by; rights URI under subject_to.classified_as.id; +// image under digitally_carried_by.access_point.id). +const REC_CC0 = { + id: 'https://id.rijksmuseum.nl/200100988', + type: 'HumanMadeObject', + identified_by: [ + { type: 'Name', classified_as: [{ id: 'http://vocab.getty.edu/aat/300404670', _label: 'preferred terms' }], content: 'Misty Sea' }, + ], + produced_by: { + type: 'Production', + carried_out_by: [{ id: 'https://id.rijksmuseum.nl/person/toorop', type: 'Person', _label: 'Jan Toorop' }], + }, + subject_to: [ + { type: 'Right', classified_as: [{ id: 'https://creativecommons.org/publicdomain/zero/1.0/', _label: 'CC0 1.0' }] }, + ], + subject_of: [ + { type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', access_point: [{ id: 'https://lh3.googleusercontent.com/cc0-image=s0', type: 'DigitalObject' }] }] }, + ], +} + +// Public Domain Mark record. +const REC_PDM = { + id: 'https://id.rijksmuseum.nl/200100777', + type: 'HumanMadeObject', + identified_by: [{ type: 'Name', content: 'Old Engraving' }], + produced_by: { type: 'Production', carried_out_by: [{ type: 'Person', _label: 'Anonymous' }] }, + subject_to: [{ type: 'Right', classified_as: [{ id: 'https://creativecommons.org/publicdomain/mark/1.0/', _label: 'PDM' }] }], + subject_of: [{ type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', access_point: [{ id: 'https://lh3.googleusercontent.com/pdm-image=s0' }] }] }], +} + +// Rights-less record: no creativecommons/rightsstatements URI anywhere → unknown. +const REC_NO_RIGHTS = { + id: 'https://id.rijksmuseum.nl/200100666', + type: 'HumanMadeObject', + identified_by: [{ type: 'Name', content: 'Untitled (rights unclear)' }], + produced_by: { type: 'Production', carried_out_by: [{ type: 'Person', _label: 'Unknown Maker' }] }, + subject_of: [{ type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', access_point: [{ id: 'https://lh3.googleusercontent.com/mystery=s0' }] }] }], +} + +describe('rijksmuseum provider', () => { + it('maps a CC0 record to a CC0 reference that clears a commercial-product use', async () => { + const refs = await rijksmuseum().search( + { text: 'sea', modalities: ['image'], limit: 10 }, + ctxRouting(LIST, { '200100988': REC_CC0, '200100777': REC_PDM, '200100666': REC_NO_RIGHTS }), + ) + const cc0 = refs.find(r => r.title === 'Misty Sea')! + expect(cc0.modality).toBe('image') + expect(cc0.rights.license).toBe('CC0-1.0') + expect(cc0.rights.author).toBe('Jan Toorop') + expect(cc0.canonicalUrl).toBe('https://id.rijksmuseum.nl/200100988') + expect(cc0.preview?.url).toContain('googleusercontent') + expect(cc0.rights.licenseVersion).toBeUndefined() // CC0/PD never set version + expect(evaluateUse(cc0.rights, 'commercial-product').decision).toBe('allowed') + }) + + it('maps a Public Domain Mark record to PD', async () => { + const refs = await rijksmuseum().search( + { text: 'sea', modalities: ['image'] }, + ctxRouting(LIST, { '200100988': REC_CC0, '200100777': REC_PDM, '200100666': REC_NO_RIGHTS }), + ) + const pd = refs.find(r => r.title === 'Old Engraving')! + expect(pd.rights.license).toBe('PD') + expect(evaluateUse(pd.rights, 'commercial-product').decision).toBe('allowed') + }) + + it('marks a record with no parseable open-rights URI as unknown → needs-review (not dropped)', async () => { + const refs = await rijksmuseum().search( + { text: 'sea', modalities: ['image'] }, + ctxRouting(LIST, { '200100988': REC_CC0, '200100777': REC_PDM, '200100666': REC_NO_RIGHTS }), + ) + const mystery = refs.find(r => r.title === 'Untitled (rights unclear)')! + expect(mystery).toBeDefined() // kept, not silently dropped + expect(mystery.rights.license).toBe('unknown') + expect(evaluateUse(mystery.rights, 'commercial-product').decision).toBe('needs-review') + }) + + it('returns [] when the search finds nothing', async () => { + const refs = await rijksmuseum().search( + { text: 'zzz', modalities: ['image'] }, + ctxRouting({ '@context': 'x', type: 'OrderedCollectionPage', orderedItems: [] }, {}), + ) + expect(refs).toEqual([]) + }) + + it('survives a single failed per-item fetch without dropping the batch', async () => { + const refs = await rijksmuseum().search( + { text: 'sea', modalities: ['image'] }, + // 200100777 record omitted → its fetch 404s; the other two must still map. + ctxRouting(LIST, { '200100988': REC_CC0, '200100666': REC_NO_RIGHTS }), + ) + expect(refs.map(r => r.title).sort()).toEqual(['Misty Sea', 'Untitled (rights unclear)']) + }) + + it('drops a record whose only access_point is a viewer/collection page (never a non-image preview)', async () => { + // No `format`/IIIF on the DigitalObject and the access_point is a web page, not an + // image → findImage() returns undefined → the item is dropped (not surfaced with a + // webpage in preview.url). + const REC_PAGE_ONLY = { + id: 'https://id.rijksmuseum.nl/200100555', + type: 'HumanMadeObject', + identified_by: [{ type: 'Name', content: 'Viewer Only' }], + subject_to: [{ type: 'Right', classified_as: [{ id: 'https://creativecommons.org/publicdomain/zero/1.0/' }] }], + subject_of: [{ type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', access_point: [{ id: 'https://www.rijksmuseum.nl/en/collection/SK-A-1' }] }] }], + } + const ONE = { + type: 'OrderedCollectionPage', + orderedItems: [{ id: 'https://id.rijksmuseum.nl/200100555', type: 'HumanMadeObject' }], + } + const refs = await rijksmuseum().search( + { text: 'x', modalities: ['image'] }, + ctxRouting(ONE, { '200100555': REC_PAGE_ONLY }), + ) + expect(refs).toEqual([]) + }) + + it('prefers an image-typed (format/IIIF) DigitalObject over a non-image access_point', async () => { + // The first access_point is a page; a second DigitalObject is typed image/jpeg → + // findImage() must pick the typed one and carry its mediaType. + const REC_TYPED = { + id: 'https://id.rijksmuseum.nl/200100444', + type: 'HumanMadeObject', + identified_by: [{ type: 'Name', content: 'Typed Image' }], + subject_to: [{ type: 'Right', classified_as: [{ id: 'https://creativecommons.org/publicdomain/zero/1.0/' }] }], + subject_of: [ + { type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', access_point: [{ id: 'https://www.rijksmuseum.nl/en/collection/SK-A-2' }] }] }, + { type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', format: 'image/jpeg', access_point: [{ id: 'https://iiif.example.org/image/abc/full/full/0/default.jpg' }] }] }, + ], + } + const ONE = { type: 'OrderedCollectionPage', orderedItems: [{ id: 'https://id.rijksmuseum.nl/200100444', type: 'HumanMadeObject' }] } + const refs = await rijksmuseum().search({ text: 'x', modalities: ['image'] }, ctxRouting(ONE, { '200100444': REC_TYPED })) + expect(refs).toHaveLength(1) + expect(refs[0].preview?.url).toBe('https://iiif.example.org/image/abc/full/full/0/default.jpg') + expect(refs[0].preview?.mediaType).toBe('image/jpeg') + }) + + it('forwards the keyword and documented search options + caps the page size to the limit', async () => { + let searchUrl = '' + await rijksmuseum().search( + { + text: 'vermeer', + modalities: ['image'], + limit: 5, + providerOptions: { type: 'painting', material: 'canvas', technique: 'oil paint', creator: 'Johannes Vermeer', imageAvailable: true }, + }, + ctxRouting({ type: 'OrderedCollectionPage', orderedItems: [] }, {}, (u) => { searchUrl = u }), + ) + const url = new URL(searchUrl) + expect(url.origin + url.pathname).toBe('https://data.rijksmuseum.nl/search/collection') + expect(url.searchParams.get('title')).toBe('vermeer') // primary keyword param + expect(url.searchParams.get('type')).toBe('painting') + expect(url.searchParams.get('material')).toBe('canvas') + expect(url.searchParams.get('technique')).toBe('oil paint') + expect(url.searchParams.get('creator')).toBe('Johannes Vermeer') + expect(url.searchParams.get('imageAvailable')).toBe('true') + expect(url.searchParams.get('pageSize')).toBe('5') // limit → page size cap + // keyless: never a key param + expect(url.searchParams.get('key')).toBeNull() + }) +}) +``` + +- [ ] **2.2: Run the tests — expect FAIL (module/exports do not exist yet).** + +```bash +pnpm --filter @refkit/provider-rijksmuseum test +``` +Expected: **FAIL** — `Cannot find module '../index'` / `rijksmuseum is not a function`. Confirms the red state. + +--- + +## Task 3: Implement `src/index.ts` (TDD green) + commit + +- [ ] **3.1: Write `packages/provider-rijksmuseum/src/index.ts`.** + +Full code — N+1 fan-out mirrors `provider-met`; the per-item rights-URI → `LicenseId` mapping mirrors `provider-flickr`/D7. Keyless (`RijksmuseumConfig` has no `apiKey`): + +```ts +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type LicenseId, + type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +export interface RijksmuseumConfig { + /** Max records fetched per search. Search returns only IDs, so each result + * costs one extra Linked-Art fetch — this bounds that N+1 fan-out. Default 12. */ + maxObjects?: number +} + +export interface RijksmuseumSearchOptions { + /** Object type, e.g. 'painting'. */ + type?: string + /** Material, e.g. 'canvas'. */ + material?: string + /** Technique, e.g. 'oil paint'. */ + technique?: string + /** Maker/artist (maps to `creator`). */ + creator?: string + /** Free-text description match. */ + description?: string + /** Restrict to objects with an image. */ + imageAvailable?: boolean +} + +const SEARCH = 'https://data.rijksmuseum.nl/search/collection' +const RIJKS_TERMS = 'https://www.rijksmuseum.nl/en/data/policy' + +// D7-style: map a CC deed URL to our LicenseId (+ CC version). Rijksmuseum open-access is +// effectively CC0/PDM; BY/BY-SA are implemented for correctness but not expected. CC-only — +// Rijksmuseum does not use rightsstatements.org, so this is replaced by core `mapCcDeedUrl` +// (NOT core `mapRightsUrl`) in helper-refactor Task 4. Named `mapRijksRights` to avoid clashing +// with the core `mapRightsUrl` helper, which additionally handles rightsstatements.org. +function mapRijksRights(url: string | undefined): { license: LicenseId; version?: string } { + if (!url) return { license: 'unknown' } + if (/creativecommons\.org\/publicdomain\/zero/.test(url)) return { license: 'CC0-1.0' } + if (/creativecommons\.org\/publicdomain\/mark/.test(url)) return { license: 'PD' } + if (/rightsstatements\.org\/(?:vocab|page)\/NoCopyright/i.test(url)) return { license: 'PD' } + const sa = url.match(/creativecommons\.org\/licenses\/by-sa\/(\d\.\d)/) + if (sa) return { license: 'CC-BY-SA', version: sa[1] } + const by = url.match(/creativecommons\.org\/licenses\/by\/(\d\.\d)/) + if (by) return { license: 'CC-BY', version: by[1] } + return { license: 'unknown' } +} + +// The Linked-Art graph is deeply nested and varies per record, so we extract by +// shape, not by fixed index paths (see plan Open Questions). + +/** First string anywhere in the record matching a known rights-deed host. */ +function findRightsUrl(node: unknown, depth = 0): string | undefined { + if (depth > 8 || node == null) return undefined + if (typeof node === 'string') { + return /creativecommons\.org\/(publicdomain|licenses)|rightsstatements\.org/.test(node) ? node : undefined + } + if (Array.isArray(node)) { + for (const v of node) { const hit = findRightsUrl(v, depth + 1); if (hit) return hit } + return undefined + } + if (typeof node === 'object') { + for (const v of Object.values(node as Record<string, unknown>)) { + const hit = findRightsUrl(v, depth + 1); if (hit) return hit + } + } + return undefined +} + +// We must not put a NON-image URL (a viewer/collection web page) into preview.url. +// The API carries the answer: a DigitalObject's `format` (a MIME type) and IIIF +// `conforms_to` say which access_point is the image. So: read the type first, then +// fall back to a cheap URL heuristic, then give up (no network probe — `core` never +// fetches bytes, and that would add an extra request per item). See Open Questions #1. +const IMAGE_EXT = /\.(jpe?g|png|webp|gif|tiff?)(?:$|\?)/i + +/** URL-string heuristic only (no network): does this look like an image resource? */ +function isLikelyImageUrl(url: string): boolean { + return IMAGE_EXT.test(url) + || /iiif/i.test(url) // IIIF image endpoint + || /\/full\/[^/]+\/\d+\/default/i.test(url) // IIIF Image API request URL + || /googleusercontent\.com/.test(url) // Rijksmuseum/Met image CDN +} + +interface LaDigitalObject { + type?: string + format?: string + conforms_to?: Array<{ id?: string }> + access_point?: Array<{ id?: string }> +} + +/** Collect every node that carries an `access_point` (the DigitalObjects) anywhere. */ +function collectDigitalObjects(node: unknown, out: LaDigitalObject[] = [], depth = 0): LaDigitalObject[] { + if (depth > 8 || node == null) return out + if (Array.isArray(node)) { for (const v of node) collectDigitalObjects(v, out, depth + 1); return out } + if (typeof node === 'object') { + const o = node as Record<string, unknown> + if (Array.isArray(o.access_point)) out.push(o as LaDigitalObject) + for (const v of Object.values(o)) collectDigitalObjects(v, out, depth + 1) + } + return out +} + +/** Best usable IMAGE url + its mediaType, or undefined. + * Tier 1: a DigitalObject explicitly typed `image/*` or IIIF → trust it. + * Tier 2: any access_point whose URL heuristically looks like an image. + * Otherwise undefined → the item is dropped (an image provider with no image is useless). */ +function findImage(rec: Record<string, unknown>): { url: string; mediaType: string } | undefined { + const objs = collectDigitalObjects(rec) + // Tier 1 — explicit type from the data. + for (const o of objs) { + const fmt = typeof o.format === 'string' ? o.format : undefined + const isIiif = Array.isArray(o.conforms_to) && o.conforms_to.some(c => typeof c?.id === 'string' && /iiif/i.test(c.id)) + if ((fmt && fmt.startsWith('image/')) || isIiif) { + const url = o.access_point?.find(a => typeof a?.id === 'string')?.id + if (url) return { url, mediaType: fmt && fmt.startsWith('image/') ? fmt : 'image/jpeg' } + } + } + // Tier 2 — URL heuristic fallback. + for (const o of objs) { + const hit = o.access_point?.find(a => typeof a?.id === 'string' && isLikelyImageUrl(a.id))?.id + if (hit) return { url: hit, mediaType: 'image/jpeg' } + } + return undefined +} + +interface LaName { type?: string; content?: string } +function findTitle(rec: Record<string, unknown>): string | undefined { + const names = rec.identified_by + if (Array.isArray(names)) { + for (const n of names as LaName[]) { + if (n?.type === 'Name' && typeof n.content === 'string' && n.content) return n.content + } + } + return undefined +} + +function findCreator(rec: Record<string, unknown>): string | undefined { + const prod = rec.produced_by as Record<string, unknown> | undefined + if (!prod) return undefined + const direct = prod.carried_out_by + const parts = Array.isArray(prod.part) ? (prod.part as Record<string, unknown>[]) : [] + const actors = [ + ...(Array.isArray(direct) ? (direct as Record<string, unknown>[]) : []), + ...parts.flatMap(p => (Array.isArray(p.carried_out_by) ? (p.carried_out_by as Record<string, unknown>[]) : [])), + ] + for (const a of actors) { + const label = a._label ?? (a as { notation?: unknown }).notation + if (typeof label === 'string' && label) return label + } + return undefined +} + +function toReference(rec: Record<string, unknown>): Reference | null { + const id = typeof rec.id === 'string' ? rec.id : undefined + if (!id) return null + const img = findImage(rec) + if (!img) return null // no usable IMAGE url (e.g. only a viewer/collection page) → drop + const { license, version } = mapRijksRights(findRightsUrl(rec)) + const rights: RightsRecord = { + license, + licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + author: findCreator(rec) || undefined, + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: RIJKS_TERMS, sourceUrl: id }, + } + return { + id: referenceId('rijksmuseum', id), + modality: 'image', + title: findTitle(rec), + source: { providerId: 'rijksmuseum', sourceUrl: id }, + canonicalUrl: id, + rights, + verifiedAt: new Date().toISOString(), + thumbnail: { url: img.url }, + preview: { url: img.url, mediaType: img.mediaType }, + relevance: 0, + raw: rec, + } +} + +function setIfString(url: URL, key: string, value: unknown) { + if (typeof value !== 'string' || !value) return + url.searchParams.set(key, value) +} +function setIfBoolean(url: URL, key: string, value: unknown) { + if (typeof value !== 'boolean') return + url.searchParams.set(key, String(value)) +} + +interface SearchPage { orderedItems?: Array<{ id?: string }> } + +export function rijksmuseum(config: RijksmuseumConfig = {}) { + return defineProvider({ + id: 'rijksmuseum', + modalities: ['image'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const opts = q.providerOptions as RijksmuseumSearchOptions | undefined + const n = Math.min(config.maxObjects ?? q.limit ?? 12, 30) + const searchUrl = new URL(SEARCH) + // No global free-text param; `title` is a partial keyword match → use it as the keyword. + if (q.text) searchUrl.searchParams.set('title', q.text) + setIfString(searchUrl, 'type', opts?.type) + setIfString(searchUrl, 'material', opts?.material) + setIfString(searchUrl, 'technique', opts?.technique) + setIfString(searchUrl, 'creator', opts?.creator) + setIfString(searchUrl, 'description', opts?.description) + setIfBoolean(searchUrl, 'imageAvailable', opts?.imageAvailable) + searchUrl.searchParams.set('pageSize', String(n)) // best-effort cap; server caps at 100 + + const res = await ctx.fetch(searchUrl.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`rijksmuseum search failed: ${res.status}`) + const page = (await res.json()) as SearchPage + const ids = (page.orderedItems ?? []) + .map(it => it.id) + .filter((u): u is string => typeof u === 'string') + .slice(0, n) + if (ids.length === 0) return [] + + const records = await Promise.all(ids.map(async (idUrl) => { + try { + // Content-negotiate the Linked-Art JSON-LD. id.rijksmuseum.nl 303s to + // data.rijksmuseum.nl; ?_profile=la selects the Linked-Art profile. + const recUrl = `${idUrl}${idUrl.includes('?') ? '&' : '?'}_profile=la` + const r = await ctx.fetch(recUrl, { signal: ctx.signal }) + if (!r.ok) return null + return (await r.json()) as Record<string, unknown> + } catch { + return null // one bad record fetch must not drop the whole batch + } + })) + return records + .map(rec => (rec ? toReference(rec) : null)) + .filter((r): r is Reference => r !== null) + }, + }) +} +``` + +- [ ] **3.2: Run the tests — expect PASS.** + +```bash +pnpm --filter @refkit/provider-rijksmuseum test +``` +Expected: **PASS** — all tests green (CC0 → `allowed`; PDM → `PD` → `allowed`; rights-less → `unknown` → `needs-review`, kept; empty result; one-bad-fetch survives; keyword/options/`pageSize` forwarding, no `key`). + +> If the URL-routing test mock mis-routes because the per-item regex also matches digits in the search query, tighten the mock router's `match(/\/(\d+)(?:\?|$)/)` — the real provider is unaffected. + +- [ ] **3.3: Typecheck the package.** + +```bash +pnpm --filter @refkit/provider-rijksmuseum typecheck +``` +Expected: **clean**. + +- [ ] **3.4: Commit the package.** + +```bash +git add packages/provider-rijksmuseum +git commit -m "feat(provider-rijksmuseum): keyless CC0/PD art search satellite (P1)" +``` + +--- + +## Task 4: Central wiring + +- [ ] **4.1: Execute Shared Task S9 from the index** with these concrete values: + - `<id>` = `rijksmuseum`, `<Fn>` = `rijksmuseum`, `<Title>` = `Rijksmuseum`, `<modality>` = `image`, `<auth>` = `keyless`, `<licenseCol>` = `CC0 / PD`. + - **No ENVVAR** — Rijksmuseum is **keyless**. + - **S9.3 (CLI):** in `packages/mcp/src/cli.ts` add `import { rijksmuseum } from '@refkit/provider-rijksmuseum'`, then add `rijksmuseum()` to the **base keyless `providers` array** (no env gate). + - **S9.4 (mcp.test.ts):** add `'rijksmuseum'` to the id list asserted by `'includes every keyless provider by default'`. Do **not** add a BYOK gate. + - Completes S9.1 (root `vitest.config.ts` project), S9.2 (README provider table row `| `@refkit/provider-rijksmuseum` | Rijksmuseum | image | keyless | CC0 / PD |`), S9.5 (`mcp` devDep `"@refkit/provider-rijksmuseum": "workspace:*"`), S9.6 (changeset), S9.7 (full-repo `pnpm install && pnpm -r typecheck && pnpm test:run` green), S9.8 (commit `feat(provider-rijksmuseum): Rijksmuseum satellite (P1)`). + +--- + +## Self-Review + +- [ ] Task 1 states applicable decisions: **D7 applies** (per-item rights URI → URL match to `CC0-1.0`/`PD`/`CC-BY`/`CC-BY-SA`; version only for BY/BY-SA, not expected here); **D2 is the practical reality** (effectively CC0/PDM); rights-less items → `unknown` (kept, not dropped). +- [ ] N+1 shape mirrors `provider-met`: search → IDs → `Promise.all` fan-out, per-item try/catch, `Math.min(maxObjects ?? limit ?? 12, 30)` cap. +- [ ] Emits a valid `Reference` (`id, modality, source{providerId,sourceUrl}, canonicalUrl, rights, verifiedAt, relevance`) and `RightsRecord` (`license, rehostPolicy, raw{sourceTerms,sourceUrl}`); `licenseVersion` only ever set for CC-BY/CC-BY-SA. +- [ ] Tests cover: CC0 → `allowed`; PDM → `PD`; rights-less → `unknown`/`needs-review` (kept); empty; one-bad-fetch resilience; keyword + options + `pageSize` forwarding; keyless (no key param). +- [ ] Keyless wiring: `rijksmuseum()` in the CLI base array; `'rijksmuseum'` in the keyless-by-default id list (no env gate). + +## Open Questions (for the reviewer) + +1. **Exact Linked-Art field paths could not be byte-verified end-to-end.** The endpoint, keyless auth, `orderedItems` ID-list shape, `pageToken`/`next.id` paging, the `?_profile=la` content-negotiation arg, and the *presence + value* of the CC0 rights URI were verified against the live record `200100988`. But the deep nesting (`subject_of[].subject_to[].classified_as[].id` for rights vs. a top-level `subject_to[]`; `subject_of[].digitally_carried_by[].access_point[].id` for the image) **varies per record**, so the provider uses defensive recursive walks (`findRightsUrl`/`findImage`) rather than fixed paths. Reviewer should fetch 3–5 live records with `?_profile=la` and confirm the walks pick the right rights URI and a usable image URL. **Image-URL handling (resolved in this plan):** because the live `200100988` `access_point` resolved to a viewer/collection page (not a raw image), `findImage()` now (a) prefers a `DigitalObject` typed `image/*` or IIIF-conforming, (b) falls back to a URL-string heuristic (`isLikelyImageUrl`: image extension / `iiif` / IIIF request path / known image CDN), and (c) **drops the item when neither yields an image** rather than emitting a page URL as `preview`. Residual check for the reviewer: confirm against live records that real records expose `format`/IIIF (so Tier 1 fires) or image-extension URLs (so Tier 2 fires); if Rijksmuseum only ever serves a IIIF *manifest* (not an Image-API URL), add a manifest→image-API resolution step or widen `isLikelyImageUrl`. +2. **Keyword query param.** The modern Search API has no single global `q=`; params (`title`, `creator`, `type`, …) are individually partial-keyword matches. This plan routes `q.text` into `title`. Confirm `title` is the best general keyword target, or whether a broader field (e.g. `description`, or issuing the term across multiple params) better matches user intent. +3. **`pageSize` parameter name.** Docs confirm 100-per-page caps and `pageToken` paging but did not explicitly name a page-size param; the plan sends `pageSize` best-effort and additionally caps client-side via `slice(0, n)` (authoritative regardless). Reviewer to confirm/adjust the param name (or drop it and rely solely on the client-side slice). +4. **Deprecated classic API fully removed** — resolved per coordinator decision; this plan targets only the modern `data.rijksmuseum.nl` services. No residual classic-vs-modern question. From 34dff0e448d1441383d7bb3a13751b219c825964 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:02:45 +0800 Subject: [PATCH 02/34] feat(core): shared provider helpers (setIf*, first, mapCcDeedUrl, image heuristics) --- .../src/__tests__/provider-helpers.test.ts | 125 +++++++++++++++ packages/core/src/index.ts | 5 + packages/core/src/provider-helpers.ts | 143 ++++++++++++++++++ 3 files changed, 273 insertions(+) create mode 100644 packages/core/src/__tests__/provider-helpers.test.ts create mode 100644 packages/core/src/provider-helpers.ts diff --git a/packages/core/src/__tests__/provider-helpers.test.ts b/packages/core/src/__tests__/provider-helpers.test.ts new file mode 100644 index 0000000..dcc2242 --- /dev/null +++ b/packages/core/src/__tests__/provider-helpers.test.ts @@ -0,0 +1,125 @@ +import { describe, expect, it } from 'vitest' +import { + setIfString, setIfBoolean, setIfStringList, + setIfInt, setIfPositiveInt, setIfNonNegativeInt, setIfNumber, + first, mapCcDeedUrl, mapRightsUrl, isLikelyImageUrl, imageMediaType, +} from '../provider-helpers' + +const params = (fn: (u: URL) => void) => { const u = new URL('https://x.test/'); fn(u); return u.searchParams } + +describe('setIfString', () => { + it('sets a non-empty string; skips non-strings and empty', () => { + expect(params(u => setIfString(u, 'a', 'x')).get('a')).toBe('x') + expect(params(u => setIfString(u, 'a', '')).get('a')).toBeNull() + expect(params(u => setIfString(u, 'a', 5)).get('a')).toBeNull() + }) + it('honors an allowlist', () => { + expect(params(u => setIfString(u, 'a', 'no', ['yes'])).get('a')).toBeNull() + expect(params(u => setIfString(u, 'a', 'yes', ['yes'])).get('a')).toBe('yes') + }) +}) + +describe('setIfBoolean', () => { + it('encodes as true/false strings; skips non-booleans', () => { + expect(params(u => setIfBoolean(u, 'b', true)).get('b')).toBe('true') + expect(params(u => setIfBoolean(u, 'b', false)).get('b')).toBe('false') + expect(params(u => setIfBoolean(u, 'b', 'true')).get('b')).toBeNull() + }) +}) + +describe('setIfStringList', () => { + it('joins arrays (default comma), accepts a string, supports a custom separator + allowlist', () => { + expect(params(u => setIfStringList(u, 't', ['a', 'b'])).get('t')).toBe('a,b') + expect(params(u => setIfStringList(u, 't', 'solo')).get('t')).toBe('solo') + expect(params(u => setIfStringList(u, 't', ['a', 'b'], { separator: ' ' })).get('t')).toBe('a b') + expect(params(u => setIfStringList(u, 't', ['a', 'x'], { allowed: ['a', 'b'] })).get('t')).toBeNull() + expect(params(u => setIfStringList(u, 't', [])).get('t')).toBeNull() + }) +}) + +describe('int/number setters', () => { + it('setIfInt respects min/max and integer-ness', () => { + expect(params(u => setIfInt(u, 'n', 5)).get('n')).toBe('5') + expect(params(u => setIfInt(u, 'n', 5.5)).get('n')).toBeNull() + expect(params(u => setIfInt(u, 'n', 0, { min: 1 })).get('n')).toBeNull() + expect(params(u => setIfInt(u, 'n', 999, { max: 100 })).get('n')).toBeNull() + }) + it('setIfPositiveInt defaults to min 1; setIfNonNegativeInt to min 0', () => { + expect(params(u => setIfPositiveInt(u, 'p', 0)).get('p')).toBeNull() + expect(params(u => setIfPositiveInt(u, 'p', 1)).get('p')).toBe('1') + expect(params(u => setIfPositiveInt(u, 'p', 999, { max: 500 })).get('p')).toBeNull() + expect(params(u => setIfNonNegativeInt(u, 'q', 0)).get('q')).toBe('0') + expect(params(u => setIfNonNegativeInt(u, 'q', -1)).get('q')).toBeNull() + }) + it('clamp:true clamps to max instead of rejecting (preserves provider Math.min behavior)', () => { + expect(params(u => setIfInt(u, 'n', 999, { max: 100, clamp: true })).get('n')).toBe('100') + expect(params(u => setIfPositiveInt(u, 'p', 999, { max: 500, clamp: true })).get('p')).toBe('500') + expect(params(u => setIfPositiveInt(u, 'p', 0, { max: 500, clamp: true })).get('p')).toBeNull() // min floor still rejects + expect(params(u => setIfNonNegativeInt(u, 'q', 999, { max: 200, clamp: true })).get('q')).toBe('200') + }) + it('setIfNumber allows non-integers', () => { + expect(params(u => setIfNumber(u, 'f', 1.5, { min: 0, max: 10 })).get('f')).toBe('1.5') + expect(params(u => setIfNumber(u, 'f', 20, { max: 10 })).get('f')).toBeNull() + }) +}) + +describe('first', () => { + it('returns the first element or undefined', () => { + expect(first(['a', 'b'])).toBe('a') + expect(first([])).toBeUndefined() + expect(first(undefined)).toBeUndefined() + }) +}) + +describe('mapCcDeedUrl', () => { + it('maps PD/CC0, BY/BY-SA (+version), NC/ND → proprietary, else unknown', () => { + expect(mapCcDeedUrl('http://creativecommons.org/publicdomain/zero/1.0/')).toEqual({ license: 'CC0-1.0' }) + expect(mapCcDeedUrl('https://creativecommons.org/publicdomain/mark/1.0/')).toEqual({ license: 'PD' }) + expect(mapCcDeedUrl('http://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapCcDeedUrl('http://creativecommons.org/licenses/by-sa/3.0/')).toEqual({ license: 'CC-BY-SA', version: '3.0' }) + expect(mapCcDeedUrl('http://creativecommons.org/licenses/by-nc-nd/3.0/')).toEqual({ license: 'proprietary' }) + expect(mapCcDeedUrl('http://creativecommons.org/licenses/by-nd/4.0/')).toEqual({ license: 'proprietary' }) + // mapCcDeedUrl is CC-only — a rightsstatements URL has no CC pattern → unknown here + // (the faithful rightsstatements mapping lives in mapRightsUrl, tested below). + expect(mapCcDeedUrl('http://rightsstatements.org/vocab/InC/1.0/')).toEqual({ license: 'unknown' }) + expect(mapCcDeedUrl(undefined)).toEqual({ license: 'unknown' }) + expect(mapCcDeedUrl('https://example.org/x')).toEqual({ license: 'unknown' }) + }) +}) + +describe('mapRightsUrl (CC deeds + faithful rightsstatements.org)', () => { + it('delegates CC deeds to mapCcDeedUrl', () => { + expect(mapRightsUrl('http://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapRightsUrl('http://creativecommons.org/publicdomain/zero/1.0/')).toEqual({ license: 'CC0-1.0' }) + }) + it('maps rightsstatements faithfully: InC→proprietary, NoC-US→PD+US, NoC-NC→proprietary', () => { + expect(mapRightsUrl('http://rightsstatements.org/vocab/InC/1.0/')).toEqual({ license: 'proprietary' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/InC-OW-EU/1.0/')).toEqual({ license: 'proprietary' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/NoC-US/1.0/')).toEqual({ license: 'PD', jurisdiction: 'US' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/NoC-NC/1.0/')).toEqual({ license: 'proprietary' }) + }) + it('maps opaque/undetermined rightsstatements → unknown', () => { + expect(mapRightsUrl('http://rightsstatements.org/vocab/NoC-OKLR/1.0/')).toEqual({ license: 'unknown' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/NoC-CR/1.0/')).toEqual({ license: 'unknown' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/CNE/1.0/')).toEqual({ license: 'unknown' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/UND/1.0/')).toEqual({ license: 'unknown' }) + expect(mapRightsUrl('http://rightsstatements.org/vocab/NKC/1.0/')).toEqual({ license: 'unknown' }) + expect(mapRightsUrl(undefined)).toEqual({ license: 'unknown' }) + }) +}) + +describe('image helpers', () => { + it('isLikelyImageUrl: extension / iiif / thumbnail / image CDN', () => { + expect(isLikelyImageUrl('https://x/y.jpg')).toBe(true) + expect(isLikelyImageUrl('https://iiif.x/a/full/full/0/default.jpg')).toBe(true) + expect(isLikelyImageUrl('https://api.europeana.eu/thumbnail/v3/200/a.jpg')).toBe(true) + expect(isLikelyImageUrl('https://lh3.googleusercontent.com/abc=s0')).toBe(true) + expect(isLikelyImageUrl('https://www.rijksmuseum.nl/en/collection/SK-A-1')).toBe(false) + }) + it('imageMediaType: MIME wins, else extension, else default', () => { + expect(imageMediaType('image/png', 'https://x/y')).toBe('image/png') + expect(imageMediaType(undefined, 'https://x/y.png')).toBe('image/png') + expect(imageMediaType(undefined, 'https://x/y.jpg')).toBe('image/jpeg') + expect(imageMediaType('application/octet-stream', 'https://x/y')).toBe('image/jpeg') + }) +}) diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index eb091c3..6db58a4 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -43,6 +43,11 @@ export type { ProviderOptionsById, KeyValueCache, } from './provider' +export { + setIfString, setIfBoolean, setIfStringList, + setIfInt, setIfPositiveInt, setIfNonNegativeInt, setIfNumber, + first, mapCcDeedUrl, mapRightsUrl, isLikelyImageUrl, imageMediaType, IMAGE_EXT, +} from './provider-helpers' export { normalizeQuery } from './query' export { createRefkit } from './client' export type { diff --git a/packages/core/src/provider-helpers.ts b/packages/core/src/provider-helpers.ts new file mode 100644 index 0000000..87f9b5a --- /dev/null +++ b/packages/core/src/provider-helpers.ts @@ -0,0 +1,143 @@ +import type { LicenseId } from './license' + +// — URL query-param setters (shared by every provider's search()) — + +/** Set `key=value` when value is a non-empty string (optionally within an allowlist). */ +export function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]): void { + if (typeof value !== 'string' || !value) return + if (allowed && !allowed.includes(value)) return + url.searchParams.set(key, value) +} + +/** Set `key=true|false` when value is a boolean. */ +export function setIfBoolean(url: URL, key: string, value: unknown): void { + if (typeof value !== 'boolean') return + url.searchParams.set(key, String(value)) +} + +/** Set `key` to a joined list from a string or string[] (default separator ","). + * Optional allowlist rejects the whole value if any element is not allowed. */ +export function setIfStringList( + url: URL, key: string, value: unknown, + opts?: { separator?: string; allowed?: readonly string[] }, +): void { + const sep = opts?.separator ?? ',' + const allowed = opts?.allowed + const ok = (v: string) => !allowed || allowed.includes(v) + if (typeof value === 'string' && value && ok(value)) { url.searchParams.set(key, value); return } + if (Array.isArray(value) && value.length > 0 && value.every(v => typeof v === 'string' && v && ok(v))) { + url.searchParams.set(key, value.join(sep)) + } +} + +/** Set `key` when value is an integer. `min` is a reject floor (value < min → skip). + * For `max`: the default REJECTS when value > max; with `clamp: true` it instead sets + * `max` — preserving the `Math.min(value, max)` clamp several providers rely on. */ +export function setIfInt(url: URL, key: string, value: unknown, opts?: { min?: number; max?: number; clamp?: boolean }): void { + if (typeof value !== 'number' || !Number.isInteger(value)) return + if (opts?.min !== undefined && value < opts.min) return + if (opts?.max !== undefined && value > opts.max) { + if (opts.clamp) { url.searchParams.set(key, String(opts.max)); return } + return + } + url.searchParams.set(key, String(value)) +} + +/** Integer ≥ (opts.min ?? 1). Pass `clamp: true` to clamp to `max` instead of rejecting. */ +export function setIfPositiveInt(url: URL, key: string, value: unknown, opts?: { min?: number; max?: number; clamp?: boolean }): void { + setIfInt(url, key, value, { min: opts?.min ?? 1, max: opts?.max, clamp: opts?.clamp }) +} + +/** Integer ≥ (opts.min ?? 0). Pass `clamp: true` to clamp to `max` instead of rejecting. */ +export function setIfNonNegativeInt(url: URL, key: string, value: unknown, opts?: { min?: number; max?: number; clamp?: boolean }): void { + setIfInt(url, key, value, { min: opts?.min ?? 0, max: opts?.max, clamp: opts?.clamp }) +} + +/** Set `key` when value is a finite number (non-integers allowed) within [min, max]. */ +export function setIfNumber(url: URL, key: string, value: unknown, opts?: { min?: number; max?: number }): void { + if (typeof value !== 'number' || !Number.isFinite(value)) return + if (opts?.min !== undefined && value < opts.min) return + if (opts?.max !== undefined && value > opts.max) return + url.searchParams.set(key, String(value)) +} + +// — array helper — + +/** First element of an array-typed field, or undefined. */ +export function first<T>(arr: T[] | undefined | null): T | undefined { + return Array.isArray(arr) && arr.length > 0 ? arr[0] : undefined +} + +// — license: CC deed URL → LicenseId (the moat; shared by URL-based sources) — + +/** Map a Creative Commons deed URL to a core LicenseId (+ CC version for the BY/BY-SA + * families). Conservative: NC/ND variants → proprietary; PD mark / CC0 → PD / CC0-1.0; + * absent/unrecognized → unknown. **CC deeds only** — rightsstatements.org is handled by + * `mapRightsUrl`. Match is on the path so http/https both work. */ +export function mapCcDeedUrl(url: string | undefined | null): { license: LicenseId; version?: string } { + if (!url) return { license: 'unknown' } + const u = url.toLowerCase() + if (u.includes('creativecommons.org/publicdomain/zero')) return { license: 'CC0-1.0' } + if (u.includes('creativecommons.org/publicdomain/mark')) return { license: 'PD' } + // NC / ND are NOT open grants — check before plain by/by-sa ("by-nc-sa" contains "by-sa"). + if (/creativecommons\.org\/licenses\/by-(?:nc|nd)/.test(u)) return { license: 'proprietary' } + const sa = u.match(/creativecommons\.org\/licenses\/by-sa\/(\d(?:\.\d)?)/) + if (sa) return { license: 'CC-BY-SA', version: sa[1] } + const by = u.match(/creativecommons\.org\/licenses\/by\/(\d(?:\.\d)?)/) + if (by) return { license: 'CC-BY', version: by[1] } + if (/creativecommons\.org\/licenses\/by-sa\b/.test(u)) return { license: 'CC-BY-SA' } + if (/creativecommons\.org\/licenses\/by\b/.test(u)) return { license: 'CC-BY' } + return { license: 'unknown' } +} + +// rightsstatements.org is a controlled vocabulary of rights STATUS statements (not license +// grants). Map each token FAITHFULLY to the closest true refkit representation rather than +// collapsing all to unknown — discarding a signal the source did give us is not "honest": +// • In-Copyright (InC*) → proprietary — we KNOW it's copyrighted with no grant (commercial +// denied), which is more faithful than "needs-review". +// • NoC-US → PD scoped to the US via the jurisdiction field (RightsRecord.jurisdiction +// exists for exactly this; a jurisdiction-aware caller is gated, default stays lenient). +// • NoC-NC → proprietary — no copyright BUT non-commercial only, so commercial is definitely +// out (closest honest gate; loses the "non-commercial derivatives OK" nuance, which no +// LicenseId can express — acceptable approximation). +// • Opaque/undetermined (NoC-OKLR, NoC-CR, CNE, UND, NKC) → unknown (genuinely needs review). +const RIGHTS_STATEMENT: Record<string, { license: LicenseId; jurisdiction?: string }> = { + 'inc': { license: 'proprietary' }, 'inc-ow-eu': { license: 'proprietary' }, 'inc-edu': { license: 'proprietary' }, + 'inc-nc': { license: 'proprietary' }, 'inc-ruu': { license: 'proprietary' }, + 'noc-us': { license: 'PD', jurisdiction: 'US' }, + 'noc-nc': { license: 'proprietary' }, + 'noc-oklr': { license: 'unknown' }, 'noc-cr': { license: 'unknown' }, + 'cne': { license: 'unknown' }, 'und': { license: 'unknown' }, 'nkc': { license: 'unknown' }, +} + +/** Map any rights URI — a CC deed OR a rightsstatements.org statement — to a faithful + * LicenseId (+ CC version / source jurisdiction). For sources whose rights field can be + * either (europeana `edm:rights`, internet-archive `licenseurl`). CC-only sources should + * call `mapCcDeedUrl` directly. Unknown rightsstatements tokens → unknown. */ +export function mapRightsUrl(url: string | undefined | null): { license: LicenseId; version?: string; jurisdiction?: string } { + if (!url) return { license: 'unknown' } + const rs = url.toLowerCase().match(/rightsstatements\.org\/(?:vocab|page)\/([a-z-]+)/) + if (rs) return RIGHTS_STATEMENT[rs[1]] ?? { license: 'unknown' } + return mapCcDeedUrl(url) +} + +// — image-URL heuristics (decision D8): preview.url must be an image, never a web page — + +export const IMAGE_EXT = /\.(jpe?g|png|webp|gif|tiff?)(?:$|\?)/i + +/** URL-string heuristic only (no network): does this look like an image resource? */ +export function isLikelyImageUrl(url: string): boolean { + return IMAGE_EXT.test(url) + || /iiif/i.test(url) + || /\/full\/[^/]+\/\d+\/default/i.test(url) // IIIF Image API request path + || /\/thumbnail\//i.test(url) + || /googleusercontent\.com/.test(url) // Rijksmuseum/Met image CDN +} + +/** Best image mediaType: declared MIME if image/*, else inferred from extension, else default. */ +export function imageMediaType(mime: string | undefined, url: string): string { + if (mime && mime.startsWith('image/')) return mime + const m = url.match(IMAGE_EXT) + if (m) { const e = m[1].toLowerCase(); return e === 'jpg' ? 'image/jpeg' : `image/${e === 'tif' ? 'tiff' : e}` } + return 'image/jpeg' +} From c12ca1eb6a32a4aef2ecd3efe2a8c31a716dac9c Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:15:34 +0800 Subject: [PATCH 03/34] feat(provider-rijksmuseum): keyless CC0/PD art search satellite (P1) --- packages/provider-rijksmuseum/LICENSE | 201 +++++++++++++++ packages/provider-rijksmuseum/README.md | 20 ++ packages/provider-rijksmuseum/package.json | 48 ++++ .../src/__tests__/rijksmuseum.test.ts | 195 ++++++++++++++ packages/provider-rijksmuseum/src/index.ts | 238 ++++++++++++++++++ packages/provider-rijksmuseum/tsconfig.json | 5 + packages/provider-rijksmuseum/tsup.config.ts | 10 + .../provider-rijksmuseum/vitest.config.ts | 2 + pnpm-lock.yaml | 6 + 9 files changed, 725 insertions(+) create mode 100644 packages/provider-rijksmuseum/LICENSE create mode 100644 packages/provider-rijksmuseum/README.md create mode 100644 packages/provider-rijksmuseum/package.json create mode 100644 packages/provider-rijksmuseum/src/__tests__/rijksmuseum.test.ts create mode 100644 packages/provider-rijksmuseum/src/index.ts create mode 100644 packages/provider-rijksmuseum/tsconfig.json create mode 100644 packages/provider-rijksmuseum/tsup.config.ts create mode 100644 packages/provider-rijksmuseum/vitest.config.ts diff --git a/packages/provider-rijksmuseum/LICENSE b/packages/provider-rijksmuseum/LICENSE new file mode 100644 index 0000000..c1c4eb0 --- /dev/null +++ b/packages/provider-rijksmuseum/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative + Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, excluding + those notices that do not pertain to any part of the Derivative + Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2026 refkit authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/packages/provider-rijksmuseum/README.md b/packages/provider-rijksmuseum/README.md new file mode 100644 index 0000000..5cc52ba --- /dev/null +++ b/packages/provider-rijksmuseum/README.md @@ -0,0 +1,20 @@ +# @refkit/provider-rijksmuseum + +Search **Rijksmuseum** as license-tagged image references — a provider satellite for **refkit** (use with [`@refkit/core`](https://www.npmjs.com/package/@refkit/core)). + +- **Source:** Rijksmuseum +- **Auth:** keyless +- **Modality:** image +- **License:** CC0 / PD + +## Usage + +```ts +import { createRefkit } from '@refkit/core' +import { rijksmuseum } from '@refkit/provider-rijksmuseum' + +const refkit = createRefkit({ providers: [rijksmuseum()] }) +const refs = await refkit.search({ query: 'cat', modalities: ['image'] }) +``` + +Gate by intended use with `refkit.evaluateUse(ref, 'commercial-product')`. See [`@refkit/core`](https://www.npmjs.com/package/@refkit/core) for the full API. diff --git a/packages/provider-rijksmuseum/package.json b/packages/provider-rijksmuseum/package.json new file mode 100644 index 0000000..a5b7b3d --- /dev/null +++ b/packages/provider-rijksmuseum/package.json @@ -0,0 +1,48 @@ +{ + "name": "@refkit/provider-rijksmuseum", + "version": "0.1.0", + "description": "Rijksmuseum provider satellite for refkit.", + "type": "module", + "license": "Apache-2.0", + "keywords": [ + "refkit", + "reference-retrieval", + "license", + "attribution", + "refkit-provider", + "rijksmuseum", + "art", + "museum", + "public-domain", + "linked-art" + ], + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": "./src/index.ts" + }, + "scripts": { + "typecheck": "tsc --noEmit", + "test": "vitest run", + "test:watch": "vitest watch", + "build": "tsup", + "prepublishOnly": "tsup" + }, + "dependencies": { + "@refkit/core": "workspace:*" + }, + "files": [ + "dist", + "LICENSE" + ], + "publishConfig": { + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + } + } +} diff --git a/packages/provider-rijksmuseum/src/__tests__/rijksmuseum.test.ts b/packages/provider-rijksmuseum/src/__tests__/rijksmuseum.test.ts new file mode 100644 index 0000000..fdb7b63 --- /dev/null +++ b/packages/provider-rijksmuseum/src/__tests__/rijksmuseum.test.ts @@ -0,0 +1,195 @@ +import { describe, expect, it } from 'vitest' +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { rijksmuseum } from '../index' + +// Search returns IDs only → N+1 record fetch. Route /search/collection to the +// list body, and each /{id} (with ?_profile=la) to its record body. +const ctxRouting = ( + list: unknown, + records: Record<string, unknown>, + capture?: (searchUrl: string) => void, +): ProviderContext => ({ + fetch: (async (input: Parameters<typeof fetch>[0]) => { + const u = String(input) + if (u.includes('/search/collection')) { + capture?.(u) + return new Response(JSON.stringify(list), { status: 200 }) + } + const m = u.match(/\/(\d+)(?:\?|$)/) + if (m && records[m[1]]) return new Response(JSON.stringify(records[m[1]]), { status: 200 }) + return new Response('null', { status: 404 }) + }) as typeof fetch, +}) + +const LIST = { + '@context': 'https://linked.art/ns/v1/search.json', + type: 'OrderedCollectionPage', + partOf: { type: 'OrderedCollection', totalItems: 3 }, + orderedItems: [ + { id: 'https://id.rijksmuseum.nl/200100988', type: 'HumanMadeObject' }, + { id: 'https://id.rijksmuseum.nl/200100777', type: 'HumanMadeObject' }, + { id: 'https://id.rijksmuseum.nl/200100666', type: 'HumanMadeObject' }, + ], + next: { id: 'https://data.rijksmuseum.nl/search/collection?title=sea&pageToken=abc', type: 'OrderedCollectionPage' }, +} + +// CC0 record (verified shape: title=identified_by[].content of type Name; creator +// via produced_by.carried_out_by; rights URI under subject_to.classified_as.id; +// image under digitally_carried_by.access_point.id). +const REC_CC0 = { + id: 'https://id.rijksmuseum.nl/200100988', + type: 'HumanMadeObject', + identified_by: [ + { type: 'Name', classified_as: [{ id: 'http://vocab.getty.edu/aat/300404670', _label: 'preferred terms' }], content: 'Misty Sea' }, + ], + produced_by: { + type: 'Production', + carried_out_by: [{ id: 'https://id.rijksmuseum.nl/person/toorop', type: 'Person', _label: 'Jan Toorop' }], + }, + subject_to: [ + { type: 'Right', classified_as: [{ id: 'https://creativecommons.org/publicdomain/zero/1.0/', _label: 'CC0 1.0' }] }, + ], + subject_of: [ + { type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', access_point: [{ id: 'https://lh3.googleusercontent.com/cc0-image=s0', type: 'DigitalObject' }] }] }, + ], +} + +// Public Domain Mark record. +const REC_PDM = { + id: 'https://id.rijksmuseum.nl/200100777', + type: 'HumanMadeObject', + identified_by: [{ type: 'Name', content: 'Old Engraving' }], + produced_by: { type: 'Production', carried_out_by: [{ type: 'Person', _label: 'Anonymous' }] }, + subject_to: [{ type: 'Right', classified_as: [{ id: 'https://creativecommons.org/publicdomain/mark/1.0/', _label: 'PDM' }] }], + subject_of: [{ type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', access_point: [{ id: 'https://lh3.googleusercontent.com/pdm-image=s0' }] }] }], +} + +// Rights-less record: no creativecommons/rightsstatements URI anywhere → unknown. +const REC_NO_RIGHTS = { + id: 'https://id.rijksmuseum.nl/200100666', + type: 'HumanMadeObject', + identified_by: [{ type: 'Name', content: 'Untitled (rights unclear)' }], + produced_by: { type: 'Production', carried_out_by: [{ type: 'Person', _label: 'Unknown Maker' }] }, + subject_of: [{ type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', access_point: [{ id: 'https://lh3.googleusercontent.com/mystery=s0' }] }] }], +} + +describe('rijksmuseum provider', () => { + it('maps a CC0 record to a CC0 reference that clears a commercial-product use', async () => { + const refs = await rijksmuseum().search( + { text: 'sea', modalities: ['image'], limit: 10 }, + ctxRouting(LIST, { '200100988': REC_CC0, '200100777': REC_PDM, '200100666': REC_NO_RIGHTS }), + ) + const cc0 = refs.find(r => r.title === 'Misty Sea')! + expect(cc0.modality).toBe('image') + expect(cc0.rights.license).toBe('CC0-1.0') + expect(cc0.rights.author).toBe('Jan Toorop') + expect(cc0.canonicalUrl).toBe('https://id.rijksmuseum.nl/200100988') + expect(cc0.preview?.url).toContain('googleusercontent') + expect(cc0.rights.licenseVersion).toBeUndefined() // CC0/PD never set version + expect(evaluateUse(cc0.rights, 'commercial-product').decision).toBe('allowed') + }) + + it('maps a Public Domain Mark record to PD', async () => { + const refs = await rijksmuseum().search( + { text: 'sea', modalities: ['image'] }, + ctxRouting(LIST, { '200100988': REC_CC0, '200100777': REC_PDM, '200100666': REC_NO_RIGHTS }), + ) + const pd = refs.find(r => r.title === 'Old Engraving')! + expect(pd.rights.license).toBe('PD') + expect(evaluateUse(pd.rights, 'commercial-product').decision).toBe('allowed') + }) + + it('marks a record with no parseable open-rights URI as unknown → needs-review (not dropped)', async () => { + const refs = await rijksmuseum().search( + { text: 'sea', modalities: ['image'] }, + ctxRouting(LIST, { '200100988': REC_CC0, '200100777': REC_PDM, '200100666': REC_NO_RIGHTS }), + ) + const mystery = refs.find(r => r.title === 'Untitled (rights unclear)')! + expect(mystery).toBeDefined() // kept, not silently dropped + expect(mystery.rights.license).toBe('unknown') + expect(evaluateUse(mystery.rights, 'commercial-product').decision).toBe('needs-review') + }) + + it('returns [] when the search finds nothing', async () => { + const refs = await rijksmuseum().search( + { text: 'zzz', modalities: ['image'] }, + ctxRouting({ '@context': 'x', type: 'OrderedCollectionPage', orderedItems: [] }, {}), + ) + expect(refs).toEqual([]) + }) + + it('survives a single failed per-item fetch without dropping the batch', async () => { + const refs = await rijksmuseum().search( + { text: 'sea', modalities: ['image'] }, + // 200100777 record omitted → its fetch 404s; the other two must still map. + ctxRouting(LIST, { '200100988': REC_CC0, '200100666': REC_NO_RIGHTS }), + ) + expect(refs.map(r => r.title).sort()).toEqual(['Misty Sea', 'Untitled (rights unclear)']) + }) + + it('drops a record whose only access_point is a viewer/collection page (never a non-image preview)', async () => { + // No `format`/IIIF on the DigitalObject and the access_point is a web page, not an + // image → findImage() returns undefined → the item is dropped (not surfaced with a + // webpage in preview.url). + const REC_PAGE_ONLY = { + id: 'https://id.rijksmuseum.nl/200100555', + type: 'HumanMadeObject', + identified_by: [{ type: 'Name', content: 'Viewer Only' }], + subject_to: [{ type: 'Right', classified_as: [{ id: 'https://creativecommons.org/publicdomain/zero/1.0/' }] }], + subject_of: [{ type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', access_point: [{ id: 'https://www.rijksmuseum.nl/en/collection/SK-A-1' }] }] }], + } + const ONE = { + type: 'OrderedCollectionPage', + orderedItems: [{ id: 'https://id.rijksmuseum.nl/200100555', type: 'HumanMadeObject' }], + } + const refs = await rijksmuseum().search( + { text: 'x', modalities: ['image'] }, + ctxRouting(ONE, { '200100555': REC_PAGE_ONLY }), + ) + expect(refs).toEqual([]) + }) + + it('prefers an image-typed (format/IIIF) DigitalObject over a non-image access_point', async () => { + // The first access_point is a page; a second DigitalObject is typed image/jpeg → + // findImage() must pick the typed one and carry its mediaType. + const REC_TYPED = { + id: 'https://id.rijksmuseum.nl/200100444', + type: 'HumanMadeObject', + identified_by: [{ type: 'Name', content: 'Typed Image' }], + subject_to: [{ type: 'Right', classified_as: [{ id: 'https://creativecommons.org/publicdomain/zero/1.0/' }] }], + subject_of: [ + { type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', access_point: [{ id: 'https://www.rijksmuseum.nl/en/collection/SK-A-2' }] }] }, + { type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', format: 'image/jpeg', access_point: [{ id: 'https://iiif.example.org/image/abc/full/full/0/default.jpg' }] }] }, + ], + } + const ONE = { type: 'OrderedCollectionPage', orderedItems: [{ id: 'https://id.rijksmuseum.nl/200100444', type: 'HumanMadeObject' }] } + const refs = await rijksmuseum().search({ text: 'x', modalities: ['image'] }, ctxRouting(ONE, { '200100444': REC_TYPED })) + expect(refs).toHaveLength(1) + expect(refs[0].preview?.url).toBe('https://iiif.example.org/image/abc/full/full/0/default.jpg') + expect(refs[0].preview?.mediaType).toBe('image/jpeg') + }) + + it('forwards the keyword and documented search options + caps the page size to the limit', async () => { + let searchUrl = '' + await rijksmuseum().search( + { + text: 'vermeer', + modalities: ['image'], + limit: 5, + providerOptions: { type: 'painting', material: 'canvas', technique: 'oil paint', creator: 'Johannes Vermeer', imageAvailable: true }, + }, + ctxRouting({ type: 'OrderedCollectionPage', orderedItems: [] }, {}, (u) => { searchUrl = u }), + ) + const url = new URL(searchUrl) + expect(url.origin + url.pathname).toBe('https://data.rijksmuseum.nl/search/collection') + expect(url.searchParams.get('title')).toBe('vermeer') // primary keyword param + expect(url.searchParams.get('type')).toBe('painting') + expect(url.searchParams.get('material')).toBe('canvas') + expect(url.searchParams.get('technique')).toBe('oil paint') + expect(url.searchParams.get('creator')).toBe('Johannes Vermeer') + expect(url.searchParams.get('imageAvailable')).toBe('true') + expect(url.searchParams.get('pageSize')).toBe('5') // limit → page size cap + // keyless: never a key param + expect(url.searchParams.get('key')).toBeNull() + }) +}) diff --git a/packages/provider-rijksmuseum/src/index.ts b/packages/provider-rijksmuseum/src/index.ts new file mode 100644 index 0000000..fb2604a --- /dev/null +++ b/packages/provider-rijksmuseum/src/index.ts @@ -0,0 +1,238 @@ +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type LicenseId, + type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +export interface RijksmuseumConfig { + /** Max records fetched per search. Search returns only IDs, so each result + * costs one extra Linked-Art fetch — this bounds that N+1 fan-out. Default 12. */ + maxObjects?: number +} + +export interface RijksmuseumSearchOptions { + /** Object type, e.g. 'painting'. */ + type?: string + /** Material, e.g. 'canvas'. */ + material?: string + /** Technique, e.g. 'oil paint'. */ + technique?: string + /** Maker/artist (maps to `creator`). */ + creator?: string + /** Free-text description match. */ + description?: string + /** Restrict to objects with an image. */ + imageAvailable?: boolean +} + +const SEARCH = 'https://data.rijksmuseum.nl/search/collection' +const RIJKS_TERMS = 'https://www.rijksmuseum.nl/en/data/policy' + +// D7-style: map a CC deed URL to our LicenseId (+ CC version). Rijksmuseum open-access is +// effectively CC0/PDM; BY/BY-SA are implemented for correctness but not expected. CC-only — +// Rijksmuseum does not use rightsstatements.org, so this is replaced by core `mapCcDeedUrl` +// (NOT core `mapRightsUrl`) in helper-refactor Task 4. Named `mapRijksRights` to avoid clashing +// with the core `mapRightsUrl` helper, which additionally handles rightsstatements.org. +function mapRijksRights(url: string | undefined): { license: LicenseId; version?: string } { + if (!url) return { license: 'unknown' } + if (/creativecommons\.org\/publicdomain\/zero/.test(url)) return { license: 'CC0-1.0' } + if (/creativecommons\.org\/publicdomain\/mark/.test(url)) return { license: 'PD' } + if (/rightsstatements\.org\/(?:vocab|page)\/NoCopyright/i.test(url)) return { license: 'PD' } + const sa = url.match(/creativecommons\.org\/licenses\/by-sa\/(\d\.\d)/) + if (sa) return { license: 'CC-BY-SA', version: sa[1] } + const by = url.match(/creativecommons\.org\/licenses\/by\/(\d\.\d)/) + if (by) return { license: 'CC-BY', version: by[1] } + return { license: 'unknown' } +} + +// The Linked-Art graph is deeply nested and varies per record, so we extract by +// shape, not by fixed index paths (see plan Open Questions). + +/** First string anywhere in the record matching a known rights-deed host. */ +function findRightsUrl(node: unknown, depth = 0): string | undefined { + if (depth > 8 || node == null) return undefined + if (typeof node === 'string') { + return /creativecommons\.org\/(publicdomain|licenses)|rightsstatements\.org/.test(node) ? node : undefined + } + if (Array.isArray(node)) { + for (const v of node) { const hit = findRightsUrl(v, depth + 1); if (hit) return hit } + return undefined + } + if (typeof node === 'object') { + for (const v of Object.values(node as Record<string, unknown>)) { + const hit = findRightsUrl(v, depth + 1); if (hit) return hit + } + } + return undefined +} + +// We must not put a NON-image URL (a viewer/collection web page) into preview.url. +// The API carries the answer: a DigitalObject's `format` (a MIME type) and IIIF +// `conforms_to` say which access_point is the image. So: read the type first, then +// fall back to a cheap URL heuristic, then give up (no network probe — `core` never +// fetches bytes, and that would add an extra request per item). See Open Questions #1. +const IMAGE_EXT = /\.(jpe?g|png|webp|gif|tiff?)(?:$|\?)/i + +/** URL-string heuristic only (no network): does this look like an image resource? */ +function isLikelyImageUrl(url: string): boolean { + return IMAGE_EXT.test(url) + || /iiif/i.test(url) // IIIF image endpoint + || /\/full\/[^/]+\/\d+\/default/i.test(url) // IIIF Image API request URL + || /googleusercontent\.com/.test(url) // Rijksmuseum/Met image CDN +} + +interface LaDigitalObject { + type?: string + format?: string + conforms_to?: Array<{ id?: string }> + access_point?: Array<{ id?: string }> +} + +/** Collect every node that carries an `access_point` (the DigitalObjects) anywhere. */ +function collectDigitalObjects(node: unknown, out: LaDigitalObject[] = [], depth = 0): LaDigitalObject[] { + if (depth > 8 || node == null) return out + if (Array.isArray(node)) { for (const v of node) collectDigitalObjects(v, out, depth + 1); return out } + if (typeof node === 'object') { + const o = node as Record<string, unknown> + if (Array.isArray(o.access_point)) out.push(o as LaDigitalObject) + for (const v of Object.values(o)) collectDigitalObjects(v, out, depth + 1) + } + return out +} + +/** Best usable IMAGE url + its mediaType, or undefined. + * Tier 1: a DigitalObject explicitly typed `image/*` or IIIF → trust it. + * Tier 2: any access_point whose URL heuristically looks like an image. + * Otherwise undefined → the item is dropped (an image provider with no image is useless). */ +function findImage(rec: Record<string, unknown>): { url: string; mediaType: string } | undefined { + const objs = collectDigitalObjects(rec) + // Tier 1 — explicit type from the data. + for (const o of objs) { + const fmt = typeof o.format === 'string' ? o.format : undefined + const isIiif = Array.isArray(o.conforms_to) && o.conforms_to.some(c => typeof c?.id === 'string' && /iiif/i.test(c.id)) + if ((fmt && fmt.startsWith('image/')) || isIiif) { + const url = o.access_point?.find(a => typeof a?.id === 'string')?.id + if (url) return { url, mediaType: fmt && fmt.startsWith('image/') ? fmt : 'image/jpeg' } + } + } + // Tier 2 — URL heuristic fallback. + for (const o of objs) { + const hit = o.access_point?.find(a => typeof a?.id === 'string' && isLikelyImageUrl(a.id))?.id + if (hit) return { url: hit, mediaType: 'image/jpeg' } + } + return undefined +} + +interface LaName { type?: string; content?: string } +function findTitle(rec: Record<string, unknown>): string | undefined { + const names = rec.identified_by + if (Array.isArray(names)) { + for (const n of names as LaName[]) { + if (n?.type === 'Name' && typeof n.content === 'string' && n.content) return n.content + } + } + return undefined +} + +function findCreator(rec: Record<string, unknown>): string | undefined { + const prod = rec.produced_by as Record<string, unknown> | undefined + if (!prod) return undefined + const direct = prod.carried_out_by + const parts = Array.isArray(prod.part) ? (prod.part as Record<string, unknown>[]) : [] + const actors = [ + ...(Array.isArray(direct) ? (direct as Record<string, unknown>[]) : []), + ...parts.flatMap(p => (Array.isArray(p.carried_out_by) ? (p.carried_out_by as Record<string, unknown>[]) : [])), + ] + for (const a of actors) { + const label = a._label ?? (a as { notation?: unknown }).notation + if (typeof label === 'string' && label) return label + } + return undefined +} + +function toReference(rec: Record<string, unknown>): Reference | null { + const id = typeof rec.id === 'string' ? rec.id : undefined + if (!id) return null + const img = findImage(rec) + if (!img) return null // no usable IMAGE url (e.g. only a viewer/collection page) → drop + const { license, version } = mapRijksRights(findRightsUrl(rec)) + const rights: RightsRecord = { + license, + licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + author: findCreator(rec) || undefined, + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: RIJKS_TERMS, sourceUrl: id }, + } + return { + id: referenceId('rijksmuseum', id), + modality: 'image', + title: findTitle(rec), + source: { providerId: 'rijksmuseum', sourceUrl: id }, + canonicalUrl: id, + rights, + verifiedAt: new Date().toISOString(), + thumbnail: { url: img.url }, + preview: { url: img.url, mediaType: img.mediaType }, + relevance: 0, + raw: rec, + } +} + +function setIfString(url: URL, key: string, value: unknown) { + if (typeof value !== 'string' || !value) return + url.searchParams.set(key, value) +} +function setIfBoolean(url: URL, key: string, value: unknown) { + if (typeof value !== 'boolean') return + url.searchParams.set(key, String(value)) +} + +interface SearchPage { orderedItems?: Array<{ id?: string }> } + +export function rijksmuseum(config: RijksmuseumConfig = {}) { + return defineProvider({ + id: 'rijksmuseum', + modalities: ['image'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const opts = q.providerOptions as RijksmuseumSearchOptions | undefined + const n = Math.min(config.maxObjects ?? q.limit ?? 12, 30) + const searchUrl = new URL(SEARCH) + // No global free-text param; `title` is a partial keyword match → use it as the keyword. + if (q.text) searchUrl.searchParams.set('title', q.text) + setIfString(searchUrl, 'type', opts?.type) + setIfString(searchUrl, 'material', opts?.material) + setIfString(searchUrl, 'technique', opts?.technique) + setIfString(searchUrl, 'creator', opts?.creator) + setIfString(searchUrl, 'description', opts?.description) + setIfBoolean(searchUrl, 'imageAvailable', opts?.imageAvailable) + searchUrl.searchParams.set('pageSize', String(n)) // best-effort cap; server caps at 100 + + const res = await ctx.fetch(searchUrl.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`rijksmuseum search failed: ${res.status}`) + const page = (await res.json()) as SearchPage + const ids = (page.orderedItems ?? []) + .map(it => it.id) + .filter((u): u is string => typeof u === 'string') + .slice(0, n) + if (ids.length === 0) return [] + + const records = await Promise.all(ids.map(async (idUrl) => { + try { + // Content-negotiate the Linked-Art JSON-LD. id.rijksmuseum.nl 303s to + // data.rijksmuseum.nl; ?_profile=la selects the Linked-Art profile. + const recUrl = `${idUrl}${idUrl.includes('?') ? '&' : '?'}_profile=la` + const r = await ctx.fetch(recUrl, { signal: ctx.signal }) + if (!r.ok) return null + return (await r.json()) as Record<string, unknown> + } catch { + return null // one bad record fetch must not drop the whole batch + } + })) + return records + .map(rec => (rec ? toReference(rec) : null)) + .filter((r): r is Reference => r !== null) + }, + }) +} diff --git a/packages/provider-rijksmuseum/tsconfig.json b/packages/provider-rijksmuseum/tsconfig.json new file mode 100644 index 0000000..98922fe --- /dev/null +++ b/packages/provider-rijksmuseum/tsconfig.json @@ -0,0 +1,5 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { "outDir": "out", "rootDir": "src", "types": ["node"] }, + "include": ["src/**/*"] +} diff --git a/packages/provider-rijksmuseum/tsup.config.ts b/packages/provider-rijksmuseum/tsup.config.ts new file mode 100644 index 0000000..af0ad13 --- /dev/null +++ b/packages/provider-rijksmuseum/tsup.config.ts @@ -0,0 +1,10 @@ +import { defineConfig } from 'tsup' + +export default defineConfig({ + entry: ['src/index.ts'], + format: ['esm'], + dts: true, + clean: true, + outDir: 'dist', + sourcemap: true, +}) diff --git a/packages/provider-rijksmuseum/vitest.config.ts b/packages/provider-rijksmuseum/vitest.config.ts new file mode 100644 index 0000000..ace68d6 --- /dev/null +++ b/packages/provider-rijksmuseum/vitest.config.ts @@ -0,0 +1,2 @@ +import { defineConfig } from 'vitest/config' +export default defineConfig({ test: { name: 'provider-rijksmuseum', environment: 'node', include: ['src/**/*.{test,spec}.ts'] } }) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index eb0cf6a..b9f439d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -165,6 +165,12 @@ importers: specifier: workspace:* version: link:../core + packages/provider-rijksmuseum: + dependencies: + '@refkit/core': + specifier: workspace:* + version: link:../core + packages/provider-smithsonian: dependencies: '@refkit/core': From 43147ee9878d8fb2d56b9491814629cc1dd87cd6 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:21:29 +0800 Subject: [PATCH 04/34] feat(provider-polyhaven): polyhaven() CC0 image satellite --- packages/provider-polyhaven/LICENSE | 201 ++++++++++++++++++ packages/provider-polyhaven/README.md | 35 +++ packages/provider-polyhaven/package.json | 45 ++++ .../src/__tests__/polyhaven.test.ts | 59 +++++ packages/provider-polyhaven/src/index.ts | 121 +++++++++++ packages/provider-polyhaven/tsconfig.json | 5 + packages/provider-polyhaven/tsup.config.ts | 10 + packages/provider-polyhaven/vitest.config.ts | 2 + pnpm-lock.yaml | 6 + 9 files changed, 484 insertions(+) create mode 100644 packages/provider-polyhaven/LICENSE create mode 100644 packages/provider-polyhaven/README.md create mode 100644 packages/provider-polyhaven/package.json create mode 100644 packages/provider-polyhaven/src/__tests__/polyhaven.test.ts create mode 100644 packages/provider-polyhaven/src/index.ts create mode 100644 packages/provider-polyhaven/tsconfig.json create mode 100644 packages/provider-polyhaven/tsup.config.ts create mode 100644 packages/provider-polyhaven/vitest.config.ts diff --git a/packages/provider-polyhaven/LICENSE b/packages/provider-polyhaven/LICENSE new file mode 100644 index 0000000..c1c4eb0 --- /dev/null +++ b/packages/provider-polyhaven/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative + Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, excluding + those notices that do not pertain to any part of the Derivative + Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2026 refkit authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/packages/provider-polyhaven/README.md b/packages/provider-polyhaven/README.md new file mode 100644 index 0000000..20475ca --- /dev/null +++ b/packages/provider-polyhaven/README.md @@ -0,0 +1,35 @@ +# @refkit/provider-polyhaven + +Search **Poly Haven** as license-tagged image references — a provider satellite for **refkit** (use with [`@refkit/core`](https://www.npmjs.com/package/@refkit/core)). This package also ships a sibling factory, **`ambientcg()`**, for [ambientCG](https://ambientcg.com). + +- **Source:** Poly Haven ([polyhaven.com](https://polyhaven.com)) + ambientCG ([ambientcg.com](https://ambientcg.com)) +- **Auth:** keyless +- **Modality:** image +- **License:** CC0 (whole-source — every reference is `CC0-1.0`) + +## Image-only (3D model formats skipped) + +refkit's core modalities are `image | video | audio | text` — there is no `3d`/`texture` modality. Poly Haven and ambientCG host textures, HDRIs, and PBR materials whose individual maps are image files, so each reference is emitted as `modality: 'image'` surfacing only the image-format preview (a texture's diffuse `.jpg`/`.png`, an HDRI's tonemapped `.jpg`, or a material's PNG preview). **3D model formats are skipped for v1** — `.blend` / `.gltf` / `.fbx` / `.mtlx` / `.usd` and HDR/EXR files are not returned. + +## Usage + +```ts +import { createRefkit } from '@refkit/core' +import { polyhaven } from '@refkit/provider-polyhaven' + +const refkit = createRefkit({ providers: [polyhaven(/* config */)] }) +const refs = await refkit.search({ query: 'asphalt', modalities: ['image'] }) +``` + +### ambientCG sibling factory + +`ambientcg()` lives in the same package and returns the same CC0-normalized image references, hitting ambientCG's API instead: + +```ts +import { polyhaven, ambientcg } from '@refkit/provider-polyhaven' + +const refkit = createRefkit({ providers: [polyhaven(), ambientcg()] }) +const refs = await refkit.search({ query: 'tiles', modalities: ['image'] }) +``` + +Gate by intended use with `refkit.evaluateUse(ref, 'commercial-product')`. See [`@refkit/core`](https://www.npmjs.com/package/@refkit/core) for the full API. diff --git a/packages/provider-polyhaven/package.json b/packages/provider-polyhaven/package.json new file mode 100644 index 0000000..9c5f723 --- /dev/null +++ b/packages/provider-polyhaven/package.json @@ -0,0 +1,45 @@ +{ + "name": "@refkit/provider-polyhaven", + "version": "0.1.0", + "description": "Poly Haven provider satellite for refkit.", + "type": "module", + "license": "Apache-2.0", + "keywords": [ + "refkit", + "reference-retrieval", + "license", + "attribution", + "refkit-provider", + "polyhaven", + "ambientcg" + ], + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": "./src/index.ts" + }, + "scripts": { + "typecheck": "tsc --noEmit", + "test": "vitest run", + "test:watch": "vitest watch", + "build": "tsup", + "prepublishOnly": "tsup" + }, + "dependencies": { + "@refkit/core": "workspace:*" + }, + "files": [ + "dist", + "LICENSE" + ], + "publishConfig": { + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + } + } +} diff --git a/packages/provider-polyhaven/src/__tests__/polyhaven.test.ts b/packages/provider-polyhaven/src/__tests__/polyhaven.test.ts new file mode 100644 index 0000000..8e3cd51 --- /dev/null +++ b/packages/provider-polyhaven/src/__tests__/polyhaven.test.ts @@ -0,0 +1,59 @@ +import { describe, expect, it } from 'vitest' +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { polyhaven } from '../index' + +// Poly Haven: /assets returns id→asset (no URLs); /files/<id> returns the download tree. +const ctxRouting = (list: unknown, files: Record<string, unknown>): ProviderContext => ({ + fetch: (async (input: string) => { + const u = String(input) + if (u.includes('/assets')) return new Response(JSON.stringify(list), { status: 200 }) + const m = u.match(/\/files\/([^/?]+)/) + if (m && files[m[1]]) return new Response(JSON.stringify(files[m[1]]), { status: 200 }) + return new Response('null', { status: 404 }) + }) as typeof fetch, +}) + +const LIST = { + aerial_asphalt_01: { + type: 1, name: 'Aerial Asphalt 01', categories: ['asphalt', 'road'], tags: ['flat'], + authors: { 'Rob Tuytel': 'All' }, + thumbnail_url: 'https://cdn.polyhaven.com/asset_img/thumbs/aerial_asphalt_01.png?width=256&height=256', + }, +} +const FILES_TEX = { + aerial_asphalt_01: { + Diffuse: { + '1k': { jpg: { url: 'https://dl.polyhaven.org/file/ph-assets/Textures/jpg/1k/aerial_asphalt_01/aerial_asphalt_01_diff_1k.jpg' } }, + }, + // non-image keys that must be ignored: + blend: { '1k': { blend: { url: 'https://dl.polyhaven.org/x.blend' } } }, + gltf: { '1k': { gltf: { url: 'https://dl.polyhaven.org/x.gltf' } } }, + }, +} + +describe('polyhaven provider', () => { + it('maps a texture to a CC0 image reference with a resolved jpg preview', async () => { + const refs = await polyhaven().search( + { text: 'asphalt', modalities: ['image'], limit: 5 }, + ctxRouting(LIST, FILES_TEX), + ) + expect(refs).toHaveLength(1) + const r = refs[0] + expect(r.modality).toBe('image') + expect(r.title).toBe('Aerial Asphalt 01') + expect(r.rights.license).toBe('CC0-1.0') + expect(r.rights.author).toBe('Rob Tuytel') + expect(r.rights.rehostPolicy).toBe('cache-allowed') + expect(r.rights.raw.sourceTerms).toBe('https://polyhaven.com/license') + expect(r.preview?.url).toContain('aerial_asphalt_01_diff_1k.jpg') + expect(r.preview?.mediaType).toBe('image/jpeg') + expect(r.thumbnail?.url).toContain('thumbs/aerial_asphalt_01.png') + expect(r.canonicalUrl).toBe('https://polyhaven.com/a/aerial_asphalt_01') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed') + }) + + it('returns [] when the list is empty', async () => { + const refs = await polyhaven().search({ text: 'zzz', modalities: ['image'] }, ctxRouting({}, {})) + expect(refs).toEqual([]) + }) +}) diff --git a/packages/provider-polyhaven/src/index.ts b/packages/provider-polyhaven/src/index.ts new file mode 100644 index 0000000..27e1f85 --- /dev/null +++ b/packages/provider-polyhaven/src/index.ts @@ -0,0 +1,121 @@ +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +const PH_BASE = 'https://api.polyhaven.com' +const PH_TERMS = 'https://polyhaven.com/license' + +export interface PolyHavenConfig { + /** texture vs HDRI listing. Default 'textures'. */ + assetType?: 'textures' | 'hdris' + /** Max assets resolved per search; each costs one /files/<id> call (N+1). Default 12. */ + maxAssets?: number +} + +interface PolyHavenAsset { + type: number + name: string + categories?: string[] + tags?: string[] + authors?: Record<string, string> + thumbnail_url?: string +} +type PolyHavenList = Record<string, PolyHavenAsset> +// /files tree: maps/resolutions/formats → { url }. Loosely typed; we walk known image paths only. +type PolyHavenFiles = Record<string, unknown> + +interface PhFileLeaf { url?: string } + +/** First image URL for a texture: Diffuse (then a couple of fallbacks) → smallest res → jpg/png. */ +function textureImageUrl(files: PolyHavenFiles): string | undefined { + for (const mapKey of ['Diffuse', 'diff', 'Color', 'albedo']) { + const byRes = files[mapKey] as Record<string, Record<string, PhFileLeaf>> | undefined + if (!byRes) continue + for (const res of ['1k', '2k', '4k']) { + const byFmt = byRes[res] + const url = byFmt?.jpg?.url ?? byFmt?.png?.url + if (url) return url + } + } + return undefined +} + +/** HDRI image preview: the tonemapped .jpg (skip .hdr/.exr — D1). */ +function hdriImageUrl(files: PolyHavenFiles): string | undefined { + const tm = files.tonemapped as PhFileLeaf | undefined + return tm?.url +} + +function firstAuthor(authors?: Record<string, string>): string | undefined { + if (!authors) return undefined + const names = Object.keys(authors) + return names.length ? names.join(', ') : undefined +} + +function toReference(id: string, asset: PolyHavenAsset, imageUrl: string): Reference { + const canonical = `https://polyhaven.com/a/${id}` + const rights: RightsRecord = { + license: 'CC0-1.0', + author: firstAuthor(asset.authors), + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: PH_TERMS, sourceUrl: canonical }, + } + return { + id: referenceId('polyhaven', canonical), + modality: 'image', + title: asset.name || undefined, + source: { providerId: 'polyhaven', sourceUrl: canonical }, + canonicalUrl: canonical, + rights, + verifiedAt: new Date().toISOString(), + ...(asset.thumbnail_url ? { thumbnail: { url: asset.thumbnail_url } } : {}), + // textureImageUrl may resolve a .png fallback — derive the MIME from the extension + // rather than hardcoding jpeg (mislabeling a PNG as JPEG). + preview: { url: imageUrl, mediaType: imageUrl.toLowerCase().includes('.png') ? 'image/png' : 'image/jpeg' }, + relevance: 0, + raw: asset, + } +} + +export function polyhaven(config: PolyHavenConfig = {}) { + const assetType = config.assetType ?? 'textures' + return defineProvider({ + id: 'polyhaven', + modalities: ['image'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const listUrl = new URL(`${PH_BASE}/assets`) + listUrl.searchParams.set('t', assetType) + const res = await ctx.fetch(listUrl.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`polyhaven list failed: ${res.status}`) + const list = (await res.json()) as PolyHavenList + let entries = Object.entries(list) + // Client-side keyword filter — the list endpoint has no query param. + const text = q.text?.trim().toLowerCase() + if (text) { + entries = entries.filter(([id, a]) => + id.includes(text) || + a.name?.toLowerCase().includes(text) || + a.categories?.some((c) => c.toLowerCase().includes(text)) || + a.tags?.some((t) => t.toLowerCase().includes(text))) + } + const n = Math.min(config.maxAssets ?? q.limit ?? 12, 30) + const picked = entries.slice(0, n) + const refs = await Promise.all(picked.map(async ([id, asset]) => { + try { + const fr = await ctx.fetch(`${PH_BASE}/files/${id}`, { signal: ctx.signal }) + if (!fr.ok) return null + const files = (await fr.json()) as PolyHavenFiles + const imageUrl = assetType === 'hdris' ? hdriImageUrl(files) : textureImageUrl(files) + if (!imageUrl) return null // no image-format file → skip (D1) + return toReference(id, asset, imageUrl) + } catch { + return null // one bad files fetch must not drop the whole batch + } + })) + return refs.filter((r): r is Reference => r !== null) + }, + }) +} diff --git a/packages/provider-polyhaven/tsconfig.json b/packages/provider-polyhaven/tsconfig.json new file mode 100644 index 0000000..98922fe --- /dev/null +++ b/packages/provider-polyhaven/tsconfig.json @@ -0,0 +1,5 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { "outDir": "out", "rootDir": "src", "types": ["node"] }, + "include": ["src/**/*"] +} diff --git a/packages/provider-polyhaven/tsup.config.ts b/packages/provider-polyhaven/tsup.config.ts new file mode 100644 index 0000000..af0ad13 --- /dev/null +++ b/packages/provider-polyhaven/tsup.config.ts @@ -0,0 +1,10 @@ +import { defineConfig } from 'tsup' + +export default defineConfig({ + entry: ['src/index.ts'], + format: ['esm'], + dts: true, + clean: true, + outDir: 'dist', + sourcemap: true, +}) diff --git a/packages/provider-polyhaven/vitest.config.ts b/packages/provider-polyhaven/vitest.config.ts new file mode 100644 index 0000000..70cbb03 --- /dev/null +++ b/packages/provider-polyhaven/vitest.config.ts @@ -0,0 +1,2 @@ +import { defineConfig } from 'vitest/config' +export default defineConfig({ test: { name: 'provider-polyhaven', environment: 'node', include: ['src/**/*.{test,spec}.ts'] } }) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b9f439d..9152430 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -165,6 +165,12 @@ importers: specifier: workspace:* version: link:../core + packages/provider-polyhaven: + dependencies: + '@refkit/core': + specifier: workspace:* + version: link:../core + packages/provider-rijksmuseum: dependencies: '@refkit/core': From 3db0edb2deb36e6ba6d31226fc93c7940dbcb40b Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:22:30 +0800 Subject: [PATCH 05/34] feat(provider-polyhaven): ambientcg() sibling CC0 image satellite --- .../src/__tests__/ambientcg.test.ts | 56 ++++++++++++++ packages/provider-polyhaven/src/index.ts | 75 +++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 packages/provider-polyhaven/src/__tests__/ambientcg.test.ts diff --git a/packages/provider-polyhaven/src/__tests__/ambientcg.test.ts b/packages/provider-polyhaven/src/__tests__/ambientcg.test.ts new file mode 100644 index 0000000..26b98f7 --- /dev/null +++ b/packages/provider-polyhaven/src/__tests__/ambientcg.test.ts @@ -0,0 +1,56 @@ +import { describe, expect, it } from 'vitest' +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { ambientcg } from '../index' + +const ctxJson = (body: unknown, capture?: (u: string) => void): ProviderContext => ({ + fetch: (async (input: string) => { + capture?.(String(input)) + return new Response(JSON.stringify(body), { status: 200 }) + }) as typeof fetch, +}) + +const FOUND = { + foundAssets: [ + { + assetId: 'Tiles141', displayName: 'Tiles 141', dataType: 'Material', + category: 'Tiles', tags: ['tiles', 'floor'], + previewImage: { + '256-PNG': 'https://acg-media.struffelproductions.com/file/ambientCG-Web/media/thumbnail/256-PNG/Tiles141.png', + '512-PNG': 'https://acg-media.struffelproductions.com/file/ambientCG-Web/media/thumbnail/512-PNG/Tiles141.png', + }, + }, + ], +} +const FOUND_NO_IMAGE = { + foundAssets: [ + // a non-image asset (e.g. plugin/3D-only) with no previewImage → must not be emitted (D1) + { assetId: 'SomeModel', displayName: 'Some Model', dataType: '3DModel', tags: [] }, + ], +} + +describe('ambientcg provider', () => { + it('maps a Material to a CC0 image reference using the PNG preview', async () => { + let url = '' + const refs = await ambientcg().search( + { text: 'tiles', modalities: ['image'], limit: 5 }, + ctxJson(FOUND, (u) => { url = u }), + ) + expect(url).toContain('type=Material') + expect(url).toContain('q=tiles') + expect(refs).toHaveLength(1) + const r = refs[0] + expect(r.modality).toBe('image') + expect(r.title).toBe('Tiles 141') + expect(r.rights.license).toBe('CC0-1.0') + expect(r.rights.rehostPolicy).toBe('cache-allowed') + expect(r.rights.raw.sourceTerms).toBe('https://ambientcg.com/license/') + expect(r.preview?.url).toContain('512-PNG/Tiles141.png') + expect(r.canonicalUrl).toBe('https://ambientcg.com/view?id=Tiles141') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed') + }) + + it('drops assets without an image preview (non-image dataType, D1)', async () => { + const refs = await ambientcg().search({ text: 'x', modalities: ['image'] }, ctxJson(FOUND_NO_IMAGE)) + expect(refs).toEqual([]) + }) +}) diff --git a/packages/provider-polyhaven/src/index.ts b/packages/provider-polyhaven/src/index.ts index 27e1f85..6bca656 100644 --- a/packages/provider-polyhaven/src/index.ts +++ b/packages/provider-polyhaven/src/index.ts @@ -119,3 +119,78 @@ export function polyhaven(config: PolyHavenConfig = {}) { }, }) } + +const ACG_BASE = 'https://ambientcg.com/api/v2/full_json' +const ACG_TERMS = 'https://ambientcg.com/license/' + +export interface AmbientCgConfig { + /** Max materials per search. Default 12. */ + limit?: number +} + +interface AmbientCgAsset { + assetId: string + displayName?: string + dataType?: string + previewImage?: Record<string, string> +} +interface AmbientCgResponse { foundAssets?: AmbientCgAsset[] } + +/** Pick the largest available PNG preview (image-format only — D1). */ +function acgPreviewUrl(preview?: Record<string, string>): string | undefined { + if (!preview) return undefined + for (const key of ['1024-PNG', '512-PNG', '256-PNG', '128-PNG']) { + if (preview[key]) return preview[key] + } + return undefined +} + +function acgToReference(a: AmbientCgAsset, imageUrl: string): Reference { + const canonical = `https://ambientcg.com/view?id=${a.assetId}` + const rights: RightsRecord = { + license: 'CC0-1.0', + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: ACG_TERMS, sourceUrl: canonical }, + } + return { + id: referenceId('ambientcg', canonical), + modality: 'image', + title: a.displayName || undefined, + source: { providerId: 'ambientcg', sourceUrl: canonical }, + canonicalUrl: canonical, + rights, + verifiedAt: new Date().toISOString(), + thumbnail: { url: imageUrl }, + preview: { url: imageUrl, mediaType: 'image/png' }, + relevance: 0, + raw: a, + } +} + +export function ambientcg(config: AmbientCgConfig = {}) { + return defineProvider({ + id: 'ambientcg', + modalities: ['image'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const url = new URL(ACG_BASE) + url.searchParams.set('type', 'Material') // image-based PBR materials only (D1) + url.searchParams.set('include', 'displayData,imageData') + url.searchParams.set('limit', String(Math.min(config.limit ?? q.limit ?? 12, 30))) + if (q.text?.trim()) url.searchParams.set('q', q.text.trim()) + const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`ambientcg search failed: ${res.status}`) + const { foundAssets } = (await res.json()) as AmbientCgResponse + if (!foundAssets || foundAssets.length === 0) return [] + return foundAssets + .map((a) => { + // Defensive D1 guard: only Material assets carry an image previewImage. + if (a.dataType && a.dataType !== 'Material') return null + const imageUrl = acgPreviewUrl(a.previewImage) + return imageUrl ? acgToReference(a, imageUrl) : null + }) + .filter((r): r is Reference => r !== null) + }, + }) +} From 958d8fc6d4b4f55b8f9c97dba74861c4525003bb Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:27:01 +0800 Subject: [PATCH 06/34] feat(provider-freesound): scaffold + license mapper --- packages/provider-freesound/LICENSE | 201 ++++++++++++++++++ packages/provider-freesound/README.md | 20 ++ packages/provider-freesound/package.json | 47 ++++ .../src/__tests__/freesound.test.ts | 25 +++ packages/provider-freesound/src/index.ts | 41 ++++ packages/provider-freesound/tsconfig.json | 5 + packages/provider-freesound/tsup.config.ts | 10 + packages/provider-freesound/vitest.config.ts | 2 + pnpm-lock.yaml | 6 + 9 files changed, 357 insertions(+) create mode 100644 packages/provider-freesound/LICENSE create mode 100644 packages/provider-freesound/README.md create mode 100644 packages/provider-freesound/package.json create mode 100644 packages/provider-freesound/src/__tests__/freesound.test.ts create mode 100644 packages/provider-freesound/src/index.ts create mode 100644 packages/provider-freesound/tsconfig.json create mode 100644 packages/provider-freesound/tsup.config.ts create mode 100644 packages/provider-freesound/vitest.config.ts diff --git a/packages/provider-freesound/LICENSE b/packages/provider-freesound/LICENSE new file mode 100644 index 0000000..c1c4eb0 --- /dev/null +++ b/packages/provider-freesound/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative + Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, excluding + those notices that do not pertain to any part of the Derivative + Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2026 refkit authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/packages/provider-freesound/README.md b/packages/provider-freesound/README.md new file mode 100644 index 0000000..bbcd645 --- /dev/null +++ b/packages/provider-freesound/README.md @@ -0,0 +1,20 @@ +# @refkit/provider-freesound + +Search **Freesound** as license-tagged audio references — a provider satellite for **refkit** (use with [`@refkit/core`](https://www.npmjs.com/package/@refkit/core)). + +- **Source:** Freesound +- **Auth:** API key +- **Modality:** audio +- **License:** per-item CC / CC0 + +## Usage + +```ts +import { createRefkit } from '@refkit/core' +import { freesound } from '@refkit/provider-freesound' + +const refkit = createRefkit({ providers: [freesound(/* config */)] }) +const refs = await refkit.search({ query: 'cat', modalities: ['audio'] }) +``` + +Gate by intended use with `refkit.evaluateUse(ref, 'commercial-product')`. See [`@refkit/core`](https://www.npmjs.com/package/@refkit/core) for the full API. diff --git a/packages/provider-freesound/package.json b/packages/provider-freesound/package.json new file mode 100644 index 0000000..70381fe --- /dev/null +++ b/packages/provider-freesound/package.json @@ -0,0 +1,47 @@ +{ + "name": "@refkit/provider-freesound", + "version": "0.1.0", + "description": "Freesound provider satellite for refkit.", + "type": "module", + "license": "Apache-2.0", + "keywords": [ + "refkit", + "reference-retrieval", + "license", + "attribution", + "refkit-provider", + "freesound", + "audio", + "sound-effects", + "creative-commons" + ], + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": "./src/index.ts" + }, + "scripts": { + "typecheck": "tsc --noEmit", + "test": "vitest run", + "test:watch": "vitest watch", + "build": "tsup", + "prepublishOnly": "tsup" + }, + "dependencies": { + "@refkit/core": "workspace:*" + }, + "files": [ + "dist", + "LICENSE" + ], + "publishConfig": { + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + } + } +} diff --git a/packages/provider-freesound/src/__tests__/freesound.test.ts b/packages/provider-freesound/src/__tests__/freesound.test.ts new file mode 100644 index 0000000..96c6c36 --- /dev/null +++ b/packages/provider-freesound/src/__tests__/freesound.test.ts @@ -0,0 +1,25 @@ +import { describe, expect, it } from 'vitest' +import { mapFreesoundLicense } from '../index' + +describe('mapFreesoundLicense', () => { + it('maps CC name strings (D4 — no version)', () => { + expect(mapFreesoundLicense('Attribution')).toEqual({ license: 'CC-BY' }) + expect(mapFreesoundLicense('Attribution NonCommercial')).toEqual({ license: 'proprietary' }) + expect(mapFreesoundLicense('Attribution Noncommercial')).toEqual({ license: 'proprietary' }) + expect(mapFreesoundLicense('Creative Commons 0')).toEqual({ license: 'CC0-1.0' }) + expect(mapFreesoundLicense('Sampling+')).toEqual({ license: 'proprietary' }) + expect(mapFreesoundLicense('Attribution Sampling+')).toEqual({ license: 'proprietary' }) + }) + + it('maps CC deed URLs and extracts version for BY/BY-SA (D7)', () => { + expect(mapFreesoundLicense('http://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapFreesoundLicense('http://creativecommons.org/licenses/by-sa/3.0/')).toEqual({ license: 'CC-BY-SA', version: '3.0' }) + expect(mapFreesoundLicense('http://creativecommons.org/publicdomain/zero/1.0/')).toEqual({ license: 'CC0-1.0' }) + expect(mapFreesoundLicense('http://creativecommons.org/licenses/by-nc/3.0/')).toEqual({ license: 'proprietary' }) + }) + + it('returns unknown for anything unrecognized', () => { + expect(mapFreesoundLicense('Weird Custom License')).toEqual({ license: 'unknown' }) + expect(mapFreesoundLicense('')).toEqual({ license: 'unknown' }) + }) +}) diff --git a/packages/provider-freesound/src/index.ts b/packages/provider-freesound/src/index.ts new file mode 100644 index 0000000..8923326 --- /dev/null +++ b/packages/provider-freesound/src/index.ts @@ -0,0 +1,41 @@ +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type LicenseId, + type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +// Freesound's `license` is usually a CC NAME string ("Attribution", "Creative +// Commons 0") but has historically also been a CC DEED URL. Handle both. +// D4: name → family LicenseId, no version. D7: URL → family (+ version for BY/BY-SA). +// Conservative: noncommercial / sampling / unrecognized → proprietary or unknown. +const FREESOUND_NAME_LICENSE: Record<string, { license: LicenseId }> = { + 'attribution': { license: 'CC-BY' }, + 'attribution noncommercial': { license: 'proprietary' }, // NC → not commercially usable + 'creative commons 0': { license: 'CC0-1.0' }, + 'sampling+': { license: 'proprietary' }, // bespoke CC sampling licence, not a clean free grant + 'attribution sampling+': { license: 'proprietary' }, +} + +/** Map a Freesound `license` value (CC name string OR CC deed URL) to our + * license + optional CC version. Unrecognized → `unknown` (strict-deny). */ +export function mapFreesoundLicense(value: string): { license: LicenseId; version?: string } { + const v = (value ?? '').trim() + if (!v) return { license: 'unknown' } + + // D7 — deed URL form + if (/^https?:\/\//i.test(v)) { + if (/\/publicdomain\/zero\//i.test(v)) return { license: 'CC0-1.0' } + const m = v.match(/\/licenses\/(by(?:-sa)?|by-nc[a-z-]*|by-nd[a-z-]*)\/(\d\.\d)\//i) + if (m) { + const fam = m[1].toLowerCase() + const version = m[2] + if (fam === 'by') return { license: 'CC-BY', version } + if (fam === 'by-sa') return { license: 'CC-BY-SA', version } + return { license: 'proprietary' } // any NC/ND variant + } + return { license: 'unknown' } + } + + // D4 — name string form (case-insensitive) + return FREESOUND_NAME_LICENSE[v.toLowerCase()] ?? { license: 'unknown' } +} diff --git a/packages/provider-freesound/tsconfig.json b/packages/provider-freesound/tsconfig.json new file mode 100644 index 0000000..98922fe --- /dev/null +++ b/packages/provider-freesound/tsconfig.json @@ -0,0 +1,5 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { "outDir": "out", "rootDir": "src", "types": ["node"] }, + "include": ["src/**/*"] +} diff --git a/packages/provider-freesound/tsup.config.ts b/packages/provider-freesound/tsup.config.ts new file mode 100644 index 0000000..af0ad13 --- /dev/null +++ b/packages/provider-freesound/tsup.config.ts @@ -0,0 +1,10 @@ +import { defineConfig } from 'tsup' + +export default defineConfig({ + entry: ['src/index.ts'], + format: ['esm'], + dts: true, + clean: true, + outDir: 'dist', + sourcemap: true, +}) diff --git a/packages/provider-freesound/vitest.config.ts b/packages/provider-freesound/vitest.config.ts new file mode 100644 index 0000000..6062bd8 --- /dev/null +++ b/packages/provider-freesound/vitest.config.ts @@ -0,0 +1,2 @@ +import { defineConfig } from 'vitest/config' +export default defineConfig({ test: { name: 'provider-freesound', environment: 'node', include: ['src/**/*.{test,spec}.ts'] } }) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9152430..dd530cd 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -121,6 +121,12 @@ importers: specifier: workspace:* version: link:../core + packages/provider-freesound: + dependencies: + '@refkit/core': + specifier: workspace:* + version: link:../core + packages/provider-gutendex: dependencies: '@refkit/core': From 03cc6752816eaa5f50e189fbf5f2a38a67c847c4 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:28:24 +0800 Subject: [PATCH 07/34] feat(provider-freesound): audio search + reference mapper --- .../src/__tests__/freesound.test.ts | 73 +++++++++++++- packages/provider-freesound/src/index.ts | 94 +++++++++++++++++++ 2 files changed, 166 insertions(+), 1 deletion(-) diff --git a/packages/provider-freesound/src/__tests__/freesound.test.ts b/packages/provider-freesound/src/__tests__/freesound.test.ts index 96c6c36..c0ac5f2 100644 --- a/packages/provider-freesound/src/__tests__/freesound.test.ts +++ b/packages/provider-freesound/src/__tests__/freesound.test.ts @@ -1,5 +1,76 @@ import { describe, expect, it } from 'vitest' -import { mapFreesoundLicense } from '../index' +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { freesound, mapFreesoundLicense } from '../index' + +const ctxJson = (body: unknown, capture?: (url: string) => void): ProviderContext => ({ + fetch: (async (input: string) => { + capture?.(String(input)) + return new Response(JSON.stringify(body), { status: 200 }) + }) as typeof fetch, +}) + +const RESULTS = { + count: 4, next: null, previous: null, + results: [ + { id: 1, name: 'Door creak', license: 'Attribution', username: 'alice', + url: 'https://freesound.org/people/alice/sounds/1/', + previews: { 'preview-hq-mp3': 'https://cdn.freesound.org/previews/1/1_hq.mp3', 'preview-lq-mp3': 'https://cdn.freesound.org/previews/1/1_lq.mp3' }, + duration: 2.5, filesize: 41000, tags: ['door', 'creak'] }, + { id: 2, name: 'Loop NC', license: 'Attribution NonCommercial', username: 'bob', + url: 'https://freesound.org/people/bob/sounds/2/', + previews: { 'preview-hq-mp3': 'https://cdn.freesound.org/previews/2/2_hq.mp3' }, duration: 5, filesize: 80000, tags: [] }, + { id: 3, name: 'Public bell', license: 'Creative Commons 0', username: 'carol', + url: 'https://freesound.org/people/carol/sounds/3/', + previews: { 'preview-hq-mp3': 'https://cdn.freesound.org/previews/3/3_hq.mp3' }, duration: 1, filesize: 16000, tags: [] }, + { id: 4, name: 'Mystery', license: 'Weird Custom License', username: 'dave', + url: 'https://freesound.org/people/dave/sounds/4/', + previews: { 'preview-hq-mp3': 'https://cdn.freesound.org/previews/4/4_hq.mp3' }, duration: 3, filesize: 48000, tags: [] }, + ], +} + +describe('freesound provider', () => { + it('maps each license family to audio references', async () => { + const refs = await freesound({ apiKey: 'k' }).search({ text: 'door', modalities: ['audio'], limit: 10 }, ctxJson(RESULTS)) + expect(refs).toHaveLength(4) + const byId = Object.fromEntries(refs.map(r => [r.canonicalUrl, r])) + + const cc = byId['https://freesound.org/people/alice/sounds/1/'] + expect(cc.modality).toBe('audio') + expect(cc.rights.license).toBe('CC-BY') + expect(cc.rights.author).toBe('alice') + expect(cc.preview?.url).toBe('https://cdn.freesound.org/previews/1/1_hq.mp3') + expect(cc.preview?.mediaType).toBe('audio/mpeg') + + const nc = byId['https://freesound.org/people/bob/sounds/2/'] + expect(nc.rights.license).toBe('proprietary') + expect(evaluateUse(nc.rights, 'commercial-product').decision).toBe('denied') + + const cc0 = byId['https://freesound.org/people/carol/sounds/3/'] + expect(cc0.rights.license).toBe('CC0-1.0') + expect(evaluateUse(cc0.rights, 'commercial-product').decision).toBe('allowed') + + const unk = byId['https://freesound.org/people/dave/sounds/4/'] + expect(unk.rights.license).toBe('unknown') + expect(evaluateUse(unk.rights, 'commercial-product').decision).toBe('needs-review') + }) + + it('forwards query, token, and fields; respects limit', async () => { + let url = '' + await freesound({ apiKey: 'secret' }).search( + { text: 'rain', modalities: ['audio'], limit: 7, providerOptions: { sort: 'rating_desc', filter: 'duration:[1 TO 10]' } }, + ctxJson(RESULTS, u => { url = u }), + ) + const u = new URL(url) + expect(u.pathname).toBe('/apiv2/search/text/') + expect(u.searchParams.get('query')).toBe('rain') + expect(u.searchParams.get('token')).toBe('secret') + expect(u.searchParams.get('fields')).toContain('previews') + expect(u.searchParams.get('fields')).toContain('license') + expect(u.searchParams.get('page_size')).toBe('7') + expect(u.searchParams.get('sort')).toBe('rating_desc') + expect(u.searchParams.get('filter')).toBe('duration:[1 TO 10]') + }) +}) describe('mapFreesoundLicense', () => { it('maps CC name strings (D4 — no version)', () => { diff --git a/packages/provider-freesound/src/index.ts b/packages/provider-freesound/src/index.ts index 8923326..ceeabe1 100644 --- a/packages/provider-freesound/src/index.ts +++ b/packages/provider-freesound/src/index.ts @@ -39,3 +39,97 @@ export function mapFreesoundLicense(value: string): { license: LicenseId; versio // D4 — name string form (case-insensitive) return FREESOUND_NAME_LICENSE[v.toLowerCase()] ?? { license: 'unknown' } } + +export interface FreesoundConfig { + /** Freesound APIv2 token (https://freesound.org/apiv2/apply). Passed as the + * `token` query param. The `Authorization: Token <key>` header is the documented + * equivalent if a future need arises. */ + apiKey: string +} + +export interface FreesoundSearchOptions { + /** Freesound `sort` (e.g. 'score', 'rating_desc', 'downloads_desc', 'created_desc'). */ + sort?: string + /** Freesound `filter` query (field-scoped Solr-style filter, e.g. 'duration:[1 TO 10]'). */ + filter?: string + page?: number + pageSize?: number +} + +const BASE = 'https://freesound.org/apiv2/search/text/' +// Fields must be requested explicitly — default search responses omit previews/license. +const FIELDS = 'id,name,license,username,previews,url,duration,filesize,tags' + +interface FreesoundResult { + id: number + name: string + license: string + username?: string + url: string + previews?: Record<string, string> + duration?: number + filesize?: number + tags?: string[] +} +interface FreesoundResponse { count: number; results: FreesoundResult[] } + +function toAudioReference(r: FreesoundResult): Reference { + const { license, version } = mapFreesoundLicense(r.license) + const canonicalUrl = r.url + const rights: RightsRecord = { + license, + // version only ever set when the license arrived as a CC deed URL (D7); D4 omits it. + licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + author: r.username || undefined, + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: 'https://freesound.org/help/tos_api/', sourceUrl: canonicalUrl }, + } + const previewUrl = r.previews?.['preview-hq-mp3'] ?? r.previews?.['preview-lq-mp3'] + return { + id: referenceId('freesound', canonicalUrl), + modality: 'audio', + title: r.name || undefined, + source: { providerId: 'freesound', sourceUrl: canonicalUrl }, + canonicalUrl, + rights, + verifiedAt: new Date().toISOString(), + ...(previewUrl ? { preview: { url: previewUrl, mediaType: 'audio/mpeg' } } : {}), + relevance: 0, // mergeReferences assigns the final RRF relevance + raw: r, + } +} + +function setIfString(url: URL, key: string, value: unknown) { + if (typeof value !== 'string' || !value) return + url.searchParams.set(key, value) +} + +function setIfPositiveInt(url: URL, key: string, value: unknown) { + if (typeof value !== 'number' || !Number.isInteger(value) || value < 1) return + url.searchParams.set(key, String(value)) +} + +export function freesound(config: FreesoundConfig) { + return defineProvider({ + id: 'freesound', + modalities: ['audio'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const opts = q.providerOptions as FreesoundSearchOptions | undefined + const url = new URL(BASE) + url.searchParams.set('query', q.text) + url.searchParams.set('token', config.apiKey) + url.searchParams.set('fields', FIELDS) + url.searchParams.set('page_size', String(opts?.pageSize ?? q.limit ?? 20)) + setIfString(url, 'sort', opts?.sort) + setIfString(url, 'filter', opts?.filter) + setIfPositiveInt(url, 'page', opts?.page) + const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`freesound search failed: ${res.status}`) + const json = (await res.json()) as FreesoundResponse + if (!json.results) return [] + return json.results.map(toAudioReference) + }, + }) +} From e739c9e52eb95ee91c13913c18ad464acf020f9d Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:34:43 +0800 Subject: [PATCH 08/34] feat(provider-jamendo): scaffold + CC-BY audio mapping (P1) --- packages/provider-jamendo/LICENSE | 201 ++++++++++++++++++ packages/provider-jamendo/README.md | 20 ++ packages/provider-jamendo/package.json | 44 ++++ .../src/__tests__/jamendo.test.ts | 64 ++++++ packages/provider-jamendo/src/index.ts | 141 ++++++++++++ packages/provider-jamendo/tsconfig.json | 5 + packages/provider-jamendo/tsup.config.ts | 10 + packages/provider-jamendo/vitest.config.ts | 2 + pnpm-lock.yaml | 6 + 9 files changed, 493 insertions(+) create mode 100644 packages/provider-jamendo/LICENSE create mode 100644 packages/provider-jamendo/README.md create mode 100644 packages/provider-jamendo/package.json create mode 100644 packages/provider-jamendo/src/__tests__/jamendo.test.ts create mode 100644 packages/provider-jamendo/src/index.ts create mode 100644 packages/provider-jamendo/tsconfig.json create mode 100644 packages/provider-jamendo/tsup.config.ts create mode 100644 packages/provider-jamendo/vitest.config.ts diff --git a/packages/provider-jamendo/LICENSE b/packages/provider-jamendo/LICENSE new file mode 100644 index 0000000..c1c4eb0 --- /dev/null +++ b/packages/provider-jamendo/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative + Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, excluding + those notices that do not pertain to any part of the Derivative + Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2026 refkit authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/packages/provider-jamendo/README.md b/packages/provider-jamendo/README.md new file mode 100644 index 0000000..c9365a7 --- /dev/null +++ b/packages/provider-jamendo/README.md @@ -0,0 +1,20 @@ +# @refkit/provider-jamendo + +Search **Jamendo** as license-tagged audio references — a provider satellite for **refkit** (use with [`@refkit/core`](https://www.npmjs.com/package/@refkit/core)). + +- **Source:** Jamendo +- **Auth:** API key +- **Modality:** audio +- **License:** per-item CC + +## Usage + +```ts +import { createRefkit } from '@refkit/core' +import { jamendo } from '@refkit/provider-jamendo' + +const refkit = createRefkit({ providers: [jamendo(/* config */)] }) +const refs = await refkit.search({ query: 'cat', modalities: ['audio'] }) +``` + +Gate by intended use with `refkit.evaluateUse(ref, 'commercial-product')`. See [`@refkit/core`](https://www.npmjs.com/package/@refkit/core) for the full API. diff --git a/packages/provider-jamendo/package.json b/packages/provider-jamendo/package.json new file mode 100644 index 0000000..c04e76c --- /dev/null +++ b/packages/provider-jamendo/package.json @@ -0,0 +1,44 @@ +{ + "name": "@refkit/provider-jamendo", + "version": "0.1.0", + "description": "Jamendo provider satellite for refkit.", + "type": "module", + "license": "Apache-2.0", + "keywords": [ + "refkit", + "reference-retrieval", + "license", + "attribution", + "refkit-provider", + "jamendo" + ], + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": "./src/index.ts" + }, + "scripts": { + "typecheck": "tsc --noEmit", + "test": "vitest run", + "test:watch": "vitest watch", + "build": "tsup", + "prepublishOnly": "tsup" + }, + "dependencies": { + "@refkit/core": "workspace:*" + }, + "files": [ + "dist", + "LICENSE" + ], + "publishConfig": { + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + } + } +} diff --git a/packages/provider-jamendo/src/__tests__/jamendo.test.ts b/packages/provider-jamendo/src/__tests__/jamendo.test.ts new file mode 100644 index 0000000..22e4812 --- /dev/null +++ b/packages/provider-jamendo/src/__tests__/jamendo.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it } from 'vitest' +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { jamendo, mapJamendoLicense } from '../index' + +// Jamendo wraps results in { headers, results }. This ctx captures the request URL +// (to assert client_id/search/limit forwarding) and returns the supplied body. +const ctxCapturing = (body: unknown): { ctx: ProviderContext; url: () => string } => { + let captured = '' + const ctx: ProviderContext = { + fetch: (async (input: Parameters<typeof fetch>[0]) => { + captured = String(input) + return new Response(JSON.stringify(body), { status: 200 }) + }) as typeof fetch, + } + return { ctx, url: () => captured } +} + +const envelope = (results: unknown[]) => ({ + headers: { status: 'success', code: 0, error_message: '', results_count: results.length }, + results, +}) + +const TRACK_BY = { + id: '1848357', + name: 'Sunrise', + artist_name: 'fankel', + audio: 'https://prod-1.storage.jamendo.com/?trackid=1848357&format=mp31&from=app-devsite', + audiodownload: 'https://prod-1.storage.jamendo.com/download/track/1848357/mp32/', + image: 'https://usercontent.jamendo.com?type=album&id=368084&width=300&trackid=1848357', + shareurl: 'https://www.jamendo.com/track/1848357', + shorturl: 'https://jamen.do/t/1848357', + license_ccurl: 'http://creativecommons.org/licenses/by/4.0/', +} + +describe('mapJamendoLicense', () => { + it('maps CC-BY and CC-BY-SA with version, NC/ND → proprietary, missing → unknown', () => { + expect(mapJamendoLicense('http://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapJamendoLicense('https://creativecommons.org/licenses/by-sa/3.0/')).toEqual({ license: 'CC-BY-SA', version: '3.0' }) + expect(mapJamendoLicense('http://creativecommons.org/licenses/by-nc-nd/3.0/')).toEqual({ license: 'proprietary' }) + expect(mapJamendoLicense('http://creativecommons.org/licenses/by-nc/2.0/')).toEqual({ license: 'proprietary' }) + expect(mapJamendoLicense('http://creativecommons.org/licenses/by-nd/4.0/')).toEqual({ license: 'proprietary' }) + expect(mapJamendoLicense('')).toEqual({ license: 'unknown' }) + expect(mapJamendoLicense('https://example.com/whatever')).toEqual({ license: 'unknown' }) + }) +}) + +describe('jamendo provider', () => { + it('maps a CC-BY track to a CC-BY audio reference (allowed-with-attribution)', async () => { + const { ctx } = ctxCapturing(envelope([TRACK_BY])) + const refs = await jamendo({ clientId: 'cid' }).search({ text: 'sunrise', modalities: ['audio'], limit: 5 }, ctx) + expect(refs).toHaveLength(1) + const r = refs[0] + expect(r.modality).toBe('audio') + expect(r.rights.license).toBe('CC-BY') + expect(r.rights.licenseVersion).toBe('4.0') + expect(r.rights.author).toBe('fankel') + expect(r.title).toBe('Sunrise') + expect(r.canonicalUrl).toBe('https://www.jamendo.com/track/1848357') + expect(r.preview?.url).toContain('trackid=1848357') + expect(r.preview?.mediaType).toBe('audio/mpeg') + expect(r.thumbnail?.url).toContain('usercontent.jamendo.com') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed-with-attribution') + }) +}) diff --git a/packages/provider-jamendo/src/index.ts b/packages/provider-jamendo/src/index.ts new file mode 100644 index 0000000..9865146 --- /dev/null +++ b/packages/provider-jamendo/src/index.ts @@ -0,0 +1,141 @@ +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type LicenseId, + type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +export interface JamendoConfig { + /** Jamendo API client_id (BYOK). Register at https://devportal.jamendo.com/. */ + clientId: string +} + +export interface JamendoSearchOptions { + /** mp3 stream quality. Default 'mp31' (96 kbps). */ + audioformat?: 'mp31' | 'mp32' | 'ogg' | 'flac' + order?: 'relevance' | 'popularity_total' | 'popularity_month' | 'popularity_week' | 'releasedate_asc' | 'releasedate_desc' | 'buzzrate' + /** Restrict to tracks whose license permits a given use, server-side. Relevance + * hint only — mapJamendoLicense below is the authoritative rights gate. */ + ccsa?: boolean + ccnd?: boolean + ccnc?: boolean + tags?: string | readonly string[] + artist_name?: string + offset?: number +} + +const BASE = 'https://api.jamendo.com/v3.0/tracks/' + +// The `audioformat` request param decides what `t.audio` streams; reflect it in mediaType +// rather than hardcoding audio/mpeg (which would mislabel ogg/flac requests). +const JAMENDO_AUDIO_MIME: Record<string, string> = { + mp31: 'audio/mpeg', mp32: 'audio/mpeg', ogg: 'audio/ogg', flac: 'audio/flac', +} + +interface JamendoTrack { + id: string + name: string + artist_name: string + audio: string + audiodownload?: string + image: string + shareurl: string + shorturl?: string + license_ccurl: string +} +interface JamendoResponse { + headers: { status: string; code: number; error_message?: string; results_count: number } + results: JamendoTrack[] +} + +// Jamendo deed URLs look like http(s)://creativecommons.org/licenses/<variant>/<v>/. +// Only by/by-sa fit our enum (D5); capture the version (D7). Any nc/nd variant is +// non-commercial or no-derivatives → 'proprietary'. Missing/unrecognized → 'unknown'. +export function mapJamendoLicense(ccurl: string): { license: LicenseId; version?: string } { + if (!ccurl) return { license: 'unknown' } + const by = ccurl.match(/\/licenses\/by\/(\d\.\d)\//) + if (by) return { license: 'CC-BY', version: by[1] } + const bySa = ccurl.match(/\/licenses\/by-sa\/(\d\.\d)\//) + if (bySa) return { license: 'CC-BY-SA', version: bySa[1] } + if (/\/licenses\/by-(nc|nd)/.test(ccurl)) return { license: 'proprietary' } + return { license: 'unknown' } +} + +function toAudioReference(t: JamendoTrack, mediaType: string): Reference { + const { license, version } = mapJamendoLicense(t.license_ccurl) + const canonicalUrl = t.shareurl + const rights: RightsRecord = { + license, + // CC version is metadata only (attribution/audit), kept for the BY/BY-SA family. + licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + author: t.artist_name || undefined, + // governed by the per-item CC license; the mp3 stream is served directly by Jamendo + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: t.license_ccurl, sourceUrl: canonicalUrl }, + } + return { + id: referenceId('jamendo', canonicalUrl), + modality: 'audio', + title: t.name || undefined, + source: { providerId: 'jamendo', sourceUrl: canonicalUrl }, + canonicalUrl, + rights, + verifiedAt: new Date().toISOString(), + // audio has no native thumbnail; the album art is the closest visual handle + ...(t.image ? { thumbnail: { url: t.image } } : {}), + preview: { url: t.audio, mediaType }, + relevance: 0, // per-source order; mergeReferences assigns the final RRF relevance + raw: t, + } +} + +function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { + if (typeof value !== 'string' || !value) return + if (allowed && !allowed.includes(value)) return + url.searchParams.set(key, value) +} + +function setIfStringList(url: URL, key: string, value: unknown) { + if (typeof value === 'string' && value) url.searchParams.set(key, value) + if (Array.isArray(value) && value.length > 0 && value.every(v => typeof v === 'string' && v)) url.searchParams.set(key, value.join(' ')) +} + +function setIfBooleanFlag(url: URL, key: string, value: unknown) { + if (typeof value !== 'boolean') return + url.searchParams.set(key, value ? 'true' : 'false') +} + +function setIfPositiveInt(url: URL, key: string, value: unknown) { + if (typeof value !== 'number' || !Number.isInteger(value) || value < 0) return + url.searchParams.set(key, String(value)) +} + +export function jamendo(config: JamendoConfig) { + return defineProvider({ + id: 'jamendo', + modalities: ['audio'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const url = new URL(BASE) + url.searchParams.set('client_id', config.clientId) + url.searchParams.set('format', 'json') + url.searchParams.set('search', q.text) + url.searchParams.set('limit', String(Math.min(q.limit ?? 20, 200))) + const opts = q.providerOptions as JamendoSearchOptions | undefined + setIfString(url, 'audioformat', opts?.audioformat, ['mp31', 'mp32', 'ogg', 'flac']) + setIfString(url, 'order', opts?.order, ['relevance', 'popularity_total', 'popularity_month', 'popularity_week', 'releasedate_asc', 'releasedate_desc', 'buzzrate']) + setIfBooleanFlag(url, 'ccsa', opts?.ccsa) + setIfBooleanFlag(url, 'ccnd', opts?.ccnd) + setIfBooleanFlag(url, 'ccnc', opts?.ccnc) + setIfStringList(url, 'tags', opts?.tags) + setIfString(url, 'artist_name', opts?.artist_name) + setIfPositiveInt(url, 'offset', opts?.offset) + const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`jamendo search failed: ${res.status}`) + const json = (await res.json()) as JamendoResponse + if (json.headers?.status !== 'success') throw new Error(`jamendo search error: ${json.headers?.error_message || json.headers?.status}`) + const mediaType = JAMENDO_AUDIO_MIME[opts?.audioformat ?? 'mp31'] ?? 'audio/mpeg' + return (json.results ?? []).map((t) => toAudioReference(t, mediaType)) + }, + }) +} diff --git a/packages/provider-jamendo/tsconfig.json b/packages/provider-jamendo/tsconfig.json new file mode 100644 index 0000000..98922fe --- /dev/null +++ b/packages/provider-jamendo/tsconfig.json @@ -0,0 +1,5 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { "outDir": "out", "rootDir": "src", "types": ["node"] }, + "include": ["src/**/*"] +} diff --git a/packages/provider-jamendo/tsup.config.ts b/packages/provider-jamendo/tsup.config.ts new file mode 100644 index 0000000..af0ad13 --- /dev/null +++ b/packages/provider-jamendo/tsup.config.ts @@ -0,0 +1,10 @@ +import { defineConfig } from 'tsup' + +export default defineConfig({ + entry: ['src/index.ts'], + format: ['esm'], + dts: true, + clean: true, + outDir: 'dist', + sourcemap: true, +}) diff --git a/packages/provider-jamendo/vitest.config.ts b/packages/provider-jamendo/vitest.config.ts new file mode 100644 index 0000000..4407b8d --- /dev/null +++ b/packages/provider-jamendo/vitest.config.ts @@ -0,0 +1,2 @@ +import { defineConfig } from 'vitest/config' +export default defineConfig({ test: { name: 'provider-jamendo', environment: 'node', include: ['src/**/*.{test,spec}.ts'] } }) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index dd530cd..6be6da1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -137,6 +137,12 @@ importers: specifier: workspace:* version: link:../provider-poetrydb + packages/provider-jamendo: + dependencies: + '@refkit/core': + specifier: workspace:* + version: link:../core + packages/provider-met: dependencies: '@refkit/core': From cb5279c69786bf2d892a2a7544e818cced0313f1 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:35:13 +0800 Subject: [PATCH 09/34] =?UTF-8?q?test(provider-jamendo):=20NC/ND=20?= =?UTF-8?q?=E2=86=92=20proprietary=20denied=20for=20commercial?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/__tests__/jamendo.test.ts | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/packages/provider-jamendo/src/__tests__/jamendo.test.ts b/packages/provider-jamendo/src/__tests__/jamendo.test.ts index 22e4812..dd2c0bf 100644 --- a/packages/provider-jamendo/src/__tests__/jamendo.test.ts +++ b/packages/provider-jamendo/src/__tests__/jamendo.test.ts @@ -61,4 +61,21 @@ describe('jamendo provider', () => { expect(r.thumbnail?.url).toContain('usercontent.jamendo.com') expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed-with-attribution') }) + + const TRACK_NC = { + ...TRACK_BY, + id: '2000001', + name: 'For Listening Only', + license_ccurl: 'http://creativecommons.org/licenses/by-nc-nd/3.0/', + shareurl: 'https://www.jamendo.com/track/2000001', + } + + it('maps a CC-BY-NC-ND track to proprietary → denied for commercial use', async () => { + const { ctx } = ctxCapturing(envelope([TRACK_NC])) + const refs = await jamendo({ clientId: 'cid' }).search({ text: 'listen', modalities: ['audio'] }, ctx) + expect(refs).toHaveLength(1) + expect(refs[0].rights.license).toBe('proprietary') + expect(refs[0].rights.licenseVersion).toBeUndefined() + expect(evaluateUse(refs[0].rights, 'commercial-product').decision).toBe('denied') + }) }) From 4824064bb822abac40d331d7ff2d20bdcbd0205c Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:35:36 +0800 Subject: [PATCH 10/34] =?UTF-8?q?test(provider-jamendo):=20missing/unknown?= =?UTF-8?q?=20ccurl=20=E2=86=92=20needs-review?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/__tests__/jamendo.test.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/packages/provider-jamendo/src/__tests__/jamendo.test.ts b/packages/provider-jamendo/src/__tests__/jamendo.test.ts index dd2c0bf..29e6161 100644 --- a/packages/provider-jamendo/src/__tests__/jamendo.test.ts +++ b/packages/provider-jamendo/src/__tests__/jamendo.test.ts @@ -78,4 +78,20 @@ describe('jamendo provider', () => { expect(refs[0].rights.licenseVersion).toBeUndefined() expect(evaluateUse(refs[0].rights, 'commercial-product').decision).toBe('denied') }) + + const TRACK_NO_LICENSE = { + ...TRACK_BY, + id: '3000002', + name: 'Mystery Track', + license_ccurl: '', + shareurl: 'https://www.jamendo.com/track/3000002', + } + + it('maps a track with no recognizable license to unknown → needs-review', async () => { + const { ctx } = ctxCapturing(envelope([TRACK_NO_LICENSE])) + const refs = await jamendo({ clientId: 'cid' }).search({ text: 'mystery', modalities: ['audio'] }, ctx) + expect(refs).toHaveLength(1) + expect(refs[0].rights.license).toBe('unknown') + expect(evaluateUse(refs[0].rights, 'commercial-product').decision).toBe('needs-review') + }) }) From ceb283f1bd22365d3acaca44875b4d92b7e99885 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:36:13 +0800 Subject: [PATCH 11/34] test(provider-jamendo): request forwarding (client_id/search/limit/options) --- .../src/__tests__/jamendo.test.ts | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/packages/provider-jamendo/src/__tests__/jamendo.test.ts b/packages/provider-jamendo/src/__tests__/jamendo.test.ts index 29e6161..af5bbdb 100644 --- a/packages/provider-jamendo/src/__tests__/jamendo.test.ts +++ b/packages/provider-jamendo/src/__tests__/jamendo.test.ts @@ -94,4 +94,31 @@ describe('jamendo provider', () => { expect(refs[0].rights.license).toBe('unknown') expect(evaluateUse(refs[0].rights, 'commercial-product').decision).toBe('needs-review') }) + + it('forwards client_id, search, limit, format and documented options', async () => { + const { ctx, url } = ctxCapturing(envelope([])) + await jamendo({ clientId: 'my-client-id' }).search({ + text: 'ambient', + modalities: ['audio'], + limit: 7, + providerOptions: { audioformat: 'mp32', order: 'popularity_total', ccnc: false, tags: ['ambient', 'chill'], artist_name: 'fankel', offset: 20 }, + }, ctx) + const u = new URL(url()) + expect(u.searchParams.get('client_id')).toBe('my-client-id') + expect(u.searchParams.get('format')).toBe('json') + expect(u.searchParams.get('search')).toBe('ambient') + expect(u.searchParams.get('limit')).toBe('7') + expect(u.searchParams.get('audioformat')).toBe('mp32') + expect(u.searchParams.get('order')).toBe('popularity_total') + expect(u.searchParams.get('ccnc')).toBe('false') + expect(u.searchParams.get('tags')).toBe('ambient chill') + expect(u.searchParams.get('artist_name')).toBe('fankel') + expect(u.searchParams.get('offset')).toBe('20') + }) + + it('returns [] when Jamendo finds nothing', async () => { + const { ctx } = ctxCapturing(envelope([])) + const refs = await jamendo({ clientId: 'cid' }).search({ text: 'zzzz', modalities: ['audio'] }, ctx) + expect(refs).toEqual([]) + }) }) From 9cea671ce67102f26bc3966a17703fa28ded8a6f Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:42:07 +0800 Subject: [PATCH 12/34] feat(provider-europeana): scaffold + edm:rights mapper --- packages/provider-europeana/LICENSE | 201 ++++++++++++++++++ packages/provider-europeana/README.md | 22 ++ packages/provider-europeana/package.json | 44 ++++ .../src/__tests__/europeana.test.ts | 33 +++ packages/provider-europeana/src/index.ts | 45 ++++ packages/provider-europeana/tsconfig.json | 5 + packages/provider-europeana/tsup.config.ts | 10 + packages/provider-europeana/vitest.config.ts | 2 + pnpm-lock.yaml | 6 + 9 files changed, 368 insertions(+) create mode 100644 packages/provider-europeana/LICENSE create mode 100644 packages/provider-europeana/README.md create mode 100644 packages/provider-europeana/package.json create mode 100644 packages/provider-europeana/src/__tests__/europeana.test.ts create mode 100644 packages/provider-europeana/src/index.ts create mode 100644 packages/provider-europeana/tsconfig.json create mode 100644 packages/provider-europeana/tsup.config.ts create mode 100644 packages/provider-europeana/vitest.config.ts diff --git a/packages/provider-europeana/LICENSE b/packages/provider-europeana/LICENSE new file mode 100644 index 0000000..c1c4eb0 --- /dev/null +++ b/packages/provider-europeana/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative + Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, excluding + those notices that do not pertain to any part of the Derivative + Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2026 refkit authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/packages/provider-europeana/README.md b/packages/provider-europeana/README.md new file mode 100644 index 0000000..0706906 --- /dev/null +++ b/packages/provider-europeana/README.md @@ -0,0 +1,22 @@ +# @refkit/provider-europeana + +Search **Europeana** as license-tagged image references — a provider satellite for **refkit** (use with [`@refkit/core`](https://www.npmjs.com/package/@refkit/core)). + +- **Source:** Europeana +- **Auth:** API key +- **Modality:** image +- **License:** per-item CC / PD / rights-statement + +> v1 returns images only (`type=IMAGE`); audio/video/text records are a planned follow-up. Media is hotlinked from third-party data providers — cache/rehost is not permitted (`rehostPolicy: 'hotlink-required'`). + +## Usage + +```ts +import { createRefkit } from '@refkit/core' +import { europeana } from '@refkit/provider-europeana' + +const refkit = createRefkit({ providers: [europeana(/* config */)] }) +const refs = await refkit.search({ query: 'cat', modalities: ['image'] }) +``` + +Gate by intended use with `refkit.evaluateUse(ref, 'commercial-product')`. See [`@refkit/core`](https://www.npmjs.com/package/@refkit/core) for the full API. diff --git a/packages/provider-europeana/package.json b/packages/provider-europeana/package.json new file mode 100644 index 0000000..29e87e2 --- /dev/null +++ b/packages/provider-europeana/package.json @@ -0,0 +1,44 @@ +{ + "name": "@refkit/provider-europeana", + "version": "0.1.0", + "description": "Europeana provider satellite for refkit.", + "type": "module", + "license": "Apache-2.0", + "keywords": [ + "refkit", + "reference-retrieval", + "license", + "attribution", + "refkit-provider", + "europeana" + ], + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": "./src/index.ts" + }, + "scripts": { + "typecheck": "tsc --noEmit", + "test": "vitest run", + "test:watch": "vitest watch", + "build": "tsup", + "prepublishOnly": "tsup" + }, + "dependencies": { + "@refkit/core": "workspace:*" + }, + "files": [ + "dist", + "LICENSE" + ], + "publishConfig": { + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + } + } +} diff --git a/packages/provider-europeana/src/__tests__/europeana.test.ts b/packages/provider-europeana/src/__tests__/europeana.test.ts new file mode 100644 index 0000000..3461127 --- /dev/null +++ b/packages/provider-europeana/src/__tests__/europeana.test.ts @@ -0,0 +1,33 @@ +import { describe, expect, it } from 'vitest' +import { mapEuropeanaRights } from '../index' + +describe('mapEuropeanaRights', () => { + it('maps CC0 and Public Domain Mark to open licenses (no version)', () => { + expect(mapEuropeanaRights('http://creativecommons.org/publicdomain/zero/1.0/')).toEqual({ license: 'CC0-1.0' }) + expect(mapEuropeanaRights('http://creativecommons.org/publicdomain/mark/1.0/')).toEqual({ license: 'PD' }) + }) + + it('maps CC-BY / CC-BY-SA and captures the version', () => { + expect(mapEuropeanaRights('http://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapEuropeanaRights('https://creativecommons.org/licenses/by-sa/3.0/')).toEqual({ license: 'CC-BY-SA', version: '3.0' }) + }) + + it('maps NC / ND variants to proprietary (not an open grant)', () => { + expect(mapEuropeanaRights('http://creativecommons.org/licenses/by-nc/4.0/')).toEqual({ license: 'proprietary' }) + expect(mapEuropeanaRights('http://creativecommons.org/licenses/by-nc-sa/4.0/')).toEqual({ license: 'proprietary' }) + expect(mapEuropeanaRights('http://creativecommons.org/licenses/by-nd/4.0/')).toEqual({ license: 'proprietary' }) + }) + + it('maps rightsstatements.org faithfully: InC→proprietary, NoC-US→PD+US, NoC-NC→proprietary', () => { + expect(mapEuropeanaRights('http://rightsstatements.org/vocab/InC/1.0/')).toEqual({ license: 'proprietary' }) + expect(mapEuropeanaRights('http://rightsstatements.org/vocab/NoC-US/1.0/')).toEqual({ license: 'PD', jurisdiction: 'US' }) + expect(mapEuropeanaRights('http://rightsstatements.org/vocab/NoC-NC/1.0/')).toEqual({ license: 'proprietary' }) + }) + + it('maps opaque/undetermined rightsstatements + empty/unrecognized to unknown', () => { + expect(mapEuropeanaRights('http://rightsstatements.org/vocab/NoC-OKLR/1.0/')).toEqual({ license: 'unknown' }) + expect(mapEuropeanaRights('http://rightsstatements.org/vocab/CNE/1.0/')).toEqual({ license: 'unknown' }) + expect(mapEuropeanaRights('')).toEqual({ license: 'unknown' }) + expect(mapEuropeanaRights('http://example.org/some-other-license')).toEqual({ license: 'unknown' }) + }) +}) diff --git a/packages/provider-europeana/src/index.ts b/packages/provider-europeana/src/index.ts new file mode 100644 index 0000000..2910cb6 --- /dev/null +++ b/packages/provider-europeana/src/index.ts @@ -0,0 +1,45 @@ +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type LicenseId, + type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +const BASE = 'https://api.europeana.eu/record/v2/search.json' + +/** Map a Europeana `edm:rights` controlled-vocabulary URI to a core license id (+ CC version, + * + jurisdiction for jurisdiction-scoped PD). Conservative (D5): only clearly-open CC deeds and + * PD/CC0 become open grants; CC NC/ND → proprietary; rightsstatements.org is mapped faithfully + * per token (see below); anything unrecognized/empty → unknown. */ +// rightsstatements.org is a rights-STATUS vocabulary (not license grants). Map each token +// FAITHFULLY (index D5-style): InC* → proprietary (copyrighted, no grant); NoC-US → PD scoped +// to the US via the jurisdiction field; NoC-NC → proprietary (non-commercial → commercial out); +// opaque/undetermined (NoC-OKLR/CR, CNE, UND, NKC) → unknown. (This mirrors core `mapRightsUrl`; +// the helper-refactor Task 4 replaces this inlined copy with that import.) +const RIGHTS_STATEMENT: Record<string, { license: LicenseId; jurisdiction?: string }> = { + 'inc': { license: 'proprietary' }, 'inc-ow-eu': { license: 'proprietary' }, 'inc-edu': { license: 'proprietary' }, + 'inc-nc': { license: 'proprietary' }, 'inc-ruu': { license: 'proprietary' }, + 'noc-us': { license: 'PD', jurisdiction: 'US' }, + 'noc-nc': { license: 'proprietary' }, + 'noc-oklr': { license: 'unknown' }, 'noc-cr': { license: 'unknown' }, + 'cne': { license: 'unknown' }, 'und': { license: 'unknown' }, 'nkc': { license: 'unknown' }, +} + +export function mapEuropeanaRights(uri: string): { license: LicenseId; version?: string; jurisdiction?: string } { + const u = (uri || '').toLowerCase() + if (!u) return { license: 'unknown' } + // rightsstatements.org — faithful per-token mapping (not blanket unknown). + const rs = u.match(/rightsstatements\.org\/(?:vocab|page)\/([a-z-]+)/) + if (rs) return RIGHTS_STATEMENT[rs[1]] ?? { license: 'unknown' } + // Public domain dedications / marks (no version surfaced). + if (u.includes('creativecommons.org/publicdomain/zero')) return { license: 'CC0-1.0' } + if (u.includes('creativecommons.org/publicdomain/mark')) return { license: 'PD' } + // Non-commercial / no-derivatives variants are NOT open grants → proprietary. + // Checked before plain by/by-sa because "by-nc-sa" contains "by-sa". + if (/creativecommons\.org\/licenses\/by-(?:nc|nd)/.test(u)) return { license: 'proprietary' } + // Open CC deeds: capture the version (D7) for the attribution families only. + const bySa = u.match(/creativecommons\.org\/licenses\/by-sa\/(\d\.\d)/) + if (bySa) return { license: 'CC-BY-SA', version: bySa[1] } + const by = u.match(/creativecommons\.org\/licenses\/by\/(\d\.\d)/) + if (by) return { license: 'CC-BY', version: by[1] } + return { license: 'unknown' } +} diff --git a/packages/provider-europeana/tsconfig.json b/packages/provider-europeana/tsconfig.json new file mode 100644 index 0000000..98922fe --- /dev/null +++ b/packages/provider-europeana/tsconfig.json @@ -0,0 +1,5 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { "outDir": "out", "rootDir": "src", "types": ["node"] }, + "include": ["src/**/*"] +} diff --git a/packages/provider-europeana/tsup.config.ts b/packages/provider-europeana/tsup.config.ts new file mode 100644 index 0000000..af0ad13 --- /dev/null +++ b/packages/provider-europeana/tsup.config.ts @@ -0,0 +1,10 @@ +import { defineConfig } from 'tsup' + +export default defineConfig({ + entry: ['src/index.ts'], + format: ['esm'], + dts: true, + clean: true, + outDir: 'dist', + sourcemap: true, +}) diff --git a/packages/provider-europeana/vitest.config.ts b/packages/provider-europeana/vitest.config.ts new file mode 100644 index 0000000..50c0303 --- /dev/null +++ b/packages/provider-europeana/vitest.config.ts @@ -0,0 +1,2 @@ +import { defineConfig } from 'vitest/config' +export default defineConfig({ test: { name: 'provider-europeana', environment: 'node', include: ['src/**/*.{test,spec}.ts'] } }) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 6be6da1..a9f8bc0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -115,6 +115,12 @@ importers: specifier: workspace:* version: link:../provider-openverse + packages/provider-europeana: + dependencies: + '@refkit/core': + specifier: workspace:* + version: link:../core + packages/provider-flickr: dependencies: '@refkit/core': From 643fd750e8d6aa3bb6a62389760acfb12dac4af4 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:43:51 +0800 Subject: [PATCH 13/34] feat(provider-europeana): toReference mapper (image-only, hotlink rehost) --- .../src/__tests__/europeana.test.ts | 123 ++++++++++++++++++ packages/provider-europeana/src/index.ts | 119 +++++++++++++++++ 2 files changed, 242 insertions(+) diff --git a/packages/provider-europeana/src/__tests__/europeana.test.ts b/packages/provider-europeana/src/__tests__/europeana.test.ts index 3461127..49389a0 100644 --- a/packages/provider-europeana/src/__tests__/europeana.test.ts +++ b/packages/provider-europeana/src/__tests__/europeana.test.ts @@ -31,3 +31,126 @@ describe('mapEuropeanaRights', () => { expect(mapEuropeanaRights('http://example.org/some-other-license')).toEqual({ license: 'unknown' }) }) }) + +import { evaluateUse, type ProviderContext } from '@refkit/core' +import { europeana } from '../index' + +// Realistic Europeana Search API item shapes. Note every metadata field is an +// array; id/type/guid are scalars. id is "/datasetId/recordId" with a leading slash. +const ITEM_CC0 = { + id: '/2048128/europeana_fashion_12345', + type: 'IMAGE', + title: ['A Painted Fan'], + dataProvider: ['Rijksmuseum'], + provider: ['Europeana Fashion'], + edmPreview: ['https://api.europeana.eu/thumbnail/v3/200/cc0thumb.jpg'], + edmIsShownBy: ['https://images.example.org/cc0-full.jpg'], + edmIsShownAt: ['https://www.rijksmuseum.nl/item/cc0'], + rights: ['http://creativecommons.org/publicdomain/zero/1.0/'], +} +const ITEM_BY_SA = { + id: '/9876543/abc_xyz', + type: 'IMAGE', + title: ['A Photographed Statue'], + dataProvider: ['Some Museum'], + provider: ['Some Aggregator'], + edmPreview: ['https://api.europeana.eu/thumbnail/v3/200/bysathumb.jpg'], + edmIsShownBy: ['https://images.example.org/bysa-full.jpg'], + edmIsShownAt: ['https://museum.example.org/item/bysa'], + rights: ['https://creativecommons.org/licenses/by-sa/3.0/'], +} +const ITEM_INC = { + id: '/111/in_copyright', + type: 'IMAGE', + title: ['A Modern Photo'], + dataProvider: ['Living Archive'], + provider: ['Aggregator'], + edmPreview: ['https://api.europeana.eu/thumbnail/v3/200/incthumb.jpg'], + edmIsShownBy: ['https://images.example.org/inc-full.jpg'], + edmIsShownAt: ['https://archive.example.org/item/inc'], + rights: ['http://rightsstatements.org/vocab/InC/1.0/'], +} + +const okCtx = (items: unknown[]): ProviderContext => ({ + fetch: (async () => + new Response(JSON.stringify({ success: true, itemsCount: items.length, totalResults: items.length, items }), { status: 200 }) + ) as typeof fetch, +}) + +describe('europeana toReference', () => { + it('maps a CC0 image to an allowed reference with hotlink rehost policy', async () => { + const refs = await europeana({ apiKey: 'k' }).search({ text: 'fan', modalities: ['image'], limit: 5 }, okCtx([ITEM_CC0])) + expect(refs).toHaveLength(1) + const r = refs[0] + expect(r.modality).toBe('image') + expect(r.title).toBe('A Painted Fan') + expect(r.rights.license).toBe('CC0-1.0') + expect(r.rights.rehostPolicy).toBe('hotlink-required') + expect(r.canonicalUrl).toBe('https://www.europeana.eu/item/2048128/europeana_fashion_12345') + expect(r.preview?.url).toBe('https://images.example.org/cc0-full.jpg') // from edmIsShownBy + expect(r.thumbnail?.url).toBe('https://api.europeana.eu/thumbnail/v3/200/cc0thumb.jpg') // from edmPreview + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed') + }) + + it('preserves the CC-BY-SA version and gates to allowed-with-attribution', async () => { + const refs = await europeana({ apiKey: 'k' }).search({ text: 'statue', modalities: ['image'] }, okCtx([ITEM_BY_SA])) + const r = refs[0] + expect(r.rights.license).toBe('CC-BY-SA') + expect(r.rights.licenseVersion).toBe('3.0') + expect(r.rights.rehostPolicy).toBe('hotlink-required') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed-with-attribution') + }) + + it('maps an in-copyright (InC) rights statement to proprietary → denied', async () => { + const refs = await europeana({ apiKey: 'k' }).search({ text: 'photo', modalities: ['image'] }, okCtx([ITEM_INC])) + const r = refs[0] + expect(r.rights.license).toBe('proprietary') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('denied') + }) + + it('maps NoC-US to PD scoped to the US (allowed by default; jurisdiction-aware callers gate)', async () => { + const nocUs = { ...ITEM_CC0, id: '/x/noc_us', rights: ['http://rightsstatements.org/vocab/NoC-US/1.0/'] } + const refs = await europeana({ apiKey: 'k' }).search({ text: 'x', modalities: ['image'] }, okCtx([nocUs])) + const r = refs[0] + expect(r.rights.license).toBe('PD') + expect(r.rights.jurisdiction).toBe('US') + expect(evaluateUse(r.rights, 'commercial-product').decision).toBe('allowed') + // a caller whose jurisdiction differs from the source's is deferred to review: + expect(evaluateUse(r.rights, 'commercial-product', { userJurisdiction: 'DE' }).decision).toBe('needs-review') + }) + + it('drops non-IMAGE items and items with no usable media at all', async () => { + const sound = { ...ITEM_CC0, id: '/x/sound', type: 'SOUND' } + const noMedia = { ...ITEM_CC0, id: '/x/nomedia', edmIsShownBy: [], edmIsShownAt: [], edmPreview: [] } + const refs = await europeana({ apiKey: 'k' }).search({ text: 'x', modalities: ['image'] }, okCtx([sound, noMedia, ITEM_CC0])) + expect(refs).toHaveLength(1) + expect(refs[0].canonicalUrl).toBe('https://www.europeana.eu/item/2048128/europeana_fashion_12345') + }) + + it('never uses edmIsShownAt (a landing page) as preview; keeps the item via its thumbnail', async () => { + // No media resource, only a landing PAGE + a Europeana thumbnail image. + const pageOnly = { + ...ITEM_CC0, + id: '/x/page_only', + edmIsShownBy: [], + edmIsShownAt: ['https://www.rijksmuseum.nl/en/collection/SK-A-1'], // a web page, NOT an image + edmPreview: ['https://api.europeana.eu/thumbnail/v3/200/pagethumb.jpg'], + } + const refs = await europeana({ apiKey: 'k' }).search({ text: 'x', modalities: ['image'] }, okCtx([pageOnly])) + expect(refs).toHaveLength(1) + expect(refs[0].preview).toBeUndefined() // the landing page is never surfaced as media + expect(refs[0].thumbnail?.url).toBe('https://api.europeana.eu/thumbnail/v3/200/pagethumb.jpg') + }) + + it('reads ebucoreHasMimeType for the preview media type when the URL has no extension', async () => { + const png = { + ...ITEM_CC0, + id: '/x/png', + edmIsShownBy: ['https://images.example.org/no-extension'], + ebucoreHasMimeType: ['image/png'], + } + const refs = await europeana({ apiKey: 'k' }).search({ text: 'x', modalities: ['image'] }, okCtx([png])) + expect(refs[0].preview?.url).toBe('https://images.example.org/no-extension') + expect(refs[0].preview?.mediaType).toBe('image/png') + }) +}) diff --git a/packages/provider-europeana/src/index.ts b/packages/provider-europeana/src/index.ts index 2910cb6..af45abe 100644 --- a/packages/provider-europeana/src/index.ts +++ b/packages/provider-europeana/src/index.ts @@ -43,3 +43,122 @@ export function mapEuropeanaRights(uri: string): { license: LicenseId; version?: if (by) return { license: 'CC-BY', version: by[1] } return { license: 'unknown' } } + +export interface EuropeanaConfig { + /** Free BYOK Europeana API key (sent as the `wskey` query param). */ + apiKey: string +} + +interface EuropeanaItem { + id: string + type?: string + title?: string[] + dataProvider?: string[] + provider?: string[] + edmPreview?: string[] + edmIsShownBy?: string[] + edmIsShownAt?: string[] + /** MIME type of the media resource when the record declares it. */ + ebucoreHasMimeType?: string[] + rights?: string[] +} +interface EuropeanaResponse { success?: boolean; items?: EuropeanaItem[] } + +/** First element of an array-typed Europeana field, or undefined. */ +function first<T>(arr: T[] | undefined): T | undefined { + return Array.isArray(arr) && arr.length > 0 ? arr[0] : undefined +} + +// edmIsShownBy is the MEDIA resource; edmIsShownAt is a LANDING PAGE (a web page, not +// an image) — it must never become preview.url. The record usually tells us the media +// type (ebucoreHasMimeType); otherwise fall back to a URL-string heuristic (no network — +// `core` never fetches bytes, and a probe would add a request per item). +const IMAGE_EXT = /\.(jpe?g|png|webp|gif|tiff?)(?:$|\?)/i + +/** URL-string heuristic only (no network): does this look like an image resource? */ +function isLikelyImageUrl(url: string): boolean { + return IMAGE_EXT.test(url) || /iiif/i.test(url) || /\/thumbnail\//i.test(url) +} + +/** Best image mediaType: the declared MIME if it's an image, else inferred from the + * URL extension, else a safe default. */ +function imageMediaType(mime: string | undefined, url: string): string { + if (mime && mime.startsWith('image/')) return mime + const m = url.match(IMAGE_EXT) + if (m) { const e = m[1].toLowerCase(); return e === 'jpg' ? 'image/jpeg' : `image/${e === 'tif' ? 'tiff' : e}` } + return 'image/jpeg' +} + +function toReference(it: EuropeanaItem): Reference | null { + // v1 image-only scope (D1): defensively re-check type even though the search is + // server-filtered with qf=TYPE:IMAGE. + if (it.type && it.type !== 'IMAGE') return null + if (!it.id) return null + + // id is "/datasetId/recordId" (leading slash) → canonical Europeana item page. + const canonicalUrl = `https://www.europeana.eu/item${it.id}` + + // preview = the actual IMAGE media (edmIsShownBy) ONLY — NEVER edmIsShownAt, which is + // a landing web page. Trust edmIsShownBy when the record's MIME says image/*, or the + // URL looks like an image, or no MIME contradicts it (type is already IMAGE). thumbnail + // = edmPreview (Europeana's own thumbnail image service — reliable). Drop the item only + // when there is neither a usable preview nor a thumbnail (nothing visual to surface). + const shownBy = first(it.edmIsShownBy) + const mime = first(it.ebucoreHasMimeType) + const thumbUrl = first(it.edmPreview) + const previewUrl = shownBy && (mime?.startsWith('image/') || isLikelyImageUrl(shownBy) || !mime) + ? shownBy + : undefined + if (!previewUrl && !thumbUrl) return null + + const rightsUri = first(it.rights) ?? '' + const { license, version, jurisdiction } = mapEuropeanaRights(rightsUri) + + const rights: RightsRecord = { + license, + licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + // jurisdiction-scoped PD (e.g. NoC-US → PD in the US); metadata for evaluateUse. + ...(jurisdiction ? { jurisdiction } : {}), + author: first(it.dataProvider) ?? first(it.provider) ?? undefined, + // D6: media is hotlinked from data providers — caching/rehosting not permitted. + rehostPolicy: 'hotlink-required', + raw: { sourceTerms: rightsUri || 'https://www.europeana.eu/rights', sourceUrl: canonicalUrl }, + } + return { + id: referenceId('europeana', canonicalUrl), + modality: 'image', + title: first(it.title) || undefined, + source: { providerId: 'europeana', sourceUrl: canonicalUrl }, + canonicalUrl, + rights, + verifiedAt: new Date().toISOString(), + ...(thumbUrl ? { thumbnail: { url: thumbUrl } } : {}), + ...(previewUrl ? { preview: { url: previewUrl, mediaType: imageMediaType(mime, previewUrl) } } : {}), + relevance: 0, + raw: it, + } +} + +export function europeana(config: EuropeanaConfig) { + return defineProvider({ + id: 'europeana', + modalities: ['image'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const url = new URL(BASE) + url.searchParams.set('wskey', config.apiKey) + url.searchParams.set('query', q.text) + url.searchParams.set('rows', String(q.limit ?? 20)) + url.searchParams.set('media', 'true') // only items that actually carry media + url.searchParams.set('qf', 'TYPE:IMAGE') // v1 image-only scope (D1) + const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`europeana search failed: ${res.status}`) + const json = (await res.json()) as EuropeanaResponse + if (!json.items || json.items.length === 0) return [] + return json.items + .map(toReference) + .filter((r): r is Reference => r !== null) + }, + }) +} From 8f89f7b6cfe263e7d6018224154b2acfbc154d75 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:44:41 +0800 Subject: [PATCH 14/34] feat(provider-europeana): search + factory wiring --- .../src/__tests__/europeana.test.ts | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/packages/provider-europeana/src/__tests__/europeana.test.ts b/packages/provider-europeana/src/__tests__/europeana.test.ts index 49389a0..98ba586 100644 --- a/packages/provider-europeana/src/__tests__/europeana.test.ts +++ b/packages/provider-europeana/src/__tests__/europeana.test.ts @@ -154,3 +154,36 @@ describe('europeana toReference', () => { expect(refs[0].preview?.mediaType).toBe('image/png') }) }) + +describe('europeana search request', () => { + it('sets wskey, query, rows, and the image/media filters', async () => { + let url = '' + const ctx: ProviderContext = { + fetch: (async (input: Parameters<typeof fetch>[0]) => { + url = String(input) + return new Response(JSON.stringify({ success: true, items: [] }), { status: 200 }) + }) as typeof fetch, + } + await europeana({ apiKey: 'my-key' }).search({ text: 'tulips', modalities: ['image'], limit: 7 }, ctx) + const u = new URL(url) + expect(u.searchParams.get('wskey')).toBe('my-key') + expect(u.searchParams.get('query')).toBe('tulips') + expect(u.searchParams.get('rows')).toBe('7') + expect(u.searchParams.get('media')).toBe('true') + expect(u.searchParams.get('qf')).toBe('TYPE:IMAGE') + }) + + it('returns [] when the API yields no items', async () => { + const ctx: ProviderContext = { + fetch: (async () => new Response(JSON.stringify({ success: true, items: [] }), { status: 200 })) as typeof fetch, + } + expect(await europeana({ apiKey: 'k' }).search({ text: 'zzz', modalities: ['image'] }, ctx)).toEqual([]) + }) + + it('throws on a non-ok HTTP status', async () => { + const ctx: ProviderContext = { + fetch: (async () => new Response('forbidden', { status: 401 })) as typeof fetch, + } + await expect(europeana({ apiKey: 'bad' }).search({ text: 'x', modalities: ['image'] }, ctx)).rejects.toThrow(/europeana search failed: 401/) + }) +}) From 1469ee98a345d8925cc35c23059b4e9af8a322ec Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:50:30 +0800 Subject: [PATCH 15/34] feat(provider-internet-archive): license + mediatype mappers --- packages/provider-internet-archive/LICENSE | 201 ++++++++++++++++++ packages/provider-internet-archive/README.md | 22 ++ .../provider-internet-archive/package.json | 44 ++++ .../src/__tests__/internet-archive.test.ts | 46 ++++ .../provider-internet-archive/src/index.ts | 58 +++++ .../provider-internet-archive/tsconfig.json | 5 + .../provider-internet-archive/tsup.config.ts | 10 + .../vitest.config.ts | 2 + pnpm-lock.yaml | 6 + 9 files changed, 394 insertions(+) create mode 100644 packages/provider-internet-archive/LICENSE create mode 100644 packages/provider-internet-archive/README.md create mode 100644 packages/provider-internet-archive/package.json create mode 100644 packages/provider-internet-archive/src/__tests__/internet-archive.test.ts create mode 100644 packages/provider-internet-archive/src/index.ts create mode 100644 packages/provider-internet-archive/tsconfig.json create mode 100644 packages/provider-internet-archive/tsup.config.ts create mode 100644 packages/provider-internet-archive/vitest.config.ts diff --git a/packages/provider-internet-archive/LICENSE b/packages/provider-internet-archive/LICENSE new file mode 100644 index 0000000..c1c4eb0 --- /dev/null +++ b/packages/provider-internet-archive/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative + Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, excluding + those notices that do not pertain to any part of the Derivative + Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2026 refkit authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/packages/provider-internet-archive/README.md b/packages/provider-internet-archive/README.md new file mode 100644 index 0000000..21f70ca --- /dev/null +++ b/packages/provider-internet-archive/README.md @@ -0,0 +1,22 @@ +# @refkit/provider-internet-archive + +Search **Internet Archive** as license-tagged video · text references — a provider satellite for **refkit** (use with [`@refkit/core`](https://www.npmjs.com/package/@refkit/core)). + +- **Source:** Internet Archive +- **Auth:** keyless +- **Modality:** video · text +- **License:** per-item CC (dirty) → unknown fallback + +> **v1 scope:** only `mediatype: movies` (→ `video`) and `mediatype: texts` (→ `text`) are mapped. All other mediatypes (`audio`, `image`, `collection`, `software`, `web`, `data`, `etree`) are filtered out and are a documented follow-up. + +## Usage + +```ts +import { createRefkit } from '@refkit/core' +import { internetArchive } from '@refkit/provider-internet-archive' + +const refkit = createRefkit({ providers: [internetArchive(/* config */)] }) +const refs = await refkit.search({ query: 'cat', modalities: ['video'] }) +``` + +Gate by intended use with `refkit.evaluateUse(ref, 'commercial-product')`. See [`@refkit/core`](https://www.npmjs.com/package/@refkit/core) for the full API. diff --git a/packages/provider-internet-archive/package.json b/packages/provider-internet-archive/package.json new file mode 100644 index 0000000..6e84fb1 --- /dev/null +++ b/packages/provider-internet-archive/package.json @@ -0,0 +1,44 @@ +{ + "name": "@refkit/provider-internet-archive", + "version": "0.1.0", + "description": "Internet Archive provider satellite for refkit.", + "type": "module", + "license": "Apache-2.0", + "keywords": [ + "refkit", + "reference-retrieval", + "license", + "attribution", + "refkit-provider", + "internet-archive" + ], + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": "./src/index.ts" + }, + "scripts": { + "typecheck": "tsc --noEmit", + "test": "vitest run", + "test:watch": "vitest watch", + "build": "tsup", + "prepublishOnly": "tsup" + }, + "dependencies": { + "@refkit/core": "workspace:*" + }, + "files": [ + "dist", + "LICENSE" + ], + "publishConfig": { + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + } + } +} diff --git a/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts b/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts new file mode 100644 index 0000000..4f14052 --- /dev/null +++ b/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts @@ -0,0 +1,46 @@ +import { describe, expect, it } from 'vitest' +import { mapIaLicense, mediatypeToModality } from '../index' + +describe('mapIaLicense', () => { + it('maps CC0 / PD mark / PD dedication URLs', () => { + expect(mapIaLicense('https://creativecommons.org/publicdomain/zero/1.0/')).toEqual({ license: 'CC0-1.0' }) + expect(mapIaLicense('http://creativecommons.org/publicdomain/mark/1.0/')).toEqual({ license: 'PD' }) + }) + + it('maps CC-BY and CC-BY-SA with version (D7)', () => { + expect(mapIaLicense('https://creativecommons.org/licenses/by/4.0/')).toEqual({ license: 'CC-BY', version: '4.0' }) + expect(mapIaLicense('http://creativecommons.org/licenses/by-sa/3.0/')).toEqual({ license: 'CC-BY-SA', version: '3.0' }) + }) + + it('maps NC/ND variants to proprietary (D5)', () => { + expect(mapIaLicense('https://creativecommons.org/licenses/by-nc/4.0/').license).toBe('proprietary') + expect(mapIaLicense('https://creativecommons.org/licenses/by-nd/4.0/').license).toBe('proprietary') + expect(mapIaLicense('https://creativecommons.org/licenses/by-nc-sa/4.0/').license).toBe('proprietary') + }) + + it('falls back to unknown for absent / unrecognized URLs (D3)', () => { + expect(mapIaLicense(undefined)).toEqual({ license: 'unknown' }) + expect(mapIaLicense('')).toEqual({ license: 'unknown' }) + expect(mapIaLicense('https://example.com/some-license')).toEqual({ license: 'unknown' }) + }) + + it('maps rightsstatements.org faithfully (InC→proprietary, NoC-US→PD+US, opaque→unknown)', () => { + expect(mapIaLicense('http://rightsstatements.org/vocab/InC/1.0/')).toEqual({ license: 'proprietary' }) + expect(mapIaLicense('http://rightsstatements.org/vocab/NoC-US/1.0/')).toEqual({ license: 'PD', jurisdiction: 'US' }) + expect(mapIaLicense('http://rightsstatements.org/vocab/NoC-NC/1.0/')).toEqual({ license: 'proprietary' }) + expect(mapIaLicense('http://rightsstatements.org/vocab/CNE/1.0/')).toEqual({ license: 'unknown' }) + }) +}) + +describe('mediatypeToModality (D1)', () => { + it('maps movies→video and texts→text', () => { + expect(mediatypeToModality('movies')).toBe('video') + expect(mediatypeToModality('texts')).toBe('text') + }) + it('returns null for unsupported mediatypes (filtered out of v1)', () => { + expect(mediatypeToModality('audio')).toBeNull() + expect(mediatypeToModality('image')).toBeNull() + expect(mediatypeToModality('collection')).toBeNull() + expect(mediatypeToModality('software')).toBeNull() + }) +}) diff --git a/packages/provider-internet-archive/src/index.ts b/packages/provider-internet-archive/src/index.ts new file mode 100644 index 0000000..42235c5 --- /dev/null +++ b/packages/provider-internet-archive/src/index.ts @@ -0,0 +1,58 @@ +import { + defineProvider, referenceId, + type Reference, type RightsRecord, type LicenseId, type Modality, + type NormalizedQuery, type ProviderContext, +} from '@refkit/core' + +const BASE = 'https://archive.org/advancedsearch.php' + +export interface InternetArchiveConfig { + /** Max docs requested per search (advancedsearch `rows`). Default falls back to + * the query limit, then 20. Bounded to 100. */ + maxRows?: number +} + +// rightsstatements.org is a rights-STATUS vocabulary (not license grants). Mapped faithfully +// per token (mirrors core `mapRightsUrl`; helper-refactor Task 4 dedups this): InC* → +// proprietary; NoC-US → PD scoped to the US; NoC-NC → proprietary; opaque/undetermined → unknown. +const RIGHTS_STATEMENT: Record<string, { license: LicenseId; jurisdiction?: string }> = { + 'inc': { license: 'proprietary' }, 'inc-ow-eu': { license: 'proprietary' }, 'inc-edu': { license: 'proprietary' }, + 'inc-nc': { license: 'proprietary' }, 'inc-ruu': { license: 'proprietary' }, + 'noc-us': { license: 'PD', jurisdiction: 'US' }, + 'noc-nc': { license: 'proprietary' }, + 'noc-oklr': { license: 'unknown' }, 'noc-cr': { license: 'unknown' }, + 'cne': { license: 'unknown' }, 'und': { license: 'unknown' }, 'nkc': { license: 'unknown' }, +} + +/** Map an Internet Archive `licenseurl` to our license id (+ CC version, + jurisdiction for + * jurisdiction-scoped PD). **ABSENT licenseurl → 'unknown' (D3)** — IA rarely carries one, so + * most items legitimately land here → needs-review; this is the "never guess PD" rule and it + * governs the ABSENT case only. A PRESENT rightsstatements.org statement is a real declaration + * and is mapped faithfully (NoC-US → PD is the source's word, not a guess). NC/ND → proprietary + * (D5); PD mark/dedication → PD; CC0 → CC0-1.0; unrecognized → unknown. */ +export function mapIaLicense(licenseurl?: string): { license: LicenseId; version?: string; jurisdiction?: string } { + if (!licenseurl) return { license: 'unknown' } + const u = licenseurl.toLowerCase() + const rs = u.match(/rightsstatements\.org\/(?:vocab|page)\/([a-z-]+)/) + if (rs) return RIGHTS_STATEMENT[rs[1]] ?? { license: 'unknown' } + if (/\/publicdomain\/zero\b/.test(u)) return { license: 'CC0-1.0' } + if (/\/publicdomain\/mark\b/.test(u)) return { license: 'PD' } + // Exclude any NC / ND variant before matching the open by / by-sa families. + if (/\/licenses\/by-(?:nc|nd)/.test(u)) return { license: 'proprietary' } + const bySa = u.match(/\/licenses\/by-sa\/(\d(?:\.\d)?)\b/) + if (bySa) return { license: 'CC-BY-SA', version: bySa[1] } + const by = u.match(/\/licenses\/by\/(\d(?:\.\d)?)\b/) + if (by) return { license: 'CC-BY', version: by[1] } + // by / by-sa with no version still maps to the family (version omitted). + if (/\/licenses\/by-sa\b/.test(u)) return { license: 'CC-BY-SA' } + if (/\/licenses\/by\b/.test(u)) return { license: 'CC-BY' } + return { license: 'unknown' } +} + +const MEDIATYPE_MODALITY: Record<string, Modality> = { movies: 'video', texts: 'text' } + +/** v1 scope (D1): only `movies`→video and `texts`→text. Everything else → null + * (filtered out). audio / image / etc. are a documented follow-up. */ +export function mediatypeToModality(mt: string): Modality | null { + return MEDIATYPE_MODALITY[mt] ?? null +} diff --git a/packages/provider-internet-archive/tsconfig.json b/packages/provider-internet-archive/tsconfig.json new file mode 100644 index 0000000..98922fe --- /dev/null +++ b/packages/provider-internet-archive/tsconfig.json @@ -0,0 +1,5 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { "outDir": "out", "rootDir": "src", "types": ["node"] }, + "include": ["src/**/*"] +} diff --git a/packages/provider-internet-archive/tsup.config.ts b/packages/provider-internet-archive/tsup.config.ts new file mode 100644 index 0000000..af0ad13 --- /dev/null +++ b/packages/provider-internet-archive/tsup.config.ts @@ -0,0 +1,10 @@ +import { defineConfig } from 'tsup' + +export default defineConfig({ + entry: ['src/index.ts'], + format: ['esm'], + dts: true, + clean: true, + outDir: 'dist', + sourcemap: true, +}) diff --git a/packages/provider-internet-archive/vitest.config.ts b/packages/provider-internet-archive/vitest.config.ts new file mode 100644 index 0000000..d767673 --- /dev/null +++ b/packages/provider-internet-archive/vitest.config.ts @@ -0,0 +1,2 @@ +import { defineConfig } from 'vitest/config' +export default defineConfig({ test: { name: 'provider-internet-archive', environment: 'node', include: ['src/**/*.{test,spec}.ts'] } }) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a9f8bc0..fb8c84d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -143,6 +143,12 @@ importers: specifier: workspace:* version: link:../provider-poetrydb + packages/provider-internet-archive: + dependencies: + '@refkit/core': + specifier: workspace:* + version: link:../core + packages/provider-jamendo: dependencies: '@refkit/core': From a03e76557e859bb337cc461034d8d15b8330c788 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:52:03 +0800 Subject: [PATCH 16/34] feat(provider-internet-archive): toReference + search --- .../src/__tests__/internet-archive.test.ts | 105 ++++++++++++++++++ .../provider-internet-archive/src/index.ts | 72 ++++++++++++ 2 files changed, 177 insertions(+) diff --git a/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts b/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts index 4f14052..82e8835 100644 --- a/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts +++ b/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts @@ -44,3 +44,108 @@ describe('mediatypeToModality (D1)', () => { expect(mediatypeToModality('software')).toBeNull() }) }) + +import { evaluateUse, referenceId, type ProviderContext } from '@refkit/core' +import { internetArchive } from '../index' + +const DOCS = [ + { // CC-BY movie, creator as a string + identifier: 'big_buck_bunny', + title: 'Big Buck Bunny', + creator: 'Blender Foundation', + licenseurl: 'https://creativecommons.org/licenses/by/3.0/', + mediatype: 'movies', + }, + { // movie with NO licenseurl — must NOT be dropped (D3) + identifier: 'cbsnews-clip', + title: 'News Clip', + creator: 'cbsnews.com', + mediatype: 'movies', + }, + { // texts item, creator as an array (IA creator is multi-value) + identifier: 'alices_adventures', + title: "Alice's Adventures in Wonderland", + creator: ['Carroll, Lewis', 'Tenniel, John'], + licenseurl: 'https://creativecommons.org/publicdomain/zero/1.0/', + mediatype: 'texts', + }, + { // unsupported mediatype — filtered out (D1) + identifier: 'some_collection', + title: 'A Collection', + mediatype: 'collection', + }, +] + +const ctxResponding = (body: unknown, onUrl?: (u: string) => void): ProviderContext => ({ + fetch: (async (input: string) => { + onUrl?.(String(input)) + return new Response(JSON.stringify(body), { status: 200 }) + }) as typeof fetch, +}) + +describe('internetArchive search', () => { + it('maps CC-BY movie with version + video modality', async () => { + const refs = await internetArchive().search( + { text: 'animation', modalities: ['video', 'text'], limit: 10 }, + ctxResponding({ response: { numFound: 4, docs: DOCS } }), + ) + const bunny = refs.find(r => r.id === referenceId('internet-archive', 'https://archive.org/details/big_buck_bunny'))! + expect(bunny.modality).toBe('video') + expect(bunny.rights.license).toBe('CC-BY') + expect(bunny.rights.licenseVersion).toBe('3.0') + expect(bunny.rights.author).toBe('Blender Foundation') + expect(bunny.canonicalUrl).toBe('https://archive.org/details/big_buck_bunny') + expect(bunny.thumbnail?.url).toBe('https://archive.org/services/img/big_buck_bunny') + expect(bunny.preview).toBeUndefined() + expect(evaluateUse(bunny.rights, 'commercial-product').decision).toBe('allowed-with-attribution') + }) + + it('keeps a licenseurl-less movie as unknown → needs-review (D3, NOT dropped)', async () => { + const refs = await internetArchive().search( + { text: 'news', modalities: ['video', 'text'] }, + ctxResponding({ response: { numFound: 4, docs: DOCS } }), + ) + const clip = refs.find(r => r.canonicalUrl === 'https://archive.org/details/cbsnews-clip')! + expect(clip).toBeDefined() + expect(clip.rights.license).toBe('unknown') + expect(evaluateUse(clip.rights, 'commercial-product').decision).toBe('needs-review') + }) + + it('maps a texts item to text modality and joins an array creator', async () => { + const refs = await internetArchive().search( + { text: 'alice', modalities: ['video', 'text'] }, + ctxResponding({ response: { numFound: 4, docs: DOCS } }), + ) + const alice = refs.find(r => r.canonicalUrl === 'https://archive.org/details/alices_adventures')! + expect(alice.modality).toBe('text') + expect(alice.rights.license).toBe('CC0-1.0') + expect(alice.rights.author).toBe('Carroll, Lewis, Tenniel, John') + expect(alice.text).toBeUndefined() + }) + + it('filters out unsupported mediatypes (collection)', async () => { + const refs = await internetArchive().search( + { text: 'x', modalities: ['video', 'text'] }, + ctxResponding({ response: { numFound: 4, docs: DOCS } }), + ) + expect(refs.map(r => r.canonicalUrl)).not.toContain('https://archive.org/details/some_collection') + expect(refs).toHaveLength(3) // bunny + clip + alice + }) + + it('forwards query and rows to advancedsearch', async () => { + let seen = '' + await internetArchive({ maxRows: 7 }).search( + { text: 'jazz', modalities: ['video', 'text'] }, + ctxResponding({ response: { numFound: 0, docs: [] } }, u => { seen = u }), + ) + const url = new URL(seen) + expect(url.pathname).toBe('/advancedsearch.php') + expect(url.searchParams.get('q')).toBe('jazz') + expect(url.searchParams.get('output')).toBe('json') + expect(url.searchParams.get('rows')).toBe('7') + expect(url.searchParams.get('page')).toBe('1') + expect(url.searchParams.getAll('fl[]')).toEqual( + expect.arrayContaining(['identifier', 'title', 'creator', 'licenseurl', 'mediatype']), + ) + }) +}) diff --git a/packages/provider-internet-archive/src/index.ts b/packages/provider-internet-archive/src/index.ts index 42235c5..079c05b 100644 --- a/packages/provider-internet-archive/src/index.ts +++ b/packages/provider-internet-archive/src/index.ts @@ -56,3 +56,75 @@ const MEDIATYPE_MODALITY: Record<string, Modality> = { movies: 'video', texts: ' export function mediatypeToModality(mt: string): Modality | null { return MEDIATYPE_MODALITY[mt] ?? null } + +interface IaDoc { + identifier: string + title?: string + creator?: string | string[] + licenseurl?: string + mediatype: string +} +interface IaResponse { response?: { numFound: number; docs: IaDoc[] } } + +function authorOf(creator: string | string[] | undefined): string | undefined { + if (!creator) return undefined + return Array.isArray(creator) ? creator.join(', ') || undefined : creator || undefined +} + +/** Map one search doc → Reference, or null if its mediatype is out of v1 scope (D1). + * canonicalUrl = the details page; thumbnail = the services image endpoint; preview + * omitted (search exposes no clean direct media stream). */ +export function toReference(doc: IaDoc): Reference | null { + const modality = mediatypeToModality(doc.mediatype) + if (!modality) return null + const canonicalUrl = `https://archive.org/details/${doc.identifier}` + const { license, version, jurisdiction } = mapIaLicense(doc.licenseurl) + const rights: RightsRecord = { + license, + licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + // jurisdiction-scoped PD (e.g. rightsstatements NoC-US → PD in the US) + ...(jurisdiction ? { jurisdiction } : {}), + author: authorOf(doc.creator), + rehostPolicy: 'cache-allowed', + raw: { sourceTerms: 'https://archive.org/about/terms.php', sourceUrl: canonicalUrl }, + } + return { + id: referenceId('internet-archive', canonicalUrl), + modality, + title: doc.title || undefined, + source: { providerId: 'internet-archive', sourceUrl: canonicalUrl }, + canonicalUrl, + rights, + verifiedAt: new Date().toISOString(), + thumbnail: { url: `https://archive.org/services/img/${doc.identifier}` }, + relevance: 0, + raw: doc, + } +} + +export function internetArchive(config: InternetArchiveConfig = {}) { + return defineProvider({ + id: 'internet-archive', + modalities: ['video', 'text'], + queryFeatures: ['keyword'], + capabilities: { controls: [] }, + async search(q: NormalizedQuery, ctx: ProviderContext): Promise<Reference[]> { + const url = new URL(BASE) + url.searchParams.set('q', q.text) + for (const f of ['identifier', 'title', 'creator', 'licenseurl', 'mediatype']) { + url.searchParams.append('fl[]', f) + } + url.searchParams.set('output', 'json') + url.searchParams.set('page', '1') + const rows = Math.min(config.maxRows ?? q.limit ?? 20, 100) + url.searchParams.set('rows', String(rows)) + const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) + if (!res.ok) throw new Error(`internet-archive search failed: ${res.status}`) + const json = (await res.json()) as IaResponse + const docs = json.response?.docs ?? [] + return docs + .map(toReference) + .filter((r): r is Reference => r !== null) + }, + }) +} From 29822e14880f6e17a82c77c8b4af702986e0f4d2 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:58:07 +0800 Subject: [PATCH 17/34] refactor(provider-met): use shared core provider helpers --- packages/provider-met/src/index.ts | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/packages/provider-met/src/index.ts b/packages/provider-met/src/index.ts index a5ba383..8da2e66 100644 --- a/packages/provider-met/src/index.ts +++ b/packages/provider-met/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfBoolean, setIfInt, setIfString, type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -64,21 +65,6 @@ function toReference(o: MetObject): Reference | null { } } -function setIfBoolean(url: URL, key: string, value: unknown) { - if (typeof value !== 'boolean') return - url.searchParams.set(key, String(value)) -} - -function setIfInt(url: URL, key: string, value: unknown) { - if (typeof value !== 'number' || !Number.isInteger(value)) return - url.searchParams.set(key, String(value)) -} - -function setIfString(url: URL, key: string, value: unknown) { - if (typeof value !== 'string' || !value) return - url.searchParams.set(key, value) -} - export function met(config: MetConfig = {}) { return defineProvider({ id: 'met', From 4df90333b38a894cf45dbfb8d2504a0da41beb2f Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:58:48 +0800 Subject: [PATCH 18/34] refactor(provider-artic): use shared core provider helpers --- packages/provider-artic/src/index.ts | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/packages/provider-artic/src/index.ts b/packages/provider-artic/src/index.ts index a1a3be7..54f86cc 100644 --- a/packages/provider-artic/src/index.ts +++ b/packages/provider-artic/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfString, setIfNonNegativeInt, setIfStringList, type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -54,21 +55,6 @@ function toReference(a: ArticArtwork, iiifUrl: string): Reference | null { } } -function setIfString(url: URL, key: string, value: unknown) { - if (typeof value !== 'string' || !value) return - url.searchParams.set(key, value) -} - -function setIfNonNegativeInt(url: URL, key: string, value: unknown) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 0) return - url.searchParams.set(key, String(value)) -} - -function setStringList(url: URL, key: string, value: unknown) { - if (typeof value === 'string' && value) url.searchParams.set(key, value) - if (Array.isArray(value) && value.every(v => typeof v === 'string')) url.searchParams.set(key, value.join(',')) -} - function articFields(value: unknown): string { const fields = new Set(['id', 'title', 'image_id', 'is_public_domain', 'artist_display']) if (typeof value === 'string') { @@ -97,7 +83,7 @@ export function artic() { setIfString(url, 'sort', opts?.sort) setIfNonNegativeInt(url, 'from', opts?.from) setIfNonNegativeInt(url, 'size', opts?.size) - setStringList(url, 'facets', opts?.facets) + setIfStringList(url, 'facets', opts?.facets) const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) if (!res.ok) throw new Error(`artic search failed: ${res.status}`) const json = (await res.json()) as ArticResponse From 8382f708592a897a3aca72a37c84d0cff82cc200 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 20:59:26 +0800 Subject: [PATCH 19/34] refactor(provider-openverse): use shared core provider helpers --- packages/provider-openverse/src/index.ts | 29 +----------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/packages/provider-openverse/src/index.ts b/packages/provider-openverse/src/index.ts index e20b79a..300b91c 100644 --- a/packages/provider-openverse/src/index.ts +++ b/packages/provider-openverse/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfString, setIfStringList, setIfBoolean, setIfPositiveInt, setIfNumber, type Reference, type RightsRecord, type LicenseId, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -87,34 +88,6 @@ function openverseLicenseType(license: import('@refkit/core').SearchLicenseContr return types.length > 0 ? types.join(',') : 'commercial,modification' } -function setIfPositiveInt(url: URL, key: string, value: unknown) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 1) return - url.searchParams.set(key, String(value)) -} - -function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value !== 'string' || !value) return - if (allowed && !allowed.includes(value)) return - url.searchParams.set(key, value) -} - -function setIfStringList(url: URL, key: string, value: unknown) { - if (typeof value === 'string' && value) url.searchParams.set(key, value) - if (Array.isArray(value) && value.length > 0 && value.every(v => typeof v === 'string' && v)) url.searchParams.set(key, value.join(',')) -} - -function setIfBoolean(url: URL, key: string, value: unknown) { - if (typeof value !== 'boolean') return - url.searchParams.set(key, String(value)) -} - -function setIfNumber(url: URL, key: string, value: unknown, options?: { min?: number; max?: number }) { - if (typeof value !== 'number' || !Number.isFinite(value)) return - if (options?.min !== undefined && value < options.min) return - if (options?.max !== undefined && value > options.max) return - url.searchParams.set(key, String(value)) -} - function hasStringList(value: unknown): boolean { return (typeof value === 'string' && value.length > 0) || (Array.isArray(value) && value.some(v => typeof v === 'string' && v.length > 0)) From b595e45ac558b5b7af4ef1693c6d81e007fa3a4a Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:00:17 +0800 Subject: [PATCH 20/34] refactor(provider-unsplash): use shared core provider helpers --- packages/provider-unsplash/src/index.ts | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/packages/provider-unsplash/src/index.ts b/packages/provider-unsplash/src/index.ts index 1565f3a..f51a8b7 100644 --- a/packages/provider-unsplash/src/index.ts +++ b/packages/provider-unsplash/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfString, setIfPositiveInt, type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -27,22 +28,11 @@ interface UnsplashResult { } interface UnsplashResponse { results: UnsplashResult[] } -function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value !== 'string') return - if (allowed && !allowed.includes(value)) return - url.searchParams.set(key, value) -} - function setCollections(url: URL, value: unknown) { if (typeof value === 'string' && value) url.searchParams.set('collections', value) if (Array.isArray(value) && value.every(v => typeof v === 'string')) url.searchParams.set('collections', value.join(',')) } -function setIfPositiveInt(url: URL, key: string, value: unknown, max?: number) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 1) return - url.searchParams.set(key, String(max ? Math.min(value, max) : value)) -} - function useLegacyFilter<T>(control: T | undefined, legacy: T | undefined): T | undefined { return control === undefined ? legacy : undefined } @@ -100,7 +90,7 @@ export function unsplash(config: UnsplashConfig) { setCollections(url, opts?.collections) setIfString(url, 'lang', opts?.lang) setIfPositiveInt(url, 'page', opts?.page) - setIfPositiveInt(url, 'per_page', opts?.perPage, 30) + setIfPositiveInt(url, 'per_page', opts?.perPage, { max: 30, clamp: true }) const res = await ctx.fetch(url.toString(), { headers: { Authorization: `Client-ID ${config.accessKey}`, 'Accept-Version': 'v1' }, signal: ctx.signal, From 58027226754b56ba67734677fcb3b1ff7c592504 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:01:02 +0800 Subject: [PATCH 21/34] refactor(provider-pexels): use shared core provider helpers --- packages/provider-pexels/src/index.ts | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/packages/provider-pexels/src/index.ts b/packages/provider-pexels/src/index.ts index e5e5ec3..3eb5e92 100644 --- a/packages/provider-pexels/src/index.ts +++ b/packages/provider-pexels/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfString, setIfPositiveInt, type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -27,17 +28,6 @@ interface PexelsPhoto { } interface PexelsResponse { photos: PexelsPhoto[] } -function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value !== 'string') return - if (allowed && !allowed.includes(value)) return - url.searchParams.set(key, value) -} - -function setIfPositiveInt(url: URL, key: string, value: unknown, max?: number) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 1) return - url.searchParams.set(key, String(max ? Math.min(value, max) : value)) -} - function useLegacyFilter<T>(control: T | undefined, legacy: T | undefined): T | undefined { return control === undefined ? legacy : undefined } @@ -60,7 +50,7 @@ function applyPexelsSearchParams(url: URL, q: NormalizedQuery, options?: { allow setIfString(url, 'size', opts?.size, ['large', 'medium', 'small']) setIfString(url, 'locale', opts?.locale) setIfPositiveInt(url, 'page', opts?.page) - setIfPositiveInt(url, 'per_page', opts?.perPage, 80) + setIfPositiveInt(url, 'per_page', opts?.perPage, { max: 80, clamp: true }) } function toReference(p: PexelsPhoto): Reference { From 91b8221d892c44a08e53ad9b5fb16ef6f029661f Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:02:35 +0800 Subject: [PATCH 22/34] refactor(provider-pixabay): use shared core provider helpers --- packages/provider-pixabay/src/index.ts | 33 +++++++------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/packages/provider-pixabay/src/index.ts b/packages/provider-pixabay/src/index.ts index 86f71bb..ea50f4c 100644 --- a/packages/provider-pixabay/src/index.ts +++ b/packages/provider-pixabay/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfString, setIfNonNegativeInt, setIfPositiveInt, setIfBoolean, type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -50,13 +51,10 @@ interface PixabayHit { } interface PixabayResponse { hits: PixabayHit[] } -function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value !== 'string') return - if (allowed && !allowed.includes(value)) return - url.searchParams.set(key, value) -} - -function setIfStringList(url: URL, key: string, value: unknown, allowed?: readonly string[]) { +// Kept local: pixabay's allowlist semantics split a comma-joined STRING value and +// validate each element, whereas core's setIfStringList tests the whole string against +// the allowlist (so a "red,blue" string would be rejected). Preserving behavior. +function setColorsList(url: URL, key: string, value: unknown, allowed?: readonly string[]) { if (typeof value === 'string') { if (!value) return if (allowed && !value.split(',').every(v => allowed.includes(v))) return @@ -68,21 +66,6 @@ function setIfStringList(url: URL, key: string, value: unknown, allowed?: readon } } -function setIfNonNegativeInt(url: URL, key: string, value: unknown) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 0) return - url.searchParams.set(key, String(value)) -} - -function setIfPositiveInt(url: URL, key: string, value: unknown, options?: { min?: number; max?: number }) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < (options?.min ?? 1)) return - url.searchParams.set(key, String(Math.min(value, options?.max ?? value))) -} - -function setIfBoolean(url: URL, key: string, value: unknown) { - if (typeof value !== 'boolean') return - url.searchParams.set(key, String(value)) -} - function useLegacyFilter<T>(control: T | undefined, legacy: T | undefined): T | undefined { return control === undefined ? legacy : undefined } @@ -153,12 +136,12 @@ export function pixabay(config: PixabayConfig) { setIfString(url, 'category', opts?.category) setIfNonNegativeInt(url, 'min_width', opts?.minWidth) setIfNonNegativeInt(url, 'min_height', opts?.minHeight) - setIfStringList(url, 'colors', opts?.colors, ['grayscale', 'transparent', 'red', 'orange', 'yellow', 'green', 'turquoise', 'blue', 'lilac', 'pink', 'white', 'gray', 'black', 'brown']) + setColorsList(url, 'colors', opts?.colors, ['grayscale', 'transparent', 'red', 'orange', 'yellow', 'green', 'turquoise', 'blue', 'lilac', 'pink', 'white', 'gray', 'black', 'brown']) setIfBoolean(url, 'safesearch', opts?.safesearch) setIfString(url, 'order', opts?.order, ['popular', 'latest']) setIfBoolean(url, 'editors_choice', opts?.editorsChoice) setIfPositiveInt(url, 'page', opts?.page) - setIfPositiveInt(url, 'per_page', opts?.perPage, { min: 3, max: 200 }) + setIfPositiveInt(url, 'per_page', opts?.perPage, { min: 3, max: 200, clamp: true }) const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) if (!res.ok) throw new Error(`pixabay search failed: ${res.status}`) const json = (await res.json()) as PixabayResponse @@ -237,7 +220,7 @@ export function pixabayVideo(config: PixabayConfig) { setIfString(url, 'order', opts?.order, ['popular', 'latest']) setIfBoolean(url, 'editors_choice', opts?.editorsChoice) setIfPositiveInt(url, 'page', opts?.page) - setIfPositiveInt(url, 'per_page', opts?.perPage, { min: 3, max: 200 }) + setIfPositiveInt(url, 'per_page', opts?.perPage, { min: 3, max: 200, clamp: true }) const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) if (!res.ok) throw new Error(`pixabay video search failed: ${res.status}`) const json = (await res.json()) as PixabayVideoResponse From e930f2fb2a4422c0346b2b207033422b0339a31b Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:03:29 +0800 Subject: [PATCH 23/34] refactor(provider-gutendex): use shared core provider helpers --- packages/provider-gutendex/src/index.ts | 30 ++----------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/packages/provider-gutendex/src/index.ts b/packages/provider-gutendex/src/index.ts index bd71c92..90e5030 100644 --- a/packages/provider-gutendex/src/index.ts +++ b/packages/provider-gutendex/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfInt, setIfPositiveInt, setIfString, setIfStringList, type Reference, type RightsRecord, type LicenseId, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -68,33 +69,6 @@ function toReference(r: GutendexResult): Reference { } } -function setIfInt(url: URL, key: string, value: unknown) { - if (typeof value !== 'number' || !Number.isInteger(value)) return - url.searchParams.set(key, String(value)) -} - -function setIfPositiveInt(url: URL, key: string, value: unknown) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 1) return - url.searchParams.set(key, String(value)) -} - -function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value !== 'string' || !value) return - if (allowed && !allowed.includes(value)) return - url.searchParams.set(key, value) -} - -function setIfStringList(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value === 'string' && value) { - if (allowed && !value.split(',').every(v => allowed.includes(v))) return - url.searchParams.set(key, value) - } - if (Array.isArray(value) && value.every(v => typeof v === 'string')) { - if (allowed && !value.every(v => allowed.includes(v))) return - url.searchParams.set(key, value.join(',')) - } -} - export function gutendex(config: GutendexConfig = {}) { return defineProvider({ id: 'gutendex', @@ -111,7 +85,7 @@ export function gutendex(config: GutendexConfig = {}) { const opts = q.providerOptions as GutendexSearchOptions | undefined setIfInt(url, 'author_year_start', opts?.authorYearStart) setIfInt(url, 'author_year_end', opts?.authorYearEnd) - setIfStringList(url, 'copyright', opts?.copyright, ['true', 'false', 'null']) + setIfStringList(url, 'copyright', opts?.copyright, { allowed: ['true', 'false', 'null'] }) setIfStringList(url, 'ids', opts?.ids) setIfStringList(url, 'languages', opts?.languages) setIfString(url, 'mime_type', opts?.mimeType) From 20904f194381331575eac5fd561b19e7f4412abe Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:04:38 +0800 Subject: [PATCH 24/34] refactor(provider-smithsonian): use shared core provider helpers --- packages/provider-smithsonian/src/index.ts | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/packages/provider-smithsonian/src/index.ts b/packages/provider-smithsonian/src/index.ts index ec90dc2..5db4d02 100644 --- a/packages/provider-smithsonian/src/index.ts +++ b/packages/provider-smithsonian/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfString, setIfNonNegativeInt, type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -62,17 +63,6 @@ function toReference(row: SiRow): Reference | null { } } -function setIfNonNegativeInt(url: URL, key: string, value: unknown, max?: number) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 0) return - url.searchParams.set(key, String(max ? Math.min(value, max) : value)) -} - -function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value !== 'string' || !value) return - if (allowed && !allowed.includes(value)) return - url.searchParams.set(key, value) -} - export function smithsonian(config: SmithsonianConfig) { return defineProvider({ id: 'smithsonian', @@ -88,7 +78,7 @@ export function smithsonian(config: SmithsonianConfig) { url.searchParams.set('fq', 'online_media_type:"Images" AND media_usage:"CC0"') const opts = q.providerOptions as SmithsonianSearchOptions | undefined setIfNonNegativeInt(url, 'start', opts?.start) - setIfNonNegativeInt(url, 'rows', opts?.rows, 1000) + setIfNonNegativeInt(url, 'rows', opts?.rows, { max: 1000, clamp: true }) setIfString(url, 'sort', opts?.sort, ['id', 'newest', 'updated', 'random']) setIfString(url, 'type', opts?.type, ['edanmdm', 'ead_collection', 'ead_component', 'all']) setIfString(url, 'row_group', opts?.rowGroup, ['objects', 'archives']) From a58f603923d96faf19d0a7527ce4da6a76cf5d35 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:05:41 +0800 Subject: [PATCH 25/34] refactor(provider-brave): use shared core provider helpers --- packages/provider-brave/src/index.ts | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/packages/provider-brave/src/index.ts b/packages/provider-brave/src/index.ts index 0d0ff24..5c639b4 100644 --- a/packages/provider-brave/src/index.ts +++ b/packages/provider-brave/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfString, setIfPositiveInt, setIfBoolean, type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, type SearchSafety, } from '@refkit/core' @@ -32,22 +33,6 @@ function braveSafeSearch(control: SearchSafety | undefined, fallback: BraveConfi return fallback ?? 'strict' } -function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value !== 'string' || !value) return - if (allowed && !allowed.includes(value)) return - url.searchParams.set(key, value) -} - -function setIfPositiveInt(url: URL, key: string, value: unknown, max?: number) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 1) return - url.searchParams.set(key, String(max ? Math.min(value, max) : value)) -} - -function setIfBoolean(url: URL, key: string, value: unknown) { - if (typeof value !== 'boolean') return - url.searchParams.set(key, String(value)) -} - function toReference(r: BraveImageResult): Reference { const rights: RightsRecord = { // open web → no license metadata → evaluateUse returns needs-review (never auto-allowed) @@ -84,7 +69,7 @@ export function brave(config: BraveConfig) { const opts = q.providerOptions as BraveImageSearchOptions | undefined setIfString(url, 'country', opts?.country) setIfString(url, 'search_lang', opts?.searchLang) - setIfPositiveInt(url, 'count', opts?.count, 200) + setIfPositiveInt(url, 'count', opts?.count, { max: 200, clamp: true }) setIfString(url, 'safesearch', opts?.safesearch, ['strict', 'off']) setIfBoolean(url, 'spellcheck', opts?.spellcheck) const res = await ctx.fetch(url.toString(), { From 64f817ceb37bacd6ce35ec8140b1e7b932fc1321 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:06:47 +0800 Subject: [PATCH 26/34] refactor(provider-flickr): use shared core provider helpers --- packages/provider-flickr/src/index.ts | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/packages/provider-flickr/src/index.ts b/packages/provider-flickr/src/index.ts index 85a2c0d..9688adc 100644 --- a/packages/provider-flickr/src/index.ts +++ b/packages/provider-flickr/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfString, setIfInt, setIfStringList, type Reference, type RightsRecord, type LicenseId, type SearchLicenseControls, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -88,12 +89,6 @@ interface FlickrPhoto { } interface FlickrResponse { photos?: { photo: FlickrPhoto[] }; stat: string } -function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value !== 'string') return - if (allowed && !allowed.includes(value)) return - url.searchParams.set(key, value) -} - function setIfSafeSearch(url: URL, value: unknown) { if (value !== 1 && value !== 2 && value !== 3) return url.searchParams.set('safe_search', String(value)) @@ -109,18 +104,6 @@ function setStringOrNumber(url: URL, key: string, value: unknown) { if (typeof value === 'number' && Number.isFinite(value)) url.searchParams.set(key, String(value)) } -function setStringList(url: URL, key: string, value: unknown) { - if (typeof value === 'string' && value) url.searchParams.set(key, value) - if (Array.isArray(value) && value.every(v => typeof v === 'string')) url.searchParams.set(key, value.join(',')) -} - -function setIfInt(url: URL, key: string, value: unknown, options?: { min?: number; max?: number }) { - if (typeof value !== 'number' || !Number.isInteger(value)) return - if (options?.min !== undefined && value < options.min) return - if (options?.max !== undefined && value > options.max) return - url.searchParams.set(key, String(value)) -} - function setBooleanFlag(url: URL, key: string, value: unknown) { if (typeof value !== 'boolean') return url.searchParams.set(key, value ? '1' : '0') @@ -217,7 +200,7 @@ export function flickr(config: FlickrConfig) { setStringOrNumber(url, 'max_taken_date', opts?.maxTakenDate) setIfString(url, 'bbox', opts?.bbox) setIfInt(url, 'accuracy', opts?.accuracy, { min: 1, max: 16 }) - setStringList(url, 'machine_tags', opts?.machineTags) + setIfStringList(url, 'machine_tags', opts?.machineTags) setIfString(url, 'machine_tag_mode', opts?.machineTagMode, ['any', 'all']) setIfString(url, 'group_id', opts?.groupId) setIfString(url, 'woe_id', opts?.woeId) From 335fb81cea2ee6769455dcb78e4bc9d29d93bc07 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:07:45 +0800 Subject: [PATCH 27/34] refactor(provider-wikimedia-commons): use shared core provider helpers --- .../provider-wikimedia-commons/src/index.ts | 24 ++----------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/packages/provider-wikimedia-commons/src/index.ts b/packages/provider-wikimedia-commons/src/index.ts index 6f05721..91f218a 100644 --- a/packages/provider-wikimedia-commons/src/index.ts +++ b/packages/provider-wikimedia-commons/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfString, setIfNonNegativeInt, setIfPositiveInt, setIfBoolean, type Reference, type RightsRecord, type LicenseId, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -116,32 +117,11 @@ function toReference(page: CommonsPage): Reference | null { } } -function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value !== 'string' || !value) return - if (allowed && !allowed.includes(value)) return - url.searchParams.set(key, value) -} - function setPipeList(url: URL, key: string, value: unknown) { if (typeof value === 'string' && value) url.searchParams.set(key, value) if (Array.isArray(value) && value.every(v => typeof v === 'string')) url.searchParams.set(key, value.join('|')) } -function setIfNonNegativeInt(url: URL, key: string, value: unknown) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 0) return - url.searchParams.set(key, String(value)) -} - -function setIfPositiveInt(url: URL, key: string, value: unknown, max?: number) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 1) return - url.searchParams.set(key, String(max ? Math.min(value, max) : value)) -} - -function setIfBoolean(url: URL, key: string, value: unknown) { - if (typeof value !== 'boolean') return - url.searchParams.set(key, String(value)) -} - function commonsImageInfoProps(value: unknown): string { const props = new Set(['url', 'mime', 'size', 'extmetadata']) if (typeof value === 'string') { @@ -171,7 +151,7 @@ export function wikimediaCommons(config: WikimediaCommonsConfig = {}) { url.searchParams.set('iiprop', 'url|mime|size|extmetadata') url.searchParams.set('iiurlwidth', String(config.thumbWidth ?? 1024)) const opts = q.providerOptions as WikimediaCommonsSearchOptions | undefined - setIfPositiveInt(url, 'gsrlimit', opts?.gsrlimit, 500) + setIfPositiveInt(url, 'gsrlimit', opts?.gsrlimit, { max: 500, clamp: true }) setIfNonNegativeInt(url, 'gsroffset', opts?.gsroffset) setIfString(url, 'gsrqiprofile', opts?.gsrqiprofile) setIfString(url, 'gsrqdprofile', opts?.gsrqdprofile) From c70d02b5f32f8f28f27831360a3223715b17f806 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:13:35 +0800 Subject: [PATCH 28/34] refactor(provider-rijksmuseum): use shared core provider helpers --- packages/provider-rijksmuseum/src/index.ts | 46 ++++------------------ 1 file changed, 8 insertions(+), 38 deletions(-) diff --git a/packages/provider-rijksmuseum/src/index.ts b/packages/provider-rijksmuseum/src/index.ts index fb2604a..677bb49 100644 --- a/packages/provider-rijksmuseum/src/index.ts +++ b/packages/provider-rijksmuseum/src/index.ts @@ -1,6 +1,7 @@ import { defineProvider, referenceId, - type Reference, type RightsRecord, type LicenseId, + setIfString, setIfBoolean, mapCcDeedUrl, isLikelyImageUrl, + type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -28,22 +29,9 @@ export interface RijksmuseumSearchOptions { const SEARCH = 'https://data.rijksmuseum.nl/search/collection' const RIJKS_TERMS = 'https://www.rijksmuseum.nl/en/data/policy' -// D7-style: map a CC deed URL to our LicenseId (+ CC version). Rijksmuseum open-access is -// effectively CC0/PDM; BY/BY-SA are implemented for correctness but not expected. CC-only — -// Rijksmuseum does not use rightsstatements.org, so this is replaced by core `mapCcDeedUrl` -// (NOT core `mapRightsUrl`) in helper-refactor Task 4. Named `mapRijksRights` to avoid clashing -// with the core `mapRightsUrl` helper, which additionally handles rightsstatements.org. -function mapRijksRights(url: string | undefined): { license: LicenseId; version?: string } { - if (!url) return { license: 'unknown' } - if (/creativecommons\.org\/publicdomain\/zero/.test(url)) return { license: 'CC0-1.0' } - if (/creativecommons\.org\/publicdomain\/mark/.test(url)) return { license: 'PD' } - if (/rightsstatements\.org\/(?:vocab|page)\/NoCopyright/i.test(url)) return { license: 'PD' } - const sa = url.match(/creativecommons\.org\/licenses\/by-sa\/(\d\.\d)/) - if (sa) return { license: 'CC-BY-SA', version: sa[1] } - const by = url.match(/creativecommons\.org\/licenses\/by\/(\d\.\d)/) - if (by) return { license: 'CC-BY', version: by[1] } - return { license: 'unknown' } -} +// Rijksmuseum open-access rights are CC deed URLs (effectively CC0/PDM; BY/BY-SA possible). +// Rijksmuseum does not use rightsstatements.org, so we use the CC-only core `mapCcDeedUrl` +// (NOT core `mapRightsUrl`, which additionally handles rightsstatements.org). // The Linked-Art graph is deeply nested and varies per record, so we extract by // shape, not by fixed index paths (see plan Open Questions). @@ -69,17 +57,8 @@ function findRightsUrl(node: unknown, depth = 0): string | undefined { // We must not put a NON-image URL (a viewer/collection web page) into preview.url. // The API carries the answer: a DigitalObject's `format` (a MIME type) and IIIF // `conforms_to` say which access_point is the image. So: read the type first, then -// fall back to a cheap URL heuristic, then give up (no network probe — `core` never -// fetches bytes, and that would add an extra request per item). See Open Questions #1. -const IMAGE_EXT = /\.(jpe?g|png|webp|gif|tiff?)(?:$|\?)/i - -/** URL-string heuristic only (no network): does this look like an image resource? */ -function isLikelyImageUrl(url: string): boolean { - return IMAGE_EXT.test(url) - || /iiif/i.test(url) // IIIF image endpoint - || /\/full\/[^/]+\/\d+\/default/i.test(url) // IIIF Image API request URL - || /googleusercontent\.com/.test(url) // Rijksmuseum/Met image CDN -} +// fall back to a cheap URL heuristic (core `isLikelyImageUrl`, no network probe — `core` +// never fetches bytes, and that would add an extra request per item). See Open Questions #1. interface LaDigitalObject { type?: string @@ -155,7 +134,7 @@ function toReference(rec: Record<string, unknown>): Reference | null { if (!id) return null const img = findImage(rec) if (!img) return null // no usable IMAGE url (e.g. only a viewer/collection page) → drop - const { license, version } = mapRijksRights(findRightsUrl(rec)) + const { license, version } = mapCcDeedUrl(findRightsUrl(rec)) const rights: RightsRecord = { license, licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, @@ -178,15 +157,6 @@ function toReference(rec: Record<string, unknown>): Reference | null { } } -function setIfString(url: URL, key: string, value: unknown) { - if (typeof value !== 'string' || !value) return - url.searchParams.set(key, value) -} -function setIfBoolean(url: URL, key: string, value: unknown) { - if (typeof value !== 'boolean') return - url.searchParams.set(key, String(value)) -} - interface SearchPage { orderedItems?: Array<{ id?: string }> } export function rijksmuseum(config: RijksmuseumConfig = {}) { From 398e36da2f42fffcb7ab0482ff9c5e42db413363 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:14:14 +0800 Subject: [PATCH 29/34] refactor(provider-freesound): use shared core provider helpers --- packages/provider-freesound/src/index.ts | 26 +++--------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/packages/provider-freesound/src/index.ts b/packages/provider-freesound/src/index.ts index ceeabe1..6d026d7 100644 --- a/packages/provider-freesound/src/index.ts +++ b/packages/provider-freesound/src/index.ts @@ -1,5 +1,6 @@ import { defineProvider, referenceId, + setIfString, setIfPositiveInt, mapCcDeedUrl, type Reference, type RightsRecord, type LicenseId, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -22,19 +23,8 @@ export function mapFreesoundLicense(value: string): { license: LicenseId; versio const v = (value ?? '').trim() if (!v) return { license: 'unknown' } - // D7 — deed URL form - if (/^https?:\/\//i.test(v)) { - if (/\/publicdomain\/zero\//i.test(v)) return { license: 'CC0-1.0' } - const m = v.match(/\/licenses\/(by(?:-sa)?|by-nc[a-z-]*|by-nd[a-z-]*)\/(\d\.\d)\//i) - if (m) { - const fam = m[1].toLowerCase() - const version = m[2] - if (fam === 'by') return { license: 'CC-BY', version } - if (fam === 'by-sa') return { license: 'CC-BY-SA', version } - return { license: 'proprietary' } // any NC/ND variant - } - return { license: 'unknown' } - } + // D7 — deed URL form: delegate to the core CC-deed mapper (identical CC handling). + if (/^https?:\/\//i.test(v)) return mapCcDeedUrl(v) // D4 — name string form (case-insensitive) return FREESOUND_NAME_LICENSE[v.toLowerCase()] ?? { license: 'unknown' } @@ -99,16 +89,6 @@ function toAudioReference(r: FreesoundResult): Reference { } } -function setIfString(url: URL, key: string, value: unknown) { - if (typeof value !== 'string' || !value) return - url.searchParams.set(key, value) -} - -function setIfPositiveInt(url: URL, key: string, value: unknown) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 1) return - url.searchParams.set(key, String(value)) -} - export function freesound(config: FreesoundConfig) { return defineProvider({ id: 'freesound', From dbd8d23b9b339b2be7ffe6b71cb4f285e9e6252f Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:15:03 +0800 Subject: [PATCH 30/34] refactor(provider-jamendo): use shared core provider helpers --- packages/provider-jamendo/src/index.ts | 48 +++++++------------------- 1 file changed, 12 insertions(+), 36 deletions(-) diff --git a/packages/provider-jamendo/src/index.ts b/packages/provider-jamendo/src/index.ts index 9865146..2657d77 100644 --- a/packages/provider-jamendo/src/index.ts +++ b/packages/provider-jamendo/src/index.ts @@ -1,6 +1,7 @@ import { defineProvider, referenceId, - type Reference, type RightsRecord, type LicenseId, + setIfString, setIfStringList, setIfBoolean, setIfNonNegativeInt, mapCcDeedUrl, + type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -50,15 +51,9 @@ interface JamendoResponse { // Jamendo deed URLs look like http(s)://creativecommons.org/licenses/<variant>/<v>/. // Only by/by-sa fit our enum (D5); capture the version (D7). Any nc/nd variant is // non-commercial or no-derivatives → 'proprietary'. Missing/unrecognized → 'unknown'. -export function mapJamendoLicense(ccurl: string): { license: LicenseId; version?: string } { - if (!ccurl) return { license: 'unknown' } - const by = ccurl.match(/\/licenses\/by\/(\d\.\d)\//) - if (by) return { license: 'CC-BY', version: by[1] } - const bySa = ccurl.match(/\/licenses\/by-sa\/(\d\.\d)\//) - if (bySa) return { license: 'CC-BY-SA', version: bySa[1] } - if (/\/licenses\/by-(nc|nd)/.test(ccurl)) return { license: 'proprietary' } - return { license: 'unknown' } -} +// This is exactly the core CC-deed mapper, re-exported under the jamendo-specific name +// the provider's tests import. +export const mapJamendoLicense = mapCcDeedUrl function toAudioReference(t: JamendoTrack, mediaType: string): Reference { const { license, version } = mapJamendoLicense(t.license_ccurl) @@ -88,27 +83,6 @@ function toAudioReference(t: JamendoTrack, mediaType: string): Reference { } } -function setIfString(url: URL, key: string, value: unknown, allowed?: readonly string[]) { - if (typeof value !== 'string' || !value) return - if (allowed && !allowed.includes(value)) return - url.searchParams.set(key, value) -} - -function setIfStringList(url: URL, key: string, value: unknown) { - if (typeof value === 'string' && value) url.searchParams.set(key, value) - if (Array.isArray(value) && value.length > 0 && value.every(v => typeof v === 'string' && v)) url.searchParams.set(key, value.join(' ')) -} - -function setIfBooleanFlag(url: URL, key: string, value: unknown) { - if (typeof value !== 'boolean') return - url.searchParams.set(key, value ? 'true' : 'false') -} - -function setIfPositiveInt(url: URL, key: string, value: unknown) { - if (typeof value !== 'number' || !Number.isInteger(value) || value < 0) return - url.searchParams.set(key, String(value)) -} - export function jamendo(config: JamendoConfig) { return defineProvider({ id: 'jamendo', @@ -124,12 +98,14 @@ export function jamendo(config: JamendoConfig) { const opts = q.providerOptions as JamendoSearchOptions | undefined setIfString(url, 'audioformat', opts?.audioformat, ['mp31', 'mp32', 'ogg', 'flac']) setIfString(url, 'order', opts?.order, ['relevance', 'popularity_total', 'popularity_month', 'popularity_week', 'releasedate_asc', 'releasedate_desc', 'buzzrate']) - setIfBooleanFlag(url, 'ccsa', opts?.ccsa) - setIfBooleanFlag(url, 'ccnd', opts?.ccnd) - setIfBooleanFlag(url, 'ccnc', opts?.ccnc) - setIfStringList(url, 'tags', opts?.tags) + setIfBoolean(url, 'ccsa', opts?.ccsa) + setIfBoolean(url, 'ccnd', opts?.ccnd) + setIfBoolean(url, 'ccnc', opts?.ccnc) + // jamendo joins tags with a SPACE (not the core default comma). + setIfStringList(url, 'tags', opts?.tags, { separator: ' ' }) setIfString(url, 'artist_name', opts?.artist_name) - setIfPositiveInt(url, 'offset', opts?.offset) + // jamendo's offset is non-negative (0 is valid) → setIfNonNegativeInt, not PositiveInt. + setIfNonNegativeInt(url, 'offset', opts?.offset) const res = await ctx.fetch(url.toString(), { signal: ctx.signal }) if (!res.ok) throw new Error(`jamendo search failed: ${res.status}`) const json = (await res.json()) as JamendoResponse From ba39e0fb3252de797bc2e7fa5fb563aea12ba18f Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:15:54 +0800 Subject: [PATCH 31/34] refactor(provider-europeana): use shared core provider helpers --- packages/provider-europeana/src/index.ts | 68 ++++-------------------- 1 file changed, 9 insertions(+), 59 deletions(-) diff --git a/packages/provider-europeana/src/index.ts b/packages/provider-europeana/src/index.ts index af45abe..f5b1963 100644 --- a/packages/provider-europeana/src/index.ts +++ b/packages/provider-europeana/src/index.ts @@ -1,48 +1,17 @@ import { defineProvider, referenceId, - type Reference, type RightsRecord, type LicenseId, + first, isLikelyImageUrl, imageMediaType, mapRightsUrl, + type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' const BASE = 'https://api.europeana.eu/record/v2/search.json' /** Map a Europeana `edm:rights` controlled-vocabulary URI to a core license id (+ CC version, - * + jurisdiction for jurisdiction-scoped PD). Conservative (D5): only clearly-open CC deeds and - * PD/CC0 become open grants; CC NC/ND → proprietary; rightsstatements.org is mapped faithfully - * per token (see below); anything unrecognized/empty → unknown. */ -// rightsstatements.org is a rights-STATUS vocabulary (not license grants). Map each token -// FAITHFULLY (index D5-style): InC* → proprietary (copyrighted, no grant); NoC-US → PD scoped -// to the US via the jurisdiction field; NoC-NC → proprietary (non-commercial → commercial out); -// opaque/undetermined (NoC-OKLR/CR, CNE, UND, NKC) → unknown. (This mirrors core `mapRightsUrl`; -// the helper-refactor Task 4 replaces this inlined copy with that import.) -const RIGHTS_STATEMENT: Record<string, { license: LicenseId; jurisdiction?: string }> = { - 'inc': { license: 'proprietary' }, 'inc-ow-eu': { license: 'proprietary' }, 'inc-edu': { license: 'proprietary' }, - 'inc-nc': { license: 'proprietary' }, 'inc-ruu': { license: 'proprietary' }, - 'noc-us': { license: 'PD', jurisdiction: 'US' }, - 'noc-nc': { license: 'proprietary' }, - 'noc-oklr': { license: 'unknown' }, 'noc-cr': { license: 'unknown' }, - 'cne': { license: 'unknown' }, 'und': { license: 'unknown' }, 'nkc': { license: 'unknown' }, -} - -export function mapEuropeanaRights(uri: string): { license: LicenseId; version?: string; jurisdiction?: string } { - const u = (uri || '').toLowerCase() - if (!u) return { license: 'unknown' } - // rightsstatements.org — faithful per-token mapping (not blanket unknown). - const rs = u.match(/rightsstatements\.org\/(?:vocab|page)\/([a-z-]+)/) - if (rs) return RIGHTS_STATEMENT[rs[1]] ?? { license: 'unknown' } - // Public domain dedications / marks (no version surfaced). - if (u.includes('creativecommons.org/publicdomain/zero')) return { license: 'CC0-1.0' } - if (u.includes('creativecommons.org/publicdomain/mark')) return { license: 'PD' } - // Non-commercial / no-derivatives variants are NOT open grants → proprietary. - // Checked before plain by/by-sa because "by-nc-sa" contains "by-sa". - if (/creativecommons\.org\/licenses\/by-(?:nc|nd)/.test(u)) return { license: 'proprietary' } - // Open CC deeds: capture the version (D7) for the attribution families only. - const bySa = u.match(/creativecommons\.org\/licenses\/by-sa\/(\d\.\d)/) - if (bySa) return { license: 'CC-BY-SA', version: bySa[1] } - const by = u.match(/creativecommons\.org\/licenses\/by\/(\d\.\d)/) - if (by) return { license: 'CC-BY', version: by[1] } - return { license: 'unknown' } -} + * + jurisdiction for jurisdiction-scoped PD). The `edm:rights` field can be a CC deed OR a + * rightsstatements.org statement, so this is exactly core `mapRightsUrl` (CC deeds + faithful + * rightsstatements mapping), re-exported under the europeana-specific name the tests import. */ +export const mapEuropeanaRights = mapRightsUrl export interface EuropeanaConfig { /** Free BYOK Europeana API key (sent as the `wskey` query param). */ @@ -64,30 +33,11 @@ interface EuropeanaItem { } interface EuropeanaResponse { success?: boolean; items?: EuropeanaItem[] } -/** First element of an array-typed Europeana field, or undefined. */ -function first<T>(arr: T[] | undefined): T | undefined { - return Array.isArray(arr) && arr.length > 0 ? arr[0] : undefined -} - // edmIsShownBy is the MEDIA resource; edmIsShownAt is a LANDING PAGE (a web page, not // an image) — it must never become preview.url. The record usually tells us the media -// type (ebucoreHasMimeType); otherwise fall back to a URL-string heuristic (no network — -// `core` never fetches bytes, and a probe would add a request per item). -const IMAGE_EXT = /\.(jpe?g|png|webp|gif|tiff?)(?:$|\?)/i - -/** URL-string heuristic only (no network): does this look like an image resource? */ -function isLikelyImageUrl(url: string): boolean { - return IMAGE_EXT.test(url) || /iiif/i.test(url) || /\/thumbnail\//i.test(url) -} - -/** Best image mediaType: the declared MIME if it's an image, else inferred from the - * URL extension, else a safe default. */ -function imageMediaType(mime: string | undefined, url: string): string { - if (mime && mime.startsWith('image/')) return mime - const m = url.match(IMAGE_EXT) - if (m) { const e = m[1].toLowerCase(); return e === 'jpg' ? 'image/jpeg' : `image/${e === 'tif' ? 'tiff' : e}` } - return 'image/jpeg' -} +// type (ebucoreHasMimeType); otherwise fall back to a URL-string heuristic (core +// `isLikelyImageUrl`, no network — `core` never fetches bytes, and a probe would add a +// request per item). function toReference(it: EuropeanaItem): Reference | null { // v1 image-only scope (D1): defensively re-check type even though the search is From 039eebd938b097c8a0741e259bf0546c1e6d3473 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:16:44 +0800 Subject: [PATCH 32/34] refactor(provider-internet-archive): use shared core provider helpers --- .../provider-internet-archive/src/index.ts | 46 ++++--------------- 1 file changed, 9 insertions(+), 37 deletions(-) diff --git a/packages/provider-internet-archive/src/index.ts b/packages/provider-internet-archive/src/index.ts index 079c05b..763f107 100644 --- a/packages/provider-internet-archive/src/index.ts +++ b/packages/provider-internet-archive/src/index.ts @@ -1,6 +1,6 @@ import { - defineProvider, referenceId, - type Reference, type RightsRecord, type LicenseId, type Modality, + defineProvider, referenceId, mapRightsUrl, + type Reference, type RightsRecord, type Modality, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -12,42 +12,14 @@ export interface InternetArchiveConfig { maxRows?: number } -// rightsstatements.org is a rights-STATUS vocabulary (not license grants). Mapped faithfully -// per token (mirrors core `mapRightsUrl`; helper-refactor Task 4 dedups this): InC* → -// proprietary; NoC-US → PD scoped to the US; NoC-NC → proprietary; opaque/undetermined → unknown. -const RIGHTS_STATEMENT: Record<string, { license: LicenseId; jurisdiction?: string }> = { - 'inc': { license: 'proprietary' }, 'inc-ow-eu': { license: 'proprietary' }, 'inc-edu': { license: 'proprietary' }, - 'inc-nc': { license: 'proprietary' }, 'inc-ruu': { license: 'proprietary' }, - 'noc-us': { license: 'PD', jurisdiction: 'US' }, - 'noc-nc': { license: 'proprietary' }, - 'noc-oklr': { license: 'unknown' }, 'noc-cr': { license: 'unknown' }, - 'cne': { license: 'unknown' }, 'und': { license: 'unknown' }, 'nkc': { license: 'unknown' }, -} - /** Map an Internet Archive `licenseurl` to our license id (+ CC version, + jurisdiction for - * jurisdiction-scoped PD). **ABSENT licenseurl → 'unknown' (D3)** — IA rarely carries one, so - * most items legitimately land here → needs-review; this is the "never guess PD" rule and it - * governs the ABSENT case only. A PRESENT rightsstatements.org statement is a real declaration - * and is mapped faithfully (NoC-US → PD is the source's word, not a guess). NC/ND → proprietary - * (D5); PD mark/dedication → PD; CC0 → CC0-1.0; unrecognized → unknown. */ -export function mapIaLicense(licenseurl?: string): { license: LicenseId; version?: string; jurisdiction?: string } { - if (!licenseurl) return { license: 'unknown' } - const u = licenseurl.toLowerCase() - const rs = u.match(/rightsstatements\.org\/(?:vocab|page)\/([a-z-]+)/) - if (rs) return RIGHTS_STATEMENT[rs[1]] ?? { license: 'unknown' } - if (/\/publicdomain\/zero\b/.test(u)) return { license: 'CC0-1.0' } - if (/\/publicdomain\/mark\b/.test(u)) return { license: 'PD' } - // Exclude any NC / ND variant before matching the open by / by-sa families. - if (/\/licenses\/by-(?:nc|nd)/.test(u)) return { license: 'proprietary' } - const bySa = u.match(/\/licenses\/by-sa\/(\d(?:\.\d)?)\b/) - if (bySa) return { license: 'CC-BY-SA', version: bySa[1] } - const by = u.match(/\/licenses\/by\/(\d(?:\.\d)?)\b/) - if (by) return { license: 'CC-BY', version: by[1] } - // by / by-sa with no version still maps to the family (version omitted). - if (/\/licenses\/by-sa\b/.test(u)) return { license: 'CC-BY-SA' } - if (/\/licenses\/by\b/.test(u)) return { license: 'CC-BY' } - return { license: 'unknown' } -} + * jurisdiction-scoped PD). The field can be a CC deed OR a rightsstatements.org statement, so + * this is exactly core `mapRightsUrl`, re-exported under the IA-specific name the tests import. + * **ABSENT licenseurl → 'unknown' (D3)** — IA rarely carries one, so most items legitimately + * land in needs-review; this "never guess PD" rule governs the ABSENT case only (core + * `mapRightsUrl(undefined) → unknown`). A PRESENT rightsstatements.org statement is a real + * declaration mapped faithfully (NoC-US → PD is the source's word, not a guess). */ +export const mapIaLicense = mapRightsUrl const MEDIATYPE_MODALITY: Record<string, Modality> = { movies: 'video', texts: 'text' } From 2b169600b17810fff807b18237fac4e32d6e6afd Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:23:41 +0800 Subject: [PATCH 33/34] feat: register P1 providers + wire shared helpers (central wiring) --- .changeset/provider-europeana.md | 5 +++++ .changeset/provider-freesound.md | 5 +++++ .changeset/provider-helpers.md | 16 ++++++++++++++++ .changeset/provider-internet-archive.md | 5 +++++ .changeset/provider-jamendo.md | 5 +++++ .changeset/provider-polyhaven.md | 5 +++++ .changeset/provider-rijksmuseum.md | 6 ++++++ README.md | 6 ++++++ packages/mcp/package.json | 6 ++++++ packages/mcp/src/__tests__/mcp.test.ts | 17 ++++++++++++++++- packages/mcp/src/cli.ts | 10 ++++++++++ pnpm-lock.yaml | 18 ++++++++++++++++++ vitest.config.ts | 6 ++++++ 13 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 .changeset/provider-europeana.md create mode 100644 .changeset/provider-freesound.md create mode 100644 .changeset/provider-helpers.md create mode 100644 .changeset/provider-internet-archive.md create mode 100644 .changeset/provider-jamendo.md create mode 100644 .changeset/provider-polyhaven.md create mode 100644 .changeset/provider-rijksmuseum.md diff --git a/.changeset/provider-europeana.md b/.changeset/provider-europeana.md new file mode 100644 index 0000000..d5b5a35 --- /dev/null +++ b/.changeset/provider-europeana.md @@ -0,0 +1,5 @@ +--- +"@refkit/provider-europeana": minor +--- + +Add @refkit/provider-europeana: Europeana as license-normalized image references (BYOK; per-item CC / PD / rightsstatements.org, hotlink-required media). diff --git a/.changeset/provider-freesound.md b/.changeset/provider-freesound.md new file mode 100644 index 0000000..c7a1105 --- /dev/null +++ b/.changeset/provider-freesound.md @@ -0,0 +1,5 @@ +--- +"@refkit/provider-freesound": minor +--- + +Add @refkit/provider-freesound: Freesound as license-normalized audio references (BYOK; per-item CC / CC0, CC name-string mapping with no version). diff --git a/.changeset/provider-helpers.md b/.changeset/provider-helpers.md new file mode 100644 index 0000000..3a32402 --- /dev/null +++ b/.changeset/provider-helpers.md @@ -0,0 +1,16 @@ +--- +"@refkit/core": minor +"@refkit/provider-met": patch +"@refkit/provider-artic": patch +"@refkit/provider-openverse": patch +"@refkit/provider-unsplash": patch +"@refkit/provider-pexels": patch +"@refkit/provider-pixabay": patch +"@refkit/provider-gutendex": patch +"@refkit/provider-smithsonian": patch +"@refkit/provider-brave": patch +"@refkit/provider-flickr": patch +"@refkit/provider-wikimedia-commons": patch +--- + +Add shared provider helpers to @refkit/core (setIf* URL setters, first, mapCcDeedUrl, mapRightsUrl, image-URL heuristics) and refactor all providers to use them instead of per-package copies. diff --git a/.changeset/provider-internet-archive.md b/.changeset/provider-internet-archive.md new file mode 100644 index 0000000..7284f30 --- /dev/null +++ b/.changeset/provider-internet-archive.md @@ -0,0 +1,5 @@ +--- +"@refkit/provider-internet-archive": minor +--- + +Add @refkit/provider-internet-archive: Internet Archive as license-normalized video / text references (movies → video, texts → text; dirty per-item CC licenseurl → unknown fallback). diff --git a/.changeset/provider-jamendo.md b/.changeset/provider-jamendo.md new file mode 100644 index 0000000..d3f7840 --- /dev/null +++ b/.changeset/provider-jamendo.md @@ -0,0 +1,5 @@ +--- +"@refkit/provider-jamendo": minor +--- + +Add @refkit/provider-jamendo: Jamendo as license-normalized audio references (BYOK; per-item CC via license_ccurl URL matching). diff --git a/.changeset/provider-polyhaven.md b/.changeset/provider-polyhaven.md new file mode 100644 index 0000000..6bd2236 --- /dev/null +++ b/.changeset/provider-polyhaven.md @@ -0,0 +1,5 @@ +--- +"@refkit/provider-polyhaven": minor +--- + +Add @refkit/provider-polyhaven: Poly Haven and ambientCG (sibling factory `ambientcg`) as CC0-normalized image references (textures/HDRIs/materials; 3D model formats skipped for v1). diff --git a/.changeset/provider-rijksmuseum.md b/.changeset/provider-rijksmuseum.md new file mode 100644 index 0000000..1bb9ffe --- /dev/null +++ b/.changeset/provider-rijksmuseum.md @@ -0,0 +1,6 @@ +--- +"@refkit/provider-rijksmuseum": minor +"@refkit/mcp": minor +--- + +Add @refkit/provider-rijksmuseum: Rijksmuseum as license-normalized image references (keyless; CC0 / Public Domain). Registers the keyless provider in the @refkit/mcp zero-config server alongside the other P1 providers. diff --git a/README.md b/README.md index 7ff68d3..38cfef2 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,12 @@ const refkit = createRefkit({ | `@refkit/provider-gutendex` | Project Gutenberg | text | keyless | per-item PD | | `@refkit/provider-poetrydb` | PoetryDB | text | keyless | PD | | `@refkit/provider-brave` | Brave web search (discovery) | image (web) | API key | unknown → needs-review | +| `@refkit/provider-rijksmuseum` | Rijksmuseum | image | keyless | CC0 / PD | +| `@refkit/provider-polyhaven` | Poly Haven + ambientCG | image | keyless | CC0 | +| `@refkit/provider-freesound` | Freesound | audio | API key | per-item CC / CC0 | +| `@refkit/provider-jamendo` | Jamendo | audio | API key | per-item CC | +| `@refkit/provider-europeana` | Europeana | image | API key | per-item CC / PD / rights-statement | +| `@refkit/provider-internet-archive` | Internet Archive | video · text | keyless | per-item CC (dirty) → unknown | Audio/video are extra factories on existing packages: `openverseAudio()`, `pexelsVideo()`, `pixabayVideo()`. Modality routing is automatic — an `['audio']` search only hits audio-capable providers. diff --git a/packages/mcp/package.json b/packages/mcp/package.json index 39fff0d..19ab0ac 100644 --- a/packages/mcp/package.json +++ b/packages/mcp/package.json @@ -32,13 +32,19 @@ "@refkit/core": "workspace:*", "@refkit/provider-artic": "workspace:*", "@refkit/provider-brave": "workspace:*", + "@refkit/provider-europeana": "workspace:*", "@refkit/provider-flickr": "workspace:*", + "@refkit/provider-freesound": "workspace:*", "@refkit/provider-gutendex": "workspace:*", + "@refkit/provider-internet-archive": "workspace:*", + "@refkit/provider-jamendo": "workspace:*", "@refkit/provider-met": "workspace:*", "@refkit/provider-openverse": "workspace:*", "@refkit/provider-pexels": "workspace:*", "@refkit/provider-pixabay": "workspace:*", "@refkit/provider-poetrydb": "workspace:*", + "@refkit/provider-polyhaven": "workspace:*", + "@refkit/provider-rijksmuseum": "workspace:*", "@refkit/provider-smithsonian": "workspace:*", "@refkit/provider-unsplash": "workspace:*", "@refkit/provider-wikimedia-commons": "workspace:*", diff --git a/packages/mcp/src/__tests__/mcp.test.ts b/packages/mcp/src/__tests__/mcp.test.ts index 16192e2..2008bb7 100644 --- a/packages/mcp/src/__tests__/mcp.test.ts +++ b/packages/mcp/src/__tests__/mcp.test.ts @@ -227,7 +227,7 @@ describe('@refkit/mcp', () => { describe('defaultProviders (zero-config CLI wiring)', () => { it('includes every keyless provider by default', () => { const ids = defaultProviders({}).map(p => p.id) - for (const id of ['openverse', 'wikimedia-commons', 'met', 'artic', 'gutendex', 'poetrydb']) { + for (const id of ['openverse', 'wikimedia-commons', 'met', 'artic', 'gutendex', 'poetrydb', 'rijksmuseum', 'polyhaven', 'ambientcg', 'internet-archive']) { expect(ids).toContain(id) } }) @@ -236,4 +236,19 @@ describe('defaultProviders (zero-config CLI wiring)', () => { expect(defaultProviders({}).map(p => p.id)).not.toContain('unsplash') expect(defaultProviders({ UNSPLASH_KEY: 'k' }).map(p => p.id)).toContain('unsplash') }) + + it('adds freesound only when FREESOUND_TOKEN is present', () => { + expect(defaultProviders({}).map(p => p.id)).not.toContain('freesound') + expect(defaultProviders({ FREESOUND_TOKEN: 'k' }).map(p => p.id)).toContain('freesound') + }) + + it('adds jamendo only when JAMENDO_CLIENT_ID is present', () => { + expect(defaultProviders({}).map(p => p.id)).not.toContain('jamendo') + expect(defaultProviders({ JAMENDO_CLIENT_ID: 'k' }).map(p => p.id)).toContain('jamendo') + }) + + it('adds europeana only when EUROPEANA_KEY is present', () => { + expect(defaultProviders({}).map(p => p.id)).not.toContain('europeana') + expect(defaultProviders({ EUROPEANA_KEY: 'k' }).map(p => p.id)).toContain('europeana') + }) }) diff --git a/packages/mcp/src/cli.ts b/packages/mcp/src/cli.ts index 810cd64..8b1fb92 100644 --- a/packages/mcp/src/cli.ts +++ b/packages/mcp/src/cli.ts @@ -16,6 +16,12 @@ import { pixabay, pixabayVideo } from '@refkit/provider-pixabay' import { flickr } from '@refkit/provider-flickr' import { smithsonian } from '@refkit/provider-smithsonian' import { brave } from '@refkit/provider-brave' +import { rijksmuseum } from '@refkit/provider-rijksmuseum' +import { polyhaven, ambientcg } from '@refkit/provider-polyhaven' +import { freesound } from '@refkit/provider-freesound' +import { jamendo } from '@refkit/provider-jamendo' +import { europeana } from '@refkit/provider-europeana' +import { internetArchive } from '@refkit/provider-internet-archive' import { serveStdio } from './index' /** Providers a zero-config server boots with: all keyless sources, plus any BYOK @@ -23,6 +29,7 @@ import { serveStdio } from './index' export function defaultProviders(env: NodeJS.ProcessEnv = process.env): ReferenceProvider[] { const providers: ReferenceProvider[] = [ openverse(), openverseAudio(), wikimediaCommons(), met(), artic(), gutendex(), poetrydb(), + rijksmuseum(), polyhaven(), ambientcg(), internetArchive(), ] if (env.UNSPLASH_KEY) providers.push(unsplash({ accessKey: env.UNSPLASH_KEY })) if (env.PEXELS_KEY) providers.push(pexels({ apiKey: env.PEXELS_KEY }), pexelsVideo({ apiKey: env.PEXELS_KEY })) @@ -30,6 +37,9 @@ export function defaultProviders(env: NodeJS.ProcessEnv = process.env): Referenc if (env.FLICKR_KEY) providers.push(flickr({ apiKey: env.FLICKR_KEY })) if (env.SI_KEY) providers.push(smithsonian({ apiKey: env.SI_KEY })) if (env.BRAVE_TOKEN) providers.push(brave({ token: env.BRAVE_TOKEN })) + if (env.FREESOUND_TOKEN) providers.push(freesound({ apiKey: env.FREESOUND_TOKEN })) + if (env.JAMENDO_CLIENT_ID) providers.push(jamendo({ clientId: env.JAMENDO_CLIENT_ID })) + if (env.EUROPEANA_KEY) providers.push(europeana({ apiKey: env.EUROPEANA_KEY })) return providers } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fb8c84d..a94a744 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -65,12 +65,24 @@ importers: '@refkit/provider-brave': specifier: workspace:* version: link:../provider-brave + '@refkit/provider-europeana': + specifier: workspace:* + version: link:../provider-europeana '@refkit/provider-flickr': specifier: workspace:* version: link:../provider-flickr + '@refkit/provider-freesound': + specifier: workspace:* + version: link:../provider-freesound '@refkit/provider-gutendex': specifier: workspace:* version: link:../provider-gutendex + '@refkit/provider-internet-archive': + specifier: workspace:* + version: link:../provider-internet-archive + '@refkit/provider-jamendo': + specifier: workspace:* + version: link:../provider-jamendo '@refkit/provider-met': specifier: workspace:* version: link:../provider-met @@ -86,6 +98,12 @@ importers: '@refkit/provider-poetrydb': specifier: workspace:* version: link:../provider-poetrydb + '@refkit/provider-polyhaven': + specifier: workspace:* + version: link:../provider-polyhaven + '@refkit/provider-rijksmuseum': + specifier: workspace:* + version: link:../provider-rijksmuseum '@refkit/provider-smithsonian': specifier: workspace:* version: link:../provider-smithsonian diff --git a/vitest.config.ts b/vitest.config.ts index 6b0d52a..665a7ae 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -18,6 +18,12 @@ export default defineConfig({ './packages/provider-met/vitest.config.ts', './packages/provider-artic/vitest.config.ts', './packages/provider-smithsonian/vitest.config.ts', + './packages/provider-rijksmuseum/vitest.config.ts', + './packages/provider-polyhaven/vitest.config.ts', + './packages/provider-freesound/vitest.config.ts', + './packages/provider-jamendo/vitest.config.ts', + './packages/provider-europeana/vitest.config.ts', + './packages/provider-internet-archive/vitest.config.ts', ], }, }) From 471e5c0fdfbd43e81437b2b4a38e5a9785e92e79 Mon Sep 17 00:00:00 2001 From: MyPrototypeWhat <daoquqiexing@gmail.com> Date: Mon, 29 Jun 2026 21:41:41 +0800 Subject: [PATCH 34/34] fix: address final review (IA array fields, rijks faithful rights, freesound/jamendo resilience, polyhaven mediaType, changeset/README) --- .changeset/provider-rijksmuseum.md | 4 ++- README.md | 2 +- .../src/__tests__/provider-helpers.test.ts | 6 ++++ packages/core/src/provider-helpers.ts | 4 +-- .../src/__tests__/freesound.test.ts | 16 +++++++++ packages/provider-freesound/src/index.ts | 5 +-- .../src/__tests__/internet-archive.test.ts | 19 ++++++++++- .../provider-internet-archive/src/index.ts | 11 +++--- .../src/__tests__/jamendo.test.ts | 14 ++++++++ packages/provider-jamendo/src/index.ts | 7 ++-- packages/provider-polyhaven/src/index.ts | 6 ++-- .../src/__tests__/rijksmuseum.test.ts | 34 +++++++++++++++++++ packages/provider-rijksmuseum/src/index.ts | 14 +++++--- 13 files changed, 121 insertions(+), 21 deletions(-) diff --git a/.changeset/provider-rijksmuseum.md b/.changeset/provider-rijksmuseum.md index 1bb9ffe..50dd64d 100644 --- a/.changeset/provider-rijksmuseum.md +++ b/.changeset/provider-rijksmuseum.md @@ -3,4 +3,6 @@ "@refkit/mcp": minor --- -Add @refkit/provider-rijksmuseum: Rijksmuseum as license-normalized image references (keyless; CC0 / Public Domain). Registers the keyless provider in the @refkit/mcp zero-config server alongside the other P1 providers. +Add @refkit/provider-rijksmuseum: Rijksmuseum as license-normalized image references (keyless; CC0 / Public Domain). + +Register the P1 providers in the @refkit/mcp zero-config server — rijksmuseum, polyhaven, ambientCG and internet-archive (keyless); freesound, jamendo and europeana (when their API key/token is set). diff --git a/README.md b/README.md index 38cfef2..909e73e 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,7 @@ Agents can use refkit in two ways: npx -y @refkit/mcp ``` -It boots with the keyless sources (Met, Art Institute, Wikimedia, Openverse, Project Gutenberg, PoetryDB) and auto-adds any BYOK source whose key is in the environment (`UNSPLASH_KEY`, `PEXELS_KEY`, `BRAVE_TOKEN`, …). Pass `intent` to annotate each result with a use-verdict (may I use this, is attribution required); `gateFor` to return only allowed results. Or wire your own providers/keys via `serveStdio(createRefkit({ … }))` — see [`@refkit/mcp`](https://www.npmjs.com/package/@refkit/mcp). +It boots with the keyless sources (Met, Art Institute, Wikimedia, Openverse, Project Gutenberg, PoetryDB, Rijksmuseum, Poly Haven, ambientCG, Internet Archive) and auto-adds any BYOK source whose key is in the environment (`UNSPLASH_KEY`, `PEXELS_KEY`, `BRAVE_TOKEN`, …). Pass `intent` to annotate each result with a use-verdict (may I use this, is attribution required); `gateFor` to return only allowed results. Or wire your own providers/keys via `serveStdio(createRefkit({ … }))` — see [`@refkit/mcp`](https://www.npmjs.com/package/@refkit/mcp). ## Not legal advice diff --git a/packages/core/src/__tests__/provider-helpers.test.ts b/packages/core/src/__tests__/provider-helpers.test.ts index dcc2242..211b93a 100644 --- a/packages/core/src/__tests__/provider-helpers.test.ts +++ b/packages/core/src/__tests__/provider-helpers.test.ts @@ -85,6 +85,9 @@ describe('mapCcDeedUrl', () => { expect(mapCcDeedUrl(undefined)).toEqual({ license: 'unknown' }) expect(mapCcDeedUrl('https://example.org/x')).toEqual({ license: 'unknown' }) }) + it('never throws on a non-string input (array/number) → unknown', () => { + expect(mapCcDeedUrl(['x'] as any)).toEqual({ license: 'unknown' }) + }) }) describe('mapRightsUrl (CC deeds + faithful rightsstatements.org)', () => { @@ -106,6 +109,9 @@ describe('mapRightsUrl (CC deeds + faithful rightsstatements.org)', () => { expect(mapRightsUrl('http://rightsstatements.org/vocab/NKC/1.0/')).toEqual({ license: 'unknown' }) expect(mapRightsUrl(undefined)).toEqual({ license: 'unknown' }) }) + it('never throws on a non-string input (number) → unknown', () => { + expect(mapRightsUrl(123 as any)).toEqual({ license: 'unknown' }) + }) }) describe('image helpers', () => { diff --git a/packages/core/src/provider-helpers.ts b/packages/core/src/provider-helpers.ts index 87f9b5a..3fe76b4 100644 --- a/packages/core/src/provider-helpers.ts +++ b/packages/core/src/provider-helpers.ts @@ -75,7 +75,7 @@ export function first<T>(arr: T[] | undefined | null): T | undefined { * absent/unrecognized → unknown. **CC deeds only** — rightsstatements.org is handled by * `mapRightsUrl`. Match is on the path so http/https both work. */ export function mapCcDeedUrl(url: string | undefined | null): { license: LicenseId; version?: string } { - if (!url) return { license: 'unknown' } + if (typeof url !== 'string' || !url) return { license: 'unknown' } const u = url.toLowerCase() if (u.includes('creativecommons.org/publicdomain/zero')) return { license: 'CC0-1.0' } if (u.includes('creativecommons.org/publicdomain/mark')) return { license: 'PD' } @@ -115,7 +115,7 @@ const RIGHTS_STATEMENT: Record<string, { license: LicenseId; jurisdiction?: stri * either (europeana `edm:rights`, internet-archive `licenseurl`). CC-only sources should * call `mapCcDeedUrl` directly. Unknown rightsstatements tokens → unknown. */ export function mapRightsUrl(url: string | undefined | null): { license: LicenseId; version?: string; jurisdiction?: string } { - if (!url) return { license: 'unknown' } + if (typeof url !== 'string' || !url) return { license: 'unknown' } const rs = url.toLowerCase().match(/rightsstatements\.org\/(?:vocab|page)\/([a-z-]+)/) if (rs) return RIGHTS_STATEMENT[rs[1]] ?? { license: 'unknown' } return mapCcDeedUrl(url) diff --git a/packages/provider-freesound/src/__tests__/freesound.test.ts b/packages/provider-freesound/src/__tests__/freesound.test.ts index c0ac5f2..1939f48 100644 --- a/packages/provider-freesound/src/__tests__/freesound.test.ts +++ b/packages/provider-freesound/src/__tests__/freesound.test.ts @@ -54,6 +54,22 @@ describe('freesound provider', () => { expect(evaluateUse(unk.rights, 'commercial-product').decision).toBe('needs-review') }) + it('drops a result with no url without crashing the batch; keeps the valid one', async () => { + const MIXED = { + count: 2, next: null, previous: null, + results: [ + { id: 10, name: 'No URL', license: 'Attribution', username: 'eve', + previews: { 'preview-hq-mp3': 'https://cdn.freesound.org/previews/10/10_hq.mp3' } }, // url missing + { id: 11, name: 'Good one', license: 'Creative Commons 0', username: 'frank', + url: 'https://freesound.org/people/frank/sounds/11/', + previews: { 'preview-hq-mp3': 'https://cdn.freesound.org/previews/11/11_hq.mp3' } }, + ], + } + const refs = await freesound({ apiKey: 'k' }).search({ text: 'x', modalities: ['audio'] }, ctxJson(MIXED)) + expect(refs).toHaveLength(1) + expect(refs[0].canonicalUrl).toBe('https://freesound.org/people/frank/sounds/11/') + }) + it('forwards query, token, and fields; respects limit', async () => { let url = '' await freesound({ apiKey: 'secret' }).search( diff --git a/packages/provider-freesound/src/index.ts b/packages/provider-freesound/src/index.ts index 6d026d7..32602c2 100644 --- a/packages/provider-freesound/src/index.ts +++ b/packages/provider-freesound/src/index.ts @@ -63,7 +63,8 @@ interface FreesoundResult { } interface FreesoundResponse { count: number; results: FreesoundResult[] } -function toAudioReference(r: FreesoundResult): Reference { +function toAudioReference(r: FreesoundResult): Reference | null { + if (!r.url) return null // no canonical URL → unusable; drop rather than crash the batch const { license, version } = mapFreesoundLicense(r.license) const canonicalUrl = r.url const rights: RightsRecord = { @@ -109,7 +110,7 @@ export function freesound(config: FreesoundConfig) { if (!res.ok) throw new Error(`freesound search failed: ${res.status}`) const json = (await res.json()) as FreesoundResponse if (!json.results) return [] - return json.results.map(toAudioReference) + return json.results.map(toAudioReference).filter((x): x is Reference => x !== null) }, }) } diff --git a/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts b/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts index 82e8835..02f88ad 100644 --- a/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts +++ b/packages/provider-internet-archive/src/__tests__/internet-archive.test.ts @@ -74,6 +74,12 @@ const DOCS = [ title: 'A Collection', mediatype: 'collection', }, + { // array-valued Solr fields (title/licenseurl arrive as arrays) — must coerce, not crash + identifier: 'arr_doc', + title: ['Arr Title'], + licenseurl: ['https://creativecommons.org/licenses/by/4.0/'], + mediatype: 'movies', + }, ] const ctxResponding = (body: unknown, onUrl?: (u: string) => void): ProviderContext => ({ @@ -129,7 +135,18 @@ describe('internetArchive search', () => { ctxResponding({ response: { numFound: 4, docs: DOCS } }), ) expect(refs.map(r => r.canonicalUrl)).not.toContain('https://archive.org/details/some_collection') - expect(refs).toHaveLength(3) // bunny + clip + alice + expect(refs).toHaveLength(4) // bunny + clip + alice + arr_doc + }) + + it('coerces array-valued title/licenseurl Solr fields to scalars (no crash)', async () => { + const refs = await internetArchive().search( + { text: 'arr', modalities: ['video', 'text'] }, + ctxResponding({ response: { numFound: 5, docs: DOCS } }), + ) + const arr = refs.find(r => r.canonicalUrl === 'https://archive.org/details/arr_doc')! + expect(arr).toBeDefined() + expect(arr.title).toBe('Arr Title') + expect(arr.rights.license).toBe('CC-BY') }) it('forwards query and rows to advancedsearch', async () => { diff --git a/packages/provider-internet-archive/src/index.ts b/packages/provider-internet-archive/src/index.ts index 763f107..d7827dd 100644 --- a/packages/provider-internet-archive/src/index.ts +++ b/packages/provider-internet-archive/src/index.ts @@ -31,9 +31,9 @@ export function mediatypeToModality(mt: string): Modality | null { interface IaDoc { identifier: string - title?: string + title?: string | string[] creator?: string | string[] - licenseurl?: string + licenseurl?: string | string[] mediatype: string } interface IaResponse { response?: { numFound: number; docs: IaDoc[] } } @@ -50,7 +50,10 @@ export function toReference(doc: IaDoc): Reference | null { const modality = mediatypeToModality(doc.mediatype) if (!modality) return null const canonicalUrl = `https://archive.org/details/${doc.identifier}` - const { license, version, jurisdiction } = mapIaLicense(doc.licenseurl) + // Solr fields can arrive as scalars OR arrays — coerce to the first scalar before mapping. + const licenseurl = Array.isArray(doc.licenseurl) ? doc.licenseurl[0] : doc.licenseurl + const title = Array.isArray(doc.title) ? doc.title[0] : doc.title + const { license, version, jurisdiction } = mapIaLicense(licenseurl) const rights: RightsRecord = { license, licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, @@ -63,7 +66,7 @@ export function toReference(doc: IaDoc): Reference | null { return { id: referenceId('internet-archive', canonicalUrl), modality, - title: doc.title || undefined, + title: title || undefined, source: { providerId: 'internet-archive', sourceUrl: canonicalUrl }, canonicalUrl, rights, diff --git a/packages/provider-jamendo/src/__tests__/jamendo.test.ts b/packages/provider-jamendo/src/__tests__/jamendo.test.ts index af5bbdb..0cb0b0c 100644 --- a/packages/provider-jamendo/src/__tests__/jamendo.test.ts +++ b/packages/provider-jamendo/src/__tests__/jamendo.test.ts @@ -95,6 +95,20 @@ describe('jamendo provider', () => { expect(evaluateUse(refs[0].rights, 'commercial-product').decision).toBe('needs-review') }) + const TRACK_NO_SHAREURL = { + ...TRACK_BY, + id: '4000003', + name: 'No Share URL', + shareurl: '', + } + + it('drops a track with no shareurl without crashing the batch; keeps the valid one', async () => { + const { ctx } = ctxCapturing(envelope([TRACK_NO_SHAREURL, TRACK_BY])) + const refs = await jamendo({ clientId: 'cid' }).search({ text: 'x', modalities: ['audio'] }, ctx) + expect(refs).toHaveLength(1) + expect(refs[0].canonicalUrl).toBe('https://www.jamendo.com/track/1848357') + }) + it('forwards client_id, search, limit, format and documented options', async () => { const { ctx, url } = ctxCapturing(envelope([])) await jamendo({ clientId: 'my-client-id' }).search({ diff --git a/packages/provider-jamendo/src/index.ts b/packages/provider-jamendo/src/index.ts index 2657d77..fb8cb8b 100644 --- a/packages/provider-jamendo/src/index.ts +++ b/packages/provider-jamendo/src/index.ts @@ -55,7 +55,8 @@ interface JamendoResponse { // the provider's tests import. export const mapJamendoLicense = mapCcDeedUrl -function toAudioReference(t: JamendoTrack, mediaType: string): Reference { +function toAudioReference(t: JamendoTrack, mediaType: string): Reference | null { + if (!t.shareurl) return null // no canonical URL → unusable; drop rather than crash the batch const { license, version } = mapJamendoLicense(t.license_ccurl) const canonicalUrl = t.shareurl const rights: RightsRecord = { @@ -111,7 +112,9 @@ export function jamendo(config: JamendoConfig) { const json = (await res.json()) as JamendoResponse if (json.headers?.status !== 'success') throw new Error(`jamendo search error: ${json.headers?.error_message || json.headers?.status}`) const mediaType = JAMENDO_AUDIO_MIME[opts?.audioformat ?? 'mp31'] ?? 'audio/mpeg' - return (json.results ?? []).map((t) => toAudioReference(t, mediaType)) + return (json.results ?? []) + .map((t) => toAudioReference(t, mediaType)) + .filter((x): x is Reference => x !== null) }, }) } diff --git a/packages/provider-polyhaven/src/index.ts b/packages/provider-polyhaven/src/index.ts index 6bca656..55406c4 100644 --- a/packages/provider-polyhaven/src/index.ts +++ b/packages/provider-polyhaven/src/index.ts @@ -1,5 +1,5 @@ import { - defineProvider, referenceId, + defineProvider, referenceId, imageMediaType, type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -71,8 +71,8 @@ function toReference(id: string, asset: PolyHavenAsset, imageUrl: string): Refer verifiedAt: new Date().toISOString(), ...(asset.thumbnail_url ? { thumbnail: { url: asset.thumbnail_url } } : {}), // textureImageUrl may resolve a .png fallback — derive the MIME from the extension - // rather than hardcoding jpeg (mislabeling a PNG as JPEG). - preview: { url: imageUrl, mediaType: imageUrl.toLowerCase().includes('.png') ? 'image/png' : 'image/jpeg' }, + // (core imageMediaType) rather than hardcoding jpeg (mislabeling a PNG as JPEG). + preview: { url: imageUrl, mediaType: imageMediaType(undefined, imageUrl) }, relevance: 0, raw: asset, } diff --git a/packages/provider-rijksmuseum/src/__tests__/rijksmuseum.test.ts b/packages/provider-rijksmuseum/src/__tests__/rijksmuseum.test.ts index fdb7b63..bb43916 100644 --- a/packages/provider-rijksmuseum/src/__tests__/rijksmuseum.test.ts +++ b/packages/provider-rijksmuseum/src/__tests__/rijksmuseum.test.ts @@ -169,6 +169,40 @@ describe('rijksmuseum provider', () => { expect(refs[0].preview?.mediaType).toBe('image/jpeg') }) + it('maps a found rightsstatements.org URI faithfully (InC→proprietary, NoC-US→PD+US)', async () => { + // findRightsUrl matches rightsstatements.org; mapping must honor it, not collapse to unknown. + const REC_INC = { + id: 'https://id.rijksmuseum.nl/200100333', + type: 'HumanMadeObject', + identified_by: [{ type: 'Name', content: 'In Copyright' }], + subject_to: [{ type: 'Right', classified_as: [{ id: 'http://rightsstatements.org/vocab/InC/1.0/' }] }], + subject_of: [{ type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', format: 'image/jpeg', access_point: [{ id: 'https://iiif.example.org/inc/full/full/0/default.jpg' }] }] }], + } + const REC_NOC_US = { + id: 'https://id.rijksmuseum.nl/200100222', + type: 'HumanMadeObject', + identified_by: [{ type: 'Name', content: 'No Copyright US' }], + subject_to: [{ type: 'Right', classified_as: [{ id: 'http://rightsstatements.org/vocab/NoC-US/1.0/' }] }], + subject_of: [{ type: 'VisualItem', digitally_carried_by: [{ type: 'DigitalObject', format: 'image/jpeg', access_point: [{ id: 'https://iiif.example.org/noc/full/full/0/default.jpg' }] }] }], + } + const TWO = { + type: 'OrderedCollectionPage', + orderedItems: [ + { id: 'https://id.rijksmuseum.nl/200100333', type: 'HumanMadeObject' }, + { id: 'https://id.rijksmuseum.nl/200100222', type: 'HumanMadeObject' }, + ], + } + const refs = await rijksmuseum().search( + { text: 'x', modalities: ['image'] }, + ctxRouting(TWO, { '200100333': REC_INC, '200100222': REC_NOC_US }), + ) + const inc = refs.find(r => r.title === 'In Copyright')! + expect(inc.rights.license).toBe('proprietary') + const nocUs = refs.find(r => r.title === 'No Copyright US')! + expect(nocUs.rights.license).toBe('PD') + expect(nocUs.rights.jurisdiction).toBe('US') + }) + it('forwards the keyword and documented search options + caps the page size to the limit', async () => { let searchUrl = '' await rijksmuseum().search( diff --git a/packages/provider-rijksmuseum/src/index.ts b/packages/provider-rijksmuseum/src/index.ts index 677bb49..7bd2792 100644 --- a/packages/provider-rijksmuseum/src/index.ts +++ b/packages/provider-rijksmuseum/src/index.ts @@ -1,6 +1,6 @@ import { defineProvider, referenceId, - setIfString, setIfBoolean, mapCcDeedUrl, isLikelyImageUrl, + setIfString, setIfBoolean, mapRightsUrl, isLikelyImageUrl, type Reference, type RightsRecord, type NormalizedQuery, type ProviderContext, } from '@refkit/core' @@ -29,9 +29,11 @@ export interface RijksmuseumSearchOptions { const SEARCH = 'https://data.rijksmuseum.nl/search/collection' const RIJKS_TERMS = 'https://www.rijksmuseum.nl/en/data/policy' -// Rijksmuseum open-access rights are CC deed URLs (effectively CC0/PDM; BY/BY-SA possible). -// Rijksmuseum does not use rightsstatements.org, so we use the CC-only core `mapCcDeedUrl` -// (NOT core `mapRightsUrl`, which additionally handles rightsstatements.org). +// Rijksmuseum open-access rights are usually CC deed URLs (effectively CC0/PDM; BY/BY-SA +// possible), but `findRightsUrl` also matches rightsstatements.org URIs — so we map via core +// `mapRightsUrl` (CC deeds + faithful rightsstatements.org). Mapping via the CC-only +// `mapCcDeedUrl` would collapse a found rightsstatements URI to `unknown`, contradicting the +// matcher. mapRightsUrl delegates CC deeds to mapCcDeedUrl, so CC handling is identical. // The Linked-Art graph is deeply nested and varies per record, so we extract by // shape, not by fixed index paths (see plan Open Questions). @@ -134,10 +136,12 @@ function toReference(rec: Record<string, unknown>): Reference | null { if (!id) return null const img = findImage(rec) if (!img) return null // no usable IMAGE url (e.g. only a viewer/collection page) → drop - const { license, version } = mapCcDeedUrl(findRightsUrl(rec)) + const { license, version, jurisdiction } = mapRightsUrl(findRightsUrl(rec)) const rights: RightsRecord = { license, licenseVersion: license === 'CC-BY' || license === 'CC-BY-SA' ? version : undefined, + // jurisdiction-scoped status (e.g. rightsstatements NoC-US → PD in the US) + ...(jurisdiction ? { jurisdiction } : {}), author: findCreator(rec) || undefined, rehostPolicy: 'cache-allowed', raw: { sourceTerms: RIJKS_TERMS, sourceUrl: id },