From 295077ea70c99fd6c4ca0ef8c304781de07120c7 Mon Sep 17 00:00:00 2001 From: Alex Potsides Date: Fri, 19 Jan 2024 10:04:11 +0100 Subject: [PATCH 1/2] feat: add blockReadConcurrency option to exporter (#361) By default we attempt to load all siblings in a given layer of a DAG at once to allow slow/async loading routines extra time to fetch data before it is needed. Some blockstores (e.g. CAR files) require the exporter to only request the next sequential CID in a DAG. Add a `blockReadConcurrency` option (named similarly to the importer's `blockWriteConcurrency` option) to control this behaviour. Fixes #359 --------- Co-authored-by: Rod Vagg --- packages/ipfs-unixfs-exporter/package.json | 1 + packages/ipfs-unixfs-exporter/src/index.ts | 36 ++- .../resolvers/unixfs-v1/content/directory.ts | 5 +- .../src/resolvers/unixfs-v1/content/file.ts | 3 +- .../content/hamt-sharded-directory.ts | 5 +- .../test/exporter.spec.ts | 226 ++++++++++++++++++ 6 files changed, 272 insertions(+), 4 deletions(-) diff --git a/packages/ipfs-unixfs-exporter/package.json b/packages/ipfs-unixfs-exporter/package.json index 6fa9e5d8..4f108edf 100644 --- a/packages/ipfs-unixfs-exporter/package.json +++ b/packages/ipfs-unixfs-exporter/package.json @@ -78,6 +78,7 @@ "iso-random-stream": "^2.0.2", "it-all": "^3.0.2", "it-buffer-stream": "^3.0.0", + "it-drain": "^3.0.5", "it-first": "^3.0.2", "it-to-buffer": "^4.0.2", "merge-options": "^3.0.4", diff --git a/packages/ipfs-unixfs-exporter/src/index.ts b/packages/ipfs-unixfs-exporter/src/index.ts index 5d31070d..b1adc319 100644 --- a/packages/ipfs-unixfs-exporter/src/index.ts +++ b/packages/ipfs-unixfs-exporter/src/index.ts @@ -94,9 +94,40 @@ export type ExporterProgressEvents = ProgressEvent<'unixfs:exporter:walk:raw', ExportWalk> export interface ExporterOptions extends ProgressOptions { + /** + * An optional offset to start reading at. + * + * If the CID resolves to a file this will be a byte offset within that file, + * otherwise if it's a directory it will be a directory entry offset within + * the directory listing. (default: undefined) + */ offset?: number + + /** + * An optional length to read. + * + * If the CID resolves to a file this will be the number of bytes read from + * the file, otherwise if it's a directory it will be the number of directory + * entries read from the directory listing. (default: undefined) + */ length?: number + + /** + * This signal can be used to abort any long-lived operations such as fetching + * blocks from the network. (default: undefined) + */ signal?: AbortSignal + + /** + * When a DAG layer is encountered, all child nodes are loaded in parallel but + * processed as they arrive. This allows us to load sibling nodes in advance + * of yielding their bytes. Pass a value here to control the number of blocks + * loaded in parallel. If a strict depth-first traversal is required, this + * value should be set to `1`, otherwise the traversal order will tend to + * resemble a breadth-first fan-out and yield a have stable ordering. + * (default: undefined) + */ + blockReadConcurrency?: number } export interface Exportable { @@ -143,6 +174,8 @@ export interface Exportable { size: bigint /** + * @example File content + * * When `entry` is a file or a `raw` node, `offset` and/or `length` arguments can be passed to `entry.content()` to return slices of data: * * ```javascript @@ -162,6 +195,8 @@ export interface Exportable { * return data * ``` * + * @example Directory content + * * If `entry` is a directory, passing `offset` and/or `length` to `entry.content()` will limit the number of files returned from the directory. * * ```javascript @@ -176,7 +211,6 @@ export interface Exportable { * * // `entries` contains the first 5 files/directories in the directory * ``` - * */ content(options?: ExporterOptions): AsyncGenerator } diff --git a/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/directory.ts b/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/directory.ts index bfa1d61d..afab2634 100644 --- a/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/directory.ts +++ b/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/directory.ts @@ -25,7 +25,10 @@ const directoryContent: UnixfsV1Resolver = (cid, node, unixfs, path, resolve, de return result.entry } }), - source => parallel(source, { ordered: true }), + source => parallel(source, { + ordered: true, + concurrency: options.blockReadConcurrency + }), source => filter(source, entry => entry != null) ) } diff --git a/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/file.ts b/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/file.ts index 1da18056..f65a449a 100644 --- a/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/file.ts +++ b/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/file.ts @@ -84,7 +84,8 @@ async function walkDAG (blockstore: ReadableStorage, node: dagPb.PBNode | Uint8A } }), (source) => parallel(source, { - ordered: true + ordered: true, + concurrency: options.blockReadConcurrency }), async (source) => { for await (const { link, block, blockStart } of source) { diff --git a/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/hamt-sharded-directory.ts b/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/hamt-sharded-directory.ts index 9e59d7c9..1c482c68 100644 --- a/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/hamt-sharded-directory.ts +++ b/packages/ipfs-unixfs-exporter/src/resolvers/unixfs-v1/content/hamt-sharded-directory.ts @@ -62,7 +62,10 @@ async function * listDirectory (node: PBNode, path: string, resolve: Resolve, de } } }), - source => parallel(source, { ordered: true }) + source => parallel(source, { + ordered: true, + concurrency: options.blockReadConcurrency + }) ) for await (const { entries } of results) { diff --git a/packages/ipfs-unixfs-exporter/test/exporter.spec.ts b/packages/ipfs-unixfs-exporter/test/exporter.spec.ts index f1c5fd95..a8c5aa74 100644 --- a/packages/ipfs-unixfs-exporter/test/exporter.spec.ts +++ b/packages/ipfs-unixfs-exporter/test/exporter.spec.ts @@ -12,6 +12,7 @@ import { fixedSize } from 'ipfs-unixfs-importer/chunker' import { balanced, type FileLayout, flat, trickle } from 'ipfs-unixfs-importer/layout' import all from 'it-all' import randomBytes from 'it-buffer-stream' +import drain from 'it-drain' import first from 'it-first' import last from 'it-last' import toBuffer from 'it-to-buffer' @@ -20,6 +21,7 @@ import * as raw from 'multiformats/codecs/raw' import { identity } from 'multiformats/hashes/identity' import { sha256 } from 'multiformats/hashes/sha2' import { Readable } from 'readable-stream' +import Sinon from 'sinon' import { concat as uint8ArrayConcat } from 'uint8arrays/concat' import { fromString as uint8ArrayFromString } from 'uint8arrays/from-string' import { toString as uint8ArrayToString } from 'uint8arrays/to-string' @@ -1343,4 +1345,228 @@ describe('exporter', () => { dataSizeInBytes *= 10 } }) + + it('should allow control of block read concurrency for files', async () => { + // create a multi-layered DAG of a manageable size + const imported = await first(importer([{ + path: '1.2MiB.txt', + content: asAsyncIterable(smallFile) + }], block, { + rawLeaves: true, + chunker: fixedSize({ chunkSize: 50 }), + layout: balanced({ maxChildrenPerNode: 2 }) + })) + + if (imported == null) { + throw new Error('Nothing imported') + } + + const node = dagPb.decode(await block.get(imported.cid)) + expect(node.Links).to.have.lengthOf(2, 'imported node had too many children') + + const child1 = dagPb.decode(await block.get(node.Links[0].Hash)) + expect(child1.Links).to.have.lengthOf(2, 'layer 1 node had too many children') + + const child2 = dagPb.decode(await block.get(node.Links[1].Hash)) + expect(child2.Links).to.have.lengthOf(2, 'layer 1 node had too many children') + + // should be raw nodes + expect(child1.Links[0].Hash.code).to.equal(raw.code, 'layer 2 node had wrong codec') + expect(child1.Links[1].Hash.code).to.equal(raw.code, 'layer 2 node had wrong codec') + expect(child2.Links[0].Hash.code).to.equal(raw.code, 'layer 2 node had wrong codec') + expect(child2.Links[1].Hash.code).to.equal(raw.code, 'layer 2 node had wrong codec') + + // export file + const file = await exporter(imported.cid, block) + + // export file data with default settings + const blockReadSpy = Sinon.spy(block, 'get') + const contentWithDefaultBlockConcurrency = await toBuffer(file.content()) + + // blocks should be loaded in default order - a whole level of sibling nodes at a time + expect(blockReadSpy.getCalls().map(call => call.args[0].toString())).to.deep.equal([ + node.Links[0].Hash.toString(), + node.Links[1].Hash.toString(), + child1.Links[0].Hash.toString(), + child1.Links[1].Hash.toString(), + child2.Links[0].Hash.toString(), + child2.Links[1].Hash.toString() + ]) + + // export file data overriding read concurrency + blockReadSpy.resetHistory() + const contentWitSmallBlockConcurrency = await toBuffer(file.content({ + blockReadConcurrency: 1 + })) + + // blocks should be loaded in traversal order + expect(blockReadSpy.getCalls().map(call => call.args[0].toString())).to.deep.equal([ + node.Links[0].Hash.toString(), + child1.Links[0].Hash.toString(), + child1.Links[1].Hash.toString(), + node.Links[1].Hash.toString(), + child2.Links[0].Hash.toString(), + child2.Links[1].Hash.toString() + ]) + + // ensure exported bytes are the same + expect(contentWithDefaultBlockConcurrency).to.equalBytes(contentWitSmallBlockConcurrency) + }) + + it('should allow control of block read concurrency for directories', async () => { + const entries = 1024 + + // create a largeish directory + const imported = await last(importer((async function * () { + for (let i = 0; i < entries; i++) { + yield { + path: `file-${i}.txt`, + content: Uint8Array.from([i]) + } + } + })(), block, { + wrapWithDirectory: true + })) + + if (imported == null) { + throw new Error('Nothing imported') + } + + const node = dagPb.decode(await block.get(imported.cid)) + expect(node.Links).to.have.lengthOf(entries, 'imported node had too many children') + + for (const link of node.Links) { + // should be raw nodes + expect(link.Hash.code).to.equal(raw.code, 'child node had wrong codec') + } + + // export directory + const directory = await exporter(imported.cid, block) + + // export file data with default settings + const originalGet = block.get.bind(block) + + const expectedInvocations: string[] = [] + + for (const link of node.Links) { + expectedInvocations.push(`${link.Hash.toString()}-start`) + expectedInvocations.push(`${link.Hash.toString()}-end`) + } + + const actualInvocations: string[] = [] + + block.get = async (cid) => { + actualInvocations.push(`${cid.toString()}-start`) + + // introduce a small delay - if running in parallel actualInvocations will + // be: + // `foo-start`, `bar-start`, `baz-start`, `foo-end`, `bar-end`, `baz-end` + // if in series it will be: + // `foo-start`, `foo-end`, `bar-start`, `bar-end`, `baz-start`, `baz-end` + await delay(1) + + actualInvocations.push(`${cid.toString()}-end`) + + return originalGet(cid) + } + + const blockReadSpy = Sinon.spy(block, 'get') + await drain(directory.content({ + blockReadConcurrency: 1 + })) + + // blocks should be loaded in default order - a whole level of sibling nodes at a time + expect(blockReadSpy.getCalls().map(call => call.args[0].toString())).to.deep.equal( + node.Links.map(link => link.Hash.toString()) + ) + + expect(actualInvocations).to.deep.equal(expectedInvocations) + }) + + it('should allow control of block read concurrency for HAMT sharded directories', async () => { + const entries = 1024 + + // create a sharded directory + const imported = await last(importer((async function * () { + for (let i = 0; i < entries; i++) { + yield { + path: `file-${i}.txt`, + content: Uint8Array.from([i]) + } + } + })(), block, { + wrapWithDirectory: true, + shardSplitThresholdBytes: 10 + })) + + if (imported == null) { + throw new Error('Nothing imported') + } + + const node = dagPb.decode(await block.get(imported.cid)) + const data = UnixFS.unmarshal(node.Data ?? new Uint8Array(0)) + expect(data.type).to.equal('hamt-sharded-directory') + + // traverse the shard, collect all the CIDs + async function collectCIDs (node: PBNode): Promise { + const children: CID[] = [] + + for (const link of node.Links) { + children.push(link.Hash) + + if (link.Hash.code === dagPb.code) { + const buf = await block.get(link.Hash) + const childNode = dagPb.decode(buf) + + children.push(...(await collectCIDs(childNode))) + } + } + + return children + } + + const children: CID[] = await collectCIDs(node) + + // export directory + const directory = await exporter(imported.cid, block) + + // export file data with default settings + const originalGet = block.get.bind(block) + + const expectedInvocations: string[] = [] + + for (const cid of children) { + expectedInvocations.push(`${cid.toString()}-start`) + expectedInvocations.push(`${cid.toString()}-end`) + } + + const actualInvocations: string[] = [] + + block.get = async (cid) => { + actualInvocations.push(`${cid.toString()}-start`) + + // introduce a small delay - if running in parallel actualInvocations will + // be: + // `foo-start`, `bar-start`, `baz-start`, `foo-end`, `bar-end`, `baz-end` + // if in series it will be: + // `foo-start`, `foo-end`, `bar-start`, `bar-end`, `baz-start`, `baz-end` + await delay(1) + + actualInvocations.push(`${cid.toString()}-end`) + + return originalGet(cid) + } + + const blockReadSpy = Sinon.spy(block, 'get') + await drain(directory.content({ + blockReadConcurrency: 1 + })) + + // blocks should be loaded in default order - a whole level of sibling nodes at a time + expect(blockReadSpy.getCalls().map(call => call.args[0].toString())).to.deep.equal( + children.map(link => link.toString()) + ) + + expect(actualInvocations).to.deep.equal(expectedInvocations) + }) }) From 62918171c1d5ffa3c3e6cd0acb7dd5c3ab9b7f20 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 19 Jan 2024 09:10:22 +0000 Subject: [PATCH 2/2] chore(release): 13.4.0 [skip ci] ## ipfs-unixfs-exporter [13.4.0](https://github.com/ipfs/js-ipfs-unixfs/compare/ipfs-unixfs-exporter-13.3.1...ipfs-unixfs-exporter-13.4.0) (2024-01-19) ### Features * add blockReadConcurrency option to exporter ([#361](https://github.com/ipfs/js-ipfs-unixfs/issues/361)) ([295077e](https://github.com/ipfs/js-ipfs-unixfs/commit/295077ea70c99fd6c4ca0ef8c304781de07120c7)), closes [#359](https://github.com/ipfs/js-ipfs-unixfs/issues/359) --- packages/ipfs-unixfs-exporter/CHANGELOG.md | 7 +++++++ packages/ipfs-unixfs-exporter/package.json | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/packages/ipfs-unixfs-exporter/CHANGELOG.md b/packages/ipfs-unixfs-exporter/CHANGELOG.md index ea372edc..67feb227 100644 --- a/packages/ipfs-unixfs-exporter/CHANGELOG.md +++ b/packages/ipfs-unixfs-exporter/CHANGELOG.md @@ -1,3 +1,10 @@ +## ipfs-unixfs-exporter [13.4.0](https://github.com/ipfs/js-ipfs-unixfs/compare/ipfs-unixfs-exporter-13.3.1...ipfs-unixfs-exporter-13.4.0) (2024-01-19) + + +### Features + +* add blockReadConcurrency option to exporter ([#361](https://github.com/ipfs/js-ipfs-unixfs/issues/361)) ([295077e](https://github.com/ipfs/js-ipfs-unixfs/commit/295077ea70c99fd6c4ca0ef8c304781de07120c7)), closes [#359](https://github.com/ipfs/js-ipfs-unixfs/issues/359) + ## ipfs-unixfs-exporter [13.3.1](https://github.com/ipfs/js-ipfs-unixfs/compare/ipfs-unixfs-exporter-13.3.0...ipfs-unixfs-exporter-13.3.1) (2024-01-19) diff --git a/packages/ipfs-unixfs-exporter/package.json b/packages/ipfs-unixfs-exporter/package.json index 4f108edf..ffc4006b 100644 --- a/packages/ipfs-unixfs-exporter/package.json +++ b/packages/ipfs-unixfs-exporter/package.json @@ -1,6 +1,6 @@ { "name": "ipfs-unixfs-exporter", - "version": "13.3.1", + "version": "13.4.0", "description": "JavaScript implementation of the UnixFs exporter used by IPFS", "license": "Apache-2.0 OR MIT", "homepage": "https://github.com/ipfs/js-ipfs-unixfs/tree/master/packages/ipfs-unixfs-exporter#readme",