url: add fileURLToPathBuffer API

The existing `fileURLToPath()` does not handle the case
where the input URL contains percent-encoded characters
that are not valid UTF-8 sequences. This can lead to
issues, for instance, when the URL is constructed
using file names in non-Unicode encodings (like
Shift-JIS). This commit introduces a new API,
`fileURLToPathBuffer()`, which returns a `Buffer`
representing the path, allowing for accurate
conversion of file URLs to paths without attempting
to decode the percent-encoded bytes into characters.

PR-URL: https://github.com/nodejs/node/pull/58700
Reviewed-By: Matteo Collina <matteo.collina@gmail.com>
Reviewed-By: Ethan Arrowood <ethan@arrowood.dev>
Reviewed-By: LiviaMedeiros <livia@cirno.name>
This commit is contained in:
James M Snell 2025-06-12 13:43:56 -07:00
parent f582d4569f
commit 3f6ad56846
6 changed files with 215 additions and 0 deletions

View file

@ -1358,6 +1358,26 @@ new URL('file:///hello world').pathname; // Incorrect: /hello%20world
fileURLToPath('file:///hello world'); // Correct: /hello world (POSIX)
```
### `url.fileURLToPathBuffer(url[, options])`
<!--
added: REPLACEME
-->
* `url` {URL | string} The file URL string or URL object to convert to a path.
* `options` {Object}
* `windows` {boolean|undefined} `true` if the `path` should be
return as a windows filepath, `false` for posix, and
`undefined` for the system default.
**Default:** `undefined`.
* Returns: {Buffer} The fully-resolved platform-specific Node.js file path
as a {Buffer}.
Like `url.fileURLToPath(...)` except that instead of returning a string
representation of the path, a `Buffer` is returned. This conversion is
helpful when the input URL contains percent-encoded segments that are
not valid UTF-8 / Unicode sequences.
### `url.format(URL[, options])`
<!-- YAML

View file

@ -349,4 +349,5 @@ function isomorphicDecode(input) {
module.exports = {
dataURLProcessor,
percentDecode,
};

View file

@ -29,6 +29,9 @@ const {
Symbol,
SymbolIterator,
SymbolToStringTag,
TypedArrayPrototypeGetBuffer,
TypedArrayPrototypeGetByteLength,
TypedArrayPrototypeGetByteOffset,
decodeURIComponent,
} = primordials;
@ -81,13 +84,17 @@ const {
CHAR_LOWERCASE_Z,
CHAR_PERCENT,
CHAR_PLUS,
CHAR_COLON,
} = require('internal/constants');
const path = require('path');
const { Buffer } = require('buffer');
const {
validateFunction,
} = require('internal/validators');
const { percentDecode } = require('internal/data_url');
const querystring = require('querystring');
const bindingUrl = internalBinding('url');
@ -1482,6 +1489,76 @@ function getPathFromURLWin32(url) {
return StringPrototypeSlice(pathname, 1);
}
function getPathBufferFromURLWin32(url) {
const hostname = url.hostname;
let pathname = url.pathname;
// In the getPathFromURLWin32 variant, we scan the input for backslash (\)
// and forward slash (/) characters, specifically looking for the ASCII/UTF8
// encoding these and forbidding their use. This is a bit tricky
// because these may conflict with non-UTF8 encodings. For instance,
// in shift-jis, %5C identifies the symbol for the Japanese Yen and not the
// backslash. If we have a url like file:///foo/%5c/bar, then we really have
// no way of knowing if that %5c is meant to be a backslash \ or a yen sign.
// Passing in an encoding option does not help since our Buffer encoding only
// knows about certain specific text encodings and a single file path might
// actually contain segments that use multiple encodings. It's tricky! So,
// for this variation where we are producing a buffer, we won't scan for the
// slashes at all, and instead will decode the bytes literally into the
// returned Buffer. That said, that can also be tricky because, on windows,
// the file path separator *is* the ASCII backslash. This is a known issue
// on windows specific to the Shift-JIS encoding that we're not really going
// to solve here. Instead, we're going to do the best we can and just
// interpret the input url as a sequence of bytes.
// Because we are converting to a Windows file path here, we need to replace
// the explicit forward slash separators with backslashes. Note that this
// intentionally disregards any percent-encoded forward slashes in the path.
pathname = SideEffectFreeRegExpPrototypeSymbolReplace(FORWARD_SLASH, pathname, '\\');
// Now, let's start to build our Buffer. We will initially start with a
// Buffer allocated to fit in the entire string. Worst case there are no
// percent encoded characters and we take the string as is. Any invalid
// percent encodings, e.g. `%ZZ` are ignored and are passed through
// literally.
const decodedu8 = percentDecode(Buffer.from(pathname, 'utf8'));
const decodedPathname = Buffer.from(TypedArrayPrototypeGetBuffer(decodedu8),
TypedArrayPrototypeGetByteOffset(decodedu8),
TypedArrayPrototypeGetByteLength(decodedu8));
if (hostname !== '') {
// If hostname is set, then we have a UNC path
// Pass the hostname through domainToUnicode just in case
// it is an IDN using punycode encoding. We do not need to worry
// about percent encoding because the URL parser will have
// already taken care of that for us. Note that this only
// causes IDNs with an appropriate `xn--` prefix to be decoded.
// This is a bit tricky because of the need to convert to a Buffer
// followed by concatenation of the results.
const prefix = Buffer.from('\\\\', 'ascii');
const domain = Buffer.from(domainToUnicode(hostname), 'utf8');
return Buffer.concat([prefix, domain, decodedPathname]);
}
// Otherwise, it's a local path that requires a drive letter
// In this case we're only going to pay attention to the second and
// third bytes in the decodedPathname. If first byte is either an ASCII
// uppercase letter between 'A' and 'Z' or lowercase letter between
// 'a' and 'z', and the second byte must be an ASCII `:` or the
// operation will fail.
const letter = decodedPathname[1] | 0x20;
const sep = decodedPathname[2];
if (letter < CHAR_LOWERCASE_A || letter > CHAR_LOWERCASE_Z || // a..z A..Z
(sep !== CHAR_COLON)) {
throw new ERR_INVALID_FILE_URL_PATH('must be absolute');
}
// Now, we'll just return everything except the first byte of
// decodedPathname
return decodedPathname.subarray(1);
}
function getPathFromURLPosix(url) {
if (url.hostname !== '') {
throw new ERR_INVALID_FILE_URL_HOST(platform);
@ -1500,6 +1577,28 @@ function getPathFromURLPosix(url) {
return decodeURIComponent(pathname);
}
function getPathBufferFromURLPosix(url) {
if (url.hostname !== '') {
throw new ERR_INVALID_FILE_URL_HOST(platform);
}
const pathname = url.pathname;
// In the getPathFromURLPosix variant, we scan the input for forward slash
// (/) characters, specifically looking for the ASCII/UTF8 and forbidding
// its use. This is a bit tricky because these may conflict with non-UTF8
// encodings. Passing in an encoding option does not help since our Buffer
// encoding only knows about certain specific text encodings and a single
// file path might actually contain segments that use multiple encodings.
// It's tricky! So, for this variation where we are producing a buffer, we
// won't scan for the slashes at all, and instead will decode the bytes
// literally into the returned Buffer. We're going to do the best we can and
// just interpret the input url as a sequence of bytes.
const u8 = percentDecode(Buffer.from(pathname, 'utf8'));
return Buffer.from(TypedArrayPrototypeGetBuffer(u8),
TypedArrayPrototypeGetByteOffset(u8),
TypedArrayPrototypeGetByteLength(u8));
}
function fileURLToPath(path, options = kEmptyObject) {
const windows = options?.windows;
if (typeof path === 'string')
@ -1511,6 +1610,24 @@ function fileURLToPath(path, options = kEmptyObject) {
return (windows ?? isWindows) ? getPathFromURLWin32(path) : getPathFromURLPosix(path);
}
// An alternative to fileURLToPath that outputs a Buffer
// instead of a string. The other fileURLToPath does not
// handle non-UTF8 encoded percent encodings at all, so
// converting to a Buffer is necessary in cases where the
// to string conversion would fail.
function fileURLToPathBuffer(path, options = kEmptyObject) {
const windows = options?.windows;
if (typeof path === 'string') {
path = new URL(path);
} else if (!isURL(path)) {
throw new ERR_INVALID_ARG_TYPE('path', ['string', 'URL'], path);
}
if (path.protocol !== 'file:') {
throw new ERR_INVALID_URL_SCHEME('file');
}
return (windows ?? isWindows) ? getPathBufferFromURLWin32(path) : getPathBufferFromURLPosix(path);
}
function pathToFileURL(filepath, options = kEmptyObject) {
const windows = options?.windows ?? isWindows;
const isUNC = windows && StringPrototypeStartsWith(filepath, '\\\\');
@ -1571,6 +1688,7 @@ function getURLOrigin(url) {
module.exports = {
fileURLToPath,
fileURLToPathBuffer,
pathToFileURL,
toPathIfFileURL,
installObjectURLMethods,

View file

@ -60,6 +60,7 @@ const {
domainToASCII,
domainToUnicode,
fileURLToPath,
fileURLToPathBuffer,
pathToFileURL: _pathToFileURL,
urlToHttpOptions,
unsafeProtocol,
@ -1037,5 +1038,6 @@ module.exports = {
// Utilities
pathToFileURL,
fileURLToPath,
fileURLToPathBuffer,
urlToHttpOptions,
};

View file

@ -105,6 +105,8 @@ expected.beforePreExec = new Set([
'Internal Binding wasm_web_api',
'NativeModule internal/events/abort_listener',
'NativeModule internal/modules/typescript',
'NativeModule internal/data_url',
'NativeModule internal/mime',
]);
expected.atRunTime = new Set([

View file

@ -0,0 +1,72 @@
'use strict';
const common = require('../common');
// This test does not work on OSX due to the way it handles
// non-Unicode sequences in file names.
if (common.isMacOS) {
common.skip('Test unsupported on OSX');
}
// Unfortunately, the test also does not work on Windows
// because the writeFileSync operation will replace the
// non-Unicode characters with replacement characters when
// it normalizes the path.
if (common.isWindows) {
common.skip('Test unsupported on Windows');
}
const tmpdir = require('../common/tmpdir');
const {
existsSync,
writeFileSync,
} = require('node:fs');
const {
ok,
throws,
} = require('node:assert');
const {
sep,
} = require('node:path');
tmpdir.refresh();
const {
pathToFileURL,
fileURLToPath,
fileURLToPathBuffer,
} = require('node:url');
const kShiftJisName = '%82%A0%82%A2%82%A4';
const kShiftJisBuffer = Buffer.from([0x82, 0xA0, 0x82, 0xA2, 0x82, 0xA4]);
const tmpdirUrl = pathToFileURL(tmpdir.path + sep);
const testPath = new URL(kShiftJisName, tmpdirUrl);
ok(testPath.pathname.endsWith(`/${kShiftJisName}`));
const tmpdirBuffer = Buffer.from(tmpdir.path + sep, 'utf8');
const testPathBuffer = Buffer.concat([tmpdirBuffer, kShiftJisBuffer]);
// We can use the Buffer version of the path to create a file and check
// its existence. But we cannot use the URL version because it contains
// non-Unicode percent-encoded characters.
throws(() => writeFileSync(testPath, 'test'), {
name: 'URIError',
});
writeFileSync(testPathBuffer, 'test');
ok(existsSync(testPathBuffer));
// Using fileURLToPath fails because the URL contains non-Unicode
// percent-encoded characters.
throws(() => existsSync(fileURLToPath(testPath)), {
name: 'URIError',
});
// This variation succeeds because the URL is converted to a buffer
// without trying to interpret the percent-encoded characters.
ok(existsSync(fileURLToPathBuffer(testPath)));