Another (and hopefully last) major streams commit.

This moves unicode conversion to the filter layer
(rather than at the lower streams layer)
unicode_filter.c has been moved from ext/unicode to main/streams
as it's an integral part of the streams unicode conversion process.

There are now three ways to set encoding on a stream:

(1) By context
$ctx = stream_context_create(NULL,array('encoding'=>'latin1'));
$fp = fopen('somefile', 'r+t', false, $ctx);

(2) By stream_encoding()
$fp = fopen('somefile', 'r+');
stream_encoding($fp, 'latin1');

(3) By filter
$fp = fopen('somefile', 'r+');
stream_filter_append($fp, 'unicode.from.latin1', STREAM_FILTER_READ);
stream_filter_append($fp, 'unicode.to.latin1', STREAM_FILTER_WRITE);

Note: Methods 1 and 2 are convenience wrappers around method 3.
This commit is contained in:
Sara Golemon 2006-03-29 01:20:43 +00:00
parent f028fcecb5
commit 30a2bd1d11
15 changed files with 275 additions and 238 deletions

View file

@ -589,6 +589,7 @@ zend_function_entry basic_functions[] = {
PHP_FE(stream_filter_prepend, NULL)
PHP_FE(stream_filter_append, NULL)
PHP_FE(stream_filter_remove, NULL)
PHP_FE(stream_encoding, NULL)
PHP_FE(stream_socket_client, second_and_third_args_force_ref)
PHP_FE(stream_socket_server, second_and_third_args_force_ref)
PHP_FE(stream_socket_accept, third_arg_force_ref)

View file

@ -1008,14 +1008,14 @@ PHPAPI PHP_FUNCTION(fgets)
php_stream_from_zval(stream, &zstream);
buf.v = php_stream_get_line_ex(stream, php_stream_reads_unicode(stream) ? IS_UNICODE : IS_STRING, NULL_ZSTR, 0, length, &retlen);
buf.v = php_stream_get_line_ex(stream, stream->readbuf_type, NULL_ZSTR, 0, length, &retlen);
if (!buf.v) {
RETURN_FALSE;
}
if (php_stream_reads_unicode(stream)) {
if (stream->readbuf_type == IS_UNICODE) {
RETURN_UNICODEL(buf.u, retlen, 0);
} else {
} else { /* IS_STRING */
RETURN_STRINGL(buf.s, retlen, 0);
}
}
@ -1034,7 +1034,7 @@ PHPAPI PHP_FUNCTION(fgetc)
PHP_STREAM_TO_ZVAL(stream, arg1);
if (php_stream_reads_unicode(stream)) {
if (stream->readbuf_type == IS_UNICODE) {
int buflen = 1;
UChar *buf = php_stream_read_unicode_chars(stream, &buflen);
@ -1042,7 +1042,7 @@ PHPAPI PHP_FUNCTION(fgetc)
RETURN_FALSE;
}
RETURN_UNICODEL(buf, buflen, 0);
} else {
} else { /* IS_STRING */
char buf[2];
buf[0] = php_stream_getc(stream);
@ -1068,7 +1068,7 @@ PHPAPI PHP_FUNCTION(fgetss)
php_stream_from_zval(stream, &zstream);
if (php_stream_reads_unicode(stream)) {
if (stream->readbuf_type == IS_UNICODE) {
UChar *buf = php_stream_get_line_ex(stream, IS_UNICODE, NULL_ZSTR, 0, length, &retlen);
UChar *allowed = NULL;
int allowed_len = 0;
@ -1085,7 +1085,7 @@ PHPAPI PHP_FUNCTION(fgetss)
retlen = php_u_strip_tags(buf, retlen, &stream->fgetss_state, allowed, allowed_len TSRMLS_CC);
RETURN_UNICODEL(buf, retlen, 0);
} else {
} else { /* IS_STRING */
char *buf = php_stream_get_line_ex(stream, IS_STRING, NULL_ZSTR, 0, length, &retlen);
char *allowed = NULL;
int allowed_len = 0;
@ -1752,7 +1752,7 @@ PHPAPI PHP_FUNCTION(fread)
RETURN_FALSE;
}
if (php_stream_reads_unicode(stream)) {
if (stream->readbuf_type == IS_UNICODE) {
int buflen = len;
UChar *buf = php_stream_read_unicode_chars(stream, &buflen);
@ -1761,7 +1761,7 @@ PHPAPI PHP_FUNCTION(fread)
}
RETURN_UNICODEL(buf, buflen, 0);
} else {
} else { /* IS_STRING */
char *buf = emalloc(len + 1);
int buflen = php_stream_read(stream, buf, len);

View file

@ -489,11 +489,11 @@ PHP_FUNCTION(stream_get_meta_data)
add_assoc_zval(return_value, "write_filters", newval);
}
if (php_stream_reads_unicode(stream)) {
if (stream->readbuf_type == IS_UNICODE) {
int readbuf_len = u_countChar32(stream->readbuf.u + stream->readpos, stream->writepos - stream->readpos);
add_assoc_long(return_value, "unread_bytes", UBYTES(stream->writepos - stream->readpos));
add_assoc_long(return_value, "unread_chars", readbuf_len);
} else {
} else { /* IS_STRING */
add_assoc_long(return_value, "unread_bytes", stream->writepos - stream->readpos);
add_assoc_long(return_value, "unread_chars", stream->writepos - stream->readpos);
}
@ -1275,7 +1275,7 @@ PHP_FUNCTION(stream_get_line)
php_stream_from_zval(stream, &zstream);
if (php_stream_reads_unicode(stream)) {
if (stream->readbuf_type == IS_UNICODE) {
UChar *buf;
UChar *d = NULL;
int dlen = 0;
@ -1294,7 +1294,7 @@ PHP_FUNCTION(stream_get_line)
}
RETURN_UNICODEL(buf, buf_size, 0);
} else {
} else { /* IS_STRING */
char *buf;
char *d = NULL;
int dlen = 0;
@ -1462,6 +1462,67 @@ PHP_FUNCTION(stream_socket_enable_crypto)
}
/* }}} */
/* {{{ proto void stream_encoding(resource stream[, string encoding])
Set character set for stream encoding
UTODO: Return current encoding charset
*/
PHP_FUNCTION(stream_encoding)
{
zval *zstream;
php_stream *stream;
char *encoding = NULL;
int encoding_len = 0;
int remove_read_tail = 0, remove_write_tail = 0;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r|s", &zstream, &encoding, &encoding_len) == FAILURE) {
return;
}
php_stream_from_zval(stream, &zstream);
/* Double check that the target encoding is legal before attempting anything */
if (stream->readfilters.tail) {
if (stream->readfilters.tail->fops == &php_unicode_from_string_filter_ops) {
/* Remove the current unicode.from.* filter,
the filter layer will transcode anything in the read buffer back to binary
or invalidate the read buffer */
remove_read_tail = 1;
} else if (stream->readbuf_type == IS_UNICODE) {
/* There's an encoding on the stream already, but then there's filtering happening after that point
It's asking too much for PHP to figure out what the user wants, throw an error back in their face */
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Cannot change encoding on filtered stream");
RETURN_FALSE;
}
}
if (stream->writefilters.tail) {
if (stream->writefilters.tail->fops == &php_unicode_to_string_filter_ops) {
/* Remove the current unicode.to.* filter */
remove_write_tail = 1;
} else if ((stream->writefilters.tail->fops->flags & PSFO_FLAG_OUTPUTS_UNICODE) == 0) {
/* conversion to binary is happening, them another filter is doing something
bailout for same reason as read filters */
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Cannot change encoding on filtered stream");
RETURN_FALSE;
}
}
if (remove_read_tail) {
php_stream_filter_remove(stream->readfilters.tail, 1 TSRMLS_CC);
}
if (remove_write_tail) {
php_stream_filter_remove(stream->writefilters.tail, 1 TSRMLS_CC);
}
/* UTODO: Allow overriding error handling for converters */
php_stream_encoding_apply(stream, 1, encoding, UG(from_error_mode), UG(from_subst_char));
php_stream_encoding_apply(stream, 0, encoding, UG(to_error_mode), NULL);
RETURN_TRUE;
}
/* }}} */
/*
* Local variables:
* tab-width: 4

View file

@ -53,6 +53,7 @@ PHP_FUNCTION(stream_context_get_default);
PHP_FUNCTION(stream_filter_prepend);
PHP_FUNCTION(stream_filter_append);
PHP_FUNCTION(stream_filter_remove);
PHP_FUNCTION(stream_encoding);
PHP_FUNCTION(stream_socket_enable_crypto);
PHP_FUNCTION(stream_socket_pair);

View file

@ -4,4 +4,4 @@ dnl
PHP_SUBST(UNICODE_SHARED_LIBADD)
AC_DEFINE(HAVE_UNICODE, 1, [ ])
PHP_NEW_EXTENSION(unicode, unicode.c locale.c unicode_filter.c unicode_iterators.c collator.c, $ext_shared)
PHP_NEW_EXTENSION(unicode, unicode.c locale.c unicode_iterators.c collator.c, $ext_shared)

View file

@ -1,5 +1,5 @@
// $Id$
// vim:ft=javascript
EXTENSION("unicode", "unicode.c unicode_filter.c unicode_iterators.c collator.c locale.c");
EXTENSION("unicode", "unicode.c unicode_iterators.c collator.c locale.c");
AC_DEFINE('HAVE_UNICODE', 1, 'ICU API extension');

View file

@ -67,7 +67,6 @@ PHP_FUNCTION(collator_get_attribute);
PHP_METHOD(collator, __construct);
void php_init_collation(TSRMLS_D);
extern php_stream_filter_factory php_unicode_filter_factory;
#ifdef __cplusplus
} // extern "C"

View file

@ -273,10 +273,6 @@ ZEND_GET_MODULE(unicode)
/* {{{ PHP_MINIT_FUNCTION */
PHP_MINIT_FUNCTION(unicode)
{
if (php_stream_filter_register_factory("unicode.*", &php_unicode_filter_factory TSRMLS_CC) == FAILURE) {
return FAILURE;
}
php_register_unicode_iterators(TSRMLS_C);
php_init_collation(TSRMLS_C);
@ -287,9 +283,6 @@ PHP_MINIT_FUNCTION(unicode)
/* {{{ PHP_MSHUTDOWN_FUNCTION */
PHP_MSHUTDOWN_FUNCTION(unicode)
{
if (php_stream_filter_unregister_factory("unicode.*" TSRMLS_CC) == FAILURE) {
return FAILURE;
}
/* add your stuff here */

View file

@ -1611,6 +1611,12 @@ int php_module_startup(sapi_module_struct *sf, zend_module_entry *additional_mod
return FAILURE;
}
/* Initialize unicode filters */
if (php_stream_filter_register_factory("unicode.*", &php_unicode_filter_factory TSRMLS_CC) == FAILURE) {
php_printf("PHP: Unable to initialize unicode stream filters.\n");
return FAILURE;
}
/* initialize registry for images to be used in phpinfo()
(this uses configuration parameters from php.ini)
*/
@ -1744,6 +1750,7 @@ void php_module_shutdown(TSRMLS_D)
zend_shutdown(TSRMLS_C);
/* Destroys filter & transport registries too */
php_shutdown_stream_wrappers(module_number TSRMLS_CC);
php_shutdown_info_logos();

View file

@ -206,12 +206,9 @@ struct _php_stream {
php_stream_context *context;
int flags; /* PHP_STREAM_FLAG_XXX */
/* unicode */
UConverter *input_encoding;
UConverter *output_encoding;
/* buffer */
off_t position; /* of underlying stream */
zend_uchar readbuf_type;
zstr readbuf; /* readbuf.s or readbuf.u */
size_t readbuflen; /* Length in units (char or UChar) */
off_t readpos; /* Position in units (char or UChar) */
@ -252,8 +249,6 @@ END_EXTERN_C()
#define php_stream_from_zval_no_verify(xstr, ppzval) (xstr) = (php_stream*)zend_fetch_resource((ppzval) TSRMLS_CC, -1, "stream", NULL, 2, php_file_le_stream(), php_file_le_pstream())
#define PS_ULEN(is_unicode, len) ((is_unicode) ? UBYTES(len) : (len))
#define php_stream_reads_unicode(stream) ((stream->input_encoding) ? 1 : 0)
#define php_stream_writes_unicode(stream) ((stream->output_encoding) ? 1 : 0)
BEGIN_EXTERN_C()
PHPAPI int php_stream_from_persistent_id(const char *persistent_id, php_stream **stream TSRMLS_DC);

View file

@ -396,15 +396,16 @@ PHPAPI void _php_stream_filter_append(php_stream_filter_chain *chain, php_stream
chain->tail = filter;
filter->chain = chain;
if (&(stream->readfilters) == chain && (stream->writepos - stream->readpos) > 0) {
if (&(stream->readfilters) == chain) {
/* Let's going ahead and wind anything in the buffer through this filter */
php_stream_bucket_brigade brig_in = { NULL, NULL }, brig_out = { NULL, NULL };
php_stream_bucket_brigade *brig_inp = &brig_in, *brig_outp = &brig_out;
php_stream_filter_status_t status;
php_stream_filter_status_t status = PSFS_FEED_ME;
php_stream_bucket *bucket;
size_t consumed = 0;
if (stream->input_encoding) {
if ((stream->writepos - stream->readpos) > 0) {
if (stream->readbuf_type == IS_UNICODE) {
bucket = php_stream_bucket_new_unicode(stream, stream->readbuf.u + stream->readpos, stream->writepos - stream->readpos, 0, 0 TSRMLS_CC);
} else {
bucket = php_stream_bucket_new(stream, stream->readbuf.s + stream->readpos, stream->writepos - stream->readpos, 0, 0 TSRMLS_CC);
@ -416,9 +417,9 @@ PHPAPI void _php_stream_filter_append(php_stream_filter_chain *chain, php_stream
/* No behaving filter should cause this. */
status = PSFS_ERR_FATAL;
}
}
switch (status) {
case PSFS_ERR_FATAL:
if (status == PSFS_ERR_FATAL) {
/* If this first cycle simply fails then there's something wrong with the filter.
Pull the filter off the chain and leave the read buffer alone. */
if (chain->head == filter) {
@ -431,15 +432,27 @@ PHPAPI void _php_stream_filter_append(php_stream_filter_chain *chain, php_stream
php_stream_bucket_unlink(bucket TSRMLS_CC);
php_stream_bucket_delref(bucket TSRMLS_CC);
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Filter failed to process pre-buffered data. Not adding to filterchain.");
break;
case PSFS_FEED_ME:
} else {
/* This filter addition may change the readbuffer type.
Since all the previously held data is in the bucket brigade,
we can reappropriate the buffer that already exists (if one does) */
if (stream->readbuf_type == IS_UNICODE && (filter->fops->flags & PSFO_FLAG_OUTPUTS_UNICODE) == 0) {
/* Buffer is currently based on unicode characters, but filter only outputs STRING adjust counting */
stream->readbuf_type = IS_STRING;
stream->readbuflen *= UBYTES(1);
} else if (stream->readbuf_type == IS_STRING && (filter->fops->flags & PSFO_FLAG_OUTPUTS_STRING) == 0) {
/* Buffer is currently based on binary characters, but filter only outputs UNICODE adjust counting */
stream->readbuf_type = IS_UNICODE;
stream->readbuflen /= UBYTES(1);
}
if (status == PSFS_FEED_ME) {
/* We don't actually need data yet,
leave this filter in a feed me state until data is needed.
Reset stream's internal read buffer since the filter is "holding" it. */
stream->readpos = 0;
stream->writepos = 0;
break;
case PSFS_PASS_ON:
} else if (status == PSFS_PASS_ON) {
/* Put any filtered data onto the readbuffer stack.
Previously read data has been at least partially consumed. */
stream->readpos += consumed;
@ -454,23 +467,20 @@ PHPAPI void _php_stream_filter_append(php_stream_filter_chain *chain, php_stream
bucket = brig_outp->head;
/* Convert for stream type */
if (bucket->buf_type != IS_UNICODE && stream->input_encoding) {
/* Stream expects unicode, convert using stream encoding */
php_stream_bucket_convert(bucket, IS_UNICODE, stream->input_encoding);
} else if (bucket->buf_type == IS_UNICODE && !stream->input_encoding) {
/* Stream expects binary, filter provided unicode, just take the buffer as is */
php_stream_bucket_convert_notranscode(bucket, IS_STRING);
if (bucket->buf_type != stream->readbuf_type) {
/* Stream expects different type than bucket contains, convert slopily */
php_stream_bucket_convert_notranscode(bucket, stream->readbuf_type);
}
/* Grow buffer to hold this bucket if need be.
TODO: See warning in main/stream/streams.c::php_stream_fill_read_buffer */
if (stream->readbuflen - stream->writepos < bucket->buflen) {
stream->readbuflen += bucket->buflen;
stream->readbuf.v = perealloc(stream->readbuf.v, PS_ULEN(stream->input_encoding, stream->readbuflen), stream->is_persistent);
stream->readbuf.v = perealloc(stream->readbuf.v, PS_ULEN(stream->readbuf_type == IS_UNICODE, stream->readbuflen), stream->is_persistent);
}
/* Append to readbuf */
if (stream->input_encoding) {
if (stream->readbuf_type == IS_UNICODE) {
memcpy(stream->readbuf.u + stream->writepos, bucket->buf.u, UBYTES(bucket->buflen));
} else {
memcpy(stream->readbuf.s + stream->writepos, bucket->buf.s, bucket->buflen);
@ -480,10 +490,9 @@ PHPAPI void _php_stream_filter_append(php_stream_filter_chain *chain, php_stream
php_stream_bucket_unlink(bucket TSRMLS_CC);
php_stream_bucket_delref(bucket TSRMLS_CC);
}
break;
}
}
} /* end of readfilters specific code */
}
PHPAPI int _php_stream_filter_check_chain(php_stream_filter_chain *chain TSRMLS_DC)
@ -597,26 +606,23 @@ PHPAPI int _php_stream_filter_flush(php_stream_filter *filter, int finish TSRMLS
/* Dump any newly flushed data to the read buffer */
if (stream->readpos > stream->chunk_size) {
/* Back the buffer up */
memcpy(stream->readbuf.s, stream->readbuf.s + PS_ULEN(stream->input_encoding, stream->readpos), PS_ULEN(stream->input_encoding, stream->writepos - stream->readpos));
memcpy(stream->readbuf.s, stream->readbuf.s + PS_ULEN(stream->readbuf_type == IS_UNICODE, stream->readpos), PS_ULEN(stream->readbuf_type == IS_UNICODE, stream->writepos - stream->readpos));
stream->writepos -= stream->readpos;
stream->readpos = 0;
}
if (flushed_size > (stream->readbuflen - stream->writepos)) {
/* Grow the buffer */
stream->readbuf.v = perealloc(stream->readbuf.v, PS_ULEN(stream->input_encoding, stream->writepos + flushed_size + stream->chunk_size), stream->is_persistent);
stream->readbuf.v = perealloc(stream->readbuf.v, PS_ULEN(stream->readbuf_type == IS_UNICODE, stream->writepos + flushed_size + stream->chunk_size), stream->is_persistent);
}
while ((bucket = inp->head)) {
/* Convert if necessary */
if (bucket->buf_type != IS_UNICODE && stream->input_encoding) {
/* Stream expects unicode, convert using stream encoding */
php_stream_bucket_convert(bucket, IS_UNICODE, stream->input_encoding);
} else if (bucket->buf_type == IS_UNICODE && !stream->input_encoding) {
/* Stream expects binary, filter provided unicode, just take the buffer as is */
php_stream_bucket_convert_notranscode(bucket, IS_STRING);
if (bucket->buf_type != stream->readbuf_type) {
/* Stream expects different type than what's in bucket, convert slopily */
php_stream_bucket_convert_notranscode(bucket, stream->readbuf_type);
}
/* Append to readbuf */
if (stream->input_encoding) {
if (stream->readbuf_type == IS_UNICODE) {
memcpy(stream->readbuf.u + stream->writepos, bucket->buf.u, UBYTES(bucket->buflen));
} else {
memcpy(stream->readbuf.s + stream->writepos, bucket->buf.s, bucket->buflen);
@ -632,14 +638,9 @@ PHPAPI int _php_stream_filter_flush(php_stream_filter *filter, int finish TSRMLS
while ((bucket = inp->head)) {
/* Convert if necessary */
if (bucket->buf_type == IS_UNICODE) {
if (stream->output_encoding) {
/* Stream has a configured output encoding, convert to appropriate type */
php_stream_bucket_convert(bucket, IS_STRING, stream->output_encoding);
} else {
/* Stream is binary, write ugly UChars as is */
/* Force data to binary, adjusting buflen */
php_stream_bucket_convert_notranscode(bucket, IS_STRING);
}
}
/* Must be binary by this point */
stream->ops->write(stream, bucket->buf.s, bucket->buflen TSRMLS_CC);
@ -654,6 +655,9 @@ PHPAPI int _php_stream_filter_flush(php_stream_filter *filter, int finish TSRMLS
PHPAPI php_stream_filter *php_stream_filter_remove(php_stream_filter *filter, int call_dtor TSRMLS_DC)
{
/* UTODO: Figure out a sane way to "defilter" so that unicode converters can be swapped around
For now, at least fopen(,'b') + stream_encoding($fp, 'charset') works since there's nothing to remove */
if (filter->prev) {
filter->prev->next = filter->next;
} else {
@ -770,6 +774,42 @@ PHPAPI int _php_stream_bucket_convert(php_stream_bucket *bucket, unsigned char t
return FAILURE;
}
PHPAPI int _php_stream_encoding_apply(php_stream *stream, int writechain, const char *encoding, uint16_t error_mode, UChar *subst TSRMLS_DC)
{
int encoding_len = strlen(encoding);
int buflen = sizeof("unicode.from.") + encoding_len - 1; /* might be "to", but "from" is long enough for both */
char *buf = emalloc(buflen + 1);
php_stream_filter *filter;
zval *filterparams;
if (writechain) {
memcpy(buf, "unicode.to.", sizeof("unicode.to.") - 1);
memcpy(buf + sizeof("unicode.to.") - 1, encoding, encoding_len + 1);
} else {
memcpy(buf, "unicode.from.", sizeof("unicode.from.") - 1);
memcpy(buf + sizeof("unicode.from.") - 1, encoding, encoding_len + 1);
}
ALLOC_INIT_ZVAL(filterparams);
array_init(filterparams);
add_assoc_long(filterparams, "error_mode", error_mode);
if (subst) {
add_assoc_unicode(filterparams, "subst_char", subst, 1);
}
filter = php_stream_filter_create(buf, filterparams, php_stream_is_persistent(stream) TSRMLS_CC);
efree(buf);
zval_ptr_dtor(&filterparams);
if (!filter) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to apply encoding for charset: %s\n", encoding);
return FAILURE;
}
php_stream_filter_append(writechain ? &stream->writefilters : &stream->readfilters, filter);
return SUCCESS;
}
/*
* Local variables:
* tab-width: 4

View file

@ -157,6 +157,7 @@ PHPAPI int _php_stream_filter_flush(php_stream_filter *filter, int finish TSRMLS
PHPAPI php_stream_filter *php_stream_filter_remove(php_stream_filter *filter, int call_dtor TSRMLS_DC);
PHPAPI void php_stream_filter_free(php_stream_filter *filter TSRMLS_DC);
PHPAPI php_stream_filter *_php_stream_filter_alloc(php_stream_filter_ops *fops, void *abstract, int persistent STREAMS_DC TSRMLS_DC);
PHPAPI int _php_stream_encoding_apply(php_stream *stream, int writechain, const char *encoding, uint16_t error_mode, UChar *subst TSRMLS_DC);
END_EXTERN_C()
#define php_stream_filter_alloc(fops, thisptr, persistent) _php_stream_filter_alloc((fops), (thisptr), (persistent) STREAMS_CC TSRMLS_CC)
#define php_stream_filter_alloc_rel(fops, thisptr, persistent) _php_stream_filter_alloc((fops), (thisptr), (persistent) STREAMS_REL_CC TSRMLS_CC)
@ -165,6 +166,8 @@ END_EXTERN_C()
#define php_stream_filter_flush(filter, finish) _php_stream_filter_flush((filter), (finish) TSRMLS_CC)
#define php_stream_filter_check_chain(chain) _php_stream_filter_check_chain((chain) TSRMLS_CC)
#define php_stream_filter_output_prefer_unicode(filter) _php_stream_filter_output_prefer_unicode((filter) TSRMLS_CC)
#define php_stream_encoding_apply(stream, writechain, encoding, error_mode, subst) \
_php_stream_encoding_apply((stream), (writechain), (encoding), (error_mode), (subst) TSRMLS_CC)
#define php_stream_is_filtered(stream) ((stream)->readfilters.head || (stream)->writefilters.head)
@ -179,6 +182,12 @@ PHPAPI int php_stream_filter_register_factory_volatile(const char *filterpattern
PHPAPI php_stream_filter *php_stream_filter_create(const char *filtername, zval *filterparams, int persistent TSRMLS_DC);
END_EXTERN_C()
/* unicode_filter.c exports */
extern php_stream_filter_ops php_unicode_to_string_filter_ops;
extern php_stream_filter_ops php_unicode_from_string_filter_ops;
extern php_stream_filter_ops php_unicode_tidy_filter_ops;
extern php_stream_filter_factory php_unicode_filter_factory;
/*
* Local variables:
* tab-width: 4

View file

@ -239,6 +239,7 @@ fprintf(stderr, "stream_alloc: %s:%p persistent=%s\n", ops->label, ret, persiste
ret->abstract = abstract;
ret->is_persistent = persistent_id ? 1 : 0;
ret->chunk_size = FG(def_chunk_size);
ret->readbuf_type = IS_STRING;
if (FG(auto_detect_line_endings)) {
ret->flags |= PHP_STREAM_FLAG_DETECT_EOL;
@ -483,12 +484,9 @@ static void php_stream_fill_read_buffer(php_stream *stream, size_t size TSRMLS_D
* stream read buffer */
while (brig_inp->head) {
bucket = brig_inp->head;
if (bucket->buf_type != IS_UNICODE && stream->input_encoding) {
/* Stream expects unicode, convert using stream encoding */
php_stream_bucket_convert(bucket, IS_UNICODE, stream->input_encoding);
} else if (bucket->buf_type == IS_UNICODE && !stream->input_encoding) {
/* Stream expects binary, filter provided unicode, just take the buffer as is */
php_stream_bucket_convert_notranscode(bucket, IS_STRING);
if (bucket->buf_type != stream->readbuf_type) {
/* Stream expects different datatype than bucket has, convert slopily */
php_stream_bucket_convert_notranscode(bucket, stream->readbuf_type);
}
/* Bucket type now matches stream type */
@ -496,9 +494,9 @@ static void php_stream_fill_read_buffer(php_stream *stream, size_t size TSRMLS_D
* TODO: this can fail for persistent streams */
if (stream->readbuflen - stream->writepos < bucket->buflen) {
stream->readbuflen += bucket->buflen;
stream->readbuf.v = perealloc(stream->readbuf.v, PS_ULEN(stream->input_encoding, stream->readbuflen), stream->is_persistent);
stream->readbuf.v = perealloc(stream->readbuf.v, PS_ULEN(stream->readbuf_type == IS_UNICODE, stream->readbuflen), stream->is_persistent);
}
memcpy(stream->readbuf.s + stream->writepos, bucket->buf.s, PS_ULEN(stream->input_encoding, bucket->buflen));
memcpy(stream->readbuf.s + stream->writepos, bucket->buf.s, PS_ULEN(stream->readbuf_type == IS_UNICODE, bucket->buflen));
stream->writepos += bucket->buflen;
php_stream_bucket_unlink(bucket TSRMLS_CC);
@ -530,46 +528,6 @@ static void php_stream_fill_read_buffer(php_stream *stream, size_t size TSRMLS_D
}
efree(chunk_buf);
} else if (stream->input_encoding) { /* Unfiltered Unicode stream */
/* is there enough data in the buffer ? */
if (stream->writepos - stream->readpos < (off_t)size) {
char *binbuf;
UChar *ubuf;
int binbuf_len, ubuf_len;
size_t toread = (size > stream->chunk_size) ? size : stream->chunk_size;
UErrorCode status = U_ZERO_ERROR;
/* Read stream data into temporary buffer, then convert to unicode
TODO: This can be improved */
binbuf = emalloc(toread + 1);
binbuf_len = stream->ops->read(stream, binbuf, toread TSRMLS_CC);
if (binbuf_len == (size_t)-1) {
/* Failure */
efree(binbuf);
return;
}
/* Convert to unicode */
zend_convert_to_unicode(stream->input_encoding, &ubuf, &ubuf_len, binbuf, binbuf_len, &status);
efree(binbuf);
/* reduce buffer memory consumption if possible, to avoid a realloc */
if (stream->readbuf.u && stream->readbuflen - stream->writepos < stream->chunk_size) {
memmove(stream->readbuf.u, stream->readbuf.u + stream->readpos, UBYTES(stream->readbuflen - stream->readpos));
stream->writepos -= stream->readpos;
stream->readpos = 0;
}
/* grow the buffer if required
* TODO: this can fail for persistent streams */
if (stream->readbuflen - stream->writepos < ubuf_len) {
stream->readbuflen += ((stream->chunk_size > ubuf_len) ? stream->chunk_size : ubuf_len);
stream->readbuf.u = (UChar*)perealloc(stream->readbuf.u, UBYTES(stream->readbuflen), stream->is_persistent);
}
memcpy(stream->readbuf.u + stream->writepos, ubuf, UBYTES(ubuf_len));
efree(ubuf);
stream->writepos += ubuf_len;
}
} else { /* Unfiltered Binary stream */
/* is there enough data in the buffer ? */
if (stream->writepos - stream->readpos < (off_t)size) {
@ -609,13 +567,13 @@ PHPAPI size_t _php_stream_read(php_stream *stream, char *buf, size_t size TSRMLS
* drain the remainder of the buffer before using the "raw" read mode for
* the excess */
if (stream->writepos - stream->readpos > 0) {
toread = PS_ULEN(stream->input_encoding, stream->writepos - stream->readpos);
toread = PS_ULEN(stream->readbuf_type == IS_UNICODE, stream->writepos - stream->readpos);
if (toread > size) {
toread = size;
}
if (stream->input_encoding) {
if (stream->readbuf_type == IS_UNICODE) {
/* Sloppy read, anyone using php_stream_read() on a unicode stream
* had better know what they're doing */
@ -647,7 +605,7 @@ PHPAPI size_t _php_stream_read(php_stream *stream, char *buf, size_t size TSRMLS
}
if (toread > 0) {
if (php_stream_reads_unicode(stream)) {
if (stream->readbuf_type == IS_UNICODE) {
/* Sloppy read, anyone using php_stream_read() on a unicode stream
* had better know what they're doing */
@ -685,7 +643,7 @@ PHPAPI size_t _php_stream_read_unicode(php_stream *stream, UChar *buf, int size,
{
size_t toread = 0, didread = 0, string_length = 0;
if (!stream->input_encoding) {
if (stream->readbuf_type != IS_UNICODE) {
return -1;
}
@ -763,7 +721,7 @@ PHPAPI UChar *_php_stream_read_unicode_chars(php_stream *stream, int *pchars TSR
int buflen = size;
size_t toread = 0, didread = 0, string_length = 0;
if (!stream->input_encoding) {
if (stream->readbuf_type != IS_UNICODE) {
return NULL;
}
@ -921,7 +879,7 @@ PHPAPI void *php_stream_locate_eol(php_stream *stream, zstr zbuf, int buf_len TS
char *readptr, *buf = zbuf.s;
if (!buf) {
readptr = stream->readbuf.s + PS_ULEN(stream->input_encoding, stream->readpos);
readptr = stream->readbuf.s + PS_ULEN(stream->readbuf_type == IS_UNICODE, stream->readpos);
avail = stream->writepos - stream->readpos;
} else {
readptr = zbuf.s;
@ -929,7 +887,7 @@ PHPAPI void *php_stream_locate_eol(php_stream *stream, zstr zbuf, int buf_len TS
}
if (stream->flags & PHP_STREAM_FLAG_DETECT_EOL) {
if (stream->input_encoding) {
if (stream->readbuf_type == IS_UNICODE) {
cr = (char*)u_memchr((UChar*)readptr, '\r', avail);
lf = (char*)u_memchr((UChar*)readptr, '\n', avail);
} else {
@ -948,10 +906,10 @@ PHPAPI void *php_stream_locate_eol(php_stream *stream, zstr zbuf, int buf_len TS
eol = lf;
}
} else if (stream->flags & PHP_STREAM_FLAG_EOL_MAC) {
eol = stream->input_encoding ? u_memchr((UChar*)readptr, '\r', avail) : memchr(readptr, '\r', avail);
eol = (stream->readbuf_type == IS_UNICODE) ? u_memchr((UChar*)readptr, '\r', avail) : memchr(readptr, '\r', avail);
} else {
/* unix (and dos) line endings */
eol = stream->input_encoding ? u_memchr((UChar*)readptr, '\n', avail) : memchr(readptr, '\n', avail);
eol = (stream->readbuf_type == IS_UNICODE) ? u_memchr((UChar*)readptr, '\n', avail) : memchr(readptr, '\n', avail);
}
return (void*)eol;
@ -967,7 +925,7 @@ PHPAPI void *_php_stream_get_line(php_stream *stream, int buf_type, zstr buf, si
size_t current_buf_size = 0;
size_t total_copied = 0;
int grow_mode = 0;
int is_unicode = php_stream_reads_unicode(stream);
int is_unicode = stream->readbuf_type == IS_UNICODE;
int split_surrogate = 0;
zstr bufstart = buf;
@ -1042,8 +1000,8 @@ PHPAPI void *_php_stream_get_line(php_stream *stream, int buf_type, zstr buf, si
* than 8K, we waste 1 byte per additional 8K or so.
* That seems acceptable to me, to avoid making this code
* hard to follow */
bufstart.s = erealloc(bufstart.s, PS_ULEN(stream->input_encoding, current_buf_size + cpysz + 1));
buf.s = bufstart.s + PS_ULEN(stream->input_encoding, total_copied);
bufstart.s = erealloc(bufstart.s, PS_ULEN(stream->readbuf_type == IS_UNICODE, current_buf_size + cpysz + 1));
buf.s = bufstart.s + PS_ULEN(stream->readbuf_type == IS_UNICODE, total_copied);
current_buf_size += cpysz + 1;
} else {
if (cpysz >= maxlen - 1) {
@ -1177,7 +1135,7 @@ PHPAPI UChar *php_stream_get_record_unicode(php_stream *stream, size_t maxlen, s
size_t toread;
int skip = 0;
if (!php_stream_reads_unicode(stream)) {
if (stream->readbuf_type != IS_UNICODE) {
return NULL;
}
@ -1241,8 +1199,7 @@ PHPAPI UChar *php_stream_get_record_unicode(php_stream *stream, size_t maxlen, s
/* Writes a buffer directly to a stream, using multiple of the chunk size */
static size_t _php_stream_write_buffer(php_stream *stream, int buf_type, zstr buf, int buflen TSRMLS_DC)
{
size_t didwrite = 0, towrite, justwrote, shouldwrite, buflen_orig = buflen;
zstr buf_orig = buf;
size_t didwrite = 0, towrite, justwrote, shouldwrite;
char *freeme = NULL;
/* if we have a seekable stream we need to ensure that data is written at the
@ -1254,25 +1211,10 @@ static size_t _php_stream_write_buffer(php_stream *stream, int buf_type, zstr bu
stream->ops->seek(stream, stream->position, SEEK_SET, &stream->position TSRMLS_CC);
}
if (stream->output_encoding && buf_type == IS_UNICODE) {
char *dest;
int destlen, num_conv;
UErrorCode status = U_ZERO_ERROR;
num_conv = zend_convert_from_unicode(stream->output_encoding, &dest, &destlen, buf.u, buflen, &status);
if (U_FAILURE(status)) {
int32_t offset = u_countChar32(buf.u, num_conv);
zend_raise_conversion_error_ex("Could not convert Unicode string to binary string", stream->output_encoding, ZEND_FROM_UNICODE, offset, (UG(from_error_mode) & ZEND_CONV_ERROR_EXCEPTION) TSRMLS_CC);
}
freeme = buf.s = dest;
buflen = destlen;
} else {
/* Sloppy handling, make it a binary buffer */
if (buf_type != IS_STRING) {
if (buf_type == IS_UNICODE) {
buflen = UBYTES(buflen);
}
}
shouldwrite = buflen;
@ -1300,32 +1242,7 @@ static size_t _php_stream_write_buffer(php_stream *stream, int buf_type, zstr bu
}
}
if (stream->output_encoding) {
/* Map didwrite back to the original character count */
if (didwrite == shouldwrite) {
/* Everything wrote okay, no need to count */
didwrite = buflen_orig;
} else {
UErrorCode status = U_ZERO_ERROR;
char *t = freeme;
const UChar *p = buf_orig.u;
switch (ucnv_getType(stream->output_encoding)) {
case UCNV_SBCS:
case UCNV_LATIN_1:
case UCNV_US_ASCII:
/* 1:1 character->byte mapping, didwrite really does mean the number of characters written */
break;
default:
/* Reconvert into junk buffer to see where conversion stops in source string */
ucnv_resetFromUnicode(stream->output_encoding);
ucnv_fromUnicode(stream->output_encoding, &t, t + didwrite, &p, p + buflen_orig, NULL, TRUE, &status);
/* p stops at the first unconvertable UChar when t runs out of space */
didwrite = p - buf_orig.u;
}
}
} else if (buf_type == IS_UNICODE) {
if (buf_type == IS_UNICODE) {
/* Was slopily converted */
didwrite /= UBYTES(1);
}
@ -2274,50 +2191,15 @@ PHPAPI php_stream *_php_stream_open_wrapper_ex(char *path, char *mode, int optio
if (stream && strchr(implicit_mode, 't') && UG(unicode)) {
if (strchr(implicit_mode, 'w') || strchr(implicit_mode, 'a') || strchr(implicit_mode, '+')) {
char *encoding = (context && context->output_encoding) ? context->output_encoding : "utf8";
UErrorCode status = U_ZERO_ERROR;
stream->output_encoding = ucnv_open(encoding, &status);
if (U_FAILURE(status)) {
switch (status) {
case U_MEMORY_ALLOCATION_ERROR:
php_stream_wrapper_log_error(wrapper, options ^ REPORT_ERRORS TSRMLS_CC,
"Unable to allocate memory for unicode output converter: %s", encoding);
break;
case U_FILE_ACCESS_ERROR:
php_stream_wrapper_log_error(wrapper, options ^ REPORT_ERRORS TSRMLS_CC,
"Error loading unicode output converter: %s", encoding);
break;
default:
php_stream_wrapper_log_error(wrapper, options ^ REPORT_ERRORS TSRMLS_CC,
"Unknown error starting unicode output converter: %s", encoding);
}
} else {
/* UTODO: (Maybe?) Allow overriding the default error handlers on a per-stream basis via context params */
zend_set_converter_error_mode(stream->output_encoding, ZEND_FROM_UNICODE, UG(from_error_mode));
zend_set_converter_subst_char(stream->output_encoding, UG(from_subst_char));
}
php_stream_encoding_apply(stream, 1, encoding, UG(from_error_mode), UG(from_subst_char));
}
if (strchr(implicit_mode, 'r') || strchr(implicit_mode, '+')) {
char *encoding = (context && context->input_encoding) ? context->input_encoding : "utf8";
UErrorCode status = U_ZERO_ERROR;
stream->input_encoding = ucnv_open(encoding, &status);
if (U_FAILURE(status)) {
switch (status) {
case U_MEMORY_ALLOCATION_ERROR:
php_stream_wrapper_log_error(wrapper, options ^ REPORT_ERRORS TSRMLS_CC,
"Unable to allocate memory for unicode input converter: %s", encoding);
break;
case U_FILE_ACCESS_ERROR:
php_stream_wrapper_log_error(wrapper, options ^ REPORT_ERRORS TSRMLS_CC,
"Error loading unicode input converter: %s", encoding);
break;
default:
php_stream_wrapper_log_error(wrapper, options ^ REPORT_ERRORS TSRMLS_CC,
"Unknown error starting unicode input converter: %s", encoding);
}
}
/* UTODO: If/When Input error handling gets implemented, set the options on success */
/* UTODO: (Maybe?) Allow overriding the default error handlers on a per-stream basis via context params */
php_stream_encoding_apply(stream, 0, encoding, UG(to_error_mode), NULL);
}
}
@ -2334,6 +2216,7 @@ PHPAPI php_stream *_php_stream_open_wrapper_ex(char *path, char *mode, int optio
pefree(copy_of_path, persistent);
}
#endif
return stream;
}
/* }}} */

View file

@ -74,6 +74,7 @@ static php_stream_filter_status_t php_unicode_to_string_filter(
destp = destbuf = (char *)pemalloc(destlen, data->is_persistent);
ucnv_fromUnicode(data->conv, &destp, destbuf + destlen, (const UChar**)&src, src + remaining, NULL, FALSE, &errCode);
/* UTODO: Error catching */
new_bucket = php_stream_bucket_new(stream, destbuf, destp - destbuf, 1, data->is_persistent TSRMLS_CC);
php_stream_bucket_append(buckets_out, new_bucket TSRMLS_CC);
exit_status = PSFS_PASS_ON;
@ -88,6 +89,7 @@ static php_stream_filter_status_t php_unicode_to_string_filter(
/* Spit it out! */
ucnv_fromUnicode(data->conv, &dest, destp, NULL, NULL, NULL, TRUE, &errCode);
/* UTODO: Error catching */
if (dest > d) {
php_stream_bucket *bucket = php_stream_bucket_new(stream, d, dest - d, 0, 0 TSRMLS_CC);
php_stream_bucket_append(buckets_out, bucket TSRMLS_CC);
@ -145,6 +147,7 @@ static php_stream_filter_status_t php_unicode_from_string_filter(
destp = destbuf = (UChar *)pemalloc(destlen, data->is_persistent);
ucnv_toUnicode(data->conv, &destp, (UChar*)((char*)destbuf + destlen), (const char**)&src, src + remaining, NULL, FALSE, &errCode);
/* UTODO: Error catching */
new_bucket = php_stream_bucket_new_unicode(stream, destbuf, destp - destbuf, 1, data->is_persistent TSRMLS_CC);
php_stream_bucket_append(buckets_out, new_bucket TSRMLS_CC);
@ -160,6 +163,7 @@ static php_stream_filter_status_t php_unicode_from_string_filter(
/* Spit it out! */
ucnv_toUnicode(data->conv, &dest, destp, NULL, NULL, NULL, TRUE, &errCode);
/* UTODO: Error catching */
if (dest > d) {
php_stream_bucket *bucket = php_stream_bucket_new_unicode(stream, d, dest - d, 0, 0 TSRMLS_CC);
php_stream_bucket_append(buckets_out, bucket TSRMLS_CC);
@ -220,21 +224,21 @@ static void php_unicode_filter_dtor(php_stream_filter *thisfilter TSRMLS_DC)
}
}
static php_stream_filter_ops php_unicode_to_string_filter_ops = {
php_stream_filter_ops php_unicode_to_string_filter_ops = {
php_unicode_to_string_filter,
php_unicode_filter_dtor,
"unicode.to.*",
PSFO_FLAG_ACCEPTS_UNICODE | PSFO_FLAG_OUTPUTS_STRING
};
static php_stream_filter_ops php_unicode_from_string_filter_ops = {
php_stream_filter_ops php_unicode_from_string_filter_ops = {
php_unicode_from_string_filter,
php_unicode_filter_dtor,
"unicode.from.*",
PSFO_FLAG_ACCEPTS_STRING | PSFO_FLAG_OUTPUTS_UNICODE
};
static php_stream_filter_ops php_unicode_tidy_filter_ops = {
php_stream_filter_ops php_unicode_tidy_filter_ops = {
php_unicode_tidy_filter,
php_unicode_filter_dtor,
"unicode.tidy.*",
@ -251,7 +255,10 @@ static php_stream_filter *php_unicode_filter_create(const char *filtername, zval
const char *charset, *direction;
php_stream_filter_ops *fops;
UErrorCode ucnvError = U_ZERO_ERROR;
/* Note: from_error_mode means from unicode to charset. from filter means from charset to unicode */
uint16_t err_mode = UG(from_error_mode);
char to_unicode = 0;
zval **tmpzval;
if (strncasecmp(filtername, "unicode.", sizeof("unicode.") - 1)) {
/* Never happens */
@ -264,8 +271,9 @@ static php_stream_filter *php_unicode_filter_create(const char *filtername, zval
charset = direction + sizeof("to.") - 1;
} else if (strncmp(direction, "from.", sizeof("from.") - 1) == 0) {
fops = &php_unicode_from_string_filter_ops;
to_unicode = 1;
charset = direction + sizeof("from.") - 1;
to_unicode = 1;
err_mode = UG(to_error_mode);
} else if (strncmp(direction, "tidy.", sizeof("tidy.") - 1) == 0) {
fops = &php_unicode_tidy_filter_ops;
charset = direction + sizeof("tidy.") - 1;
@ -303,6 +311,46 @@ static php_stream_filter *php_unicode_filter_create(const char *filtername, zval
return NULL;
}
if (filterparams &&
Z_TYPE_P(filterparams) == IS_ARRAY &&
zend_hash_find(Z_ARRVAL_P(filterparams), "error_mode", sizeof("error_mode"), (void**)&tmpzval) == SUCCESS &&
tmpzval && *tmpzval) {
if (Z_TYPE_PP(tmpzval) == IS_LONG) {
err_mode = Z_LVAL_PP(tmpzval);
} else {
zval copyval = **tmpzval;
zval_copy_ctor(&copyval);
convert_to_long(&copyval);
err_mode = Z_LVAL(copyval);
}
}
zend_set_converter_error_mode(data->conv, to_unicode ? ZEND_TO_UNICODE : ZEND_FROM_UNICODE, err_mode);
if (!to_unicode) {
UChar *freeme = NULL;
UChar *subst_char = UG(from_subst_char);
if (filterparams &&
Z_TYPE_P(filterparams) == IS_ARRAY &&
zend_hash_find(Z_ARRVAL_P(filterparams), "subst_char", sizeof("subst_char"), (void**)&tmpzval) == SUCCESS &&
tmpzval && *tmpzval) {
if (Z_TYPE_PP(tmpzval) == IS_UNICODE) {
subst_char = Z_USTRVAL_PP(tmpzval);
} else {
zval copyval = **tmpzval;
zval_copy_ctor(&copyval);
convert_to_unicode(&copyval);
subst_char = freeme = Z_USTRVAL(copyval);
}
}
zend_set_converter_subst_char(data->conv, subst_char);
if (freeme) {
efree(freeme);
}
}
return php_stream_filter_alloc(fops, data, persistent);
}

View file

@ -279,7 +279,7 @@ ADD_SOURCES("main", "main.c snprintf.c spprintf.c fopen_wrappers.c \
php_open_temporary_file.c php_logos.c output.c internal_functions.c php_sprintf.c");
ADD_SOURCES("main/streams", "streams.c cast.c memory.c filter.c plain_wrapper.c \
userspace.c transports.c xp_socket.c mmap.c");
userspace.c transports.c xp_socket.c mmap.c unicode_filter.c");
ADD_SOURCES("win32", "crypt_win32.c flock.c glob.c md5crypt.c pwd.c readdir.c \
registry.c select.c sendmail.c time.c wfile.c winutil.c wsyslog.c globals.c");