Added conversion support from script character encoding to internal character encoding. This feature is very useful for japanese who uses Shift_JIS encoding because some of characters in Shift_JIS are including '0x5c' and it causes some troubles on Zend parser. This patch is made by Masaki Fujimoto.

This commit is contained in:
Rui Hirokawa 2002-05-08 12:33:44 +00:00
parent 2b5beee5ad
commit f30b722f14
6 changed files with 295 additions and 4 deletions

View file

@ -685,12 +685,12 @@ static mbfl_encoding mbfl_encoding_2022jp = {
#if defined(HAVE_MBSTR_CN)
static const char *mbfl_encoding_euc_cn_aliases[] = {"EUC_CN", "eucCN", "x-euc-cn", NULL};
static const char *mbfl_encoding_euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", NULL};
static mbfl_encoding mbfl_encoding_euc_cn = {
mbfl_no_encoding_euc_cn,
"EUC-CN",
"EUC-CN",
"CN-GB",
(const char *(*)[])&mbfl_encoding_euc_cn_aliases,
mblen_table_euccn,
MBFL_ENCTYPE_MBCS
@ -721,12 +721,12 @@ static mbfl_encoding mbfl_encoding_euc_tw = {
MBFL_ENCTYPE_MBCS
};
static const char *mbfl_encoding_big5_aliases[] = {"big5", "CP950", NULL};
static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG5", "BIG-FIVE", "BIGFIVE", "CP950", NULL};
static mbfl_encoding mbfl_encoding_big5 = {
mbfl_no_encoding_big5,
"BIG-5",
"BIG-5",
"CN-BIG5",
(const char *(*)[])&mbfl_encoding_big5_aliases,
mblen_table_big5,
MBFL_ENCTYPE_MBCS
@ -6995,7 +6995,53 @@ mbfl_strlen(mbfl_string *string TSRMLS_DC)
return len;
}
#ifdef ZEND_MULTIBYTE
/*
* oddlen
*/
int
mbfl_oddlen(mbfl_string *string)
{
int len, n, m, k;
unsigned char *p;
const unsigned char *mbtab;
mbfl_encoding *encoding;
encoding = mbfl_no2encoding(string->no_encoding);
if (encoding == NULL || string == NULL) {
return -1;
}
len = 0;
if (encoding->flag & MBFL_ENCTYPE_SBCS) {
return 0;
} else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
return len % 2;
} else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
return len % 4;
} else if (encoding->mblen_table != NULL) {
mbtab = encoding->mblen_table;
n = 0;
p = string->val;
k = string->len;
/* count */
if (p != NULL) {
while (n < k) {
m = mbtab[*p];
n += m;
p += m;
};
}
return n-k;
} else {
/* how can i do ? */
return 0;
}
/* NOT REACHED */
}
#endif /* ZEND_MULTIBYTE */
/*
* strpos
*/

View file

@ -461,6 +461,14 @@ mbfl_identify_encoding_no(mbfl_string *string, enum mbfl_no_encoding *elist, int
int
mbfl_strlen(mbfl_string *string TSRMLS_DC);
#ifdef ZEND_MULTIBYTE
/*
* oddlen
*/
int
mbfl_oddlen(mbfl_string *string);
#endif /* ZEND_MULTIBYTE */
/*
* strpos
*/

View file

@ -65,6 +65,10 @@
#include "php_content_types.h"
#include "SAPI.h"
#ifdef ZEND_MULTIBYTE
#include "zend_multibyte.h"
#endif /* ZEND_MULTIBYTE */
#if HAVE_MBSTRING
#if HAVE_MBREGEX
@ -524,6 +528,25 @@ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
return SUCCESS;
}
#ifdef ZEND_MULTIBYTE
static PHP_INI_MH(OnUpdate_mbstring_script_encoding)
{
int *list, size;
if (php_mbstring_parse_encoding_list(new_value, new_value_length, &list, &size, 1)) {
if (MBSTRG(script_encoding_list) != NULL) {
free(MBSTRG(script_encoding_list));
}
MBSTRG(script_encoding_list) = list;
MBSTRG(script_encoding_list_size) = size;
} else {
return FAILURE;
}
return SUCCESS;
}
#endif /* ZEND_MULTIBYTE */
static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
{
if (new_value != NULL) {
@ -546,6 +569,9 @@ PHP_INI_BEGIN()
PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding)
#ifdef ZEND_MULTIBYTE
PHP_INI_ENTRY("mbstring.script_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_script_encoding)
#endif /* ZEND_MULTIBYTE */
PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
STD_PHP_INI_ENTRY("mbstring.func_overload", "0", PHP_INI_SYSTEM, OnUpdateInt, func_overload, zend_mbstring_globals, mbstring_globals)
PHP_INI_END()
@ -579,6 +605,10 @@ php_mbstring_init_globals(zend_mbstring_globals *pglobals TSRMLS_DC)
MBSTRG(internal_encoding) = mbfl_no_encoding_euc_jp;
MBSTRG(current_internal_encoding) = mbfl_no_encoding_euc_jp;
#endif
#ifdef ZEND_MULTIBYTE
MBSTRG(script_encoding_list) = NULL;
MBSTRG(script_encoding_list_size) = 0;
#endif /* ZEND_MULTIBYTE */
MBSTRG(http_output_encoding) = mbfl_no_encoding_pass;
MBSTRG(current_http_output_encoding) = mbfl_no_encoding_pass;
MBSTRG(http_input_identify) = mbfl_no_encoding_invalid;
@ -640,6 +670,11 @@ PHP_MSHUTDOWN_FUNCTION(mbstring)
if (MBSTRG(http_input_list)) {
free(MBSTRG(http_input_list));
}
#ifdef ZEND_MULTIBYTE
if (MBSTRG(script_encoding_list)) {
free(MBSTRG(script_encoding_list));
}
#endif /* ZEND_MULTIBYTE */
if (MBSTRG(detect_order_list)) {
free(MBSTRG(detect_order_list));
}
@ -858,6 +893,9 @@ PHP_FUNCTION(mb_internal_encoding)
RETURN_FALSE;
} else {
MBSTRG(current_internal_encoding) = no_encoding;
#ifdef ZEND_MULTIBYTE
zend_multibyte_set_internal_encoding(Z_STRVAL_PP(arg1), Z_STRLEN_PP(arg1) TSRMLS_CC);
#endif /* ZEND_MULTIBYTE */
RETURN_TRUE;
}
} else {
@ -3174,6 +3212,175 @@ PHP_FUNCTION(mb_get_info)
}
/* }}} */
#ifdef ZEND_MULTIBYTE
PHPAPI int php_mbstring_set_zend_encoding(TSRMLS_D)
{
/* 'd better use mbfl_memory_device? */
char *name, *list = NULL;
int n, *entry, list_size = 0;
zend_encoding_detector encoding_detector;
zend_encoding_converter encoding_converter;
zend_multibyte_oddlen multibyte_oddlen;
/* notify script encoding to Zend Engine */
entry = MBSTRG(script_encoding_list);
n = MBSTRG(script_encoding_list_size);
while (n > 0) {
name = (char *)mbfl_no_encoding2name(*entry);
if (name) {
list_size += strlen(name) + 1;
if (!list)
{
list = (char*)emalloc(list_size);
if (!list)
return -1;
*list = (char)NULL;
}
else
{
list = (char*)erealloc(list, list_size);
if (!list)
return -1;
strcat(list, ",");
}
strcat(list, name);
}
entry++;
n--;
}
zend_multibyte_set_script_encoding(list, (list ? strlen(list) : 0) TSRMLS_CC);
if (list)
efree(list);
encoding_detector = php_mbstring_encoding_detector;
encoding_converter = NULL;
multibyte_oddlen = php_mbstring_oddlen;
#if defined(MBSTR_ENC_TRANS)
/* notify internal encoding to Zend Engine */
name = (char*)mbfl_no_encoding2name(MBSTRG(current_internal_encoding));
zend_multibyte_set_internal_encoding(name, strlen(name) TSRMLS_CC);
encoding_converter = php_mbstring_encoding_converter;
#endif /* defined(MBSTR_ENC_TRANS) */
zend_multibyte_set_functions(encoding_detector, encoding_converter,
multibyte_oddlen TSRMLS_CC);
return 0;
}
/*
* mb_detect_encoding (interface for Zend Engine)
*/
char* php_mbstring_encoding_detector(char *arg_string, int arg_length, char *arg_list TSRMLS_DC)
{
mbfl_string string;
const char *ret;
enum mbfl_no_encoding *elist;
int size, *list;
/* make encoding list */
list = NULL;
size = 0;
php_mbstring_parse_encoding_list(arg_list, strlen(arg_list), &list, &size, 0);
if (size <= 0)
return NULL;
if (size > 0 && list != NULL) {
elist = list;
} else {
elist = MBSTRG(current_detect_order_list);
size = MBSTRG(current_detect_order_list_size);
}
mbfl_string_init(&string);
string.no_language = MBSTRG(current_language);
string.val = arg_string;
string.len = arg_length;
ret = mbfl_identify_encoding_name(&string, elist, size);
if (list != NULL) {
efree((void *)list);
}
if (ret != NULL) {
return estrdup(ret);
} else {
return NULL;
}
}
/*
* mb_convert_encoding (interface for Zend Engine)
*/
int php_mbstring_encoding_converter(char **to, int *to_length, char *from,
int from_length, const char *encoding_to, const char *encoding_from
TSRMLS_DC)
{
mbfl_string string, result, *ret;
enum mbfl_no_encoding from_encoding, to_encoding;
mbfl_buffer_converter *convd;
/* new encoding */
to_encoding = mbfl_name2no_encoding(encoding_to);
if (to_encoding == mbfl_no_encoding_invalid)
return -1;
/* old encoding */
from_encoding = mbfl_name2no_encoding(encoding_from);
if (from_encoding == mbfl_no_encoding_invalid)
return -1;
/* initialize string */
mbfl_string_init(&string);
mbfl_string_init(&result);
string.no_encoding = from_encoding;
string.no_language = MBSTRG(current_language);
string.val = from;
string.len = from_length;
/* initialize converter */
convd = mbfl_buffer_converter_new(from_encoding, to_encoding, string.len);
if (convd == NULL)
return -1;
mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
/* do it */
ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
if (ret != NULL) {
*to = ret->val;
*to_length = ret->len;
}
mbfl_buffer_converter_delete(convd);
return ret ? 0 : -1;
}
/*
* returns number of odd (e.g. appears only first byte of multibyte
* character) chars
*/
int php_mbstring_oddlen(char *string, int length, const char *encoding TSRMLS_DC)
{
mbfl_string mb_string;
mbfl_string_init(&mb_string);
mb_string.no_language = MBSTRG(current_language);
mb_string.no_encoding = mbfl_name2no_encoding(encoding);
mb_string.val = string;
mb_string.len = length;
if(mb_string.no_encoding == mbfl_no_encoding_invalid)
return 0;
return mbfl_oddlen(&mb_string);
}
#endif /* ZEND_MULTIBYTE */
#endif /* HAVE_MBSTRING */
/*

View file

@ -129,6 +129,10 @@ ZEND_BEGIN_MODULE_GLOBALS(mbstring)
int current_language;
int internal_encoding;
int current_internal_encoding;
#ifdef ZEND_MULTIBYTE
int *script_encoding_list;
int script_encoding_list_size;
#endif /* ZEND_MULTIBYTE */
int http_output_encoding;
int current_http_output_encoding;
int http_input_identify;
@ -177,6 +181,16 @@ struct mb_overload_def {
#define MBSTRG(v) (mbstring_globals.v)
#endif
#ifdef ZEND_MULTIBYTE
PHPAPI int php_mbstring_set_zend_encoding(TSRMLS_D);
char* php_mbstring_encoding_detector(char *string, int length, char *list
TSRMLS_DC);
int php_mbstring_encoding_converter(char **to, int *to_length, char *from,
int from_length, const char *encoding_to, const char *encoding_from
TSRMLS_DC);
int php_mbstring_oddlen(char *string, int length, const char *encoding TSRMLS_DC);
#endif /* ZEND_MULTIBYTE */
#else /* HAVE_MBSTRING */
#define mbstring_module_ptr NULL

View file

@ -75,6 +75,10 @@
#include "php_logos.h"
#include "php_streams.h"
#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING)
#include "ext/mbstring/mbstring.h"
#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */
#include "SAPI.h"
/* }}} */
@ -1402,6 +1406,9 @@ PHPAPI int php_execute_script(zend_file_handle *primary_file TSRMLS_DC)
} else {
append_file_p = NULL;
}
#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING)
php_mbstring_set_zend_encoding(TSRMLS_C);
#endif /* ZEND_MULTIBYTE && HAVE_MBSTRING */
retval = (zend_execute_scripts(ZEND_REQUIRE TSRMLS_CC, NULL, 3, prepend_file_p, primary_file, append_file_p) == SUCCESS);
} zend_end_try();

View file

@ -21,6 +21,10 @@
#include "php_apache_http.h"
#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING)
#include "ext/mbstring/mbstring.h"
#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */
#undef shutdown
/* {{{ Prototypes
@ -459,6 +463,11 @@ static int send_php(request_rec *r, int display_source_mode, char *filename)
fh.opened_path = NULL;
fh.free_filename = 0;
fh.type = ZEND_HANDLE_FILENAME;
#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING)
php_mbstring_set_zend_encoding(TSRMLS_C);
#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */
zend_execute_scripts(ZEND_INCLUDE TSRMLS_CC, NULL, 1, &fh);
return OK;
}