Fix fgetcsv() to correctly support international characters

# note: mblen() is not a mbstring function, but is part of the ANSI standard
# which is even supported by Microsoft's libc.
This commit is contained in:
Moriyoshi Koizumi 2003-10-04 02:51:19 +00:00
parent 96e0010f13
commit 921e5b47c5
4 changed files with 279 additions and 101 deletions

View file

@ -24,6 +24,10 @@
#include <sys/stat.h> #include <sys/stat.h>
#ifdef HAVE_WCHAR_H
#include <wchar.h>
#endif
#include "zend_highlight.h" #include "zend_highlight.h"
#include "url_scanner.h" #include "url_scanner.h"
@ -199,6 +203,11 @@ typedef struct _php_basic_globals {
#endif #endif
HashTable *user_filter_map; HashTable *user_filter_map;
/* file.c */
#if defined(_REENTRANT) && defined(HAVE_MBRLEN) && defined(HAVE_MBSTATE_T)
mbstate_t mblen_state;
#endif
} php_basic_globals; } php_basic_globals;
#ifdef ZTS #ifdef ZTS

View file

@ -297,6 +297,19 @@ PHP_CHECK_FUNC(res_nsend, resolv, bind, socket)
PHP_CHECK_FUNC(dn_expand, resolv, bind, socket) PHP_CHECK_FUNC(dn_expand, resolv, bind, socket)
dnl already done PHP_CHECK_FUNC(dn_skipname, resolv, bind, socket) dnl already done PHP_CHECK_FUNC(dn_skipname, resolv, bind, socket)
AC_CHECK_HEADERS([wchar.h])
AC_CHECK_FUNCS([mblen])
AC_CHECK_FUNCS([mbrlen mbsinit],,,[
#ifdef HAVE_WCHAR_H
# include <wchar.h>
#endif
])
AC_CHECK_TYPES([mbstate_t],,,[
#ifdef HAVE_WCHAR_H
# include <wchar.h>
#endif
])
PHP_NEW_EXTENSION(standard, array.c base64.c basic_functions.c browscap.c crc32.c crypt.c \ PHP_NEW_EXTENSION(standard, array.c base64.c basic_functions.c browscap.c crc32.c crypt.c \
cyr_convert.c datetime.c dir.c dl.c dns.c exec.c file.c filestat.c \ cyr_convert.c datetime.c dir.c dl.c dns.c exec.c file.c filestat.c \
flock_compat.c formatted_print.c fsock.c head.c html.c image.c \ flock_compat.c formatted_print.c fsock.c head.c html.c image.c \

View file

@ -114,6 +114,10 @@ php_file_globals file_globals;
#include <fnmatch.h> #include <fnmatch.h>
#endif #endif
#ifdef HAVE_WCHAR_H
#include <wchar.h>
#endif
/* }}} */ /* }}} */
/* {{{ ZTS-stuff / Globals / Prototypes */ /* {{{ ZTS-stuff / Globals / Prototypes */
@ -1706,18 +1710,59 @@ PHPAPI PHP_FUNCTION(fread)
} }
/* }}} */ /* }}} */
#ifndef HAVE_MBLEN
# define _php_mblen(ptr, len) 1
#else
# if defined(_REENTRANT) && defined(HAVE_MBRLEN) && defined(HAVE_MBSTATE_T)
# define _php_mblen(ptr, len) (ptr == NULL ? mbsinit(&BG(mblen_state)): (int)mbrlen(ptr, len, &BG(mblen_state)))
# else
# define _php_mblen(ptr, len) mblen(ptr, len)
# endif
#endif
static const char *php_fgetcsv_lookup_trailing_spaces(const char *ptr, size_t len, const char delimiter TSRMLS_DC)
{
int inc_len;
size_t cnt = 0;
while (len > 0) {
switch ((inc_len = _php_mblen(ptr, len))) {
case -2:
case -1:
inc_len = 1;
break;
case 0:
goto quit_loop;
case 1:
if (delimiter != *ptr && isspace((int)*(const unsigned char *)ptr)) {
cnt++;
break;
}
/* break is omitted intentionally */
default:
cnt = 0;
break;
}
ptr += inc_len;
len -= inc_len;
}
quit_loop:
return ptr - cnt;
}
/* {{{ proto array fgetcsv(resource fp, int length [, string delimiter [, string enclosure]]) /* {{{ proto array fgetcsv(resource fp, int length [, string delimiter [, string enclosure]])
Get line from file pointer and parse for CSV fields */ Get line from file pointer and parse for CSV fields */
PHP_FUNCTION(fgetcsv) PHP_FUNCTION(fgetcsv)
{ {
char *temp, *tptr, *bptr, *lineEnd; char *temp, *tptr, *bptr, *line_end, *limit;
char delimiter = ','; /* allow this to be set as parameter */ char delimiter = ','; /* allow this to be set as parameter */
char enclosure = '"'; /* allow this to be set as parameter */ char enclosure = '"'; /* allow this to be set as parameter */
const char escape_char = '\\';
/* first section exactly as php_fgetss */ /* first section exactly as php_fgetss */
zval **fd, **bytes, **p_delim, **p_enclosure; zval **fd, **bytes, **p_delim, **p_enclosure;
int len, temp_len; long len;
size_t buf_len, temp_len, line_end_len;
char *buf; char *buf;
php_stream *stream; php_stream *stream;
@ -1778,34 +1823,27 @@ PHP_FUNCTION(fgetcsv)
} }
buf = emalloc(len + 1); buf = emalloc(len + 1);
/* needed because recv/read/gzread doesnt set null char at end */
memset(buf, 0, len + 1);
if (php_stream_gets(stream, buf, len) == NULL) { if (php_stream_get_line(stream, buf, len, &buf_len) == NULL) {
efree(buf); efree(buf);
RETURN_FALSE; RETURN_FALSE;
} }
/* initialize internal state */
_php_mblen(NULL, 0);
/* Now into new section that parses buf for delimiter/enclosure fields */ /* Now into new section that parses buf for delimiter/enclosure fields */
/* Strip trailing space from buf, saving end of line in case required for enclosure field */ /* Strip trailing space from buf, saving end of line in case required for enclosure field */
lineEnd = emalloc(len + 1);
bptr = buf; bptr = buf;
tptr = buf + strlen(buf) -1; tptr = (char *)php_fgetcsv_lookup_trailing_spaces(buf, buf_len, delimiter TSRMLS_CC);
while ( isspace((int)*(unsigned char *)tptr) && (*tptr!=delimiter) && (tptr > bptr) ) tptr--; line_end_len = buf_len - (size_t)(tptr - buf);
tptr++; line_end = limit = tptr;
strcpy(lineEnd, tptr);
/* add single space - makes it easier to parse trailing null field */
*tptr++ = ' ';
*tptr = 0;
/* reserve workspace for building each individual field */ /* reserve workspace for building each individual field */
temp_len = buf_len;
temp_len = len; temp = emalloc(temp_len + line_end_len + 1);
temp = emalloc(temp_len + 1); /* unlikely but possible! */
tptr = temp;
/* Initialize return array */ /* Initialize return array */
array_init(return_value); array_init(return_value);
@ -1813,113 +1851,229 @@ PHP_FUNCTION(fgetcsv)
/* Main loop to read CSV fields */ /* Main loop to read CSV fields */
/* NB this routine will return a single null entry for a blank line */ /* NB this routine will return a single null entry for a blank line */
do { for (;;) {
int inc_len;
char *comp_end, *hunk_begin;
tptr = temp;
/* 1. Strip any leading space */ /* 1. Strip any leading space */
while(isspace((int)*(unsigned char *)bptr) && (*bptr!=delimiter)) bptr++; for (;;) {
inc_len = (bptr < limit ? _php_mblen(bptr, limit - bptr): 0);
switch (inc_len) {
case -2:
case -1:
inc_len = 1;
_php_mblen(NULL, 0);
break;
case 0:
goto quit_loop_0;
case 1:
if (!isspace((int)*(unsigned char *)bptr) || *bptr == delimiter) {
goto quit_loop_1;
}
break;
default:
goto quit_loop_1;
}
bptr += inc_len;
}
quit_loop_1:
/* 2. Read field, leaving bptr pointing at start of next field */ /* 2. Read field, leaving bptr pointing at start of next field */
if (enclosure && *bptr == enclosure) { if (*bptr == enclosure) {
int state = 0;
bptr++; /* move on to first character in field */ bptr++; /* move on to first character in field */
hunk_begin = bptr;
/* 2A. handle enclosure delimited field */ /* 2A. handle enclosure delimited field */
while (*bptr) { for (;;) {
/* we need to determine if the enclosure is 'real' or is it escaped */ inc_len = (bptr < limit ? _php_mblen(bptr, limit - bptr): 0);
if (*(bptr - 1) == '\\') { switch (inc_len) {
int escape_cnt = 0; case 0:
char *bptr_p = bptr - 2; switch (state) {
case 2:
while (bptr_p > buf && *bptr_p == '\\') { memcpy(tptr, hunk_begin, bptr - hunk_begin - 1);
escape_cnt++; tptr += (bptr - hunk_begin - 1);
bptr_p--; hunk_begin = bptr;
} goto quit_loop_2;
if (!(escape_cnt % 2)) {
goto normal_char;
continue;
}
}
if (*bptr == enclosure) {
/* handle the enclosure */
if ( *(bptr+1) == enclosure) {
/* embedded enclosure */
*tptr++ = *bptr; bptr +=2;
} else {
/* must be end of string - skip to start of next field or end */
while ( (*bptr != delimiter) && *bptr ) bptr++;
if (*bptr == delimiter) bptr++;
*tptr=0; /* terminate temporary string */
break; /* .. from handling this field - resumes at 3. */
}
} else {
normal_char:
/* normal character */
*tptr++ = *bptr++;
if (*bptr == 0) { /* embedded line end? */ case 1:
*(tptr-1)=0; /* remove space character added on reading line */ memcpy(tptr, hunk_begin, bptr - hunk_begin);
strcat(temp, lineEnd); /* add the embedded line end to the field */ tptr += (bptr - hunk_begin);
hunk_begin = bptr;
/* break is omitted intentionally */
/* read a new line from input, as at start of routine */ case 0: {
memset(buf, 0, len+1); char *new_buf;
size_t new_len;
char *new_temp;
if (php_stream_gets(stream, buf, len) == NULL) { memcpy(tptr, hunk_begin, bptr - hunk_begin);
/* we've got an unterminated enclosure, assign all the data tptr += (bptr - hunk_begin);
* from the start of the enclosure to end of data to the last element hunk_begin = bptr;
*/
if (temp_len > len) { /* add the embedded line end to the field */
memcpy(tptr, line_end, line_end_len);
tptr += line_end_len;
if ((new_buf = php_stream_get_line(stream, NULL, 0, &new_len)) == NULL) {
/* we've got an unterminated enclosure,
* assign all the data from the start of
* the enclosure to end of data to the
* last element */
if ((size_t)temp_len > (size_t)(limit - buf)) {
goto quit_loop_2;
}
zval_dtor(return_value);
RETVAL_FALSE;
goto out;
}
temp_len += new_len;
new_temp = erealloc(temp, temp_len);
tptr = new_temp + (size_t)(tptr - temp);
temp = new_temp;
efree(buf);
buf_len = new_len;
bptr = buf = new_buf;
hunk_begin = buf;
line_end = limit = (char *)php_fgetcsv_lookup_trailing_spaces(buf, buf_len, delimiter TSRMLS_CC);
line_end_len = buf_len - (size_t)(limit - buf);
state = 0;
} break;
}
break;
case -2:
case -1:
_php_mblen(NULL, 0);
/* break is omitted intentionally */
case 1:
/* we need to determine if the enclosure is
* 'real' or is it escaped */
switch (state) {
case 1: /* escaped */
bptr++;
state = 0;
break;
case 2: /* embedded enclosure ? let's check it */
if (*bptr != enclosure) {
/* real enclosure */
memcpy(tptr, hunk_begin, bptr - hunk_begin - 1);
tptr += (bptr - hunk_begin - 1);
goto quit_loop_2;
}
memcpy(tptr, hunk_begin, bptr - hunk_begin);
tptr += (bptr - hunk_begin);
bptr++;
hunk_begin = bptr;
state = 0;
break;
default:
if (*bptr == escape_char) {
state = 1;
} else if (*bptr == enclosure) {
state = 2;
} else {
}
bptr++;
break; break;
}
efree(lineEnd);
efree(temp);
efree(buf);
zval_dtor(return_value);
RETURN_FALSE;
} }
break;
temp_len += len; default:
temp = erealloc(temp, temp_len+1); switch (state) {
bptr = buf; case 2:
tptr = buf + strlen(buf) -1; /* real enclosure */
while (isspace((int)*(unsigned char *)tptr) && (*tptr!=delimiter) && (tptr > bptr)) { memcpy(tptr, hunk_begin, bptr - hunk_begin - 1);
tptr--; tptr += (bptr - hunk_begin - 1);
goto quit_loop_2;
case 1:
bptr += inc_len;
memcpy(tptr, hunk_begin, bptr - hunk_begin);
tptr += (bptr - hunk_begin);
hunk_begin = bptr;
break;
/* break is missing intentionally */
default:
bptr += inc_len;
break;
} }
tptr++; break;
strcpy(lineEnd, tptr);
*tptr++ = ' ';
*tptr = 0;
tptr = temp; /* reset temp pointer to end of field as read so far */
while (*tptr) {
tptr++;
}
}
} }
} }
quit_loop_2:
/* look up for a delimiter */
for (;;) {
switch (inc_len) {
case 0:
goto quit_loop_3;
case -2:
case -1:
inc_len = 1;
_php_mblen(NULL, 0);
/* break is omitted intentionally */
case 1:
if (*bptr == delimiter) {
goto quit_loop_3;
}
break;
default:
break;
}
bptr += inc_len;
inc_len = (bptr < limit ? _php_mblen(bptr, limit - bptr): 0);
}
quit_loop_3:
comp_end = tptr;
if (*bptr == delimiter) {
bptr++;
}
} else { } else {
/* 2B. Handle non-enclosure field */ /* 2B. Handle non-enclosure field */
while ((*bptr != delimiter) && *bptr) {
*tptr++ = *bptr++;
}
*tptr=0; /* terminate temporary string */
if (strlen(temp)) { hunk_begin = bptr;
tptr--;
while (isspace((int)*(unsigned char *)tptr) && (*tptr!=delimiter)) { for (;;) {
*tptr-- = 0; /* strip any trailing spaces */ inc_len = (bptr < limit ? _php_mblen(bptr, limit - bptr): 0);
switch (inc_len) {
case 0:
goto quit_loop_4;
case -2:
case -1:
inc_len = 1;
_php_mblen(NULL, 0);
/* break is omitted intentionally */
case 1:
if (*bptr == delimiter) {
goto quit_loop_4;
}
break;
default:
break;
} }
bptr += inc_len;
} }
quit_loop_4:
memcpy(tptr, hunk_begin, bptr - hunk_begin);
tptr += (bptr - hunk_begin);
comp_end = (char *)php_fgetcsv_lookup_trailing_spaces(temp, tptr - temp, delimiter TSRMLS_CC);
if (*bptr == delimiter) { if (*bptr == delimiter) {
bptr++; bptr++;
} }
} }
/* 3. Now pass our field back to php */ /* 3. Now pass our field back to php */
add_next_index_string(return_value, temp, 1); *comp_end = '\0';
tptr = temp; add_next_index_stringl(return_value, temp, comp_end - temp, 1);
} while (*bptr); }
quit_loop_0:
efree(lineEnd); out:
efree(temp); efree(temp);
efree(buf); efree(buf);
} }

View file

@ -192,3 +192,5 @@
/* Win32 support proc_open */ /* Win32 support proc_open */
#define PHP_CAN_SUPPORT_PROC_OPEN 1 #define PHP_CAN_SUPPORT_PROC_OPEN 1
#define HAVE_MBLEN