Implement character/word/line/sentence iterators and the reverse

counterparts.
This commit is contained in:
Andrei Zmievski 2006-02-11 00:16:43 +00:00
parent 50bbedcec3
commit 5418ae7976

View file

@ -28,11 +28,16 @@
#include "php.h"
#include "zend_interfaces.h"
#include "zend_exceptions.h"
#include <unicode/ubrk.h>
typedef enum {
ITER_CODE_UNIT,
ITER_CODE_POINT,
ITER_COMB_SEQUENCE,
ITER_CHARACTER,
ITER_WORD,
ITER_LINE,
ITER_SENTENCE,
ITER_TYPE_LAST,
} text_iter_type;
@ -60,6 +65,12 @@ typedef struct {
int32_t start;
int32_t end;
} cs;
struct {
UBreakIterator *iter;
int32_t index;
int32_t start;
int32_t end;
} brk;
} u;
} text_iter_obj;
@ -76,6 +87,13 @@ typedef struct {
void (*rewind) (text_iter_obj* object TSRMLS_DC);
} text_iter_ops;
enum UBreakIteratorType brk_type_map[] = {
UBRK_CHARACTER,
UBRK_WORD,
UBRK_LINE,
UBRK_SENTENCE,
};
PHPAPI zend_class_entry* text_iterator_aggregate_ce;
PHPAPI zend_class_entry* text_iterator_ce;
PHPAPI zend_class_entry* rev_text_iterator_ce;
@ -276,12 +294,95 @@ static text_iter_ops text_iter_cs_ops = {
};
/* UBreakIterator Character Ops */
static int text_iter_brk_char_valid(text_iter_obj* object TSRMLS_DC)
{
if (object->flags & ITER_REVERSE) {
return (object->u.brk.start != UBRK_DONE);
} else {
return (object->u.brk.end != UBRK_DONE);
}
}
static void text_iter_brk_char_current(text_iter_obj* object TSRMLS_DC)
{
uint32_t length;
int32_t start = object->u.brk.start;
int32_t end = object->u.brk.end;
if (object->flags & ITER_REVERSE) {
if (end == UBRK_DONE) {
end = object->text_len;
}
} else {
if (start == UBRK_DONE) {
start = 0;
}
}
length = end - start;
if (length > object->current_alloc-1) {
object->current_alloc = length+1;
Z_USTRVAL_P(object->current) = eurealloc(Z_USTRVAL_P(object->current), object->current_alloc);
}
u_memcpy(Z_USTRVAL_P(object->current), object->text + start, length);
Z_USTRVAL_P(object->current)[length] = 0;
Z_USTRLEN_P(object->current) = length;
}
static int text_iter_brk_char_key(text_iter_obj* object TSRMLS_DC)
{
return object->u.brk.index;
}
static void text_iter_brk_char_next(text_iter_obj* object TSRMLS_DC)
{
if (object->flags & ITER_REVERSE) {
if (object->u.brk.start != UBRK_DONE) {
object->u.brk.end = object->u.brk.start;
object->u.brk.start = ubrk_previous(object->u.brk.iter);
object->u.brk.index++;
}
} else {
if (object->u.brk.end != UBRK_DONE) {
object->u.brk.start = object->u.brk.end;
object->u.brk.end = ubrk_next(object->u.brk.iter);
object->u.brk.index++;
}
}
}
static void text_iter_brk_char_rewind(text_iter_obj *object TSRMLS_DC)
{
if (object->flags & ITER_REVERSE) {
object->u.brk.end = ubrk_last(object->u.brk.iter);
object->u.brk.start = ubrk_previous(object->u.brk.iter);
} else {
object->u.brk.start = ubrk_first(object->u.brk.iter);
object->u.brk.end = ubrk_next(object->u.brk.iter);
}
object->u.brk.index = 0;
}
static text_iter_ops text_iter_brk_ops = {
text_iter_brk_char_valid,
text_iter_brk_char_current,
text_iter_brk_char_key,
text_iter_brk_char_next,
text_iter_brk_char_rewind,
};
/* Ops array */
static text_iter_ops* iter_ops[] = {
&text_iter_cu_ops,
&text_iter_cp_ops,
&text_iter_cs_ops,
&text_iter_brk_ops,
&text_iter_brk_ops,
&text_iter_brk_ops,
&text_iter_brk_ops,
};
/* Iterator Funcs */
@ -376,6 +477,9 @@ static void text_iterator_free_storage(void *object TSRMLS_DC)
if (intern->text) {
efree(intern->text);
}
if (intern->type > ITER_CHARACTER && intern->u.brk.iter) {
ubrk_close(intern->u.brk.iter);
}
zval_ptr_dtor(&intern->current);
efree(object);
}
@ -399,6 +503,7 @@ static zend_object_value text_iterator_new(zend_class_entry *class_type TSRMLS_D
intern->current_alloc = 3;
Z_USTRVAL_P(intern->current) = eumalloc(3);
Z_USTRVAL_P(intern->current)[0] = 0;
Z_USTRLEN_P(intern->current) = 0;
Z_TYPE_P(intern->current) = IS_UNICODE;
retval.handle = zend_objects_store_put(intern, (zend_objects_store_dtor_t)zend_objects_destroy_object, (zend_objects_free_object_storage_t) text_iterator_free_storage, NULL TSRMLS_CC);
@ -426,11 +531,11 @@ PHP_METHOD(TextIterator, __construct)
intern->text_len = text_len;
if (ZEND_NUM_ARGS() > 1) {
ti_type = flags & ITER_TYPE_MASK;
if (ti_type < ITER_TYPE_LAST) {
intern->type = ti_type;
} else {
if (ti_type < 0 || ti_type >= ITER_TYPE_LAST) {
php_error(E_WARNING, "Invalid iterator type in TextIterator constructor");
ti_type = ITER_CODE_POINT;
}
intern->type = ti_type;
intern->flags = flags;
}
@ -438,6 +543,15 @@ PHP_METHOD(TextIterator, __construct)
intern->flags |= ITER_REVERSE;
}
if (ti_type >= ITER_CHARACTER && ti_type < ITER_TYPE_LAST) {
UErrorCode status = U_ZERO_ERROR;
intern->u.brk.iter = ubrk_open(brk_type_map[ti_type - ITER_CHARACTER], UG(default_locale), text, text_len, &status);
if (!U_SUCCESS(status)) {
php_error(E_RECOVERABLE_ERROR, "Could not create UBreakIterator: %s", u_errorName(status));
return;
}
}
iter_ops[intern->type]->rewind(intern TSRMLS_CC);
}
@ -513,6 +627,10 @@ void php_register_unicode_iterators(TSRMLS_D)
zend_declare_class_constant_long(text_iterator_ce, "CODE_UNIT", sizeof("CODE_UNIT")-1, ITER_CODE_UNIT TSRMLS_CC);
zend_declare_class_constant_long(text_iterator_ce, "CODE_POINT", sizeof("CODE_POINT")-1, ITER_CODE_POINT TSRMLS_CC);
zend_declare_class_constant_long(text_iterator_ce, "COMB_SEQUENCE", sizeof("COMB_SEQUENCE")-1, ITER_COMB_SEQUENCE TSRMLS_CC);
zend_declare_class_constant_long(text_iterator_ce, "CHARACTER", sizeof("CHARACTER")-1, ITER_CHARACTER TSRMLS_CC);
zend_declare_class_constant_long(text_iterator_ce, "WORD", sizeof("WORD")-1, ITER_WORD TSRMLS_CC);
zend_declare_class_constant_long(text_iterator_ce, "LINE", sizeof("LINE")-1, ITER_LINE TSRMLS_CC);
zend_declare_class_constant_long(text_iterator_ce, "SENTENCE", sizeof("SENTENCE")-1, ITER_SENTENCE TSRMLS_CC);
}
/*