php-src/ext/tokenizer/tokenizer.c
George Peter Banyard fa8d9b1183 Improve type declarations for Zend APIs
Voidification of Zend API which always succeeded
Use bool argument types instead of int for boolean arguments
Use bool return type for functions which return true/false (1/0)
Use zend_result return type for functions which return SUCCESS/FAILURE as they don't follow normal boolean semantics

Closes GH-6002
2020-08-28 15:41:27 +02:00

574 lines
15 KiB
C

/*
+----------------------------------------------------------------------+
| Copyright (c) The PHP Group |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
| Author: Andrei Zmievski <andrei@php.net> |
+----------------------------------------------------------------------+
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "php.h"
#include "php_ini.h"
#include "ext/standard/info.h"
#include "php_tokenizer.h"
#include "tokenizer_arginfo.h"
#include "zend.h"
#include "zend_exceptions.h"
#include "zend_language_scanner.h"
#include "zend_language_scanner_defs.h"
#include <zend_language_parser.h>
#include "zend_interfaces.h"
#define zendtext LANG_SCNG(yy_text)
#define zendleng LANG_SCNG(yy_leng)
#define zendcursor LANG_SCNG(yy_cursor)
#define zendlimit LANG_SCNG(yy_limit)
#define TOKEN_PARSE (1 << 0)
zend_class_entry *php_token_ce;
void tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS) {
REGISTER_LONG_CONSTANT("TOKEN_PARSE", TOKEN_PARSE, CONST_CS|CONST_PERSISTENT);
}
/* {{{ tokenizer_module_entry */
zend_module_entry tokenizer_module_entry = {
STANDARD_MODULE_HEADER,
"tokenizer",
ext_functions,
PHP_MINIT(tokenizer),
NULL,
NULL,
NULL,
PHP_MINFO(tokenizer),
PHP_TOKENIZER_VERSION,
STANDARD_MODULE_PROPERTIES
};
/* }}} */
#ifdef COMPILE_DL_TOKENIZER
ZEND_GET_MODULE(tokenizer)
#endif
static zval *php_token_get_id(zval *obj) {
zval *id = OBJ_PROP_NUM(Z_OBJ_P(obj), 0);
if (Z_ISUNDEF_P(id)) {
zend_throw_error(NULL,
"Typed property PhpToken::$id must not be accessed before initialization");
return NULL;
}
ZVAL_DEREF(id);
ZEND_ASSERT(Z_TYPE_P(id) == IS_LONG);
return id;
}
static zend_string *php_token_get_text(zval *obj) {
zval *text_zval = OBJ_PROP_NUM(Z_OBJ_P(obj), 1);
if (Z_ISUNDEF_P(text_zval)) {
zend_throw_error(NULL,
"Typed property PhpToken::$text must not be accessed before initialization");
return NULL;
}
ZVAL_DEREF(text_zval);
ZEND_ASSERT(Z_TYPE_P(text_zval) == IS_STRING);
return Z_STR_P(text_zval);
}
static zend_bool tokenize_common(
zval *return_value, zend_string *source, zend_long flags, zend_class_entry *token_class);
PHP_METHOD(PhpToken, getAll)
{
zend_string *source;
zend_long flags = 0;
zend_class_entry *token_class;
ZEND_PARSE_PARAMETERS_START(1, 2)
Z_PARAM_STR(source)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(flags)
ZEND_PARSE_PARAMETERS_END();
token_class = zend_get_called_scope(execute_data);
/* Check construction preconditions in advance, so these are not repeated for each token. */
if (token_class->ce_flags & ZEND_ACC_EXPLICIT_ABSTRACT_CLASS) {
zend_throw_error(NULL, "Cannot instantiate abstract class %s", ZSTR_VAL(token_class->name));
RETURN_THROWS();
}
if (zend_update_class_constants(token_class) == FAILURE) {
RETURN_THROWS();
}
if (!tokenize_common(return_value, source, flags, token_class)) {
RETURN_THROWS();
}
}
PHP_METHOD(PhpToken, __construct)
{
zend_long id;
zend_string *text;
zend_long line = -1;
zend_long pos = -1;
zend_object *obj = Z_OBJ_P(ZEND_THIS);
ZEND_PARSE_PARAMETERS_START(2, 4)
Z_PARAM_LONG(id)
Z_PARAM_STR(text)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(line)
Z_PARAM_LONG(pos)
ZEND_PARSE_PARAMETERS_END();
ZVAL_LONG(OBJ_PROP_NUM(obj, 0), id);
zval_ptr_dtor(OBJ_PROP_NUM(obj, 1));
ZVAL_STR_COPY(OBJ_PROP_NUM(obj, 1), text);
ZVAL_LONG(OBJ_PROP_NUM(obj, 2), line);
ZVAL_LONG(OBJ_PROP_NUM(obj, 3), pos);
}
PHP_METHOD(PhpToken, is)
{
zval *kind;
ZEND_PARSE_PARAMETERS_START(1, 1)
Z_PARAM_ZVAL(kind)
ZEND_PARSE_PARAMETERS_END();
if (Z_TYPE_P(kind) == IS_LONG) {
zval *id_zval = php_token_get_id(ZEND_THIS);
if (!id_zval) {
RETURN_THROWS();
}
RETURN_BOOL(Z_LVAL_P(id_zval) == Z_LVAL_P(kind));
} else if (Z_TYPE_P(kind) == IS_STRING) {
zend_string *text = php_token_get_text(ZEND_THIS);
if (!text) {
RETURN_THROWS();
}
RETURN_BOOL(zend_string_equals(text, Z_STR_P(kind)));
} else if (Z_TYPE_P(kind) == IS_ARRAY) {
zval *id_zval = NULL, *entry;
zend_string *text = NULL;
ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(kind), entry) {
ZVAL_DEREF(entry);
if (Z_TYPE_P(entry) == IS_LONG) {
if (!id_zval) {
id_zval = php_token_get_id(ZEND_THIS);
if (!id_zval) {
RETURN_THROWS();
}
}
if (Z_LVAL_P(id_zval) == Z_LVAL_P(entry)) {
RETURN_TRUE;
}
} else if (Z_TYPE_P(entry) == IS_STRING) {
if (!text) {
text = php_token_get_text(ZEND_THIS);
if (!text) {
RETURN_THROWS();
}
}
if (zend_string_equals(text, Z_STR_P(entry))) {
RETURN_TRUE;
}
} else {
zend_argument_type_error(1, "must only have elements of type string|int, %s given", zend_zval_type_name(entry));
RETURN_THROWS();
}
} ZEND_HASH_FOREACH_END();
RETURN_FALSE;
} else {
zend_argument_type_error(1, "must be of type string|int|array, %s given", zend_zval_type_name(kind));
RETURN_THROWS();
}
}
PHP_METHOD(PhpToken, isIgnorable)
{
ZEND_PARSE_PARAMETERS_NONE();
zval *id_zval = php_token_get_id(ZEND_THIS);
if (!id_zval) {
RETURN_THROWS();
}
zend_long id = Z_LVAL_P(id_zval);
RETURN_BOOL(id == T_WHITESPACE || id == T_COMMENT || id == T_DOC_COMMENT || id == T_OPEN_TAG);
}
PHP_METHOD(PhpToken, getTokenName)
{
ZEND_PARSE_PARAMETERS_NONE();
zval *id_zval = php_token_get_id(ZEND_THIS);
if (!id_zval) {
RETURN_THROWS();
}
if (Z_LVAL_P(id_zval) < 256) {
RETURN_CHAR(Z_LVAL_P(id_zval));
} else {
const char *token_name = get_token_type_name(Z_LVAL_P(id_zval));
if (!token_name) {
RETURN_NULL();
}
RETURN_STRING(token_name);
}
}
PHP_METHOD(PhpToken, __toString)
{
ZEND_PARSE_PARAMETERS_NONE();
zend_string *text = php_token_get_text(ZEND_THIS);
if (!text) {
RETURN_THROWS();
}
RETURN_STR_COPY(text);
}
/* {{{ PHP_MINIT_FUNCTION */
PHP_MINIT_FUNCTION(tokenizer)
{
zend_class_entry ce;
zend_string *name;
zval default_val;
ZVAL_UNDEF(&default_val);
tokenizer_register_constants(INIT_FUNC_ARGS_PASSTHRU);
tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS_PASSTHRU);
INIT_CLASS_ENTRY(ce, "PhpToken", class_PhpToken_methods);
php_token_ce = zend_register_internal_class(&ce);
zend_class_implements(php_token_ce, 1, zend_ce_stringable);
name = zend_string_init("id", sizeof("id") - 1, 1);
zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
(zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
zend_string_release(name);
name = zend_string_init("text", sizeof("text") - 1, 1);
zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
(zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_STRING));
zend_string_release(name);
name = zend_string_init("line", sizeof("line") - 1, 1);
zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
(zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
zend_string_release(name);
name = zend_string_init("pos", sizeof("pos") - 1, 1);
zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
(zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
zend_string_release(name);
return SUCCESS;
}
/* }}} */
/* {{{ PHP_MINFO_FUNCTION */
PHP_MINFO_FUNCTION(tokenizer)
{
php_info_print_table_start();
php_info_print_table_row(2, "Tokenizer Support", "enabled");
php_info_print_table_end();
}
/* }}} */
static zend_string *make_str(unsigned char *text, size_t leng, HashTable *interned_strings) {
if (leng == 1) {
return ZSTR_CHAR(text[0]);
} else if (interned_strings) {
zend_string *interned_str = zend_hash_str_find_ptr(interned_strings, (char *) text, leng);
if (interned_str) {
return zend_string_copy(interned_str);
}
interned_str = zend_string_init((char *) text, leng, 0);
zend_hash_add_new_ptr(interned_strings, interned_str, interned_str);
return interned_str;
} else {
return zend_string_init((char *) text, leng, 0);
}
}
static void add_token(
zval *return_value, int token_type, unsigned char *text, size_t leng, int lineno,
zend_class_entry *token_class, HashTable *interned_strings) {
zval token;
if (token_class) {
zend_object *obj = zend_objects_new(token_class);
ZVAL_OBJ(&token, obj);
ZVAL_LONG(OBJ_PROP_NUM(obj, 0), token_type);
ZVAL_STR(OBJ_PROP_NUM(obj, 1), make_str(text, leng, interned_strings));
ZVAL_LONG(OBJ_PROP_NUM(obj, 2), lineno);
ZVAL_LONG(OBJ_PROP_NUM(obj, 3), text - LANG_SCNG(yy_start));
/* If the class is extended with additional properties, initialized them as well. */
if (UNEXPECTED(token_class->default_properties_count > 4)) {
zval *dst = OBJ_PROP_NUM(obj, 4);
zval *src = &token_class->default_properties_table[4];
zval *end = token_class->default_properties_table
+ token_class->default_properties_count;
for (; src < end; src++, dst++) {
ZVAL_COPY_PROP(dst, src);
}
}
} else if (token_type >= 256) {
array_init(&token);
add_next_index_long(&token, token_type);
add_next_index_str(&token, make_str(text, leng, interned_strings));
add_next_index_long(&token, lineno);
} else {
ZVAL_STR(&token, make_str(text, leng, interned_strings));
}
zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &token);
}
static zend_bool tokenize(zval *return_value, zend_string *source, zend_class_entry *token_class)
{
zval source_zval;
zend_lex_state original_lex_state;
zval token;
int token_type;
int token_line = 1;
int need_tokens = -1; /* for __halt_compiler lexing. -1 = disabled */
HashTable interned_strings;
ZVAL_STR_COPY(&source_zval, source);
zend_save_lexical_state(&original_lex_state);
zend_prepare_string_for_scanning(&source_zval, "");
LANG_SCNG(yy_state) = yycINITIAL;
zend_hash_init(&interned_strings, 0, NULL, NULL, 0);
array_init(return_value);
while ((token_type = lex_scan(&token, NULL))) {
ZEND_ASSERT(token_type != T_ERROR);
add_token(
return_value, token_type, zendtext, zendleng, token_line,
token_class, &interned_strings);
if (Z_TYPE(token) != IS_UNDEF) {
zval_ptr_dtor_nogc(&token);
ZVAL_UNDEF(&token);
}
/* after T_HALT_COMPILER collect the next three non-dropped tokens */
if (need_tokens != -1) {
if (token_type != T_WHITESPACE && token_type != T_OPEN_TAG
&& token_type != T_COMMENT && token_type != T_DOC_COMMENT
&& --need_tokens == 0
) {
/* fetch the rest into a T_INLINE_HTML */
if (zendcursor < zendlimit) {
add_token(
return_value, T_INLINE_HTML, zendcursor, zendlimit - zendcursor,
token_line, token_class, &interned_strings);
}
break;
}
} else if (token_type == T_HALT_COMPILER) {
need_tokens = 3;
}
if (CG(increment_lineno)) {
CG(zend_lineno)++;
CG(increment_lineno) = 0;
}
token_line = CG(zend_lineno);
}
zval_ptr_dtor_str(&source_zval);
zend_restore_lexical_state(&original_lex_state);
zend_hash_destroy(&interned_strings);
return 1;
}
struct event_context {
zval *tokens;
zend_class_entry *token_class;
};
static zval *extract_token_id_to_replace(zval *token_zv, const char *text, size_t length) {
zval *id_zv, *text_zv;
ZEND_ASSERT(token_zv);
if (Z_TYPE_P(token_zv) == IS_ARRAY) {
id_zv = zend_hash_index_find(Z_ARRVAL_P(token_zv), 0);
text_zv = zend_hash_index_find(Z_ARRVAL_P(token_zv), 1);
} else if (Z_TYPE_P(token_zv) == IS_OBJECT) {
id_zv = OBJ_PROP_NUM(Z_OBJ_P(token_zv), 0);
text_zv = OBJ_PROP_NUM(Z_OBJ_P(token_zv), 1);
} else {
return NULL;
}
/* There are multiple candidate tokens to which this feedback may apply,
* check text to make sure this is the right one. */
ZEND_ASSERT(Z_TYPE_P(text_zv) == IS_STRING);
if (Z_STRLEN_P(text_zv) == length && !memcmp(Z_STRVAL_P(text_zv), text, length)) {
return id_zv;
}
return NULL;
}
void on_event(
zend_php_scanner_event event, int token, int line,
const char *text, size_t length, void *context)
{
struct event_context *ctx = context;
switch (event) {
case ON_TOKEN:
if (token == END) break;
/* Special cases */
if (token == ';' && LANG_SCNG(yy_leng) > 1) { /* ?> or ?>\n or ?>\r\n */
token = T_CLOSE_TAG;
} else if (token == T_ECHO && LANG_SCNG(yy_leng) == sizeof("<?=") - 1) {
token = T_OPEN_TAG_WITH_ECHO;
}
add_token(
ctx->tokens, token, (unsigned char *) text, length, line, ctx->token_class, NULL);
break;
case ON_FEEDBACK: {
HashTable *tokens_ht = Z_ARRVAL_P(ctx->tokens);
zval *token_zv, *id_zv = NULL;
ZEND_HASH_REVERSE_FOREACH_VAL(tokens_ht, token_zv) {
id_zv = extract_token_id_to_replace(token_zv, text, length);
if (id_zv) {
break;
}
} ZEND_HASH_FOREACH_END();
ZEND_ASSERT(id_zv);
ZVAL_LONG(id_zv, token);
break;
}
case ON_STOP:
if (LANG_SCNG(yy_cursor) != LANG_SCNG(yy_limit)) {
add_token(ctx->tokens, T_INLINE_HTML, LANG_SCNG(yy_cursor),
LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor), CG(zend_lineno),
ctx->token_class, NULL);
}
break;
}
}
static zend_bool tokenize_parse(
zval *return_value, zend_string *source, zend_class_entry *token_class)
{
zval source_zval;
struct event_context ctx;
zval token_stream;
zend_lex_state original_lex_state;
zend_bool original_in_compilation;
zend_bool success;
ZVAL_STR_COPY(&source_zval, source);
original_in_compilation = CG(in_compilation);
CG(in_compilation) = 1;
zend_save_lexical_state(&original_lex_state);
zend_prepare_string_for_scanning(&source_zval, "");
array_init(&token_stream);
ctx.tokens = &token_stream;
ctx.token_class = token_class;
CG(ast) = NULL;
CG(ast_arena) = zend_arena_create(1024 * 32);
LANG_SCNG(yy_state) = yycINITIAL;
LANG_SCNG(on_event) = on_event;
LANG_SCNG(on_event_context) = &ctx;
if((success = (zendparse() == SUCCESS))) {
ZVAL_COPY_VALUE(return_value, &token_stream);
} else {
zval_ptr_dtor(&token_stream);
}
zend_ast_destroy(CG(ast));
zend_arena_destroy(CG(ast_arena));
/* restore compiler and scanner global states */
zend_restore_lexical_state(&original_lex_state);
CG(in_compilation) = original_in_compilation;
zval_ptr_dtor_str(&source_zval);
return success;
}
static zend_bool tokenize_common(
zval *return_value, zend_string *source, zend_long flags, zend_class_entry *token_class)
{
if (flags & TOKEN_PARSE) {
return tokenize_parse(return_value, source, token_class);
} else {
int success = tokenize(return_value, source, token_class);
/* Normal token_get_all() should not throw. */
zend_clear_exception();
return success;
}
}
/* }}} */
/* {{{ */
PHP_FUNCTION(token_get_all)
{
zend_string *source;
zend_long flags = 0;
ZEND_PARSE_PARAMETERS_START(1, 2)
Z_PARAM_STR(source)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(flags)
ZEND_PARSE_PARAMETERS_END();
if (!tokenize_common(return_value, source, flags, /* token_class */ NULL)) {
RETURN_THROWS();
}
}
/* }}} */
/* {{{ */
PHP_FUNCTION(token_name)
{
zend_long type;
ZEND_PARSE_PARAMETERS_START(1, 1)
Z_PARAM_LONG(type)
ZEND_PARSE_PARAMETERS_END();
const char *token_name = get_token_type_name(type);
if (!token_name) {
token_name = "UNKNOWN";
}
RETURN_STRING(token_name);
}
/* }}} */