php-src/ext/tokenizer/tokenizer.c
Nikita Popov b03cafd19c Fix bug #77966: Cannot alias a method named "namespace"
This is a bit tricky: In this cases we have "namespace as", which
means that we will only recognize "namespace" as an identifier when
the lookahead token is already at the "as". This means that
zend_lex_tstring picks up the wrong identifier.

We solve this by actually assigning the identifier as the semantic
value on the parser stack -- as in almost all cases we will not
actually need the identifier, this is just an (offset, size)
reference, not a copy of the string.

Additionally, we need to teach the lexer feedback mechanism used
by tokenizer TOKEN_PARSE mode to apply feedback to something
other than the very last token. To that purpose we pass through
the token text and check the tokens in reverse order to find the
right one.

Closes GH-5668.
2020-06-08 12:55:14 +02:00

583 lines
16 KiB
C

/*
+----------------------------------------------------------------------+
| Copyright (c) The PHP Group |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
| Author: Andrei Zmievski <andrei@php.net> |
+----------------------------------------------------------------------+
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "php.h"
#include "php_ini.h"
#include "ext/standard/info.h"
#include "php_tokenizer.h"
#include "tokenizer_arginfo.h"
#include "zend.h"
#include "zend_exceptions.h"
#include "zend_language_scanner.h"
#include "zend_language_scanner_defs.h"
#include <zend_language_parser.h>
#include "zend_interfaces.h"
#define zendtext LANG_SCNG(yy_text)
#define zendleng LANG_SCNG(yy_leng)
#define zendcursor LANG_SCNG(yy_cursor)
#define zendlimit LANG_SCNG(yy_limit)
#define TOKEN_PARSE (1 << 0)
zend_class_entry *php_token_ce;
void tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS) {
REGISTER_LONG_CONSTANT("TOKEN_PARSE", TOKEN_PARSE, CONST_CS|CONST_PERSISTENT);
}
/* {{{ tokenizer_module_entry
*/
zend_module_entry tokenizer_module_entry = {
STANDARD_MODULE_HEADER,
"tokenizer",
ext_functions,
PHP_MINIT(tokenizer),
NULL,
NULL,
NULL,
PHP_MINFO(tokenizer),
PHP_TOKENIZER_VERSION,
STANDARD_MODULE_PROPERTIES
};
/* }}} */
#ifdef COMPILE_DL_TOKENIZER
ZEND_GET_MODULE(tokenizer)
#endif
static zval *php_token_get_id(zval *obj) {
zval *id = OBJ_PROP_NUM(Z_OBJ_P(obj), 0);
if (Z_ISUNDEF_P(id)) {
zend_throw_error(NULL,
"Typed property PhpToken::$id must not be accessed before initialization");
return NULL;
}
ZVAL_DEREF(id);
ZEND_ASSERT(Z_TYPE_P(id) == IS_LONG);
return id;
}
static zend_string *php_token_get_text(zval *obj) {
zval *text_zval = OBJ_PROP_NUM(Z_OBJ_P(obj), 1);
if (Z_ISUNDEF_P(text_zval)) {
zend_throw_error(NULL,
"Typed property PhpToken::$text must not be accessed before initialization");
return NULL;
}
ZVAL_DEREF(text_zval);
ZEND_ASSERT(Z_TYPE_P(text_zval) == IS_STRING);
return Z_STR_P(text_zval);
}
static zend_bool tokenize_common(
zval *return_value, zend_string *source, zend_long flags, zend_class_entry *token_class);
PHP_METHOD(PhpToken, getAll)
{
zend_string *source;
zend_long flags = 0;
zend_class_entry *token_class;
ZEND_PARSE_PARAMETERS_START(1, 2)
Z_PARAM_STR(source)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(flags)
ZEND_PARSE_PARAMETERS_END();
token_class = zend_get_called_scope(execute_data);
/* Check construction preconditions in advance, so these are not repeated for each token. */
if (token_class->ce_flags & ZEND_ACC_EXPLICIT_ABSTRACT_CLASS) {
zend_throw_error(NULL, "Cannot instantiate abstract class %s", ZSTR_VAL(token_class->name));
RETURN_THROWS();
}
if (zend_update_class_constants(token_class) == FAILURE) {
RETURN_THROWS();
}
if (!tokenize_common(return_value, source, flags, token_class)) {
RETURN_THROWS();
}
}
PHP_METHOD(PhpToken, __construct)
{
zend_long id;
zend_string *text;
zend_long line = -1;
zend_long pos = -1;
zend_object *obj = Z_OBJ_P(ZEND_THIS);
ZEND_PARSE_PARAMETERS_START(2, 4)
Z_PARAM_LONG(id)
Z_PARAM_STR(text)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(line)
Z_PARAM_LONG(pos)
ZEND_PARSE_PARAMETERS_END();
ZVAL_LONG(OBJ_PROP_NUM(obj, 0), id);
zval_ptr_dtor(OBJ_PROP_NUM(obj, 1));
ZVAL_STR_COPY(OBJ_PROP_NUM(obj, 1), text);
ZVAL_LONG(OBJ_PROP_NUM(obj, 2), line);
ZVAL_LONG(OBJ_PROP_NUM(obj, 3), pos);
}
PHP_METHOD(PhpToken, is)
{
zval *kind;
ZEND_PARSE_PARAMETERS_START(1, 1)
Z_PARAM_ZVAL(kind)
ZEND_PARSE_PARAMETERS_END();
if (Z_TYPE_P(kind) == IS_LONG) {
zval *id_zval = php_token_get_id(ZEND_THIS);
if (!id_zval) {
RETURN_THROWS();
}
RETURN_BOOL(Z_LVAL_P(id_zval) == Z_LVAL_P(kind));
} else if (Z_TYPE_P(kind) == IS_STRING) {
zend_string *text = php_token_get_text(ZEND_THIS);
if (!text) {
RETURN_THROWS();
}
RETURN_BOOL(zend_string_equals(text, Z_STR_P(kind)));
} else if (Z_TYPE_P(kind) == IS_ARRAY) {
zval *id_zval = NULL, *entry;
zend_string *text = NULL;
ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(kind), entry) {
ZVAL_DEREF(entry);
if (Z_TYPE_P(entry) == IS_LONG) {
if (!id_zval) {
id_zval = php_token_get_id(ZEND_THIS);
if (!id_zval) {
RETURN_THROWS();
}
}
if (Z_LVAL_P(id_zval) == Z_LVAL_P(entry)) {
RETURN_TRUE;
}
} else if (Z_TYPE_P(entry) == IS_STRING) {
if (!text) {
text = php_token_get_text(ZEND_THIS);
if (!text) {
RETURN_THROWS();
}
}
if (zend_string_equals(text, Z_STR_P(entry))) {
RETURN_TRUE;
}
} else {
zend_argument_type_error(1, "must only have elements of type string|int, %s given", zend_zval_type_name(entry));
RETURN_THROWS();
}
} ZEND_HASH_FOREACH_END();
RETURN_FALSE;
} else {
zend_argument_type_error(1, "must be of type string|int|array, %s given", zend_zval_type_name(kind));
RETURN_THROWS();
}
}
PHP_METHOD(PhpToken, isIgnorable)
{
ZEND_PARSE_PARAMETERS_NONE();
zval *id_zval = php_token_get_id(ZEND_THIS);
if (!id_zval) {
RETURN_THROWS();
}
zend_long id = Z_LVAL_P(id_zval);
RETURN_BOOL(id == T_WHITESPACE || id == T_COMMENT || id == T_DOC_COMMENT || id == T_OPEN_TAG);
}
PHP_METHOD(PhpToken, getTokenName)
{
ZEND_PARSE_PARAMETERS_NONE();
zval *id_zval = php_token_get_id(ZEND_THIS);
if (!id_zval) {
RETURN_THROWS();
}
if (Z_LVAL_P(id_zval) < 256) {
RETURN_INTERNED_STR(ZSTR_CHAR(Z_LVAL_P(id_zval)));
} else {
const char *token_name = get_token_type_name(Z_LVAL_P(id_zval));
if (!token_name) {
RETURN_NULL();
}
RETURN_STRING(token_name);
}
}
PHP_METHOD(PhpToken, __toString)
{
ZEND_PARSE_PARAMETERS_NONE();
zend_string *text = php_token_get_text(ZEND_THIS);
if (!text) {
RETURN_THROWS();
}
RETURN_STR_COPY(text);
}
/* {{{ PHP_MINIT_FUNCTION
*/
PHP_MINIT_FUNCTION(tokenizer)
{
zend_class_entry ce;
zend_string *name;
zval default_val;
ZVAL_UNDEF(&default_val);
tokenizer_register_constants(INIT_FUNC_ARGS_PASSTHRU);
tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS_PASSTHRU);
INIT_CLASS_ENTRY(ce, "PhpToken", class_PhpToken_methods);
php_token_ce = zend_register_internal_class(&ce);
zend_class_implements(php_token_ce, 1, zend_ce_stringable);
name = zend_string_init("id", sizeof("id") - 1, 1);
zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
(zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
zend_string_release(name);
name = zend_string_init("text", sizeof("text") - 1, 1);
zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
(zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_STRING));
zend_string_release(name);
name = zend_string_init("line", sizeof("line") - 1, 1);
zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
(zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
zend_string_release(name);
name = zend_string_init("pos", sizeof("pos") - 1, 1);
zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
(zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
zend_string_release(name);
return SUCCESS;
}
/* }}} */
/* {{{ PHP_MINFO_FUNCTION
*/
PHP_MINFO_FUNCTION(tokenizer)
{
php_info_print_table_start();
php_info_print_table_row(2, "Tokenizer Support", "enabled");
php_info_print_table_end();
}
/* }}} */
static zend_string *make_str(unsigned char *text, size_t leng, HashTable *interned_strings) {
if (leng == 1) {
return ZSTR_CHAR(text[0]);
} else if (interned_strings) {
zend_string *interned_str = zend_hash_str_find_ptr(interned_strings, (char *) text, leng);
if (interned_str) {
return zend_string_copy(interned_str);
}
interned_str = zend_string_init((char *) text, leng, 0);
zend_hash_add_new_ptr(interned_strings, interned_str, interned_str);
return interned_str;
} else {
return zend_string_init((char *) text, leng, 0);
}
}
static void add_token(
zval *return_value, int token_type, unsigned char *text, size_t leng, int lineno,
zend_class_entry *token_class, HashTable *interned_strings) {
zval token;
if (token_class) {
zend_object *obj = zend_objects_new(token_class);
ZVAL_OBJ(&token, obj);
ZVAL_LONG(OBJ_PROP_NUM(obj, 0), token_type);
ZVAL_STR(OBJ_PROP_NUM(obj, 1), make_str(text, leng, interned_strings));
ZVAL_LONG(OBJ_PROP_NUM(obj, 2), lineno);
ZVAL_LONG(OBJ_PROP_NUM(obj, 3), text - LANG_SCNG(yy_start));
/* If the class is extended with additional properties, initialized them as well. */
if (UNEXPECTED(token_class->default_properties_count > 4)) {
zval *dst = OBJ_PROP_NUM(obj, 4);
zval *src = &token_class->default_properties_table[4];
zval *end = token_class->default_properties_table
+ token_class->default_properties_count;
for (; src < end; src++, dst++) {
ZVAL_COPY_PROP(dst, src);
}
}
} else if (token_type >= 256) {
array_init(&token);
add_next_index_long(&token, token_type);
add_next_index_str(&token, make_str(text, leng, interned_strings));
add_next_index_long(&token, lineno);
} else {
ZVAL_STR(&token, make_str(text, leng, interned_strings));
}
zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &token);
}
static zend_bool tokenize(zval *return_value, zend_string *source, zend_class_entry *token_class)
{
zval source_zval;
zend_lex_state original_lex_state;
zval token;
int token_type;
int token_line = 1;
int need_tokens = -1; /* for __halt_compiler lexing. -1 = disabled */
HashTable interned_strings;
ZVAL_STR_COPY(&source_zval, source);
zend_save_lexical_state(&original_lex_state);
if (zend_prepare_string_for_scanning(&source_zval, "") == FAILURE) {
zend_restore_lexical_state(&original_lex_state);
return 0;
}
LANG_SCNG(yy_state) = yycINITIAL;
zend_hash_init(&interned_strings, 0, NULL, NULL, 0);
array_init(return_value);
while ((token_type = lex_scan(&token, NULL))) {
ZEND_ASSERT(token_type != T_ERROR);
add_token(
return_value, token_type, zendtext, zendleng, token_line,
token_class, &interned_strings);
if (Z_TYPE(token) != IS_UNDEF) {
zval_ptr_dtor_nogc(&token);
ZVAL_UNDEF(&token);
}
/* after T_HALT_COMPILER collect the next three non-dropped tokens */
if (need_tokens != -1) {
if (token_type != T_WHITESPACE && token_type != T_OPEN_TAG
&& token_type != T_COMMENT && token_type != T_DOC_COMMENT
&& --need_tokens == 0
) {
/* fetch the rest into a T_INLINE_HTML */
if (zendcursor < zendlimit) {
add_token(
return_value, T_INLINE_HTML, zendcursor, zendlimit - zendcursor,
token_line, token_class, &interned_strings);
}
break;
}
} else if (token_type == T_HALT_COMPILER) {
need_tokens = 3;
}
if (CG(increment_lineno)) {
CG(zend_lineno)++;
CG(increment_lineno) = 0;
}
token_line = CG(zend_lineno);
}
zval_ptr_dtor_str(&source_zval);
zend_restore_lexical_state(&original_lex_state);
zend_hash_destroy(&interned_strings);
return 1;
}
struct event_context {
zval *tokens;
zend_class_entry *token_class;
};
static zval *extract_token_id_to_replace(zval *token_zv, const char *text, size_t length) {
zval *id_zv, *text_zv;
ZEND_ASSERT(token_zv);
if (Z_TYPE_P(token_zv) == IS_ARRAY) {
id_zv = zend_hash_index_find(Z_ARRVAL_P(token_zv), 0);
text_zv = zend_hash_index_find(Z_ARRVAL_P(token_zv), 1);
} else if (Z_TYPE_P(token_zv) == IS_OBJECT) {
id_zv = OBJ_PROP_NUM(Z_OBJ_P(token_zv), 0);
text_zv = OBJ_PROP_NUM(Z_OBJ_P(token_zv), 1);
} else {
return NULL;
}
/* There are multiple candidate tokens to which this feedback may apply,
* check text to make sure this is the right one. */
ZEND_ASSERT(Z_TYPE_P(text_zv) == IS_STRING);
if (Z_STRLEN_P(text_zv) == length && !memcmp(Z_STRVAL_P(text_zv), text, length)) {
return id_zv;
}
return NULL;
}
void on_event(
zend_php_scanner_event event, int token, int line,
const char *text, size_t length, void *context)
{
struct event_context *ctx = context;
switch (event) {
case ON_TOKEN:
if (token == END) break;
/* Special cases */
if (token == ';' && LANG_SCNG(yy_leng) > 1) { /* ?> or ?>\n or ?>\r\n */
token = T_CLOSE_TAG;
} else if (token == T_ECHO && LANG_SCNG(yy_leng) == sizeof("<?=") - 1) {
token = T_OPEN_TAG_WITH_ECHO;
}
add_token(
ctx->tokens, token, (unsigned char *) text, length, line, ctx->token_class, NULL);
break;
case ON_FEEDBACK: {
HashTable *tokens_ht = Z_ARRVAL_P(ctx->tokens);
zval *token_zv, *id_zv = NULL;
ZEND_HASH_REVERSE_FOREACH_VAL(tokens_ht, token_zv) {
id_zv = extract_token_id_to_replace(token_zv, text, length);
if (id_zv) {
break;
}
} ZEND_HASH_FOREACH_END();
ZEND_ASSERT(id_zv);
ZVAL_LONG(id_zv, token);
break;
}
case ON_STOP:
if (LANG_SCNG(yy_cursor) != LANG_SCNG(yy_limit)) {
add_token(ctx->tokens, T_INLINE_HTML, LANG_SCNG(yy_cursor),
LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor), CG(zend_lineno),
ctx->token_class, NULL);
}
break;
}
}
static zend_bool tokenize_parse(
zval *return_value, zend_string *source, zend_class_entry *token_class)
{
zval source_zval;
zend_lex_state original_lex_state;
zend_bool original_in_compilation;
zend_bool success;
ZVAL_STR_COPY(&source_zval, source);
original_in_compilation = CG(in_compilation);
CG(in_compilation) = 1;
zend_save_lexical_state(&original_lex_state);
if ((success = (zend_prepare_string_for_scanning(&source_zval, "") == SUCCESS))) {
struct event_context ctx;
zval token_stream;
array_init(&token_stream);
ctx.tokens = &token_stream;
ctx.token_class = token_class;
CG(ast) = NULL;
CG(ast_arena) = zend_arena_create(1024 * 32);
LANG_SCNG(yy_state) = yycINITIAL;
LANG_SCNG(on_event) = on_event;
LANG_SCNG(on_event_context) = &ctx;
if((success = (zendparse() == SUCCESS))) {
ZVAL_COPY_VALUE(return_value, &token_stream);
} else {
zval_ptr_dtor(&token_stream);
}
zend_ast_destroy(CG(ast));
zend_arena_destroy(CG(ast_arena));
}
/* restore compiler and scanner global states */
zend_restore_lexical_state(&original_lex_state);
CG(in_compilation) = original_in_compilation;
zval_ptr_dtor_str(&source_zval);
return success;
}
static zend_bool tokenize_common(
zval *return_value, zend_string *source, zend_long flags, zend_class_entry *token_class)
{
if (flags & TOKEN_PARSE) {
return tokenize_parse(return_value, source, token_class);
} else {
int success = tokenize(return_value, source, token_class);
/* Normal token_get_all() should not throw. */
zend_clear_exception();
return success;
}
}
/* }}} */
/* {{{ proto array token_get_all(string source [, int flags])
*/
PHP_FUNCTION(token_get_all)
{
zend_string *source;
zend_long flags = 0;
ZEND_PARSE_PARAMETERS_START(1, 2)
Z_PARAM_STR(source)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(flags)
ZEND_PARSE_PARAMETERS_END();
if (!tokenize_common(return_value, source, flags, /* token_class */ NULL)) {
RETURN_THROWS();
}
}
/* }}} */
/* {{{ proto string token_name(int type)
*/
PHP_FUNCTION(token_name)
{
zend_long type;
ZEND_PARSE_PARAMETERS_START(1, 1)
Z_PARAM_LONG(type)
ZEND_PARSE_PARAMETERS_END();
const char *token_name = get_token_type_name(type);
if (!token_name) {
token_name = "UNKNOWN";
}
RETURN_STRING(token_name);
}
/* }}} */