MFH: Implemented manual scanning for strings/comments, plus misc. fixes

2025-08-16 05:58:45 +02:00 · 2009-05-05 01:35:44 +00:00 · 2009-05-05 01:35:44 +00:00 · 09034cf3f4
commit 09034cf3f4
parent af442c0bde
4 changed files with 386 additions and 311 deletions
--- a/2
+++ b/2
@ -59,6 +59,8 @@ PHP                                                                        NEWS
 - Fixed bug #47038 (Memory leak in include). (Dmitry)
 - Fixed bug #47021 (SoapClient stumbles over WSDL delivered with
  "Transfer-Encoding: chunked"). (Dmitry)
 - Fixed bug #46817 (tokenizer misses last single-line comment (PHP 5.3+, with
  re2c lexer)). (Matt, Shire)
 - Fixed bug #46108 (DateTime - Memory leak when unserializing). (Felipe)
 - Fixed bug #44861 (scrollable cursor don't work with pgsql). (Matteo)
 - Fixed bug #44409 (PDO::FETCH_SERIALIZE calls __construct()). (Matteo)
--- a/Zend/zend_highlight.c
+++ b/Zend/zend_highlight.c
@ -142,14 +142,8 @@ ZEND_API void zend_highlight(zend_syntax_highlighter_ini *syntax_highlighter_ini
 				zend_printf("<span style=\"color: %s\">", last_color);
 			}
 		}
-		switch (token_type) {
+
-			case T_END_HEREDOC:
+		zend_html_puts(LANG_SCNG(yy_text), LANG_SCNG(yy_leng) TSRMLS_CC);
 				zend_html_puts(token.value.str.val, token.value.str.len TSRMLS_CC);
 				break;
 			default:
 				zend_html_puts(LANG_SCNG(yy_text), LANG_SCNG(yy_leng) TSRMLS_CC);
 				break;
 		}
 		if (token.type == IS_STRING) {
 			switch (token_type) {
@ -170,19 +164,6 @@ ZEND_API void zend_highlight(zend_syntax_highlighter_ini *syntax_highlighter_ini
 		token.type = 0;
 	}
 	/* handler for trailing comments, see bug #42767 */
 	if (LANG_SCNG(yy_leng) && LANG_SCNG(yy_text) < LANG_SCNG(yy_limit)) {
 		if (last_color != syntax_highlighter_ini->highlight_comment) {
 			if (last_color != syntax_highlighter_ini->highlight_html) {
 				zend_printf("</span>");
 			}
 			if (syntax_highlighter_ini->highlight_comment != syntax_highlighter_ini->highlight_html) {
 				zend_printf("<span style=\"color: %s\">", syntax_highlighter_ini->highlight_comment);
 			}
 		}
 		zend_html_puts(LANG_SCNG(yy_text), (LANG_SCNG(yy_limit) - LANG_SCNG(yy_text)) TSRMLS_CC);
 	}
 	if (last_color != syntax_highlighter_ini->highlight_html) {
 		zend_printf("</span>\n");
 	}
--- a/Zend/zend_language_scanner.l
+++ b/Zend/zend_language_scanner.l
@ -109,6 +109,12 @@ do {																			\
 	} \
 }
 /* To save initial string length after scanning to first variable, CG(doc_comment_len) can be reused */
 #define SET_DOUBLE_QUOTES_SCANNED_LENGTH(len) CG(doc_comment_len) = (len)
 #define GET_DOUBLE_QUOTES_SCANNED_LENGTH()    CG(doc_comment_len)
 #define IS_LABEL_START(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z') || (c) == '_' || (c) >= 0x7F)
 #define ZEND_IS_OCT(c)  ((c)>='0' && (c)<='7')
 #define ZEND_IS_HEX(c)  (((c)>='0' && (c)<='9') || ((c)>='a' && (c)<='f') || ((c)>='A' && (c)<='F'))
@ -835,63 +841,8 @@ LABEL	[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*
 WHITESPACE [ \n\r\t]+
 TABS_AND_SPACES [ \t]*
 TOKENS [;:,.\[\]()|^&+-/*=%!~$<>?@]
-ANY_CHAR [^\x00]
+ANY_CHAR [^]
 NEWLINE ("\r"|"\n"|"\r\n")
 NULL [\x00]{1}
 /*
 * LITERAL_DOLLAR matches unescaped $ that aren't followed by a label character
 * or a { and therefore will be taken literally. The case of literal $ before
 * a variable or "${" is handled in a rule for each string type
 */
 DOUBLE_QUOTES_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$"\\{]|("\\"{ANY_CHAR})))
 BACKQUOTE_LITERAL_DOLLAR     ("$"+([^a-zA-Z_\x7f-\xff$`\\{]|("\\"{ANY_CHAR})))
 HEREDOC_LITERAL_DOLLAR       ("$"+([^a-zA-Z_\x7f-\xff$\n\r\\{]|("\\"[^\n\r])))
 /*
 * Usually, HEREDOC_NEWLINE will just function like a simple NEWLINE, but some
 * special cases need to be handled. HEREDOC_CHARS doesn't allow a line to
 * match when { or $, and/or \ is at the end. (("{"*|"$"*)"\\"?) handles that,
 * along with cases where { or $, and/or \ is the ONLY thing on a line
 *
 * The other case is when a line contains a label, followed by ONLY
 * { or $, and/or \  Handled by ({LABEL}";"?((("{"+|"$"+)"\\"?)|"\\"))
 */
 HEREDOC_NEWLINE ((({LABEL}";"?((("{"+|"$"+)"\\"?)|"\\"))|(("{"*|"$"*)"\\"?)){NEWLINE})
 /*
 * This pattern is just used in the next 2 for matching { or literal $, and/or
 * \ escape sequence immediately at the beginning of a line or after a label
 */
 HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR (("{"+[^$\n\r\\{])|("{"*"\\"[^\n\r])|{HEREDOC_LITERAL_DOLLAR})
 /*
 * These 2 label-related patterns allow HEREDOC_CHARS to continue "regular"
 * matching after a newline that starts with either a non-label character or a
 * label that isn't followed by a newline. Like HEREDOC_CHARS, they won't match
 * a variable or "{$"  Matching a newline, and possibly label, up TO a variable
 * or "{$", is handled in the heredoc rules
 *
 * The HEREDOC_LABEL_NO_NEWLINE pattern (";"[^$\n\r\\{]) handles cases where ;
 * follows a label. [^a-zA-Z0-9_\x7f-\xff;$\n\r\\{] is needed to prevent a label
 * character or ; from matching on a possible (real) ending label
 */
 HEREDOC_NON_LABEL ([^a-zA-Z_\x7f-\xff$\n\r\\{]|{HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR})
 HEREDOC_LABEL_NO_NEWLINE ({LABEL}([^a-zA-Z0-9_\x7f-\xff;$\n\r\\{]|(";"[^$\n\r\\{])|(";"?{HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR})))
 /*
 * CHARS matches everything up to a variable or "{$"
 * {'s are matched as long as they aren't followed by a $
 * The case of { before "{$" is handled in a rule for each string type
 *
 * For heredocs, matching continues across/after newlines if/when it's known
 * that the next line doesn't contain a possible ending label
 */
 DOUBLE_QUOTES_CHARS ("{"*([^$"\\{]|("\\"{ANY_CHAR}))|{DOUBLE_QUOTES_LITERAL_DOLLAR})
 BACKQUOTE_CHARS     ("{"*([^$`\\{]|("\\"{ANY_CHAR}))|{BACKQUOTE_LITERAL_DOLLAR})
 HEREDOC_CHARS       ("{"*([^$\n\r\\{]|("\\"[^\n\r]))|{HEREDOC_LITERAL_DOLLAR}|({HEREDOC_NEWLINE}+({HEREDOC_NON_LABEL}|{HEREDOC_LABEL_NO_NEWLINE})))
 NOWDOC_CHARS        ([^\n\r]|{NEWLINE}+([^a-zA-Z_\x7f-\xff\n\r]|({LABEL}([^a-zA-Z0-9_\x7f-\xff;\n\r]|(";"[^\n\r])))))
 /* compute yyleng before each rule */
 <!*> := yyleng = YYCURSOR - SCNG(yy_text);
@ -1530,6 +1481,14 @@ NOWDOC_CHARS        ([^\n\r]|{NEWLINE}+([^a-zA-Z_\x7f-\xff\n\r]|({LABEL}([^a-zA-
 }
 <INITIAL>"<script"{WHITESPACE}+"language"{WHITESPACE}*"="{WHITESPACE}*("php"|"\"php\""|"'php'"){WHITESPACE}*">" {
 	YYCTYPE *bracket = zend_memrchr(yytext, '<', yyleng - (sizeof("script language=php>") - 1));
 	if (bracket != SCNG(yy_text)) {
 		/* Handle previously scanned HTML, as possible <script> tags found are assumed to not be PHP's */
 		YYCURSOR = bracket;
 		goto inline_html;
 	}
 	HANDLE_NEWLINES(yytext, yyleng);
 	zendlval->value.str.val = yytext; /* no copying - intentional */
 	zendlval->value.str.len = yyleng;
@ -1601,29 +1560,48 @@ NOWDOC_CHARS        ([^\n\r]|{NEWLINE}+([^a-zA-Z_\x7f-\xff\n\r]|({LABEL}([^a-zA-
 }
 <INITIAL>{ANY_CHAR} {
 	if (YYCURSOR > YYLIMIT) {
 		return 0;
 	}
 inline_char_handler:
 	while (1) {
 		YYCTYPE *ptr = memchr(YYCURSOR, '<', YYLIMIT - YYCURSOR);
-		if (ptr == NULL) {
+		YYCURSOR = ptr ? ptr + 1 : YYLIMIT;
 			YYCURSOR = YYLIMIT;
 			yyleng   = YYCURSOR - SCNG(yy_text);
 			break;
-		} else {
+		if (YYCURSOR < YYLIMIT) {
-			YYCURSOR = ptr + 1;
+			switch (*YYCURSOR) {
-
+				case '?':
-			/* stop if it may be an opening tag (<?, <%, <script>). this condition is not optimal though */
+					if (CG(short_tags) || !strncasecmp(YYCURSOR + 1, "php", 3)) { /* Assume [ \t\n\r] follows "php" */
-			if (YYCURSOR < YYLIMIT && (*YYCURSOR == '?' || *YYCURSOR == '%' || *YYCURSOR == 's')) {
+						break;
-				--YYCURSOR;
+					}
-				yyleng = YYCURSOR - SCNG(yy_text);
+					continue;
-				break;
+				case '%':
 					if (CG(asp_tags)) {
 						break;
 					}
 					continue;
 				case 's':
 				case 'S':
 					/* Probably NOT an opening PHP <script> tag, so don't end the HTML chunk yet
 					 * If it is, the PHP <script> tag rule checks for any HTML scanned before it */
 					YYCURSOR--;
 					yymore();
 				default:
 					continue;
 			}
 			YYCURSOR--;
 		}
 		break;
 	}
 inline_html:
 	yyleng = YYCURSOR - SCNG(yy_text);
 #ifdef ZEND_MULTIBYTE
 	if (SCNG(output_filter)) {
 		int readsize;
@ -1688,7 +1666,6 @@ inline_char_handler:
 	/* Invalid rule to return a more explicit parse error with proper line number */
 	yyless(0);
 	yy_pop_state(TSRMLS_C);
 	ZVAL_EMPTY_STRING(zendlval); /* Empty since it won't be used */
 	return T_ENCAPSED_AND_WHITESPACE;
 }
@ -1700,93 +1677,73 @@ inline_char_handler:
 <ST_IN_SCRIPTING>"#"|"//" {
-	BEGIN(ST_ONE_LINE_COMMENT);
+	while (YYCURSOR < YYLIMIT) {
-	yymore();
+		switch (*YYCURSOR++) {
-}
+			case '\r':
 				if (*YYCURSOR == '\n') {
 					YYCURSOR++;
 				}
 				/* fall through */
 			case '\n':
 				CG(zend_lineno)++;
 				break;
 			case '%':
 				if (!CG(asp_tags)) {
 					continue;
 				}
 				/* fall through */
 			case '?':
 				if (*YYCURSOR == '>') {
 					YYCURSOR--;
 					break;
 				}
 				/* fall through */
 			default:
 				continue;
 		}
-<ST_ONE_LINE_COMMENT>"?"|"%"|">" {
+		break;
 	yymore();
 }
 <ST_ONE_LINE_COMMENT>[^\n\r?%>]*{ANY_CHAR} {
 	switch (yytext[yyleng-1]) {
 		case '?': case '%': case '>':
 			yyless(yyleng-1);
 			yymore();
 			break;
 		case '\n':
 			CG(zend_lineno)++;
 			/* intentional fall through */
 		default:
 			zendlval->value.str.val = yytext; /* no copying - intentional */
 			zendlval->value.str.len = yyleng;
 			zendlval->type = IS_STRING;
 			BEGIN(ST_IN_SCRIPTING);
 			return T_COMMENT;
 	}
 }
-<ST_ONE_LINE_COMMENT>{NEWLINE} {
+	yyleng = YYCURSOR - SCNG(yy_text);
-	zendlval->value.str.val = yytext; /* no copying - intentional */
+
 	zendlval->value.str.len = yyleng;
 	zendlval->type = IS_STRING;
 	BEGIN(ST_IN_SCRIPTING);
 	CG(zend_lineno)++;
 	return T_COMMENT;
 }
-<ST_ONE_LINE_COMMENT>"?>"|"%>" {
+<ST_IN_SCRIPTING>"/*"|"/**"{WHITESPACE} {
-	if (CG(asp_tags) || yytext[yyleng-2] != '%') { /* asp comment? */
+	int doc_com;
-		zendlval->value.str.val = yytext; /* no copying - intentional */
+
-		zendlval->value.str.len = yyleng-2;
+	if (yyleng > 2) {
-		zendlval->type = IS_STRING;
+		doc_com = 1;
-		yyless(yyleng - 2);
+		RESET_DOC_COMMENT();
 		BEGIN(ST_IN_SCRIPTING);
 		return T_COMMENT;
 	} else {
-		yymore();
+		doc_com = 0;
 	}
 }
-<ST_IN_SCRIPTING>"/**"{WHITESPACE} {
+	while (YYCURSOR < YYLIMIT) {
-	RESET_DOC_COMMENT();
+		if (*YYCURSOR++ == '*' && *YYCURSOR == '/') {
-	BEGIN(ST_DOC_COMMENT);
+			break;
-	yymore();
+		}
-}
+	}
-<ST_COMMENT,ST_DOC_COMMENT>{NULL} {
+	if (YYCURSOR < YYLIMIT) {
-	zend_error(E_COMPILE_WARNING, "Unterminated comment starting line %d", CG(zend_lineno));
+		YYCURSOR++;
-	return 0;
+	} else {
-}
+		zend_error(E_COMPILE_WARNING, "Unterminated comment starting line %d", CG(zend_lineno));
 	}
-<ST_IN_SCRIPTING>"/*" {
+	yyleng = YYCURSOR - SCNG(yy_text);
 	BEGIN(ST_COMMENT);
 	yymore();
 }
 <ST_COMMENT,ST_DOC_COMMENT>[^*]+ {
 	yymore();
 }
 <ST_DOC_COMMENT>"*/" {
 	CG(doc_comment) = estrndup(yytext, yyleng);
 	CG(doc_comment_len) = yyleng;
 	HANDLE_NEWLINES(yytext, yyleng);
 	BEGIN(ST_IN_SCRIPTING);
 	return T_DOC_COMMENT;
 }
-<ST_COMMENT>"*/" {
+	if (doc_com) {
-	HANDLE_NEWLINES(yytext, yyleng);
+		CG(doc_comment) = estrndup(yytext, yyleng);
-	BEGIN(ST_IN_SCRIPTING);
+		CG(doc_comment_len) = yyleng;
 		return T_DOC_COMMENT;
 	}
 	return T_COMMENT;
 }
 <ST_COMMENT,ST_DOC_COMMENT>"*" {
 	yymore();
 }
 <ST_IN_SCRIPTING>("?>"|"</script"{WHITESPACE}*">"){NEWLINE}? {
 	zendlval->value.str.val = yytext; /* no copying - intentional */
 	zendlval->value.str.len = yyleng;
@ -1810,21 +1767,31 @@ inline_char_handler:
 }
-/* ("{"*|"$"*) handles { or $ at the end of a string (or the entire contents)
+<ST_IN_SCRIPTING>b?['] {
 */
 <ST_IN_SCRIPTING>(b?["]{DOUBLE_QUOTES_CHARS}*("{"*|"$"*)["]) {
 	int bprefix = (yytext[0] != '"') ? 1 : 0;
 	zend_scan_escape_string(zendlval, yytext+bprefix+1, yyleng-bprefix-2, '"' TSRMLS_CC);
 	return T_CONSTANT_ENCAPSED_STRING;
 }
 <ST_IN_SCRIPTING>(b?[']([^'\\]|("\\"{ANY_CHAR}))*[']) {
 	register char *s, *t;
 	char *end;
 	int bprefix = (yytext[0] != '\'') ? 1 : 0;
 	while (1) {
 		if (YYCURSOR < YYLIMIT) {
 			if (*YYCURSOR == '\'') {
 				YYCURSOR++;
 				yyleng = YYCURSOR - SCNG(yy_text);
 				break;
 			} else if (*YYCURSOR++ == '\\' && YYCURSOR < YYLIMIT) {
 				YYCURSOR++;
 			}
 		} else {
 			yyleng = YYLIMIT - SCNG(yy_text);
 			/* Unclosed single quotes; treat similar to double quotes, but without a separate token
 			 * for ' (unrecognized by parser), instead of old flex fallback to "Unexpected character..."
 			 * rule, which continued in ST_IN_SCRIPTING state after the quote */
 			return T_ENCAPSED_AND_WHITESPACE;
 		}
 	}
 	zendlval->value.str.val = estrndup(yytext+bprefix+1, yyleng-bprefix-2);
 	zendlval->value.str.len = yyleng-bprefix-2;
 	zendlval->type = IS_STRING;
@ -1872,6 +1839,42 @@ inline_char_handler:
 <ST_IN_SCRIPTING>b?["] {
 	int bprefix = (yytext[0] != '"') ? 1 : 0;
 	while (YYCURSOR < YYLIMIT) {
 		switch (*YYCURSOR++) {
 			case '"':
 				yyleng = YYCURSOR - SCNG(yy_text);
 				zend_scan_escape_string(zendlval, yytext+bprefix+1, yyleng-bprefix-2, '"' TSRMLS_CC);
 				return T_CONSTANT_ENCAPSED_STRING;
 			case '$':
 				if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') {
 					break;
 				}
 				continue;
 			case '{':
 				if (*YYCURSOR == '$') {
 					break;
 				}
 				continue;
 			case '\\':
 				if (YYCURSOR < YYLIMIT) {
 					YYCURSOR++;
 				}
 				/* fall through */
 			default:
 				continue;
 		}
 		YYCURSOR--;
 		break;
 	}
 	/* Remember how much was scanned to save rescanning */
 	SET_DOUBLE_QUOTES_SCANNED_LENGTH(YYCURSOR - SCNG(yy_text) - yyleng);
 	YYCURSOR = SCNG(yy_text) + yyleng;
 	BEGIN(ST_DOUBLE_QUOTES);
 	return '"';
 }
@ -1911,7 +1914,7 @@ inline_char_handler:
 	/* Check for ending label on the next line */
 	if (CG(heredoc_len) < YYLIMIT - YYCURSOR && !memcmp(YYCURSOR, s, CG(heredoc_len))) {
-		unsigned char *end = YYCURSOR + CG(heredoc_len);
+		YYCTYPE *end = YYCURSOR + CG(heredoc_len);
 		if (*end == ';') {
 			end++;
@ -1932,49 +1935,6 @@ inline_char_handler:
 }
 /* Match everything up to and including a possible ending label, so if the label
 * doesn't match, it's kept with the rest of the string
 *
 * {HEREDOC_NEWLINE}+ handles the case of more than one newline sequence that
 * couldn't be matched with HEREDOC_CHARS, because of the following label
 */
 <ST_HEREDOC>{HEREDOC_CHARS}*{HEREDOC_NEWLINE}+{LABEL}";"?[\n\r] {
 	char *end = yytext + yyleng - 1;
 	if (end[-1] == ';') {
 		end--;
 		yyleng--;
 	}
 	if (yyleng > CG(heredoc_len) && !memcmp(end - CG(heredoc_len), CG(heredoc), CG(heredoc_len))) {
 		int len = yyleng - CG(heredoc_len) - 2; /* 2 for newline before and after label */
 		/* May have matched fooLABEL; make sure there's a newline before it */
 		if (yytext[len] != '\n') {
 			if (yytext[len] != '\r') {
 				yyless(yyleng - 1);
 				yymore();
 			}
 		} else if (len > 0 && yytext[len - 1] == '\r') {
 			len--; /* Windows newline */
 		}
 		/* Go back before label, to match in ST_END_HEREDOC state. yytext will include
 		 * newline before label, for zend_highlight/strip, tokenizer, etc. */
 		yyless(yyleng - CG(heredoc_len) - 1); /* 1 for newline after label */
 		CG(increment_lineno) = 1; /* For newline before label */
 		BEGIN(ST_END_HEREDOC);
 		zend_scan_escape_string(zendlval, yytext, len, 0 TSRMLS_CC);
 		return T_ENCAPSED_AND_WHITESPACE;
 	} else {
 		/* Go back to end of label, so the next match works correctly in case of
 		 * a variable or another label at the beginning of the next line */
 		yyless(yyleng - 1);
 		yymore();
 	}
 }
 <ST_END_HEREDOC>{ANY_CHAR} {
 	YYCURSOR += CG(heredoc_len) - 1;
 	yyleng = CG(heredoc_len);
@ -1988,118 +1948,250 @@ inline_char_handler:
 }
-/* Will only match when $ follows: "{$" */
+<ST_DOUBLE_QUOTES,ST_BACKQUOTE,ST_HEREDOC>"{$" {
 <ST_DOUBLE_QUOTES,ST_BACKQUOTE,ST_HEREDOC>"{" {
 	zendlval->value.lval = (long) '{';
 	yy_push_state(ST_IN_SCRIPTING TSRMLS_CC);
 	yyless(1);
 	return T_CURLY_OPEN;
 }
 <ST_DOUBLE_QUOTES>{DOUBLE_QUOTES_CHARS}+ {
 	zend_scan_escape_string(zendlval, yytext, yyleng, '"' TSRMLS_CC);
 	return T_ENCAPSED_AND_WHITESPACE;
 }
 /* "{"{2,}|"$"{2,} handles { before "{$" or literal $ before a variable or "${"
 * (("{"+|"$"+)["]) handles { or $ at the end of a string
 *
 * Same for backquotes and heredocs, except the second case doesn't apply to
 * heredocs. yyless(yyleng - 1) is used to correct taking one character too many
 */
 <ST_DOUBLE_QUOTES>{DOUBLE_QUOTES_CHARS}*("{"{2,}|"$"{2,}|(("{"+|"$"+)["])) {
 	yyless(yyleng - 1);
 	zend_scan_escape_string(zendlval, yytext, yyleng, '"' TSRMLS_CC);
 	return T_ENCAPSED_AND_WHITESPACE;
 }
 <ST_BACKQUOTE>{BACKQUOTE_CHARS}+ {
 	zend_scan_escape_string(zendlval, yytext, yyleng, '`' TSRMLS_CC);
 	return T_ENCAPSED_AND_WHITESPACE;
 }
 <ST_BACKQUOTE>{BACKQUOTE_CHARS}*("{"{2,}|"$"{2,}|(("{"+|"$"+)[`])) {
 	yyless(yyleng - 1);
 	zend_scan_escape_string(zendlval, yytext, yyleng, '`' TSRMLS_CC);
 	return T_ENCAPSED_AND_WHITESPACE;
 }
 /* ({HEREDOC_NEWLINE}+({LABEL}";"?)?)? handles the possible case of newline
 * sequences, possibly followed by a label, that couldn't be matched with
 * HEREDOC_CHARS because of a following variable or "{$"
 *
 * This doesn't affect real ending labels, as they are followed by a newline,
 * which will result in a longer match for the correct rule if present
 */
 <ST_HEREDOC>{HEREDOC_CHARS}*({HEREDOC_NEWLINE}+({LABEL}";"?)?)? {
 	zend_scan_escape_string(zendlval, yytext, yyleng, 0 TSRMLS_CC);
 	return T_ENCAPSED_AND_WHITESPACE;
 }
 <ST_HEREDOC>{HEREDOC_CHARS}*({HEREDOC_NEWLINE}+({LABEL}";"?)?)?("{"{2,}|"$"{2,}) {
 	yyless(yyleng - 1);
 	zend_scan_escape_string(zendlval, yytext, yyleng, 0 TSRMLS_CC);
 	return T_ENCAPSED_AND_WHITESPACE;
 }
 <ST_NOWDOC>({NOWDOC_CHARS}+{NEWLINE}+|{NEWLINE}+){LABEL}";"?[\n\r] {
 	char *end = yytext + yyleng - 1;
 	if (end[-1] == ';') {
 		end--;
 		yyleng--;
 	}
 	if (yyleng > CG(heredoc_len) && !memcmp(end - CG(heredoc_len), CG(heredoc), CG(heredoc_len))) {
 		int len = yyleng - CG(heredoc_len) - 2; /* 2 for newline before and after label */
 		/* May have matched fooLABEL; make sure there's a newline before it */
 		if (yytext[len] != '\n') {
 			if (yytext[len] != '\r') {
 				yyless(yyleng - 1);
 				yymore();
 			}
 		} else if (len > 0 && yytext[len - 1] == '\r') {
 			len--; /* Windows newline */
 		}
 		/* Go back before label, to match in ST_END_HEREDOC state. yytext will include
 		 * newline before label, for zend_highlight/strip, tokenizer, etc. */
 		yyless(yyleng - CG(heredoc_len) - 1); /* 1 for newline after label */
 		CG(increment_lineno) = 1; /* For newline before label */
 		BEGIN(ST_END_HEREDOC);
 		zend_copy_value(zendlval, yytext, len);
 		zendlval->type = IS_STRING;
 		HANDLE_NEWLINES(yytext, len);
 		return T_ENCAPSED_AND_WHITESPACE;
 	} else {
 		/* Go back to end of label, so the next match works correctly in case of
 		 * another label at the beginning of the next line */
 		yyless(yyleng - 1);
 		yymore();
 	}
 }
 <ST_DOUBLE_QUOTES>["] {
 	BEGIN(ST_IN_SCRIPTING);
 	return '"';
 }
 <ST_BACKQUOTE>[`] {
 	BEGIN(ST_IN_SCRIPTING);
 	return '`';
 }
-<*>{NULL} { return 0; } /* EOF */
+
 <ST_DOUBLE_QUOTES>{ANY_CHAR} {
 	if (GET_DOUBLE_QUOTES_SCANNED_LENGTH()) {
 		YYCURSOR += GET_DOUBLE_QUOTES_SCANNED_LENGTH() - 1;
 		SET_DOUBLE_QUOTES_SCANNED_LENGTH(0);
 		goto double_quotes_scan_done;
 	}
 	if (YYCURSOR > YYLIMIT) {
 		return 0;
 	}
 	if (yytext[0] == '\\' && YYCURSOR < YYLIMIT) {
 		YYCURSOR++;
 	}
 	while (YYCURSOR < YYLIMIT) {
 		switch (*YYCURSOR++) {
 			case '"':
 				break;
 			case '$':
 				if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') {
 					break;
 				}
 				continue;
 			case '{':
 				if (*YYCURSOR == '$') {
 					break;
 				}
 				continue;
 			case '\\':
 				if (YYCURSOR < YYLIMIT) {
 					YYCURSOR++;
 				}
 				/* fall through */
 			default:
 				continue;
 		}
 		YYCURSOR--;
 		break;
 	}
 double_quotes_scan_done:
 	yyleng = YYCURSOR - SCNG(yy_text);
 	zend_scan_escape_string(zendlval, yytext, yyleng, '"' TSRMLS_CC);
 	return T_ENCAPSED_AND_WHITESPACE;
 }
 <ST_BACKQUOTE>{ANY_CHAR} {
 	if (YYCURSOR > YYLIMIT) {
 		return 0;
 	}
 	if (yytext[0] == '\\' && YYCURSOR < YYLIMIT) {
 		YYCURSOR++;
 	}
 	while (YYCURSOR < YYLIMIT) {
 		switch (*YYCURSOR++) {
 			case '`':
 				break;
 			case '$':
 				if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') {
 					break;
 				}
 				continue;
 			case '{':
 				if (*YYCURSOR == '$') {
 					break;
 				}
 				continue;
 			case '\\':
 				if (YYCURSOR < YYLIMIT) {
 					YYCURSOR++;
 				}
 				/* fall through */
 			default:
 				continue;
 		}
 		YYCURSOR--;
 		break;
 	}
 	yyleng = YYCURSOR - SCNG(yy_text);
 	zend_scan_escape_string(zendlval, yytext, yyleng, '`' TSRMLS_CC);
 	return T_ENCAPSED_AND_WHITESPACE;
 }
 <ST_HEREDOC>{ANY_CHAR} {
 	int newline = 0;
 	if (YYCURSOR > YYLIMIT) {
 		return 0;
 	}
 	YYCURSOR--;
 	while (YYCURSOR < YYLIMIT) {
 		switch (*YYCURSOR++) {
 			case '\r':
 				if (*YYCURSOR == '\n') {
 					YYCURSOR++;
 				}
 				/* fall through */
 			case '\n':
 				/* Check for ending label on the next line */
 				if (IS_LABEL_START(*YYCURSOR) && CG(heredoc_len) < YYLIMIT - YYCURSOR && !memcmp(YYCURSOR, CG(heredoc), CG(heredoc_len))) {
 					YYCTYPE *end = YYCURSOR + CG(heredoc_len);
 					if (*end == ';') {
 						end++;
 					}
 					if (*end == '\n' || *end == '\r') {
 						/* newline before label will be subtracted from returned text, but
 						 * yyleng/yytext will include it, for zend_highlight/strip, tokenizer, etc. */
 						if (YYCURSOR[-2] == '\r' && YYCURSOR[-1] == '\n') {
 							newline = 2; /* Windows newline */
 						} else {
 							newline = 1;
 						}
 						CG(increment_lineno) = 1; /* For newline before label */
 						BEGIN(ST_END_HEREDOC);
 						goto heredoc_scan_done;
 					}
 				}
 				continue;
 			case '$':
 				if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') {
 					break;
 				}
 				continue;
 			case '{':
 				if (*YYCURSOR == '$') {
 					break;
 				}
 				continue;
 			case '\\':
 				if (YYCURSOR < YYLIMIT && *YYCURSOR != '\n' && *YYCURSOR != '\r') {
 					YYCURSOR++;
 				}
 				/* fall through */
 			default:
 				continue;
 		}
 		YYCURSOR--;
 		break;
 	}
 heredoc_scan_done:
 	yyleng = YYCURSOR - SCNG(yy_text);
 	zend_scan_escape_string(zendlval, yytext, yyleng - newline, 0 TSRMLS_CC);
 	return T_ENCAPSED_AND_WHITESPACE;
 }
 <ST_NOWDOC>{ANY_CHAR} {
 	int newline = 0;
 	if (YYCURSOR > YYLIMIT) {
 		return 0;
 	}
 	YYCURSOR--;
 	while (YYCURSOR < YYLIMIT) {
 		switch (*YYCURSOR++) {
 			case '\r':
 				if (*YYCURSOR == '\n') {
 					YYCURSOR++;
 				}
 				/* fall through */
 			case '\n':
 				/* Check for ending label on the next line */
 				if (IS_LABEL_START(*YYCURSOR) && CG(heredoc_len) < YYLIMIT - YYCURSOR && !memcmp(YYCURSOR, CG(heredoc), CG(heredoc_len))) {
 					YYCTYPE *end = YYCURSOR + CG(heredoc_len);
 					if (*end == ';') {
 						end++;
 					}
 					if (*end == '\n' || *end == '\r') {
 						/* newline before label will be subtracted from returned text, but
 						 * yyleng/yytext will include it, for zend_highlight/strip, tokenizer, etc. */
 						if (YYCURSOR[-2] == '\r' && YYCURSOR[-1] == '\n') {
 							newline = 2; /* Windows newline */
 						} else {
 							newline = 1;
 						}
 						CG(increment_lineno) = 1; /* For newline before label */
 						BEGIN(ST_END_HEREDOC);
 						goto nowdoc_scan_done;
 					}
 				}
 				/* fall through */
 			default:
 				continue;
 		}
 	}
 nowdoc_scan_done:
 	yyleng = YYCURSOR - SCNG(yy_text);
 	zend_copy_value(zendlval, yytext, yyleng - newline);
 	zendlval->type = IS_STRING;
 	HANDLE_NEWLINES(yytext, yyleng - newline);
 	return T_ENCAPSED_AND_WHITESPACE;
 }
 <ST_IN_SCRIPTING,ST_VAR_OFFSET>{ANY_CHAR} {
 	if (YYCURSOR > YYLIMIT) {
 		return 0;
 	}
 	zend_error(E_COMPILE_WARNING,"Unexpected character in input:  '%c' (ASCII=%d) state=%d", yytext[0], yytext[0], YYSTATE);
 	goto restart;
 }
--- a/ext/standard/tests/strings/highlight_file.phpt
+++ b/ext/standard/tests/strings/highlight_file.phpt
@ -50,7 +50,7 @@ bool(false)
 </span>
 </code>bool(true)
 <code><span style="color: #000000">
-<span style="color: #0000BB">&lt;?php&nbsp;</span><span style="color: #007700">echo&nbsp;</span><span style="color: #FF9900">"test&nbsp;?&gt;</span>
+<span style="color: #0000BB">&lt;?php&nbsp;</span><span style="color: #007700">echo&nbsp;</span><span style="color: #DD0000">"test&nbsp;?&gt;</span>
 </span>
 </code>bool(true)
 <code><span style="color: #000000">