Merge branch 'PHP-8.2'

* PHP-8.2:
  Fix GH-11300: license issue: restricted unicode license headers
This commit is contained in:
Niels Dossche 2023-07-01 22:03:08 +02:00
commit b2a54bc6af
5 changed files with 153 additions and 117 deletions

View file

@ -1,32 +1,23 @@
# BIG5.TXT
# Date: 2015-12-02 23:52:00 GMT [KW]
# © 2015 Unicode®, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# #
# Name: BIG5 to Unicode table (complete) # Name: BIG5 to Unicode table (complete)
# Unicode version: 1.1 # Unicode version: 1.1
# Table version: 0.0d3 # Table version: 2.0
# Table format: Format A # Table format: Format A
# Date: 11 February 1994 # Date: 2011 October 14 (header updated: 2015 December 02)
# Authors: Glenn Adams <glenn@metis.com>
# John H. Jenkins <John_Jenkins@taligent.com>
#
# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved.
#
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
# No claims are made as to fitness for any particular purpose. No
# warranties of any kind are expressed or implied. The recipient
# agrees to determine applicability of information provided. If this
# file has been provided on magnetic media by Unicode, Inc., the sole
# remedy for any claim will be exchange of defective media within 90
# days of receipt.
#
# Recipient is granted the right to make copies in any form for
# internal distribution and to freely use the information supplied
# in the creation of products supporting Unicode. Unicode, Inc.
# specifically excludes the right to re-distribute this file directly
# to third parties or other organizations whether for profit or not.
# #
# General notes: # General notes:
# #
# This table contains the data Metis and Taligent currently have on how #
# BIG5 characters map into Unicode. # This table contains one set of mappings from BIG5 into Unicode.
# Note that these data are *possible* mappings only and may not be the
# same as those used by actual products, nor may they be the best suited
# for all uses. For more information on the mappings between various code
# pages incorporating the repertoire of BIG5 and Unicode, consult the
# VENDORS mapping data.
# #
# WARNING! It is currently impossible to provide round-trip compatibility # WARNING! It is currently impossible to provide round-trip compatibility
# between BIG5 and Unicode. # between BIG5 and Unicode.
@ -63,10 +54,8 @@
# MACRON (Mandarin Chinese first tone) to reflect this semantic. # MACRON (Mandarin Chinese first tone) to reflect this semantic.
# However, because bopomofo uses the absense of a tone mark to indicate # However, because bopomofo uses the absense of a tone mark to indicate
# the first Mandarin tone, most implementations of Big Five represent # the first Mandarin tone, most implementations of Big Five represent
# this character with a blank space, and so a mapping such as U+2003 EM SPACE # this character with a blank space, and so a mapping such as U+2003 EM
# might be preferred. # SPACE might be preferred.
#
#
# #
# Format: Three tab-separated columns # Format: Three tab-separated columns
# Column #1 is the BIG5 code (in hex as 0xXXXX) # Column #1 is the BIG5 code (in hex as 0xXXXX)
@ -82,9 +71,24 @@
# #
# The entries are in BIG5 order # The entries are in BIG5 order
# #
# Any comments or problems, contact <John_Jenkins@taligent.com> # Revision History:
# #
# [v2.0, 2015 December 02]
# updates to copyright notice and terms of use
# no changes to character mappings
# #
# [v1.0, 2011 October 14]
# Updated terms of use to current wording.
# Updated contact information.
# No changes to the mapping data.
#
# [v0.0d3, 11 February 1994]
# First release.
#
# Use the Unicode reporting form <http://www.unicode.org/reporting.html>
# for any questions or comments or to report errors in the data.
#
# Manually added mapping of lower ASCII characters
0x0 0x0 0x0 0x0
0x1 0x1 0x1 0x1
0x2 0x2 0x2 0x2
@ -239,6 +243,7 @@
0xA157 0xFE31 # PRESENTATION FORM FOR VERTICAL EM DASH 0xA157 0xFE31 # PRESENTATION FORM FOR VERTICAL EM DASH
0xA158 0x2014 # EM DASH 0xA158 0x2014 # EM DASH
0xA159 0xFE33 # PRESENTATION FORM FOR VERTICAL LOW LINE 0xA159 0xFE33 # PRESENTATION FORM FOR VERTICAL LOW LINE
0xA15A 0xFFFD # *** NO MAPPING ***
0xA15B 0xFE34 # PRESENTATION FORM FOR VERTICAL WAVY LOW LINE 0xA15B 0xFE34 # PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
0xA15C 0xFE4F # WAVY LOW LINE 0xA15C 0xFE4F # WAVY LOW LINE
0xA15D 0xFF08 # FULLWIDTH LEFT PARENTHESIS 0xA15D 0xFF08 # FULLWIDTH LEFT PARENTHESIS
@ -309,7 +314,9 @@
0xA1C0 0x32A3 # CIRCLED IDEOGRAPH CORRECT 0xA1C0 0x32A3 # CIRCLED IDEOGRAPH CORRECT
0xA1C1 0x2105 # CARE OF 0xA1C1 0x2105 # CARE OF
0xA1C2 0x203E # OVERLINE 0xA1C2 0x203E # OVERLINE
0xA1C3 0xFFFD # *** NO MAPPING ***
0xA1C4 0xFF3F # FULLWIDTH LOW LINE 0xA1C4 0xFF3F # FULLWIDTH LOW LINE
0xA1C5 0xFFFD # *** NO MAPPING ***
0xA1C6 0xFE49 # DASHED OVERLINE 0xA1C6 0xFE49 # DASHED OVERLINE
0xA1C7 0xFE4A # CENTRELINE OVERLINE 0xA1C7 0xFE4A # CENTRELINE OVERLINE
0xA1C8 0xFE4D # DASHED LOW LINE 0xA1C8 0xFE4D # DASHED LOW LINE
@ -366,6 +373,8 @@
0xA1FB 0x2198 # SOUTH EAST ARROW 0xA1FB 0x2198 # SOUTH EAST ARROW
0xA1FC 0x2225 # PARALLEL TO 0xA1FC 0x2225 # PARALLEL TO
0xA1FD 0x2223 # DIVIDES 0xA1FD 0x2223 # DIVIDES
0xA1FE 0xFFFD # *** NO MAPPING ***
0xA240 0xFFFD # *** NO MAPPING ***
0xA241 0xFF0F # FULLWIDTH SOLIDUS 0xA241 0xFF0F # FULLWIDTH SOLIDUS
0xA242 0xFF3C # FULLWIDTH REVERSE SOLIDUS 0xA242 0xFF3C # FULLWIDTH REVERSE SOLIDUS
0xA243 0xFF04 # FULLWIDTH DOLLAR SIGN 0xA243 0xFF04 # FULLWIDTH DOLLAR SIGN
@ -471,7 +480,9 @@
0xA2C9 0x3027 # HANGZHOU NUMERAL SEVEN 0xA2C9 0x3027 # HANGZHOU NUMERAL SEVEN
0xA2CA 0x3028 # HANGZHOU NUMERAL EIGHT 0xA2CA 0x3028 # HANGZHOU NUMERAL EIGHT
0xA2CB 0x3029 # HANGZHOU NUMERAL NINE 0xA2CB 0x3029 # HANGZHOU NUMERAL NINE
0xA2CC 0xFFFD # *** NO MAPPING ***
0xA2CD 0x5344 # <CJK> 0xA2CD 0x5344 # <CJK>
0xA2CE 0xFFFD # *** NO MAPPING ***
0xA2CF 0xFF21 # FULLWIDTH LATIN CAPITAL LETTER A 0xA2CF 0xFF21 # FULLWIDTH LATIN CAPITAL LETTER A
0xA2D0 0xFF22 # FULLWIDTH LATIN CAPITAL LETTER B 0xA2D0 0xFF22 # FULLWIDTH LATIN CAPITAL LETTER B
0xA2D1 0xFF23 # FULLWIDTH LATIN CAPITAL LETTER C 0xA2D1 0xFF23 # FULLWIDTH LATIN CAPITAL LETTER C
@ -13916,7 +13927,7 @@
0xF9D3 0x9F7E # <CJK> 0xF9D3 0x9F7E # <CJK>
0xF9D4 0x9F49 # <CJK> 0xF9D4 0x9F49 # <CJK>
0xF9D5 0x9F98 # <CJK> 0xF9D5 0x9F98 # <CJK>
# The following ETEN extensions are copied from CP950.txt: # The following ETEN extensions are copied from CP950.txt (https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT):
0xF9D6 0x7881 #CJK UNIFIED IDEOGRAPH 0xF9D6 0x7881 #CJK UNIFIED IDEOGRAPH
0xF9D7 0x92B9 #CJK UNIFIED IDEOGRAPH 0xF9D7 0x92B9 #CJK UNIFIED IDEOGRAPH
0xF9D8 0x88CF #CJK UNIFIED IDEOGRAPH 0xF9D8 0x88CF #CJK UNIFIED IDEOGRAPH

View file

@ -1,33 +1,24 @@
# JIS0201.TXT
# Date: 2015-12-02 23:49:00 GMT [KW]
# © 2015 Unicode®, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# #
# Name: JIS X 0201 (1976) to Unicode 1.1 Table # Name: JIS X 0201 (1976) to Unicode 1.1 Table
# Unicode version: 1.1 # Unicode version: 1.1
# Table version: 0.9 # Table version: 2.0
# Table format: Format A # Table format: Format A
# Date: 8 March 1994 # Date: 2011 October 14 (header updated: 2015 December 02)
# Authors: Glenn Adams <glenn@metis.com>
# John H. Jenkins <John_Jenkins@taligent.com>
#
# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved.
#
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
# No claims are made as to fitness for any particular purpose. No
# warranties of any kind are expressed or implied. The recipient
# agrees to determine applicability of information provided. If this
# file has been provided on magnetic media by Unicode, Inc., the sole
# remedy for any claim will be exchange of defective media within 90
# days of receipt.
#
# Recipient is granted the right to make copies in any form for
# internal distribution and to freely use the information supplied
# in the creation of products supporting Unicode. Unicode, Inc.
# specifically excludes the right to re-distribute this file directly
# to third parties or other organizations whether for profit or not.
# #
# General notes: # General notes:
# #
# This table contains the data the Unicode Consortium has on how #
# single-byte JIS X 0201 characters map into Unicode 1.1 # This table contains one set of mappings from JIS X 0201 into Unicode.
# (ISO/IEC 10646:1-1993 UCS-2). # Note that these data are *possible* mappings only and may not be the
# same as those used by actual products, nor may they be the best suited
# for all uses. For more information on the mappings between various code
# pages incorporating the repertoire of JIS X 0201 and Unicode, consult the
# VENDORS mapping data.
#
# #
# Format: Three tab-separated columns # Format: Three tab-separated columns
# Column #1 is the shift JIS code (in hex as 0xXX) # Column #1 is the shift JIS code (in hex as 0xXX)
@ -36,11 +27,22 @@
# #
# The entries are in JIS order # The entries are in JIS order
# #
# These mappings are provisional, pending definition of # Revision History:
# official mappings by Japanese standards bodies.
# #
# Any comments or problems, contact <John_Jenkins@taligent.com> # [v2.0, 2015 December 02]
# updates to copyright notice and terms of use
# no changes to character mappings
# #
# [v1.0, 2011 October 14]
# Updated terms of use to current wording.
# Updated contact information.
# No changes to the mapping data.
#
# [v0.9, 8 March 1994]
# First release.
#
# Use the Unicode reporting form <http://www.unicode.org/reporting.html>
# for any questions or comments or to report errors in the data.
# #
0x20 0x0020 # SPACE 0x20 0x0020 # SPACE
0x21 0x0021 # EXCLAMATION MARK 0x21 0x0021 # EXCLAMATION MARK

View file

@ -1,32 +1,24 @@
# JIS0212.TXT
# Date: 2015-12-02 23:51:00 GMT [KW]
# © 2015 Unicode®, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# #
# Name: JIS X 0212 (1990) to Unicode # Name: JIS X 0212 (1990) to Unicode
# Unicode version: 1.1 # Unicode version: 1.1
# Table version: 0.9 # Table version: 2.0
# Table format: Format A # Table format: Format A
# Date: 8 March 1994 # Date: 2011 October 14 (header updated: 2015 December 02)
# Authors: Glenn Adams <glenn@metis.com>
# John H. Jenkins <John_Jenkins@taligent.com>
#
# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved.
#
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
# No claims are made as to fitness for any particular purpose. No
# warranties of any kind are expressed or implied. The recipient
# agrees to determine applicability of information provided. If this
# file has been provided on magnetic media by Unicode, Inc., the sole
# remedy for any claim will be exchange of defective media within 90
# days of receipt.
#
# Recipient is granted the right to make copies in any form for
# internal distribution and to freely use the information supplied
# in the creation of products supporting Unicode. Unicode, Inc.
# specifically excludes the right to re-distribute this file directly
# to third parties or other organizations whether for profit or not.
# #
# General notes: # General notes:
# #
# This table contains the data the Unicode Consortium has on how #
# JIS X 0212 (1983) characters map into Unicode. # This table contains one set of mappings from JIS X 0212 into Unicode.
# Note that these data are *possible* mappings only and may not be the
# same as those used by actual products, nor may they be the best suited
# for all uses. For more information on the mappings between various code
# pages incorporating the repertoire of JIS X 0212 and Unicode, consult the
# VENDORS mapping data.
#
# #
# Format: Three tab-separated columns # Format: Three tab-separated columns
# Column #1 is the JIS X 0212 code (in hex as 0xXXXX) # Column #1 is the JIS X 0212 code (in hex as 0xXXXX)
@ -51,12 +43,6 @@
# the kuten form. For example, 0x2121 -> 0x0101 -> 0101; # the kuten form. For example, 0x2121 -> 0x0101 -> 0101;
# 0x6D63 -> 0x4D43 -> 7767 # 0x6D63 -> 0x4D43 -> 7767
# #
# The kanji mappings are a normative part of ISO/IEC 10646. The
# non-kanji mappings are provisional, pending definition of
# official mappings by Japanese standards bodies
#
# Any comments or problems, contact <John_Jenkins@taligent.com>
#
# Notes: # Notes:
# #
# 1. JIS X 0212 apparently unified the following two symbols # 1. JIS X 0212 apparently unified the following two symbols
@ -72,6 +58,23 @@
# Consequently, in the Unicode mapping, 0x2922 is treated as # Consequently, in the Unicode mapping, 0x2922 is treated as
# LATIN CAPITAL LETTER D WITH STROKE. # LATIN CAPITAL LETTER D WITH STROKE.
# #
# Revision History:
#
# [v2.0, 2015 December 02]
# updates to copyright notice and terms of use
# no changes to character mappings
#
# [v1.0, 2011 October 14]
# Updated terms of use to current wording.
# Updated contact information.
# No changes to the mapping data.
#
# [v0.9, 8 March 1994]
# First release.
#
# Use the Unicode reporting form <http://www.unicode.org/reporting.html>
# for any questions or comments or to report errors in the data.
#
0x222F 0x02D8 # BREVE 0x222F 0x02D8 # BREVE
0x2230 0x02C7 # CARON (Mandarin Chinese third tone) 0x2230 0x02C7 # CARON (Mandarin Chinese third tone)
0x2231 0x00B8 # CEDILLA 0x2231 0x00B8 # CEDILLA

View file

@ -1,11 +1,12 @@
# #
# Name: Unified Hangul (KS X 1001) to Unicode table # Name: Unified Hangul (KS X 1001) to Unicode table
# Unicode version: 2.0 # Unicode version: 2.0
# Table version: 1.0 # Table version: 1.1
# Table format: Format A # Table format: Format A
# Date: 08/16/99 # Date: 2011 October 14
# Authors: Jungshik Shin at jshin@pantheon.yale.edu # Authors: Jungshik Shin at jshin@pantheon.yale.edu
# General notes: none #
# Copyright (c) 1999-2011 Unicode, Inc. All Rights reserved.
# #
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). # This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
# No claims are made as to fitness for any particular purpose. No # No claims are made as to fitness for any particular purpose. No
@ -15,11 +16,13 @@
# remedy for any claim will be exchange of defective media within 90 # remedy for any claim will be exchange of defective media within 90
# days of receipt. # days of receipt.
# #
# Recipient is granted the right to make copies in any form for # Unicode, Inc. hereby grants the right to freely use the information
# internal distribution and to freely use the information supplied # supplied in this file in the creation of products supporting the
# in the creation of products supporting Unicode. Unicode, Inc. # Unicode Standard, and to make copies of this file in any form for
# specifically excludes the right to re-distribute this file directly # internal or external distribution as long as this notice remains
# to third parties or other organizations whether for profit or not. # attached.
#
# General notes:
# #
# What is enclosed below is the mapping between KS X 1001(KS C 5601-1987 # What is enclosed below is the mapping between KS X 1001(KS C 5601-1987
# and Unicode 2.0. It's automatically generated from KSC5601.TXT # and Unicode 2.0. It's automatically generated from KSC5601.TXT
@ -64,6 +67,19 @@
# the high and low bytes correspond to the row(Hang) and the column(Yol), # the high and low bytes correspond to the row(Hang) and the column(Yol),
# respectively # respectively
# #
# Revision History:
#
# [v1.1, 2011 October 14]
# Updated terms of use to current wording.
# Updated contact information.
# No changes to the mapping data.
#
# [v1.0, 08/16/99]
# First release.
#
# Use the Unicode reporting form <http://www.unicode.org/reporting.html>
# for any questions or comments or to report errors in the data.
#
0x2121 0x3000 # IDEOGRAPHIC SPACE 0x2121 0x3000 # IDEOGRAPHIC SPACE
0x2122 0x3001 # IDEOGRAPHIC COMMA 0x2122 0x3001 # IDEOGRAPHIC COMMA
0x2123 0x3002 # IDEOGRAPHIC FULL STOP 0x2123 0x3002 # IDEOGRAPHIC FULL STOP

View file

@ -34,6 +34,10 @@ function readConversionTable($path, &$from, &$to, $utf32 = false) {
if ($line[0] == '#') if ($line[0] == '#')
continue; continue;
if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) { if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) {
// Skip codepoints that do not have a mapping (e.g. in BIG5.txt)
if ($codepoint === 0xFFFD) {
continue;
}
$codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint); $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
if ($char == PHP_INT_MAX) { if ($char == PHP_INT_MAX) {
// We may be on a 32-bit machine and testing a text encoding with 4-byte codes // We may be on a 32-bit machine and testing a text encoding with 4-byte codes