Module:Unicode data: Difference between revisions
From All Skies Encyclopaedia
| imported>Erutuon  (copied from wikt:Module:Unicode data) | imported>Pppery   (Add KCantonese lookup per request) | ||
| (63 intermediate revisions by 6 users not shown) | |||
| Line 1: | Line 1: | ||
| local  | local p = {} | ||
| local floor = math.floor | local floor = math.floor | ||
| local function errorf(level, ...) | |||
| -- http://www.unicode.org/Public/UNIDATA/Jamo.txt | |||
| 	if type(level) == "number" then | |||
| -- For the algorithm used here, see Hangul Syllable Name Generation | |||
| 		return error(string.format(...), level + 1) | |||
| -- in section 3.12 of the Unicode Specification. | |||
| 	else -- level is actually the format string. | |||
| -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | |||
| 		return error(string.format(level, ...), 2) | |||
| local hangul_leads = { | |||
| 	end | |||
| 	[0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", | |||
| end | |||
| 	"", "J", "JJ", "C", "K", "T", "P", "H" | |||
| } | |||
| local function binary_range_search(codepoint, ranges) | |||
| local hangul_vowels = { | |||
| 	local low, mid, high | |||
| 	[0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", | |||
| 	low, high = 1, ranges.length or require "Module:TableTools".length(ranges) | |||
| 	"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", | |||
| 	while low <= high do | |||
| 	"I" | |||
| 		mid = floor((low + high) / 2) | |||
| } | |||
| 		local range = ranges[mid] | |||
| 		if codepoint < range[1] then | |||
| 			high = mid - 1 | |||
| 		elseif codepoint <= range[2] then | |||
| 			return range, mid | |||
| 		else | |||
| 			low = mid + 1 | |||
| 		end | |||
| 	end | |||
| 	return nil, mid | |||
| end | |||
| p.binary_range_search = binary_range_search | |||
| --[[ | |||
| local hangul_trails = { | |||
| local function linear_range_search(codepoint, ranges) | |||
| 	[0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", | |||
| 	for i, range in ipairs(ranges) do | |||
| 	"LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", | |||
| 		if range[1] <= codepoint and codepoint <= range[2] then | |||
| 	"T", "P", "H" | |||
| 			return range | |||
| } | |||
| 		end | |||
| 	end | |||
| end | |||
| --]] | |||
| -- Load a module by indexing "loader" with the name of the module minus the | |||
| -- "Module:Unicode data/" part. For instance, loader.blocks returns | |||
| -- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be | |||
| -- returned. | |||
| local loader = setmetatable({}, { | |||
| 	__index = function (self, key) | |||
| 		local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key) | |||
| 		if not success then | |||
| 			data = false | |||
| 		end | |||
| 		self[key] = data | |||
| 		return data | |||
| 	end | |||
| }) | |||
| -- For the algorithm used to generate Hangul Syllable names, | |||
| -- see "Hangul Syllable Name Generation" in section 3.12 of the | |||
| -- Unicode Specification: | |||
| -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | |||
| local name_hooks = { | local name_hooks = { | ||
| 	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters | 	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters | ||
| 	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters | 	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters | ||
| 	{   0x3400,    | 	{   0x3400,   0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A | ||
| 	{   0x4E00,    | 	{   0x4E00,   0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph | ||
| 	{   0xAC00,   0xD7A3, function (codepoint) | 	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables | ||
| 		local  | 		local Hangul_data = loader.Hangul | ||
| 		local syllable_index = codepoint - 0xAC00 | |||
| 		-- lead index, vowel index, trail index | |||
| 		local li, vi, ti = m_hangul.syllableIndex2JamoIndices( | |||
| 			codepoint - 0xAC00 | |||
| 		) | |||
| 		return ("HANGUL SYLLABLE %s%s%s"):format( | 		return ("HANGUL SYLLABLE %s%s%s"):format( | ||
| 			Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)], | |||
| 			hangul_leads[li], -- I hate one-based indexing | |||
| 			Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count) | |||
| 			hangul_vowels[vi], | |||
| 				/ Hangul_data.trail_count)], | |||
| 			hangul_trails[ti]  -- never mind, I can live with it | |||
| 			Hangul_data.trails[syllable_index % Hangul_data.trail_count] | |||
| 		) | 		) | ||
| 	end }, | 	end }, | ||
| 	-- High Surrogates, High Private Use Surrogates, Low Surrogates | |||
| 	{   0xD800,   0xDB7F, "<surrogate-%04X>" }, -- Non Private Use High Surrogate | |||
| 	{    | 	{   0xD800,   0xDFFF, "<surrogate-%04X>" }, | ||
| 	{   0xDC00,   0xDFFF, "<surrogate-%04X>" }, -- Low Surrogate | |||
| 	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use | 	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use | ||
| 	-- CJK Compatibility Ideographs | |||
| 	{  0x17000,  0x187F1, "TANGUT IDEOGRAPH-%05X" }, -- Tangut | |||
| 	{   | 	{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | ||
| 	{   | 	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | ||
| 	{   | 	{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph | ||
| 	{  0x18800,  0x18AFF, function (codepoint) | |||
| 	{  0x2A740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension D | |||
| 		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | |||
| 	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension E | |||
| 	end }, | |||
| 	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension F --add v10 | |||
| 	{   | 	{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement | ||
| 	{   | 	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | ||
| 	{  | 	{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B | ||
| 	{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C | |||
| 	{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D | |||
| 	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | |||
| 	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | |||
| 	-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) | |||
| 	{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | |||
| 	{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement | |||
| 		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | |||
| 	end}, | |||
| 	{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G | |||
| 	{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H | |||
| 	{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I | |||
| 	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use | |||
| 	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use | |||
| } | } | ||
| name_hooks.length = #name_hooks | |||
| local name_range_cache | local name_range_cache | ||
| Line 68: | Line 114: | ||
| end | end | ||
| --[[ | |||
| -- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 | |||
| -- Checks that the code point is a number and in range. | |||
| function export.lookup_name(codepoint) | |||
| -- Does not check whether code point is an integer. | |||
| 	-- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are noncharacters: | |||
| -- Not used | |||
| local function check_codepoint(funcName, argIdx, val) | |||
| 	require 'libraryUtil'.checkType(funcName, argIdx, val, 'number') | |||
| 	if codepoint < 0 or 0x10FFFF < codepoint then | |||
| 		errorf("Codepoint %04X out of range", codepoint) | |||
| 	end | |||
| end | |||
| --]] | |||
| function p.is_noncharacter(codepoint) | |||
| 	-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned | |||
| 	-- (Cn) and specifically noncharacters: | |||
| 	-- https://www.unicode.org/faq/private_use.html#nonchar4 | 	-- https://www.unicode.org/faq/private_use.html#nonchar4 | ||
| 	return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF | |||
| 			or  | 			or floor(codepoint % 0x10000) >= 0xFFFE) | ||
| end | |||
| -- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 | |||
| function p.lookup_name(codepoint) | |||
| 	if p.is_noncharacter(codepoint) then | |||
| 		return ("<noncharacter-%04X>"):format(codepoint) | 		return ("<noncharacter-%04X>"):format(codepoint) | ||
| 	end | 	end | ||
| 	if name_range_cache  | 	if name_range_cache -- Check if previously used "name hook" applies to this code point. | ||
| 			and codepoint >= name_range_cache[1] | |||
| 			and codepoint <= name_range_cache[2] then | |||
| 		return generate_name(name_range_cache[3], codepoint) | |||
| 		end | |||
| 	end | 	end | ||
| 	local range = binary_range_search(codepoint, name_hooks) | |||
| 	if range then | |||
| 		name_range_cache = range | |||
| 			break | |||
| 		return generate_name(range[3], codepoint) | |||
| 		elseif codepoint <= item[2] then | |||
| 			name_range_cache = item | |||
| 			return generate_name(item[3], codepoint) | |||
| 		end | |||
| 	end | 	end | ||
| 	local | 	local data = loader[('names/%03X'):format(codepoint / 0x1000)] | ||
| 		('Module:Unicode data/names/%03X'):format(codepoint / 0x1000)) | |||
| 	if  | 	if data and data[codepoint] then | ||
| 		return data[codepoint] | 		return data[codepoint] | ||
| 	-- Unassigned (Cn)  | 	-- Unassigned (Cn) consists of noncharacters and reserved characters. | ||
| 	--  | 	-- The character has been established not to be a noncharacter, | ||
| 	--  | 	-- and if it were assigned, its name would already been retrieved, | ||
| 	-- so it must be reserved. | |||
| 	elseif not export.is_assigned(codepoint) then | |||
| 		return ("<reserved-%04X>"):format(codepoint) | |||
| 	else | 	else | ||
| 		return ("<reserved-%04X>"):format(codepoint) | |||
| 		require("Module:debug").track("Unicode data/no name or label") | |||
| 		return ("<U-%04X>"):format(codepoint) -- This point should not be reached. | |||
| 	end | 	end | ||
| end | end | ||
| function  | function p.lookup_image(codepoint) | ||
| 	local | 	local data = loader[('images/%03X'):format(codepoint / 0x1000)] | ||
| 		('Module:Unicode data/images/%03X'):format(codepoint / 0x1000) | |||
| 	) | |||
| 	if  | 	if data then | ||
| 		return data[codepoint] | 		return data[codepoint] | ||
| 	end | 	end | ||
| end | |||
| function export.template_lookup_name(frame) | |||
| 	local codepoint = tonumber(frame.args[1] or frame:getParent().args[1]) | |||
| 	local name = export.lookup_name(codepoint) | |||
| 	return name:gsub("<", "<") | |||
| end | end | ||
| Line 129: | Line 179: | ||
| 	[ 1] = "Supplementary Multilingual Plane"; | 	[ 1] = "Supplementary Multilingual Plane"; | ||
| 	[ 2] = "Supplementary Ideographic Plane"; | 	[ 2] = "Supplementary Ideographic Plane"; | ||
| 	[ | 	[ 3] = "Tertiary Ideographic Plane"; | ||
| 	[14] = "Supplementary  | 	[14] = "Supplementary Special-purpose Plane"; | ||
| 	[15] = "Supplementary Private Use Area- | 	[15] = "Supplementary Private Use Area-A"; | ||
| 	[16] = "Supplementary Private Use Area-B"; | |||
| } | } | ||
| -- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable. | |||
| -- http://www.unicode.org/Public/UNIDATA/Blocks.txt | |||
| local blocks | |||
| -- This should be kept synchronized with [[Module:category tree/scriptcatboiler/blocks]]. | |||
| local blocks = { | |||
| 	{ "Basic Latin",                                     0x000000, 0x00007F }, | |||
| 	{ "Latin-1 Supplement",                              0x000080, 0x0000FF }, | |||
| 	{ "Latin Extended-A",                                0x000100, 0x00017F }, | |||
| 	{ "Latin Extended-B",                                0x000180, 0x00024F }, | |||
| 	{ "IPA Extensions",                                  0x000250, 0x0002AF }, | |||
| 	{ "Spacing Modifier Letters",                        0x0002B0, 0x0002FF }, | |||
| 	{ "Combining Diacritical Marks",                     0x000300, 0x00036F }, | |||
| 	{ "Greek and Coptic",                                0x000370, 0x0003FF }, | |||
| 	{ "Cyrillic",                                        0x000400, 0x0004FF }, | |||
| 	{ "Cyrillic Supplement",                             0x000500, 0x00052F }, | |||
| 	{ "Armenian",                                        0x000530, 0x00058F }, | |||
| 	{ "Hebrew",                                          0x000590, 0x0005FF }, | |||
| 	{ "Arabic",                                          0x000600, 0x0006FF }, | |||
| 	{ "Syriac",                                          0x000700, 0x00074F }, | |||
| 	{ "Arabic Supplement",                               0x000750, 0x00077F }, | |||
| 	{ "Thaana",                                          0x000780, 0x0007BF }, | |||
| 	{ "NKo",                                             0x0007C0, 0x0007FF }, | |||
| 	{ "Samaritan",                                       0x000800, 0x00083F }, | |||
| 	{ "Mandaic",                                         0x000840, 0x00085F }, | |||
| 	{ "Syriac Supplement",                               0x000860, 0x00086F }, | |||
| 	{ "Arabic Extended-A",                               0x0008A0, 0x0008FF }, | |||
| 	{ "Devanagari",                                      0x000900, 0x00097F }, | |||
| 	{ "Bengali",                                         0x000980, 0x0009FF }, | |||
| 	{ "Gurmukhi",                                        0x000A00, 0x000A7F }, | |||
| 	{ "Gujarati",                                        0x000A80, 0x000AFF }, | |||
| 	{ "Oriya",                                           0x000B00, 0x000B7F }, | |||
| 	{ "Tamil",                                           0x000B80, 0x000BFF }, | |||
| 	{ "Telugu",                                          0x000C00, 0x000C7F }, | |||
| 	{ "Kannada",                                         0x000C80, 0x000CFF }, | |||
| 	{ "Malayalam",                                       0x000D00, 0x000D7F }, | |||
| 	{ "Sinhala",                                         0x000D80, 0x000DFF }, | |||
| 	{ "Thai",                                            0x000E00, 0x000E7F }, | |||
| 	{ "Lao",                                             0x000E80, 0x000EFF }, | |||
| 	{ "Tibetan",                                         0x000F00, 0x000FFF }, | |||
| 	{ "Myanmar",                                         0x001000, 0x00109F }, | |||
| 	{ "Georgian",                                        0x0010A0, 0x0010FF }, | |||
| 	{ "Hangul Jamo",                                     0x001100, 0x0011FF }, | |||
| 	{ "Ethiopic",                                        0x001200, 0x00137F }, | |||
| 	{ "Ethiopic Supplement",                             0x001380, 0x00139F }, | |||
| 	{ "Cherokee",                                        0x0013A0, 0x0013FF }, | |||
| 	{ "Unified Canadian Aboriginal Syllabics",           0x001400, 0x00167F }, | |||
| 	{ "Ogham",                                           0x001680, 0x00169F }, | |||
| 	{ "Runic",                                           0x0016A0, 0x0016FF }, | |||
| 	{ "Tagalog",                                         0x001700, 0x00171F }, | |||
| 	{ "Hanunoo",                                         0x001720, 0x00173F }, | |||
| 	{ "Buhid",                                           0x001740, 0x00175F }, | |||
| 	{ "Tagbanwa",                                        0x001760, 0x00177F }, | |||
| 	{ "Khmer",                                           0x001780, 0x0017FF }, | |||
| 	{ "Mongolian",                                       0x001800, 0x0018AF }, | |||
| 	{ "Unified Canadian Aboriginal Syllabics Extended",  0x0018B0, 0x0018FF }, | |||
| 	{ "Limbu",                                           0x001900, 0x00194F }, | |||
| 	{ "Tai Le",                                          0x001950, 0x00197F }, | |||
| 	{ "New Tai Lue",                                     0x001980, 0x0019DF }, | |||
| 	{ "Khmer Symbols",                                   0x0019E0, 0x0019FF }, | |||
| 	{ "Buginese",                                        0x001A00, 0x001A1F }, | |||
| 	{ "Tai Tham",                                        0x001A20, 0x001AAF }, | |||
| 	{ "Combining Diacritical Marks Extended",            0x001AB0, 0x001AFF }, | |||
| 	{ "Balinese",                                        0x001B00, 0x001B7F }, | |||
| 	{ "Sundanese",                                       0x001B80, 0x001BBF }, | |||
| 	{ "Batak",                                           0x001BC0, 0x001BFF }, | |||
| 	{ "Lepcha",                                          0x001C00, 0x001C4F }, | |||
| 	{ "Ol Chiki",                                        0x001C50, 0x001C7F }, | |||
| 	{ "Cyrillic Extended-C",                             0x001C80, 0x001C8F }, | |||
| 	{ "Georgian Extended",                               0x001C90, 0x001CBF }, | |||
| 	{ "Sundanese Supplement",                            0x001CC0, 0x001CCF }, | |||
| 	{ "Vedic Extensions",                                0x001CD0, 0x001CFF }, | |||
| 	{ "Phonetic Extensions",                             0x001D00, 0x001D7F }, | |||
| 	{ "Phonetic Extensions Supplement",                  0x001D80, 0x001DBF }, | |||
| 	{ "Combining Diacritical Marks Supplement",          0x001DC0, 0x001DFF }, | |||
| 	{ "Latin Extended Additional",                       0x001E00, 0x001EFF }, | |||
| 	{ "Greek Extended",                                  0x001F00, 0x001FFF }, | |||
| 	{ "General Punctuation",                             0x002000, 0x00206F }, | |||
| 	{ "Superscripts and Subscripts",                     0x002070, 0x00209F }, | |||
| 	{ "Currency Symbols",                                0x0020A0, 0x0020CF }, | |||
| 	{ "Combining Diacritical Marks for Symbols",         0x0020D0, 0x0020FF }, | |||
| 	{ "Letterlike Symbols",                              0x002100, 0x00214F }, | |||
| 	{ "Number Forms",                                    0x002150, 0x00218F }, | |||
| 	{ "Arrows",                                          0x002190, 0x0021FF }, | |||
| 	{ "Mathematical Operators",                          0x002200, 0x0022FF }, | |||
| 	{ "Miscellaneous Technical",                         0x002300, 0x0023FF }, | |||
| 	{ "Control Pictures",                                0x002400, 0x00243F }, | |||
| 	{ "Optical Character Recognition",                   0x002440, 0x00245F }, | |||
| 	{ "Enclosed Alphanumerics",                          0x002460, 0x0024FF }, | |||
| 	{ "Box Drawing",                                     0x002500, 0x00257F }, | |||
| 	{ "Block Elements",                                  0x002580, 0x00259F }, | |||
| 	{ "Geometric Shapes",                                0x0025A0, 0x0025FF }, | |||
| 	{ "Miscellaneous Symbols",                           0x002600, 0x0026FF }, | |||
| 	{ "Dingbats",                                        0x002700, 0x0027BF }, | |||
| 	{ "Miscellaneous Mathematical Symbols-A",            0x0027C0, 0x0027EF }, | |||
| 	{ "Supplemental Arrows-A",                           0x0027F0, 0x0027FF }, | |||
| 	{ "Braille Patterns",                                0x002800, 0x0028FF }, | |||
| 	{ "Supplemental Arrows-B",                           0x002900, 0x00297F }, | |||
| 	{ "Miscellaneous Mathematical Symbols-B",            0x002980, 0x0029FF }, | |||
| 	{ "Supplemental Mathematical Operators",             0x002A00, 0x002AFF }, | |||
| 	{ "Miscellaneous Symbols and Arrows",                0x002B00, 0x002BFF }, | |||
| 	{ "Glagolitic",                                      0x002C00, 0x002C5F }, | |||
| 	{ "Latin Extended-C",                                0x002C60, 0x002C7F }, | |||
| 	{ "Coptic",                                          0x002C80, 0x002CFF }, | |||
| 	{ "Georgian Supplement",                             0x002D00, 0x002D2F }, | |||
| 	{ "Tifinagh",                                        0x002D30, 0x002D7F }, | |||
| 	{ "Ethiopic Extended",                               0x002D80, 0x002DDF }, | |||
| 	{ "Cyrillic Extended-A",                             0x002DE0, 0x002DFF }, | |||
| 	{ "Supplemental Punctuation",                        0x002E00, 0x002E7F }, | |||
| 	{ "CJK Radicals Supplement",                         0x002E80, 0x002EFF }, | |||
| 	{ "Kangxi Radicals",                                 0x002F00, 0x002FDF }, | |||
| 	{ "Ideographic Description Characters",              0x002FF0, 0x002FFF }, | |||
| 	{ "CJK Symbols and Punctuation",                     0x003000, 0x00303F }, | |||
| 	{ "Hiragana",                                        0x003040, 0x00309F }, | |||
| 	{ "Katakana",                                        0x0030A0, 0x0030FF }, | |||
| 	{ "Bopomofo",                                        0x003100, 0x00312F }, | |||
| 	{ "Hangul Compatibility Jamo",                       0x003130, 0x00318F }, | |||
| 	{ "Kanbun",                                          0x003190, 0x00319F }, | |||
| 	{ "Bopomofo Extended",                               0x0031A0, 0x0031BF }, | |||
| 	{ "CJK Strokes",                                     0x0031C0, 0x0031EF }, | |||
| 	{ "Katakana Phonetic Extensions",                    0x0031F0, 0x0031FF }, | |||
| 	{ "Enclosed CJK Letters and Months",                 0x003200, 0x0032FF }, | |||
| 	{ "CJK Compatibility",                               0x003300, 0x0033FF }, | |||
| 	{ "CJK Unified Ideographs Extension A",              0x003400, 0x004DBF }, | |||
| 	{ "Yijing Hexagram Symbols",                         0x004DC0, 0x004DFF }, | |||
| 	{ "CJK Unified Ideographs",                          0x004E00, 0x009FFF }, | |||
| 	{ "Yi Syllables",                                    0x00A000, 0x00A48F }, | |||
| 	{ "Yi Radicals",                                     0x00A490, 0x00A4CF }, | |||
| 	{ "Lisu",                                            0x00A4D0, 0x00A4FF }, | |||
| 	{ "Vai",                                             0x00A500, 0x00A63F }, | |||
| 	{ "Cyrillic Extended-B",                             0x00A640, 0x00A69F }, | |||
| 	{ "Bamum",                                           0x00A6A0, 0x00A6FF }, | |||
| 	{ "Modifier Tone Letters",                           0x00A700, 0x00A71F }, | |||
| 	{ "Latin Extended-D",                                0x00A720, 0x00A7FF }, | |||
| 	{ "Syloti Nagri",                                    0x00A800, 0x00A82F }, | |||
| 	{ "Common Indic Number Forms",                       0x00A830, 0x00A83F }, | |||
| 	{ "Phags-pa",                                        0x00A840, 0x00A87F }, | |||
| 	{ "Saurashtra",                                      0x00A880, 0x00A8DF }, | |||
| 	{ "Devanagari Extended",                             0x00A8E0, 0x00A8FF }, | |||
| 	{ "Kayah Li",                                        0x00A900, 0x00A92F }, | |||
| 	{ "Rejang",                                          0x00A930, 0x00A95F }, | |||
| 	{ "Hangul Jamo Extended-A",                          0x00A960, 0x00A97F }, | |||
| 	{ "Javanese",                                        0x00A980, 0x00A9DF }, | |||
| 	{ "Myanmar Extended-B",                              0x00A9E0, 0x00A9FF }, | |||
| 	{ "Cham",                                            0x00AA00, 0x00AA5F }, | |||
| 	{ "Myanmar Extended-A",                              0x00AA60, 0x00AA7F }, | |||
| 	{ "Tai Viet",                                        0x00AA80, 0x00AADF }, | |||
| 	{ "Meetei Mayek Extensions",                         0x00AAE0, 0x00AAFF }, | |||
| 	{ "Ethiopic Extended-A",                             0x00AB00, 0x00AB2F }, | |||
| 	{ "Latin Extended-E",                                0x00AB30, 0x00AB6F }, | |||
| 	{ "Cherokee Supplement",                             0x00AB70, 0x00ABBF }, | |||
| 	{ "Meetei Mayek",                                    0x00ABC0, 0x00ABFF }, | |||
| 	{ "Hangul Syllables",                                0x00AC00, 0x00D7AF }, | |||
| 	{ "Hangul Jamo Extended-B",                          0x00D7B0, 0x00D7FF }, | |||
| 	{ "High Surrogates",                                 0x00D800, 0x00DB7F }, | |||
| 	{ "High Private Use Surrogates",                     0x00DB80, 0x00DBFF }, | |||
| 	{ "Low Surrogates",                                  0x00DC00, 0x00DFFF }, | |||
| 	{ "Private Use Area",                                0x00E000, 0x00F8FF }, | |||
| 	{ "CJK Compatibility Ideographs",                    0x00F900, 0x00FAFF }, | |||
| 	{ "Alphabetic Presentation Forms",                   0x00FB00, 0x00FB4F }, | |||
| 	{ "Arabic Presentation Forms-A",                     0x00FB50, 0x00FDFF }, | |||
| 	{ "Variation Selectors",                             0x00FE00, 0x00FE0F }, | |||
| 	{ "Vertical Forms",                                  0x00FE10, 0x00FE1F }, | |||
| 	{ "Combining Half Marks",                            0x00FE20, 0x00FE2F }, | |||
| 	{ "CJK Compatibility Forms",                         0x00FE30, 0x00FE4F }, | |||
| 	{ "Small Form Variants",                             0x00FE50, 0x00FE6F }, | |||
| 	{ "Arabic Presentation Forms-B",                     0x00FE70, 0x00FEFF }, | |||
| 	{ "Halfwidth and Fullwidth Forms",                   0x00FF00, 0x00FFEF }, | |||
| 	{ "Specials",                                        0x00FFF0, 0x00FFFF }, | |||
| 	{ "Linear B Syllabary",                              0x010000, 0x01007F }, | |||
| 	{ "Linear B Ideograms",                              0x010080, 0x0100FF }, | |||
| 	{ "Aegean Numbers",                                  0x010100, 0x01013F }, | |||
| 	{ "Ancient Greek Numbers",                           0x010140, 0x01018F }, | |||
| 	{ "Ancient Symbols",                                 0x010190, 0x0101CF }, | |||
| 	{ "Phaistos Disc",                                   0x0101D0, 0x0101FF }, | |||
| 	{ "Lycian",                                          0x010280, 0x01029F }, | |||
| 	{ "Carian",                                          0x0102A0, 0x0102DF }, | |||
| 	{ "Coptic Epact Numbers",                            0x0102E0, 0x0102FF }, | |||
| 	{ "Old Italic",                                      0x010300, 0x01032F }, | |||
| 	{ "Gothic",                                          0x010330, 0x01034F }, | |||
| 	{ "Old Permic",                                      0x010350, 0x01037F }, | |||
| 	{ "Ugaritic",                                        0x010380, 0x01039F }, | |||
| 	{ "Old Persian",                                     0x0103A0, 0x0103DF }, | |||
| 	{ "Deseret",                                         0x010400, 0x01044F }, | |||
| 	{ "Shavian",                                         0x010450, 0x01047F }, | |||
| 	{ "Osmanya",                                         0x010480, 0x0104AF }, | |||
| 	{ "Osage",                                           0x0104B0, 0x0104FF }, | |||
| 	{ "Elbasan",                                         0x010500, 0x01052F }, | |||
| 	{ "Caucasian Albanian",                              0x010530, 0x01056F }, | |||
| 	{ "Linear A",                                        0x010600, 0x01077F }, | |||
| 	{ "Cypriot Syllabary",                               0x010800, 0x01083F }, | |||
| 	{ "Imperial Aramaic",                                0x010840, 0x01085F }, | |||
| 	{ "Palmyrene",                                       0x010860, 0x01087F }, | |||
| 	{ "Nabataean",                                       0x010880, 0x0108AF }, | |||
| 	{ "Hatran",                                          0x0108E0, 0x0108FF }, | |||
| 	{ "Phoenician",                                      0x010900, 0x01091F }, | |||
| 	{ "Lydian",                                          0x010920, 0x01093F }, | |||
| 	{ "Meroitic Hieroglyphs",                            0x010980, 0x01099F }, | |||
| 	{ "Meroitic Cursive",                                0x0109A0, 0x0109FF }, | |||
| 	{ "Kharoshthi",                                      0x010A00, 0x010A5F }, | |||
| 	{ "Old South Arabian",                               0x010A60, 0x010A7F }, | |||
| 	{ "Old North Arabian",                               0x010A80, 0x010A9F }, | |||
| 	{ "Manichaean",                                      0x010AC0, 0x010AFF }, | |||
| 	{ "Avestan",                                         0x010B00, 0x010B3F }, | |||
| 	{ "Inscriptional Parthian",                          0x010B40, 0x010B5F }, | |||
| 	{ "Inscriptional Pahlavi",                           0x010B60, 0x010B7F }, | |||
| 	{ "Psalter Pahlavi",                                 0x010B80, 0x010BAF }, | |||
| 	{ "Old Turkic",                                      0x010C00, 0x010C4F }, | |||
| 	{ "Old Hungarian",                                   0x010C80, 0x010CFF }, | |||
| 	{ "Hanifi Rohingya",                                 0x010D00, 0x010D3F }, | |||
| 	{ "Rumi Numeral Symbols",                            0x010E60, 0x010E7F }, | |||
| 	{ "Old Sogdian",                                     0x010F00, 0x010F2F }, | |||
| 	{ "Sogdian",                                         0x010F30, 0x010F6F }, | |||
| 	{ "Brahmi",                                          0x011000, 0x01107F }, | |||
| 	{ "Kaithi",                                          0x011080, 0x0110CF }, | |||
| 	{ "Sora Sompeng",                                    0x0110D0, 0x0110FF }, | |||
| 	{ "Chakma",                                          0x011100, 0x01114F }, | |||
| 	{ "Mahajani",                                        0x011150, 0x01117F }, | |||
| 	{ "Sharada",                                         0x011180, 0x0111DF }, | |||
| 	{ "Sinhala Archaic Numbers",                         0x0111E0, 0x0111FF }, | |||
| 	{ "Khojki",                                          0x011200, 0x01124F }, | |||
| 	{ "Multani",                                         0x011280, 0x0112AF }, | |||
| 	{ "Khudawadi",                                       0x0112B0, 0x0112FF }, | |||
| 	{ "Grantha",                                         0x011300, 0x01137F }, | |||
| 	{ "Newa",                                            0x011400, 0x01147F }, | |||
| 	{ "Tirhuta",                                         0x011480, 0x0114DF }, | |||
| 	{ "Siddham",                                         0x011580, 0x0115FF }, | |||
| 	{ "Modi",                                            0x011600, 0x01165F }, | |||
| 	{ "Mongolian Supplement",                            0x011660, 0x01167F }, | |||
| 	{ "Takri",                                           0x011680, 0x0116CF }, | |||
| 	{ "Ahom",                                            0x011700, 0x01173F }, | |||
| 	{ "Dogra",                                           0x011800, 0x01184F }, | |||
| 	{ "Warang Citi",                                     0x0118A0, 0x0118FF }, | |||
| 	{ "Zanabazar Square",                                0x011A00, 0x011A4F }, | |||
| 	{ "Soyombo",                                         0x011A50, 0x011AAF }, | |||
| 	{ "Pau Cin Hau",                                     0x011AC0, 0x011AFF }, | |||
| 	{ "Bhaiksuki",                                       0x011C00, 0x011C6F }, | |||
| 	{ "Marchen",                                         0x011C70, 0x011CBF }, | |||
| 	{ "Masaram Gondi",                                   0x011D00, 0x011D5F }, | |||
| 	{ "Gunjala Gondi",                                   0x011D60, 0x011DAF }, | |||
| 	{ "Makasar",                                         0x011EE0, 0x011EFF }, | |||
| 	{ "Cuneiform",                                       0x012000, 0x0123FF }, | |||
| 	{ "Cuneiform Numbers and Punctuation",               0x012400, 0x01247F }, | |||
| 	{ "Early Dynastic Cuneiform",                        0x012480, 0x01254F }, | |||
| 	{ "Egyptian Hieroglyphs",                            0x013000, 0x01342F }, | |||
| 	{ "Anatolian Hieroglyphs",                           0x014400, 0x01467F }, | |||
| 	{ "Bamum Supplement",                                0x016800, 0x016A3F }, | |||
| 	{ "Mro",                                             0x016A40, 0x016A6F }, | |||
| 	{ "Bassa Vah",                                       0x016AD0, 0x016AFF }, | |||
| 	{ "Pahawh Hmong",                                    0x016B00, 0x016B8F }, | |||
| 	{ "Medefaidrin",                                     0x016E40, 0x016E9F }, | |||
| 	{ "Miao",                                            0x016F00, 0x016F9F }, | |||
| 	{ "Ideographic Symbols and Punctuation",             0x016FE0, 0x016FFF }, | |||
| 	{ "Tangut",                                          0x017000, 0x0187FF }, | |||
| 	{ "Tangut Components",                               0x018800, 0x018AFF }, | |||
| 	{ "Kana Supplement",                                 0x01B000, 0x01B0FF }, | |||
| 	{ "Kana Extended-A",                                 0x01B100, 0x01B12F }, | |||
| 	{ "Nushu",                                           0x01B170, 0x01B2FF }, | |||
| 	{ "Duployan",                                        0x01BC00, 0x01BC9F }, | |||
| 	{ "Shorthand Format Controls",                       0x01BCA0, 0x01BCAF }, | |||
| 	{ "Byzantine Musical Symbols",                       0x01D000, 0x01D0FF }, | |||
| 	{ "Musical Symbols",                                 0x01D100, 0x01D1FF }, | |||
| 	{ "Ancient Greek Musical Notation",                  0x01D200, 0x01D24F }, | |||
| 	{ "Mayan Numerals",                                  0x01D2E0, 0x01D2FF }, | |||
| 	{ "Tai Xuan Jing Symbols",                           0x01D300, 0x01D35F }, | |||
| 	{ "Counting Rod Numerals",                           0x01D360, 0x01D37F }, | |||
| 	{ "Mathematical Alphanumeric Symbols",               0x01D400, 0x01D7FF }, | |||
| 	{ "Sutton SignWriting",                              0x01D800, 0x01DAAF }, | |||
| 	{ "Glagolitic Supplement",                           0x01E000, 0x01E02F }, | |||
| 	{ "Mende Kikakui",                                   0x01E800, 0x01E8DF }, | |||
| 	{ "Adlam",                                           0x01E900, 0x01E95F }, | |||
| 	{ "Indic Siyaq Numbers",                             0x01EC70, 0x01ECBF }, | |||
| 	{ "Arabic Mathematical Alphabetic Symbols",          0x01EE00, 0x01EEFF }, | |||
| 	{ "Mahjong Tiles",                                   0x01F000, 0x01F02F }, | |||
| 	{ "Domino Tiles",                                    0x01F030, 0x01F09F }, | |||
| 	{ "Playing Cards",                                   0x01F0A0, 0x01F0FF }, | |||
| 	{ "Enclosed Alphanumeric Supplement",                0x01F100, 0x01F1FF }, | |||
| 	{ "Enclosed Ideographic Supplement",                 0x01F200, 0x01F2FF }, | |||
| 	{ "Miscellaneous Symbols and Pictographs",           0x01F300, 0x01F5FF }, | |||
| 	{ "Emoticons",                                       0x01F600, 0x01F64F }, | |||
| 	{ "Ornamental Dingbats",                             0x01F650, 0x01F67F }, | |||
| 	{ "Transport and Map Symbols",                       0x01F680, 0x01F6FF }, | |||
| 	{ "Alchemical Symbols",                              0x01F700, 0x01F77F }, | |||
| 	{ "Geometric Shapes Extended",                       0x01F780, 0x01F7FF }, | |||
| 	{ "Supplemental Arrows-C",                           0x01F800, 0x01F8FF }, | |||
| 	{ "Supplemental Symbols and Pictographs",            0x01F900, 0x01F9FF }, | |||
| 	{ "Chess Symbols",                                   0x01FA00, 0x01FA6F }, | |||
| 	{ "CJK Unified Ideographs Extension B",              0x020000, 0x02A6DF }, | |||
| 	{ "CJK Unified Ideographs Extension C",              0x02A700, 0x02B73F }, | |||
| 	{ "CJK Unified Ideographs Extension D",              0x02B740, 0x02B81F }, | |||
| 	{ "CJK Unified Ideographs Extension E",              0x02B820, 0x02CEAF }, | |||
| 	{ "CJK Unified Ideographs Extension F",              0x02CEB0, 0x02EBEF }, | |||
| 	{ "CJK Compatibility Ideographs Supplement",         0x02F800, 0x02FA1F }, | |||
| 	{ "Tags",                                            0x0E0000, 0x0E007F }, | |||
| 	{ "Variation Selectors Supplement",                  0x0E0100, 0x0E01EF }, | |||
| 	{ "Supplementary Private Use Area-A",                0x0F0000, 0x0FFFFF }, | |||
| 	{ "Supplementary Private Use Area-B",                0x100000, 0x10FFFF }, | |||
| } | |||
| blocks.length = #blocks | |||
| function  | local function block_iter(blocks, i) | ||
| 	i = i + 1 | |||
| 	return function (blocks, i) | |||
| 	local data = blocks[i] | |||
| 		i = i + 1 | |||
| 	if data then | |||
| 		 -- Unpack doesn't work on tables loaded with mw.loadData. | |||
| 		if not data then | |||
| 		return i, data[1], data[2], data[3] | |||
| 	end | |||
| 		return i, unpack(data) | |||
| 	end, blocks, 0 | |||
| end | end | ||
| -- An ipairs-type iterator generator for the list of blocks. | |||
| function export.lookup_plane(codepoint) | |||
| function p.enum_blocks() | |||
| 	local blocks = loader.blocks | |||
| 	return block_iter, blocks, 0 | |||
| end | |||
| function p.lookup_plane(codepoint) | |||
| 	local i = floor(codepoint / 0x10000) | 	local i = floor(codepoint / 0x10000) | ||
| 	return planes[i] or ("Plane %u"):format(i) | 	return planes[i] or ("Plane %u"):format(i) | ||
| end | end | ||
| function p.lookup_block(codepoint) | |||
| -- Binary search, to avoid iterating over entire table in order to look up the | |||
| 	local blocks = loader.blocks | |||
| -- higher codepoints. | |||
| 	local range = binary_range_search(codepoint, blocks) | |||
| 	if range then | |||
| 	local iStart, iEnd = 1, blocks.length or #blocks | |||
| 		return range[3] | |||
| 	while iStart <= iEnd do | |||
| 	else | |||
| 		local iMid = floor((iStart + iEnd) / 2) | |||
| 		return "No Block" | |||
| 		local range = blocks[iMid] | |||
| 		if codepoint < range[2] then | |||
| 			iEnd = iMid - 1 | |||
| 		elseif codepoint <= range[3] then | |||
| 			return range[1] | |||
| 		else | |||
| 			iStart = iMid + 1 | |||
| 		end | |||
| 	end | 	end | ||
| 	error(string.format("No block found for codepoint U+%04X.", codepoint)) | |||
| end | end | ||
| function  | function p.get_block_info(name) | ||
| 	for i, block in ipairs(loader.blocks) do | |||
| 	local range | |||
| 		if block[3] == name then | |||
| 			return block | |||
| 	for i, block in ipairs(blocks) do | |||
| 		if block[1] == name then | |||
| 			range = block | |||
| 		end | 		end | ||
| 	end | |||
| 	if range then | |||
| 		return range[2], range[3] | |||
| 	end | 	end | ||
| end | end | ||
| function  | function p.is_valid_pagename(pagename) | ||
| 	local has_nonws = false | 	local has_nonws = false | ||
| Line 496: | Line 243: | ||
| 		end | 		end | ||
| 		local printable, result =  | 		local printable, result = p.is_printable(cp) | ||
| 		if not printable then | 		if not printable then | ||
| 			return false | 			return false | ||
| Line 510: | Line 257: | ||
| local function manual_unpack(what, from) | local function manual_unpack(what, from) | ||
| 	if what[from + 1] == nil then | |||
| 		return what[from] | |||
| 	end | |||
| 	local result = {} | 	local result = {} | ||
| 	from = from or 1 | 	from = from or 1 | ||
| Line 520: | Line 271: | ||
| end | end | ||
| local function  | local function compare_ranges(range1, range2) | ||
| 	return range1[1] < range2[1] | |||
| end | |||
| -- Creates a function to look up data in a module that contains "singles" (a | |||
| -- code point-to-data map) and "ranges" (an array containing arrays that contain | |||
| -- the low and high code points of a range and the data associated with that | |||
| -- range). | |||
| -- "loader" loads and returns the "singles" and "ranges" tables. | |||
| -- "match_func" is passed the code point and either the data or the "dots", and | |||
| -- generates the final result of the function. | |||
| -- The varargs ("dots") describes the default data to be returned if there wasn't | |||
| -- a match. | |||
| -- In case the function is used more than once, "cache" saves ranges that have | |||
| -- already been found to match, or a range whose data is the default if there | |||
| -- was no match. | |||
| local function memo_lookup(data_module_subpage, match_func, ...) | |||
| 	local dots = { ... } | 	local dots = { ... } | ||
| 	local cache = {} | 	local cache = {} | ||
| Line 527: | Line 294: | ||
| 	return function (codepoint) | 	return function (codepoint) | ||
| 		if not singles then | 		if not singles then | ||
| 			local data_module = loader[data_module_subpage] | |||
| 			singles, ranges = data_module.singles, data_module.ranges | |||
| 		end | 		end | ||
| Line 534: | Line 302: | ||
| 		end | 		end | ||
| 		local range = binary_range_search(codepoint, cache) | |||
| 		local lastlast = -1 | |||
| 		if range then | |||
| 			return match_func(codepoint, manual_unpack(range, 3)) | |||
| 				return match_func(codepoint, unpack(range, 3)) | |||
| 			end | |||
| 		end | 		end | ||
| 		local range, index = binary_range_search(codepoint, ranges) | |||
| 		if range then | |||
| 			table.insert(cache, range) | |||
| 			table.sort(cache, compare_ranges) | |||
| 				return match_func(codepoint, unpack(dots)) | |||
| 			return match_func(codepoint, manual_unpack(range, 3)) | |||
| 		end | |||
| 				table.insert(cache, { manual_unpack(range) }) | |||
| 				return match_func(codepoint, manual_unpack(range, 3)) | |||
| 		if ranges[index] then | |||
| 			else | |||
| 			local dots_range | |||
| 				lastlast = range[2] | |||
| 			if codepoint > ranges[index][2] then | |||
| 				dots_range = { | |||
| 					ranges[index][2] + 1, | |||
| 					ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF, | |||
| 					unpack(dots) | |||
| 				} | |||
| 			else -- codepoint < range[index][1] | |||
| 				dots_range = { | |||
| 					ranges[index - 1] and ranges[index - 1][2] + 1 or 0, | |||
| 					ranges[index][1] - 1, | |||
| 					unpack(dots) | |||
| 				} | |||
| 			end | 			end | ||
| 			table.sort(cache, compare_ranges) | |||
| 		end | 		end | ||
| 		return match_func(codepoint) | 		return match_func(codepoint) | ||
| 	end | 	end | ||
| end | end | ||
| -- Get a  | -- Get a code point's combining class value in [[Module:Unicode data/combining]], | ||
| -- and return whether this value is not zero. Zero is assigned as the default | -- and return whether this value is not zero. Zero is assigned as the default | ||
| -- if the combining class value is not found in this data module. | -- if the combining class value is not found in this data module. | ||
| -- That is, return true if character is combining, or false if it is not. | -- That is, return true if character is combining, or false if it is not. | ||
| -- See  | -- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for | ||
| -- more information. | -- more information. | ||
| p.is_combining = memo_lookup( | |||
| 	"combining", | |||
| 	local m_comb = mw.loadData('Module:Unicode data/combining') | |||
| 	function (codepoint, combining_class) | |||
| 	return m_comb.single, m_comb.ranges | |||
| 		return combining_class and combining_class ~= 0 or false | |||
| 	end, | |||
| 	return combining_class and combining_class ~= 0 | |||
| 	0) | |||
| 		or false | |||
| end, 0) | |||
| function  | function p.add_dotted_circle(str) | ||
| 	return (mw.ustring.gsub(str, ".", | 	return (mw.ustring.gsub(str, ".", | ||
| 		function(char) | 		function(char) | ||
| 			if  | 			if p.is_combining(mw.ustring.codepoint(char)) then | ||
| 				return '◌' .. char | 				return '◌' .. char | ||
| 			end | 			end | ||
| Line 580: | Line 358: | ||
| end | end | ||
| local lookup_control = memo_lookup( | local lookup_control = memo_lookup( | ||
| 	"control", | |||
| 	local m_cc = mw.loadData('Module:Unicode data/control') | |||
| 	function (codepoint, ccc) | |||
| 	return m_cc.single, m_cc.ranges | |||
| 		return ccc or "assigned" | |||
| end, function (codepoint, ccc) | |||
| 	end, | |||
| 	return ccc or "assigned" | |||
| 	"assigned") | |||
| p.lookup_control = lookup_control | |||
| function  | function p.is_assigned(codepoint) | ||
| 	return lookup_control(codepoint) ~= "unassigned" | 	return lookup_control(codepoint) ~= "unassigned" | ||
| end | end | ||
| function  | function p.is_printable(codepoint) | ||
| 	local result = lookup_control(codepoint) | 	local result = lookup_control(codepoint) | ||
| 	return (result == "assigned") or (result == "space-separator"), result | 	return (result == "assigned") or (result == "space-separator"), result | ||
| end | end | ||
| function  | function p.is_whitespace(codepoint) | ||
| 	local result = lookup_control(codepoint) | 	local result = lookup_control(codepoint) | ||
| 	return (result == "space-separator"), result | 	return (result == "space-separator"), result | ||
| end | end | ||
| p.lookup_category = memo_lookup( | |||
| -- to be used in language-neutral context only (e.g. character lists) | |||
| 	"category", | |||
| 	function (codepoint, category) | |||
| 		return category | |||
| 	end, | |||
| 	"Cn") | |||
| local lookup_script = memo_lookup( | |||
| local script_pats | |||
| 	"scripts", | |||
| 	function (codepoint, script_code) | |||
| 		return script_code or 'Zzzz' | |||
| 	end, | |||
| 	"Zzzz") | |||
| p.lookup_script = lookup_script | |||
| function p.get_best_script(str) | |||
| -- Scripts that consist entirely of characters from another script. | |||
| 	-- Check type of argument, because mw.text.decode coerces numbers to strings! | |||
| local script_blacklist = { | |||
| 	require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
| 	["Latf"]		= true; | |||
| 	["Hans"]		= true; | |||
| 	-- Convert HTML character references (including named character references, | |||
| 	["Hant"]		= true; | |||
| 	-- or character entities) to characters. | |||
| 	["Kore"]		= true; | |||
| 	str = mw.text.decode(str, true) | |||
| 	["Jpan"]		= true; | |||
| 	["fa-Arab"] 	= true; | |||
| 	local scripts = {} | |||
| 	["kk-Arab"] 	= true; | |||
| 	for codepoint in mw.ustring.gcodepoint(str) do | |||
| 	["ks-Arab"] 	= true; | |||
| 		local script = lookup_script(codepoint) | |||
| 	["ku-Arab"]		= true; | |||
| 	["mzn-Arab"]	= true; | |||
| 		-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts. | |||
| 	["ota-Arab"]	= true; | |||
| 		if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then | |||
| 	["pa-Arab"]		= true; | |||
| 			scripts[script] = true | |||
| 	["sd-Arab"]		= true; | |||
| 	["tt-Arab"]		= true; | |||
| 	["ug-Arab"]		= true; | |||
| 	["ur-Arab"]		= true; | |||
| 	["nv-Latn"]		= true; | |||
| 	["pjt-Latn"]	= true; | |||
| 	["Zyyy"]		= true; | |||
| } | |||
| --[[ | |||
| 	Problem scripts: Grek and polytonic, Cyrl and Cyrs, Latn and Latinx. | |||
| 	In each key-value pair, the value should take precedence over the key. | |||
| ]] | |||
| local overridden_by = { | |||
| 	["Cyrs"] = "Cyrl", | |||
| 	["polytonic"] = "Grek", | |||
| 	["Latinx"] = "Latn", | |||
| } | |||
| local script_cache = {} | |||
| function export.get_script(codepoint) | |||
| 	local text | |||
| 	if type(codepoint) == "number" then | |||
| 		text = mw.ustring.char(codepoint) | |||
| 	elseif type(codepoint) == "string" then | |||
| 		text = codepoint | |||
| 	else | |||
| 		error("Argument to get_script should be a number (codepoint) or string.") | |||
| 	end | |||
| 	for pat, sc in pairs(script_cache) do | |||
| 		if mw.ustring.match(text, pat) and not overridden_by[sc] then | |||
| 			return sc | |||
| 		end | 		end | ||
| 	end | 	end | ||
| 	-- If scripts does not contain two or more keys, | |||
| 	-- return first and only key (script code) in table. | |||
| 	if not next(scripts, next(scripts)) then | |||
| 		return next(scripts) | |||
| 	end -- else return majority script, or else "Zzzz"? | |||
| end | |||
| function p.is_Latin(str) | |||
| 	if not script_pats then | |||
| 	require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
| 		local m_scripts = mw.loadData("Module:scripts/data") | |||
| 	str = mw.text.decode(str, true) | |||
| 		script_pats = {} | |||
| 		for sc, info in pairs(m_scripts) do | |||
| 	-- Search for the leading bytes that introduce the UTF-8 encoding of the | |||
| 			if info.characters and not script_blacklist[sc] then | |||
| 	-- code points U+0340-U+10FFFF. If they are not found and there is at least | |||
| 				script_pats[sc] = "[" .. info.characters .. "]" | |||
| 	-- one Latin-script character, the string counts as Latin, because the rest | |||
| 	-- of the characters can only be Zyyy, Zinh, and Zzzz. | |||
| 	-- The only scripts found below U+0370 (the first code point of the Greek | |||
| 	-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz. | |||
| 	-- See the codepage in the [[UTF-8]] article. | |||
| 	if not str:find "[\205-\244]" then | |||
| 		for codepoint in mw.ustring.gcodepoint(str) do | |||
| 			if lookup_script(codepoint) == "Latn" then | |||
| 				return true | |||
| 			end | 			end | ||
| 		end | 		end | ||
| 	end | 	end | ||
| 	local Latn = false | |||
| 	for sc, pat in pairs(script_pats) do | |||
| 	local i = 0;																-- indexer for use in error messages | |||
| 		if mw.ustring.match(text, pat) then | |||
| 			local overriding = overridden_by[sc] | |||
| 	for codepoint in mw.ustring.gcodepoint(str) do | |||
| 			if overriding and script_pats[overriding] and mw.ustring.match(text, script_pats[overriding]) then | |||
| 		i = i + 1;																-- bump the indexer | |||
| 				script_cache[script_pats[overriding]] = overriding | |||
| 		local script = lookup_script(codepoint) | |||
| 				return overriding | |||
| 		if script == "Latn" then | |||
| 				script_cache[pat] = sc | |||
| 			Latn = true | |||
| 		elseif not (script == "Zyyy" or script == "Zinh" | |||
| 			end | |||
| 				or script == "Zzzz") then | |||
| 			return false, i														-- abandon as not Latn; identify the offending character's position | |||
| 		end | 		end | ||
| 	end | 	end | ||
| 	return Latn, (not Latn and i) or nil										-- when <Latn> false, return offending charactor's position as second return value; nil else | |||
| 	return "None" | |||
| end | end | ||
| -- Checks that a string contains only characters belonging to right-to-left | |||
| local function sortRange(range1, range2) | |||
| -- scripts, or characters of ignorable scripts. | |||
| 	return range1[1] < range2[1] | |||
| function p.is_rtl(str) | |||
| end | |||
| 	require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
| 	str = mw.text.decode(str, true) | |||
| --[[ | |||
| 	Binary search: more efficient for the longer lists of codepoint ranges than | |||
| 	-- Search for the leading bytes that introduce the UTF-8 encoding of the | |||
| 	for the shorter ones. | |||
| 	-- code points U+0580-U+10FFFF. If they are not found, the string can only | |||
| ]] | |||
| 	-- have characters from a left-to-right script, because the first code point | |||
| local function binary_search(ranges, value) | |||
| 	-- in a right-to-left script is U+0591, in the Hebrew block. | |||
| 	if not ranges then | |||
| 	if not str:find "[\214-\244]" then | |||
| 		return nil | |||
| 		return false | |||
| 	end | 	end | ||
| 	--	Initialize numbers. | |||
| 	local  | 	local result = false | ||
| 	local rtl = loader.scripts.rtl | |||
| 	-- Can't use # because table is loaded by mw.loadData. | |||
| 	for codepoint in mw.ustring.gcodepoint(str) do | |||
| 	local iEnd = ranges.length or require("Module:table").size(ranges) | |||
| 		local script = lookup_script(codepoint) | |||
| 	if iEnd == 0 then | |||
| 		if rtl[script] then | |||
| 		return nil | |||
| 			result = true | |||
| 		elseif not (script == "Zyyy" or script == "Zinh" | |||
| 				or script == "Zzzz") then | |||
| 			return false | |||
| 		end | |||
| 	end | 	end | ||
| 	return result | |||
| end | |||
| 	local iterations = 0 | |||
| --[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------ | |||
| 	-- Do search. | |||
| 	while iStart <= iEnd do | |||
| 		iterations = iterations + 1 | |||
| external entry from an {{#invoke:}} to determine if a string of text is rtl.  Strips html and html-like tags so | |||
| 		-- Calculate middle. | |||
| that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text | |||
| 		iMid = floor((iStart + iEnd) / 2) | |||
| has <br /> tags. | |||
| ]] | |||
| 		-- Get compare value. | |||
| 		local range = ranges[iMid] | |||
| function p.is_rtl_frame (frame) | |||
| 		if range[1] > value then | |||
| 	local str = frame.args[1];													-- get the string from the {{#invoke:}} frame | |||
| 			iEnd = iMid - 1 | |||
| 	str = str:gsub ('%b<>', '');												-- strip any html and html-like tags | |||
| 	return p.is_rtl (str);														-- return if whatever remains rtl; false else | |||
| end | |||
| 		-- Return matching index. Assumes there are no duplicates. | |||
| 		elseif value <= range[2] then | |||
| 			return range | |||
| local function get_codepoint(args, arg) | |||
| 		-- Keep searching. | |||
| 	local codepoint_string = args[arg] | |||
| 		else | |||
| 		or errorf(2, "Parameter %s is required", tostring(arg)) | |||
| 			iStart = iMid + 1 | |||
| 	local codepoint = tonumber(codepoint_string, 16) | |||
| 		end | |||
| 		or errorf(2, "Parameter %s is not a code point in hexadecimal base", | |||
| 			tostring(arg)) | |||
| 	if not (0 <= codepoint and codepoint <= 0x10FFFF) then | |||
| 		errorf(2, "code point in parameter %s out of range", tostring(arg)) | |||
| 	end | 	end | ||
| 	return  | 	return codepoint | ||
| end | end | ||
| local function  | local function get_func(args, arg, prefix) | ||
| 	local suffix = args[arg] | |||
| 	for i, range in ipairs(ranges) do | |||
| 		or errorf(2, "Parameter %s is required", tostring(arg)) | |||
| 		if number < range[1] then | |||
| 	suffix = mw.text.trim(suffix) | |||
| 			return nil | |||
| 	local func_name = prefix .. suffix | |||
| 		elseif number <= range[2] then | |||
| 	local func = p[func_name] | |||
| 			return range[3] | |||
| 		or errorf(2, "There is no function '%s'", func_name) | |||
| 		end | |||
| 	return func | |||
| 	end | |||
| end | end | ||
| -- This function allows any of the "lookup" functions to be invoked. The first | |||
| -- Save previously used codepoint ranges in case another character is in the | |||
| -- parameter is the word after "lookup_"; the second parameter is the code point | |||
| -- same range. | |||
| -- in hexadecimal base. | |||
| local ranges_cache = {} | |||
| function p.lookup(frame) | |||
| 	local func = get_func(frame.args, 1, "lookup_") | |||
| --[=[ | |||
| 	local codepoint = get_codepoint(frame.args, 2) | |||
| 	Takes a codepoint or a character and finds the script code (if any) that is | |||
| 	local result = func(codepoint) | |||
| 	appropriate for it based on the codepoint, using the data module | |||
| 	if func == p.lookup_name then | |||
| 	[[Module:Unicode data/scripts]]. The data module was generated from the | |||
| 		-- Prevent code point labels such as <control-0000> from being | |||
| 	patterns in [[Module:scripts/data]] using [[Module:User:Erutuon/script recognition]]. | |||
| 		-- interpreted as HTML tags. | |||
| 		result = result:gsub("<", "<") | |||
| 	Converts the character to a codepoint. Returns a script code if the codepoint | |||
| 	is in the list of individual characters, or if it is in one of the defined | |||
| 	ranges in the 4096-character block that it belongs to, else returns "None". | |||
| ]=] | |||
| function export.char_to_script(char) | |||
| 	local lookup = mw.loadData("Module:Unicode data/scripts") | |||
| 	local t = type(char) | |||
| 	local codepoint | |||
| 	if t == "string" then | |||
| 		local etc | |||
| 		codepoint, etc = mw.ustring.codepoint(char) | |||
| 		if etc then | |||
| 			error("Argument to char_to_script should be a single character.") | |||
| 		end | |||
| 	elseif t == "number" then | |||
| 		codepoint = char | |||
| 	else | |||
| 		error("Argument to char_to_script should be a string or a number, but its type is " .. t .. ".") | |||
| 	end | 	end | ||
| 	return result | |||
| 	local individual_match = lookup.individual[codepoint] | |||
| 	if individual_match then | |||
| 		return individual_match | |||
| 	else | |||
| 		local script = look_up_in_order(codepoint, ranges_cache) | |||
| 		if script then | |||
| 			return script | |||
| 		end | |||
| 		local index = floor(codepoint / 0x1000) | |||
| 		script = look_up_in_order(index, lookup.blocks) | |||
| 		if script then | |||
| 			return script | |||
| 		end | |||
| 		local range = binary_search(lookup[index], codepoint) | |||
| 		if range then | |||
| 			table.insert(ranges_cache, range) | |||
| 			table.sort(ranges_cache, sortRange) | |||
| 			return range[3] | |||
| 		end | |||
| 	end | |||
| 	return "None" | |||
| end | end | ||
| function  | function p.is(frame) | ||
| 	local  | 	local func = get_func(frame.args, 1, "is_") | ||
| 	for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do | |||
| 		local script = export.char_to_script(character) | |||
| 		scripts[script] = (scripts[script] or 0) + 1 | |||
| 	end | |||
| 	-- is_Latin and is_valid_pagename take strings. | |||
| 	local best_script | |||
| 	if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then | |||
| 	local greatest_count = 0 | |||
| 		return (func(frame.args[2])) | |||
| 	for script, count in pairs(scripts) do | |||
| 	else -- The rest take code points. | |||
| 		if count > greatest_count then | |||
| 		local codepoint = get_codepoint(frame.args, 2) | |||
| 			best_script = script | |||
| 		return (func(codepoint)) -- Adjust to one result. | |||
| 			greatest_count = count | |||
| 		end | |||
| 	end | 	end | ||
| 	return best_script | |||
| end | end | ||
| function p.lookup_kCantonese(codepoint) | |||
| local unsupported_title = { | |||
| 	local data = loader[('Unihan/kCantonese/%02X'):format(floor(codepoint / 0x1000))] | |||
| 	[0x0020] = "Unsupported titles/Space"; | |||
| 	if data then | |||
| 	[0x0023] = "Unsupported titles/Number sign"; | |||
| 		return data[codepoint] | |||
| 	[0x002E] = "Unsupported titles/Full stop"; | |||
| 	[0x003A] = "Unsupported titles/Colon"; | |||
| 	[0x003C] = "Unsupported titles/Less than"; | |||
| 	[0x003E] = "Unsupported titles/Greater than"; | |||
| 	[0x005B] = "Unsupported titles/Left square bracket"; | |||
| 	[0x005D] = "Unsupported titles/Right square bracket"; | |||
| 	[0x005F] = "Unsupported titles/Low line"; | |||
| 	[0x007B] = "Unsupported titles/Left curly bracket"; | |||
| 	[0x007C] = "Unsupported titles/Vertical line"; | |||
| 	[0x007D] = "Unsupported titles/Right curly bracket"; | |||
| 	[0x1680] = "Unsupported titles/Ogham space"; | |||
| 	[0xFFFD] = "Unsupported titles/Replacement character"; | |||
| } | |||
| function export.get_entry_title(codepoint) | |||
| 	if unsupported_title[codepoint] then | |||
| 		return unsupported_title[codepoint] | |||
| 	end | |||
| 	if lookup_control(codepoint) ~= "assigned" then | |||
| 		return nil | |||
| 	end | 	end | ||
| 	return mw.ustring.char(codepoint) | |||
| end | end | ||
| return  | return p | ||
Latest revision as of 23:05, 13 January 2025
Documentation for this module may be created at Module:Unicode data/doc
local p = {}
local floor = math.floor
local function errorf(level, ...)
	if type(level) == "number" then
		return error(string.format(...), level + 1)
	else -- level is actually the format string.
		return error(string.format(level, ...), 2)
	end
end
local function binary_range_search(codepoint, ranges)
	local low, mid, high
	low, high = 1, ranges.length or require "Module:TableTools".length(ranges)
	while low <= high do
		mid = floor((low + high) / 2)
		local range = ranges[mid]
		if codepoint < range[1] then
			high = mid - 1
		elseif codepoint <= range[2] then
			return range, mid
		else
			low = mid + 1
		end
	end
	return nil, mid
end
p.binary_range_search = binary_range_search
--[[
local function linear_range_search(codepoint, ranges)
	for i, range in ipairs(ranges) do
		if range[1] <= codepoint and codepoint <= range[2] then
			return range
		end
	end
end
--]]
-- Load a module by indexing "loader" with the name of the module minus the
-- "Module:Unicode data/" part. For instance, loader.blocks returns
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be
-- returned.
local loader = setmetatable({}, {
	__index = function (self, key)
		local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
		if not success then
			data = false
		end
		self[key] = data
		return data
	end
})
-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local name_hooks = {
	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters
	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters
	{   0x3400,   0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
	{   0x4E00,   0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables
		local Hangul_data = loader.Hangul
		local syllable_index = codepoint - 0xAC00
		return ("HANGUL SYLLABLE %s%s%s"):format(
			Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],
			Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
				/ Hangul_data.trail_count)],
			Hangul_data.trails[syllable_index % Hangul_data.trail_count]
		)
	end },
	-- High Surrogates, High Private Use Surrogates, Low Surrogates
	{   0xD800,   0xDFFF, "<surrogate-%04X>" },
	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use
	-- CJK Compatibility Ideographs
	{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph
	{  0x18800,  0x18AFF, function (codepoint)
		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
	end },
	{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement
	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
	{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
	{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
	{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
	-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
	{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
	end},
	{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
	{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
	{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
}
name_hooks.length = #name_hooks
local name_range_cache
local function generate_name(data, codepoint)
	if type(data) == "string" then
		return data:format(codepoint)
	else
		return data(codepoint)
	end
end
--[[
-- Checks that the code point is a number and in range.
-- Does not check whether code point is an integer.
-- Not used
local function check_codepoint(funcName, argIdx, val)
	require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')
	if codepoint < 0 or 0x10FFFF < codepoint then
		errorf("Codepoint %04X out of range", codepoint)
	end
end
--]]
function p.is_noncharacter(codepoint)
	-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
	-- (Cn) and specifically noncharacters:
	-- https://www.unicode.org/faq/private_use.html#nonchar4
	return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
			or floor(codepoint % 0x10000) >= 0xFFFE)
end
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
	if p.is_noncharacter(codepoint) then
		return ("<noncharacter-%04X>"):format(codepoint)
	end
	if name_range_cache -- Check if previously used "name hook" applies to this code point.
			and codepoint >= name_range_cache[1]
			and codepoint <= name_range_cache[2] then
		return generate_name(name_range_cache[3], codepoint)
	end
	
	local range = binary_range_search(codepoint, name_hooks)
	if range then
		name_range_cache = range
		return generate_name(range[3], codepoint)
	end
	local data = loader[('names/%03X'):format(codepoint / 0x1000)]
	
	if data and data[codepoint] then
		return data[codepoint]
	
	-- Unassigned (Cn) consists of noncharacters and reserved characters.
	-- The character has been established not to be a noncharacter,
	-- and if it were assigned, its name would already been retrieved,
	-- so it must be reserved.
	else
		return ("<reserved-%04X>"):format(codepoint)
	end
end
function p.lookup_image(codepoint)
	local data = loader[('images/%03X'):format(codepoint / 0x1000)]
	
	if data then
		return data[codepoint]
	end
end
local planes = {
	[ 0] = "Basic Multilingual Plane";
	[ 1] = "Supplementary Multilingual Plane";
	[ 2] = "Supplementary Ideographic Plane";
	[ 3] = "Tertiary Ideographic Plane";
	[14] = "Supplementary Special-purpose Plane";
	[15] = "Supplementary Private Use Area-A";
	[16] = "Supplementary Private Use Area-B";
}
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
local blocks
local function block_iter(blocks, i)
	i = i + 1
	local data = blocks[i]
	if data then
		 -- Unpack doesn't work on tables loaded with mw.loadData.
		return i, data[1], data[2], data[3]
	end
end
-- An ipairs-type iterator generator for the list of blocks.
function p.enum_blocks()
	local blocks = loader.blocks
	return block_iter, blocks, 0
end
function p.lookup_plane(codepoint)
	local i = floor(codepoint / 0x10000)
	return planes[i] or ("Plane %u"):format(i)
end
function p.lookup_block(codepoint)
	local blocks = loader.blocks
	local range = binary_range_search(codepoint, blocks)
	if range then
		return range[3]
	else
		return "No Block"
	end
end
function p.get_block_info(name)
	for i, block in ipairs(loader.blocks) do
		if block[3] == name then
			return block
		end
	end
end
function p.is_valid_pagename(pagename)
	local has_nonws = false
	for cp in mw.ustring.gcodepoint(pagename) do
		if (cp == 0x0023) -- #
		or (cp == 0x005B) -- [
		or (cp == 0x005D) -- ]
		or (cp == 0x007B) -- {
		or (cp == 0x007C) -- |
		or (cp == 0x007D) -- }
		or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
		or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block
		or (cp == 0xFFFD) -- REPLACEMENT CHARACTER
		then
			return false
		end
		local printable, result = p.is_printable(cp)
		if not printable then
			return false
		end
		if result ~= "space-separator" then
			has_nonws = true
		end
	end
	return has_nonws
end
local function manual_unpack(what, from)
	if what[from + 1] == nil then
		return what[from]
	end
	
	local result = {}
	from = from or 1
	for i, item in ipairs(what) do
		if i >= from then
			table.insert(result, item)
		end
	end
	return unpack(result)
end
local function compare_ranges(range1, range2)
	return range1[1] < range2[1]
end
-- Creates a function to look up data in a module that contains "singles" (a
-- code point-to-data map) and "ranges" (an array containing arrays that contain
-- the low and high code points of a range and the data associated with that
-- range).
-- "loader" loads and returns the "singles" and "ranges" tables.
-- "match_func" is passed the code point and either the data or the "dots", and
-- generates the final result of the function.
-- The varargs ("dots") describes the default data to be returned if there wasn't
-- a match.
-- In case the function is used more than once, "cache" saves ranges that have
-- already been found to match, or a range whose data is the default if there
-- was no match.
local function memo_lookup(data_module_subpage, match_func, ...)
	local dots = { ... }
	local cache = {}
	local singles, ranges
	return function (codepoint)
		if not singles then
			local data_module = loader[data_module_subpage]
			singles, ranges = data_module.singles, data_module.ranges
		end
		if singles[codepoint] then
			return match_func(codepoint, singles[codepoint])
		end
		local range = binary_range_search(codepoint, cache)
		if range then
			return match_func(codepoint, manual_unpack(range, 3))
		end
		
		local range, index = binary_range_search(codepoint, ranges)
		if range then
			table.insert(cache, range)
			table.sort(cache, compare_ranges)
			return match_func(codepoint, manual_unpack(range, 3))
		end
		
		if ranges[index] then
			local dots_range
			if codepoint > ranges[index][2] then
				dots_range = {
					ranges[index][2] + 1,
					ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,
					unpack(dots)
				}
			else -- codepoint < range[index][1]
				dots_range = {
					ranges[index - 1] and ranges[index - 1][2] + 1 or 0,
					ranges[index][1] - 1,
					unpack(dots)
				}
			end
			table.sort(cache, compare_ranges)
		end
		
		return match_func(codepoint)
	end
end
-- Get a code point's combining class value in [[Module:Unicode data/combining]],
-- and return whether this value is not zero. Zero is assigned as the default
-- if the combining class value is not found in this data module.
-- That is, return true if character is combining, or false if it is not.
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
p.is_combining = memo_lookup(
	"combining",
	function (codepoint, combining_class)
		return combining_class and combining_class ~= 0 or false
	end,
	0)
function p.add_dotted_circle(str)
	return (mw.ustring.gsub(str, ".",
		function(char)
			if p.is_combining(mw.ustring.codepoint(char)) then
				return '◌' .. char
			end
		end))
end
local lookup_control = memo_lookup(
	"control",
	function (codepoint, ccc)
		return ccc or "assigned"
	end,
	"assigned")
p.lookup_control = lookup_control
function p.is_assigned(codepoint)
	return lookup_control(codepoint) ~= "unassigned"
end
function p.is_printable(codepoint)
	local result = lookup_control(codepoint)
	return (result == "assigned") or (result == "space-separator"), result
end
function p.is_whitespace(codepoint)
	local result = lookup_control(codepoint)
	return (result == "space-separator"), result
end
p.lookup_category = memo_lookup(
	"category",
	function (codepoint, category)
		return category
	end,
	"Cn")
local lookup_script = memo_lookup(
	"scripts",
	function (codepoint, script_code)
		return script_code or 'Zzzz'
	end,
	"Zzzz")
p.lookup_script = lookup_script
function p.get_best_script(str)
	-- Check type of argument, because mw.text.decode coerces numbers to strings!
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	
	-- Convert HTML character references (including named character references,
	-- or character entities) to characters.
	str = mw.text.decode(str, true)
	
	local scripts = {}
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = lookup_script(codepoint)
		
		-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.
		if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then
			scripts[script] = true
		end
	end
	
	-- If scripts does not contain two or more keys,
	-- return first and only key (script code) in table.
	if not next(scripts, next(scripts)) then
		return next(scripts)
	end -- else return majority script, or else "Zzzz"?
end
function p.is_Latin(str)
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	str = mw.text.decode(str, true)
	
	-- Search for the leading bytes that introduce the UTF-8 encoding of the
	-- code points U+0340-U+10FFFF. If they are not found and there is at least
	-- one Latin-script character, the string counts as Latin, because the rest
	-- of the characters can only be Zyyy, Zinh, and Zzzz.
	-- The only scripts found below U+0370 (the first code point of the Greek
	-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.
	-- See the codepage in the [[UTF-8]] article.
	if not str:find "[\205-\244]" then
		for codepoint in mw.ustring.gcodepoint(str) do
			if lookup_script(codepoint) == "Latn" then
				return true
			end
		end
	end
	
	local Latn = false
	local i = 0;																-- indexer for use in error messages
	
	for codepoint in mw.ustring.gcodepoint(str) do
		i = i + 1;																-- bump the indexer
		local script = lookup_script(codepoint)
		
		if script == "Latn" then
			Latn = true
		elseif not (script == "Zyyy" or script == "Zinh"
				or script == "Zzzz") then
			return false, i														-- abandon as not Latn; identify the offending character's position
		end
	end
	
	return Latn, (not Latn and i) or nil										-- when <Latn> false, return offending charactor's position as second return value; nil else
end
-- Checks that a string contains only characters belonging to right-to-left
-- scripts, or characters of ignorable scripts.
function p.is_rtl(str)
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	str = mw.text.decode(str, true)
	
	-- Search for the leading bytes that introduce the UTF-8 encoding of the
	-- code points U+0580-U+10FFFF. If they are not found, the string can only
	-- have characters from a left-to-right script, because the first code point
	-- in a right-to-left script is U+0591, in the Hebrew block.
	if not str:find "[\214-\244]" then
		return false
	end
	
	local result = false
	local rtl = loader.scripts.rtl
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = lookup_script(codepoint)
		
		if rtl[script] then
			result = true
		elseif not (script == "Zyyy" or script == "Zinh"
				or script == "Zzzz") then
			return false
		end
	end
	
	return result
end
--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------
external entry from an {{#invoke:}} to determine if a string of text is rtl.  Strips html and html-like tags so
that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text
has <br /> tags.
]]
function p.is_rtl_frame (frame)
	local str = frame.args[1];													-- get the string from the {{#invoke:}} frame
	str = str:gsub ('%b<>', '');												-- strip any html and html-like tags
	return p.is_rtl (str);														-- return if whatever remains rtl; false else
end
local function get_codepoint(args, arg)
	local codepoint_string = args[arg]
		or errorf(2, "Parameter %s is required", tostring(arg))
	local codepoint = tonumber(codepoint_string, 16)
		or errorf(2, "Parameter %s is not a code point in hexadecimal base",
			tostring(arg))
	if not (0 <= codepoint and codepoint <= 0x10FFFF) then
		errorf(2, "code point in parameter %s out of range", tostring(arg))
	end
	return codepoint
end
local function get_func(args, arg, prefix)
	local suffix = args[arg]
		or errorf(2, "Parameter %s is required", tostring(arg))
	suffix = mw.text.trim(suffix)
	local func_name = prefix .. suffix
	local func = p[func_name]
		or errorf(2, "There is no function '%s'", func_name)
	return func
end
-- This function allows any of the "lookup" functions to be invoked. The first
-- parameter is the word after "lookup_"; the second parameter is the code point
-- in hexadecimal base.
function p.lookup(frame)
	local func = get_func(frame.args, 1, "lookup_")
	local codepoint = get_codepoint(frame.args, 2)
	local result = func(codepoint)
	if func == p.lookup_name then
		-- Prevent code point labels such as <control-0000> from being
		-- interpreted as HTML tags.
		result = result:gsub("<", "<")
	end
	return result
end
function p.is(frame)
	local func = get_func(frame.args, 1, "is_")
	
	-- is_Latin and is_valid_pagename take strings.
	if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then
		return (func(frame.args[2]))
	else -- The rest take code points.
		local codepoint = get_codepoint(frame.args, 2)
		return (func(codepoint)) -- Adjust to one result.
	end
end
function p.lookup_kCantonese(codepoint)
	local data = loader[('Unihan/kCantonese/%02X'):format(floor(codepoint / 0x1000))]
	if data then
		return data[codepoint]
	end
end
return p







