Module:Unicode data: Difference between revisions
From All Skies Encyclopaedia
| imported>Erutuon  (unified binary range search function) | imported>Pppery   (Add KCantonese lookup per request) | ||
| (41 intermediate revisions by 6 users not shown) | |||
| Line 4: | Line 4: | ||
| local function errorf(level, ...) | local function errorf(level, ...) | ||
| 	if type(level) == number then | 	if type(level) == "number" then | ||
| 		return error(string.format(...), level + 1) | 		return error(string.format(...), level + 1) | ||
| 	else -- level is actually the format string. | 	else -- level is actually the format string. | ||
| Line 10: | Line 10: | ||
| 	end | 	end | ||
| end | end | ||
| local function binary_range_search(codepoint, ranges) | |||
| 	local low, mid, high | |||
| 	low, high = 1, ranges.length or require "Module:TableTools".length(ranges) | |||
| 	while low <= high do | |||
| 		mid = floor((low + high) / 2) | |||
| 		local range = ranges[mid] | |||
| 		if codepoint < range[1] then | |||
| 			high = mid - 1 | |||
| 		elseif codepoint <= range[2] then | |||
| 			return range, mid | |||
| 		else | |||
| 			low = mid + 1 | |||
| 		end | |||
| 	end | |||
| 	return nil, mid | |||
| end | |||
| p.binary_range_search = binary_range_search | |||
| --[[ | |||
| local function linear_range_search(codepoint, ranges) | |||
| 	for i, range in ipairs(ranges) do | |||
| 		if range[1] <= codepoint and codepoint <= range[2] then | |||
| 			return range | |||
| 		end | |||
| 	end | |||
| end | |||
| --]] | |||
| -- Load a module by indexing "loader" with the name of the module minus the | |||
| -- "Module:Unicode data/" part. For instance, loader.blocks returns | |||
| -- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be | |||
| -- returned. | |||
| local loader = setmetatable({}, { | |||
| 	__index = function (self, key) | |||
| 		local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key) | |||
| 		if not success then | |||
| 			data = false | |||
| 		end | |||
| 		self[key] = data | |||
| 		return data | |||
| 	end | |||
| }) | |||
| -- For the algorithm used to generate Hangul Syllable names, | -- For the algorithm used to generate Hangul Syllable names, | ||
| Line 15: | Line 58: | ||
| -- Unicode Specification: | -- Unicode Specification: | ||
| -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | ||
| local Hangul_data -- loaded if needed | |||
| local name_hooks = { | local name_hooks = { | ||
| 	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters | 	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters | ||
| 	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters | 	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters | ||
| 	{   0x3400,    | 	{   0x3400,   0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A | ||
| 	{   0x4E00,    | 	{   0x4E00,   0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph | ||
| 	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables | 	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables | ||
| 		Hangul_data =  | 		local Hangul_data = loader.Hangul | ||
| 		local syllable_index = codepoint - 0xAC00 | 		local syllable_index = codepoint - 0xAC00 | ||
| Line 36: | Line 77: | ||
| 	{   0xD800,   0xDFFF, "<surrogate-%04X>" }, | 	{   0xD800,   0xDFFF, "<surrogate-%04X>" }, | ||
| 	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use | 	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use | ||
| 	-- CJK Compatibility Ideographs | |||
| 	{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | |||
| 	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | 	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | ||
| 	{  0x17000,   | 	{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph | ||
| 	{  0x18800,   | 	{  0x18800,  0x18AFF, function (codepoint) | ||
| 		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | 		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | ||
| 	end }, | 	end }, | ||
| 	{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement | |||
| 	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | 	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | ||
| 	{  0x20000,   | 	{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B | ||
| 	{  0x2A700,   | 	{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C | ||
| 	{   | 	{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D | ||
| 	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | 	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | ||
| 	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | 	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | ||
| Line 53: | Line 96: | ||
| 		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | 		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | ||
| 	end}, | 	end}, | ||
| 	{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G | |||
| 	{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H | |||
| 	{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I | |||
| 	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use | 	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use | ||
| 	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use | 	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use | ||
| } | } | ||
| name_hooks.length = #name_hooks | name_hooks.length = #name_hooks | ||
| local function binary_range_search(codepoint, ranges) | |||
| 	local low, mid, high | |||
| 	low, high = 1, ranges.length | |||
| 	while low <= high do | |||
| 		mid = floor((low + high) / 2) | |||
| 		local range = ranges[mid] | |||
| 		if codepoint < range[1] then | |||
| 			high = mid - 1 | |||
| 		elseif codepoint <= range[2] then | |||
| 			-- Save "name hook" in case another character | |||
| 			-- from the same range will be looked up in the same module invocation. | |||
| 			name_range_cache = range | |||
| 			return range, mid | |||
| 		else | |||
| 			low = mid + 1 | |||
| 		end | |||
| 	end | |||
| 	-- return nil, nil | |||
| end | |||
| local name_range_cache | local name_range_cache | ||
| Line 89: | Line 114: | ||
| end | end | ||
| --[[ | |||
| -- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 | |||
| -- Checks that the code point is a number and in range. | |||
| function p.lookup_name(codepoint) | |||
| -- Does not check whether code point is an integer. | |||
| 	require 'libraryUtil'.checkType('lookup_name', 1, codepoint, 'number') | |||
| -- Not used | |||
| local function check_codepoint(funcName, argIdx, val) | |||
| 	require 'libraryUtil'.checkType(funcName, argIdx, val, 'number') | |||
| 	if codepoint < 0 or 0x10FFFF < codepoint then | 	if codepoint < 0 or 0x10FFFF < codepoint then | ||
| 		errorf("Codepoint %04X out of range", codepoint) | 		errorf("Codepoint %04X out of range", codepoint) | ||
| 	end | 	end | ||
| end | |||
| --]] | |||
| 	-- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are Unassigned | |||
| function p.is_noncharacter(codepoint) | |||
| 	-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned | |||
| 	-- (Cn) and specifically noncharacters: | 	-- (Cn) and specifically noncharacters: | ||
| 	-- https://www.unicode.org/faq/private_use.html#nonchar4 | 	-- https://www.unicode.org/faq/private_use.html#nonchar4 | ||
| 	return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF | |||
| 			or floor(codepoint % 0x10000) >= 0xFFFE) | 			or floor(codepoint % 0x10000) >= 0xFFFE) | ||
| end | |||
| -- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 | |||
| function p.lookup_name(codepoint) | |||
| 	if p.is_noncharacter(codepoint) then | |||
| 		return ("<noncharacter-%04X>"):format(codepoint) | 		return ("<noncharacter-%04X>"):format(codepoint) | ||
| 	end | 	end | ||
| 	if name_range_cache -- Check if previously used "name hook" applies to this  | 	if name_range_cache -- Check if previously used "name hook" applies to this code point. | ||
| 			and codepoint >= name_range_cache[1] | 			and codepoint >= name_range_cache[1] | ||
| 			and codepoint <= name_range_cache[2] then | 			and codepoint <= name_range_cache[2] then | ||
| Line 116: | Line 153: | ||
| 	end | 	end | ||
| 	local | 	local data = loader[('names/%03X'):format(codepoint / 0x1000)] | ||
| 		('Module:Unicode data/names/%03X'):format(codepoint / 0x1000)) | |||
| 	if  | 	if data and data[codepoint] then | ||
| 		return data[codepoint] | 		return data[codepoint] | ||
| Line 132: | Line 168: | ||
| function p.lookup_image(codepoint) | function p.lookup_image(codepoint) | ||
| 	local | 	local data = loader[('images/%03X'):format(codepoint / 0x1000)] | ||
| 		('Module:Unicode data/images/%03X'):format(codepoint / 0x1000) | |||
| 	) | |||
| 	if  | 	if data then | ||
| 		return data[codepoint] | 		return data[codepoint] | ||
| 	end | 	end | ||
| end | |||
| function p.template_lookup_name(frame) | |||
| 	local param = frame.args[1] or frame:getParent().args[1] | |||
| 	local codepoint = tonumber(param, 16) | |||
| 	if not codepoint then | |||
| 		errorf("Expected a codepoint in hexadecimal base, got '%s'", param) | |||
| 	end | |||
| 	local name = p.lookup_name(codepoint):gsub("<", "<") | |||
| 	return name | |||
| end | end | ||
| Line 157: | Line 179: | ||
| 	[ 1] = "Supplementary Multilingual Plane"; | 	[ 1] = "Supplementary Multilingual Plane"; | ||
| 	[ 2] = "Supplementary Ideographic Plane"; | 	[ 2] = "Supplementary Ideographic Plane"; | ||
| 	[ | 	[ 3] = "Tertiary Ideographic Plane"; | ||
| 	[14] = "Supplementary  | 	[14] = "Supplementary Special-purpose Plane"; | ||
| 	[15] = "Supplementary Private Use Area- | 	[15] = "Supplementary Private Use Area-A"; | ||
| 	[16] = "Supplementary Private Use Area-B"; | |||
| } | } | ||
| Line 169: | Line 192: | ||
| 	local data = blocks[i] | 	local data = blocks[i] | ||
| 	if data then | 	if data then | ||
| 		 -- Unpack doesn't work on tables loaded with mw.loadData. | |||
| 		return i, unpack(data) | |||
| 		return i, data[1], data[2], data[3] | |||
| 	end | 	end | ||
| end | end | ||
| Line 175: | Line 199: | ||
| -- An ipairs-type iterator generator for the list of blocks. | -- An ipairs-type iterator generator for the list of blocks. | ||
| function p.enum_blocks() | function p.enum_blocks() | ||
| 	blocks =  | 	local blocks = loader.blocks | ||
| 	return block_iter, blocks, 0 | 	return block_iter, blocks, 0 | ||
| end | end | ||
| Line 185: | Line 209: | ||
| function p.lookup_block(codepoint) | function p.lookup_block(codepoint) | ||
| 	blocks =  | 	local blocks = loader.blocks | ||
| 	local range = binary_range_search(codepoint, blocks) | 	local range = binary_range_search(codepoint, blocks) | ||
| 	if range then | |||
| 		or errorf("No block found for codepoint U+%04X", codepoint) | |||
| 		return range[3] | |||
| 	else | |||
| 		return "No Block" | |||
| 	end | |||
| end | end | ||
| function p. | function p.get_block_info(name) | ||
| 	for i, block in ipairs(loader.blocks) do | |||
| 	local range | |||
| 	blocks = blocks or mw.loadData("Module:Unicode data/blocks") | |||
| 	for i, block in ipairs(blocks) do | |||
| 		if block[3] == name then | 		if block[3] == name then | ||
| 			return block | |||
| 		end | 		end | ||
| 	end | |||
| 	if range then | |||
| 		return range[1], range[2] | |||
| 	end | 	end | ||
| end | end | ||
| Line 237: | Line 257: | ||
| local function manual_unpack(what, from) | local function manual_unpack(what, from) | ||
| 	if what[from + 1] == nil then | |||
| 		return what[from] | |||
| 	end | |||
| 	local result = {} | 	local result = {} | ||
| 	from = from or 1 | 	from = from or 1 | ||
| Line 245: | Line 269: | ||
| 	end | 	end | ||
| 	return unpack(result) | 	return unpack(result) | ||
| end | |||
| local function compare_ranges(range1, range2) | |||
| 	return range1[1] < range2[1] | |||
| end | end | ||
| -- Creates a function to look up data in a module that contains "singles" (a | -- Creates a function to look up data in a module that contains "singles" (a | ||
| --  | -- code point-to-data map) and "ranges" (an array containing arrays that contain | ||
| -- the low and high  | -- the low and high code points of a range and the data associated with that | ||
| -- range). | -- range). | ||
| -- "loader" loads and returns the "singles" and "ranges" tables. | |||
| -- "match_func" is passed the code point and either the data or the "dots", and | |||
| -- generates the final result of the function. | |||
| -- The varargs ("dots") describes the default data to be returned if there wasn't | -- The varargs ("dots") describes the default data to be returned if there wasn't | ||
| -- a match. | -- a match. | ||
| Line 256: | Line 287: | ||
| -- already been found to match, or a range whose data is the default if there | -- already been found to match, or a range whose data is the default if there | ||
| -- was no match. | -- was no match. | ||
| local function memo_lookup( | local function memo_lookup(data_module_subpage, match_func, ...) | ||
| 	local dots = { ... } | 	local dots = { ... } | ||
| 	local cache = {} | 	local cache = {} | ||
| Line 263: | Line 294: | ||
| 	return function (codepoint) | 	return function (codepoint) | ||
| 		if not singles then | 		if not singles then | ||
| 			local data_module = loader[data_module_subpage] | |||
| 			singles, ranges = data_module.singles, data_module.ranges | |||
| 		end | 		end | ||
| Line 270: | Line 302: | ||
| 		end | 		end | ||
| 		local range = binary_range_search(codepoint, cache) | |||
| 		local lastlast = -1 | |||
| 		if range then | |||
| 			return match_func(codepoint, manual_unpack(range, 3)) | |||
| 				return match_func(codepoint, unpack(range, 3)) | |||
| 			end | |||
| 		end | 		end | ||
| 		local range, index = binary_range_search(codepoint, ranges) | 		local range, index = binary_range_search(codepoint, ranges) | ||
| 		if range then | 		if range then | ||
| 			table.insert(cache,  | 			table.insert(cache, range) | ||
| 			table.sort(cache, compare_ranges) | |||
| 			return match_func(codepoint, manual_unpack(range, 3)) | 			return match_func(codepoint, manual_unpack(range, 3)) | ||
| 		end | 		end | ||
| Line 298: | Line 329: | ||
| 				} | 				} | ||
| 			end | 			end | ||
| 			table. | 			table.sort(cache, compare_ranges) | ||
| 		end | 		end | ||
| 		--[[ | |||
| 		mw.log(unpack(require "Module:Utility".map( | |||
| 			function (range) | |||
| 				return ("U+%04X-U+%04X: %s"):format(unpack(range)) | |||
| 			end, | |||
| 			cache))) | |||
| 		--]] | |||
| 		return match_func(codepoint) | 		return match_func(codepoint) | ||
| Line 313: | Line 336: | ||
| end | end | ||
| -- Get a  | -- Get a code point's combining class value in [[Module:Unicode data/combining]], | ||
| -- and return whether this value is not zero. Zero is assigned as the default | -- and return whether this value is not zero. Zero is assigned as the default | ||
| -- if the combining class value is not found in this data module. | -- if the combining class value is not found in this data module. | ||
| -- That is, return true if character is combining, or false if it is not. | -- That is, return true if character is combining, or false if it is not. | ||
| -- See  | -- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for | ||
| -- more information. | -- more information. | ||
| p.is_combining = memo_lookup( | p.is_combining = memo_lookup( | ||
| 	"combining", | |||
| 	local m_comb = mw.loadData('Module:Unicode data/combining') | |||
| 	function (codepoint, combining_class) | |||
| 	return m_comb.single, m_comb.ranges | |||
| 		return combining_class and combining_class ~= 0 or false | |||
| 	end, | |||
| 	return combining_class and combining_class ~= 0 | |||
| 	0) | |||
| 		or false | |||
| end, 0) | |||
| function p.add_dotted_circle(str) | function p.add_dotted_circle(str) | ||
| Line 336: | Line 358: | ||
| end | end | ||
| local lookup_control = memo_lookup( | local lookup_control = memo_lookup( | ||
| 	"control", | |||
| 	local m_cc = mw.loadData('Module:Unicode data/control') | |||
| 	function (codepoint, ccc) | |||
| 	return m_cc.single, m_cc.ranges | |||
| 		return ccc or "assigned" | |||
| end, function (codepoint, ccc) | |||
| 	end, | |||
| 	return ccc or "assigned" | |||
| 	"assigned") | |||
| p.lookup_control = lookup_control | |||
| function p.is_assigned(codepoint) | function p.is_assigned(codepoint) | ||
| Line 357: | Line 380: | ||
| end | end | ||
| p.lookup_category = memo_lookup( | |||
| local unsupported_title = { | |||
| 	"category", | |||
| 	[0x0020] = "Unsupported titles/Space"; | |||
| 	function (codepoint, category) | |||
| 	[0x0023] = "Unsupported titles/Number sign"; | |||
| 		return category | |||
| 	[0x002E] = "Unsupported titles/Full stop"; | |||
| 	end, | |||
| 	[0x003A] = "Unsupported titles/Colon"; | |||
| 	"Cn") | |||
| 	[0x003C] = "Unsupported titles/Less than"; | |||
| 	[0x003E] = "Unsupported titles/Greater than"; | |||
| 	[0x005B] = "Unsupported titles/Left square bracket"; | |||
| 	[0x005D] = "Unsupported titles/Right square bracket"; | |||
| 	[0x005F] = "Unsupported titles/Low line"; | |||
| 	[0x007B] = "Unsupported titles/Left curly bracket"; | |||
| 	[0x007C] = "Unsupported titles/Vertical line"; | |||
| 	[0x007D] = "Unsupported titles/Right curly bracket"; | |||
| 	[0x1680] = "Unsupported titles/Ogham space"; | |||
| 	[0xFFFD] = "Unsupported titles/Replacement character"; | |||
| } | |||
| local lookup_script = memo_lookup( | |||
| function p.get_entry_title(codepoint) | |||
| 	"scripts", | |||
| 	if unsupported_title[codepoint] then | |||
| 	function (codepoint, script_code) | |||
| 		return unsupported_title[codepoint] | |||
| 		return script_code or 'Zzzz' | |||
| 	end, | |||
| 	"Zzzz") | |||
| p.lookup_script = lookup_script | |||
| function p.get_best_script(str) | |||
| 	-- Check type of argument, because mw.text.decode coerces numbers to strings! | |||
| 	require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
| 	-- Convert HTML character references (including named character references, | |||
| 	-- or character entities) to characters. | |||
| 	str = mw.text.decode(str, true) | |||
| 	local scripts = {} | |||
| 	for codepoint in mw.ustring.gcodepoint(str) do | |||
| 		local script = lookup_script(codepoint) | |||
| 		-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts. | |||
| 		if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then | |||
| 			scripts[script] = true | |||
| 		end | |||
| 	end | 	end | ||
| 	if lookup_control(codepoint) ~= "assigned" then | |||
| 	-- If scripts does not contain two or more keys, | |||
| 		return nil | |||
| 	-- return first and only key (script code) in table. | |||
| 	if not next(scripts, next(scripts)) then | |||
| 		return next(scripts) | |||
| 	end -- else return majority script, or else "Zzzz"? | |||
| end | |||
| function p.is_Latin(str) | |||
| 	require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
| 	str = mw.text.decode(str, true) | |||
| 	-- Search for the leading bytes that introduce the UTF-8 encoding of the | |||
| 	-- code points U+0340-U+10FFFF. If they are not found and there is at least | |||
| 	-- one Latin-script character, the string counts as Latin, because the rest | |||
| 	-- of the characters can only be Zyyy, Zinh, and Zzzz. | |||
| 	-- The only scripts found below U+0370 (the first code point of the Greek | |||
| 	-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz. | |||
| 	-- See the codepage in the [[UTF-8]] article. | |||
| 	if not str:find "[\205-\244]" then | |||
| 		for codepoint in mw.ustring.gcodepoint(str) do | |||
| 			if lookup_script(codepoint) == "Latn" then | |||
| 				return true | |||
| 			end | |||
| 		end | |||
| 	end | |||
| 	local Latn = false | |||
| 	local i = 0;																-- indexer for use in error messages | |||
| 	for codepoint in mw.ustring.gcodepoint(str) do | |||
| 		i = i + 1;																-- bump the indexer | |||
| 		local script = lookup_script(codepoint) | |||
| 		if script == "Latn" then | |||
| 			Latn = true | |||
| 		elseif not (script == "Zyyy" or script == "Zinh" | |||
| 				or script == "Zzzz") then | |||
| 			return false, i														-- abandon as not Latn; identify the offending character's position | |||
| 		end | |||
| 	end | |||
| 	return Latn, (not Latn and i) or nil										-- when <Latn> false, return offending charactor's position as second return value; nil else | |||
| end | |||
| -- Checks that a string contains only characters belonging to right-to-left | |||
| -- scripts, or characters of ignorable scripts. | |||
| function p.is_rtl(str) | |||
| 	require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
| 	str = mw.text.decode(str, true) | |||
| 	-- Search for the leading bytes that introduce the UTF-8 encoding of the | |||
| 	-- code points U+0580-U+10FFFF. If they are not found, the string can only | |||
| 	-- have characters from a left-to-right script, because the first code point | |||
| 	-- in a right-to-left script is U+0591, in the Hebrew block. | |||
| 	if not str:find "[\214-\244]" then | |||
| 		return false | |||
| 	end | |||
| 	local result = false | |||
| 	local rtl = loader.scripts.rtl | |||
| 	for codepoint in mw.ustring.gcodepoint(str) do | |||
| 		local script = lookup_script(codepoint) | |||
| 		if rtl[script] then | |||
| 			result = true | |||
| 		elseif not (script == "Zyyy" or script == "Zinh" | |||
| 				or script == "Zzzz") then | |||
| 			return false | |||
| 		end | |||
| 	end | |||
| 	return result | |||
| end | |||
| --[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------ | |||
| external entry from an {{#invoke:}} to determine if a string of text is rtl.  Strips html and html-like tags so | |||
| that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text | |||
| has <br /> tags. | |||
| ]] | |||
| function p.is_rtl_frame (frame) | |||
| 	local str = frame.args[1];													-- get the string from the {{#invoke:}} frame | |||
| 	str = str:gsub ('%b<>', '');												-- strip any html and html-like tags | |||
| 	return p.is_rtl (str);														-- return if whatever remains rtl; false else | |||
| end | |||
| local function get_codepoint(args, arg) | |||
| 	local codepoint_string = args[arg] | |||
| 		or errorf(2, "Parameter %s is required", tostring(arg)) | |||
| 	local codepoint = tonumber(codepoint_string, 16) | |||
| 		or errorf(2, "Parameter %s is not a code point in hexadecimal base", | |||
| 			tostring(arg)) | |||
| 	if not (0 <= codepoint and codepoint <= 0x10FFFF) then | |||
| 		errorf(2, "code point in parameter %s out of range", tostring(arg)) | |||
| 	end | |||
| 	return codepoint | |||
| end | |||
| local function get_func(args, arg, prefix) | |||
| 	local suffix = args[arg] | |||
| 		or errorf(2, "Parameter %s is required", tostring(arg)) | |||
| 	suffix = mw.text.trim(suffix) | |||
| 	local func_name = prefix .. suffix | |||
| 	local func = p[func_name] | |||
| 		or errorf(2, "There is no function '%s'", func_name) | |||
| 	return func | |||
| end | |||
| -- This function allows any of the "lookup" functions to be invoked. The first | |||
| -- parameter is the word after "lookup_"; the second parameter is the code point | |||
| -- in hexadecimal base. | |||
| function p.lookup(frame) | |||
| 	local func = get_func(frame.args, 1, "lookup_") | |||
| 	local codepoint = get_codepoint(frame.args, 2) | |||
| 	local result = func(codepoint) | |||
| 	if func == p.lookup_name then | |||
| 		-- Prevent code point labels such as <control-0000> from being | |||
| 		-- interpreted as HTML tags. | |||
| 		result = result:gsub("<", "<") | |||
| 	end | |||
| 	return result | |||
| end | |||
| function p.is(frame) | |||
| 	local func = get_func(frame.args, 1, "is_") | |||
| 	-- is_Latin and is_valid_pagename take strings. | |||
| 	if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then | |||
| 		return (func(frame.args[2])) | |||
| 	else -- The rest take code points. | |||
| 		local codepoint = get_codepoint(frame.args, 2) | |||
| 		return (func(codepoint)) -- Adjust to one result. | |||
| 	end | |||
| end | |||
| function p.lookup_kCantonese(codepoint) | |||
| 	local data = loader[('Unihan/kCantonese/%02X'):format(floor(codepoint / 0x1000))] | |||
| 	if data then | |||
| 		return data[codepoint] | |||
| 	end | 	end | ||
| 	return mw.ustring.char(codepoint) | |||
| end | end | ||
Latest revision as of 23:05, 13 January 2025
Documentation for this module may be created at Module:Unicode data/doc
local p = {}
local floor = math.floor
local function errorf(level, ...)
	if type(level) == "number" then
		return error(string.format(...), level + 1)
	else -- level is actually the format string.
		return error(string.format(level, ...), 2)
	end
end
local function binary_range_search(codepoint, ranges)
	local low, mid, high
	low, high = 1, ranges.length or require "Module:TableTools".length(ranges)
	while low <= high do
		mid = floor((low + high) / 2)
		local range = ranges[mid]
		if codepoint < range[1] then
			high = mid - 1
		elseif codepoint <= range[2] then
			return range, mid
		else
			low = mid + 1
		end
	end
	return nil, mid
end
p.binary_range_search = binary_range_search
--[[
local function linear_range_search(codepoint, ranges)
	for i, range in ipairs(ranges) do
		if range[1] <= codepoint and codepoint <= range[2] then
			return range
		end
	end
end
--]]
-- Load a module by indexing "loader" with the name of the module minus the
-- "Module:Unicode data/" part. For instance, loader.blocks returns
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be
-- returned.
local loader = setmetatable({}, {
	__index = function (self, key)
		local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
		if not success then
			data = false
		end
		self[key] = data
		return data
	end
})
-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local name_hooks = {
	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters
	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters
	{   0x3400,   0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
	{   0x4E00,   0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables
		local Hangul_data = loader.Hangul
		local syllable_index = codepoint - 0xAC00
		return ("HANGUL SYLLABLE %s%s%s"):format(
			Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],
			Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
				/ Hangul_data.trail_count)],
			Hangul_data.trails[syllable_index % Hangul_data.trail_count]
		)
	end },
	-- High Surrogates, High Private Use Surrogates, Low Surrogates
	{   0xD800,   0xDFFF, "<surrogate-%04X>" },
	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use
	-- CJK Compatibility Ideographs
	{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph
	{  0x18800,  0x18AFF, function (codepoint)
		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
	end },
	{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement
	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
	{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
	{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
	{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
	-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
	{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
	end},
	{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
	{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
	{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
}
name_hooks.length = #name_hooks
local name_range_cache
local function generate_name(data, codepoint)
	if type(data) == "string" then
		return data:format(codepoint)
	else
		return data(codepoint)
	end
end
--[[
-- Checks that the code point is a number and in range.
-- Does not check whether code point is an integer.
-- Not used
local function check_codepoint(funcName, argIdx, val)
	require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')
	if codepoint < 0 or 0x10FFFF < codepoint then
		errorf("Codepoint %04X out of range", codepoint)
	end
end
--]]
function p.is_noncharacter(codepoint)
	-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
	-- (Cn) and specifically noncharacters:
	-- https://www.unicode.org/faq/private_use.html#nonchar4
	return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
			or floor(codepoint % 0x10000) >= 0xFFFE)
end
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
	if p.is_noncharacter(codepoint) then
		return ("<noncharacter-%04X>"):format(codepoint)
	end
	if name_range_cache -- Check if previously used "name hook" applies to this code point.
			and codepoint >= name_range_cache[1]
			and codepoint <= name_range_cache[2] then
		return generate_name(name_range_cache[3], codepoint)
	end
	
	local range = binary_range_search(codepoint, name_hooks)
	if range then
		name_range_cache = range
		return generate_name(range[3], codepoint)
	end
	local data = loader[('names/%03X'):format(codepoint / 0x1000)]
	
	if data and data[codepoint] then
		return data[codepoint]
	
	-- Unassigned (Cn) consists of noncharacters and reserved characters.
	-- The character has been established not to be a noncharacter,
	-- and if it were assigned, its name would already been retrieved,
	-- so it must be reserved.
	else
		return ("<reserved-%04X>"):format(codepoint)
	end
end
function p.lookup_image(codepoint)
	local data = loader[('images/%03X'):format(codepoint / 0x1000)]
	
	if data then
		return data[codepoint]
	end
end
local planes = {
	[ 0] = "Basic Multilingual Plane";
	[ 1] = "Supplementary Multilingual Plane";
	[ 2] = "Supplementary Ideographic Plane";
	[ 3] = "Tertiary Ideographic Plane";
	[14] = "Supplementary Special-purpose Plane";
	[15] = "Supplementary Private Use Area-A";
	[16] = "Supplementary Private Use Area-B";
}
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
local blocks
local function block_iter(blocks, i)
	i = i + 1
	local data = blocks[i]
	if data then
		 -- Unpack doesn't work on tables loaded with mw.loadData.
		return i, data[1], data[2], data[3]
	end
end
-- An ipairs-type iterator generator for the list of blocks.
function p.enum_blocks()
	local blocks = loader.blocks
	return block_iter, blocks, 0
end
function p.lookup_plane(codepoint)
	local i = floor(codepoint / 0x10000)
	return planes[i] or ("Plane %u"):format(i)
end
function p.lookup_block(codepoint)
	local blocks = loader.blocks
	local range = binary_range_search(codepoint, blocks)
	if range then
		return range[3]
	else
		return "No Block"
	end
end
function p.get_block_info(name)
	for i, block in ipairs(loader.blocks) do
		if block[3] == name then
			return block
		end
	end
end
function p.is_valid_pagename(pagename)
	local has_nonws = false
	for cp in mw.ustring.gcodepoint(pagename) do
		if (cp == 0x0023) -- #
		or (cp == 0x005B) -- [
		or (cp == 0x005D) -- ]
		or (cp == 0x007B) -- {
		or (cp == 0x007C) -- |
		or (cp == 0x007D) -- }
		or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
		or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block
		or (cp == 0xFFFD) -- REPLACEMENT CHARACTER
		then
			return false
		end
		local printable, result = p.is_printable(cp)
		if not printable then
			return false
		end
		if result ~= "space-separator" then
			has_nonws = true
		end
	end
	return has_nonws
end
local function manual_unpack(what, from)
	if what[from + 1] == nil then
		return what[from]
	end
	
	local result = {}
	from = from or 1
	for i, item in ipairs(what) do
		if i >= from then
			table.insert(result, item)
		end
	end
	return unpack(result)
end
local function compare_ranges(range1, range2)
	return range1[1] < range2[1]
end
-- Creates a function to look up data in a module that contains "singles" (a
-- code point-to-data map) and "ranges" (an array containing arrays that contain
-- the low and high code points of a range and the data associated with that
-- range).
-- "loader" loads and returns the "singles" and "ranges" tables.
-- "match_func" is passed the code point and either the data or the "dots", and
-- generates the final result of the function.
-- The varargs ("dots") describes the default data to be returned if there wasn't
-- a match.
-- In case the function is used more than once, "cache" saves ranges that have
-- already been found to match, or a range whose data is the default if there
-- was no match.
local function memo_lookup(data_module_subpage, match_func, ...)
	local dots = { ... }
	local cache = {}
	local singles, ranges
	return function (codepoint)
		if not singles then
			local data_module = loader[data_module_subpage]
			singles, ranges = data_module.singles, data_module.ranges
		end
		if singles[codepoint] then
			return match_func(codepoint, singles[codepoint])
		end
		local range = binary_range_search(codepoint, cache)
		if range then
			return match_func(codepoint, manual_unpack(range, 3))
		end
		
		local range, index = binary_range_search(codepoint, ranges)
		if range then
			table.insert(cache, range)
			table.sort(cache, compare_ranges)
			return match_func(codepoint, manual_unpack(range, 3))
		end
		
		if ranges[index] then
			local dots_range
			if codepoint > ranges[index][2] then
				dots_range = {
					ranges[index][2] + 1,
					ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,
					unpack(dots)
				}
			else -- codepoint < range[index][1]
				dots_range = {
					ranges[index - 1] and ranges[index - 1][2] + 1 or 0,
					ranges[index][1] - 1,
					unpack(dots)
				}
			end
			table.sort(cache, compare_ranges)
		end
		
		return match_func(codepoint)
	end
end
-- Get a code point's combining class value in [[Module:Unicode data/combining]],
-- and return whether this value is not zero. Zero is assigned as the default
-- if the combining class value is not found in this data module.
-- That is, return true if character is combining, or false if it is not.
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
p.is_combining = memo_lookup(
	"combining",
	function (codepoint, combining_class)
		return combining_class and combining_class ~= 0 or false
	end,
	0)
function p.add_dotted_circle(str)
	return (mw.ustring.gsub(str, ".",
		function(char)
			if p.is_combining(mw.ustring.codepoint(char)) then
				return '◌' .. char
			end
		end))
end
local lookup_control = memo_lookup(
	"control",
	function (codepoint, ccc)
		return ccc or "assigned"
	end,
	"assigned")
p.lookup_control = lookup_control
function p.is_assigned(codepoint)
	return lookup_control(codepoint) ~= "unassigned"
end
function p.is_printable(codepoint)
	local result = lookup_control(codepoint)
	return (result == "assigned") or (result == "space-separator"), result
end
function p.is_whitespace(codepoint)
	local result = lookup_control(codepoint)
	return (result == "space-separator"), result
end
p.lookup_category = memo_lookup(
	"category",
	function (codepoint, category)
		return category
	end,
	"Cn")
local lookup_script = memo_lookup(
	"scripts",
	function (codepoint, script_code)
		return script_code or 'Zzzz'
	end,
	"Zzzz")
p.lookup_script = lookup_script
function p.get_best_script(str)
	-- Check type of argument, because mw.text.decode coerces numbers to strings!
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	
	-- Convert HTML character references (including named character references,
	-- or character entities) to characters.
	str = mw.text.decode(str, true)
	
	local scripts = {}
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = lookup_script(codepoint)
		
		-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.
		if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then
			scripts[script] = true
		end
	end
	
	-- If scripts does not contain two or more keys,
	-- return first and only key (script code) in table.
	if not next(scripts, next(scripts)) then
		return next(scripts)
	end -- else return majority script, or else "Zzzz"?
end
function p.is_Latin(str)
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	str = mw.text.decode(str, true)
	
	-- Search for the leading bytes that introduce the UTF-8 encoding of the
	-- code points U+0340-U+10FFFF. If they are not found and there is at least
	-- one Latin-script character, the string counts as Latin, because the rest
	-- of the characters can only be Zyyy, Zinh, and Zzzz.
	-- The only scripts found below U+0370 (the first code point of the Greek
	-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.
	-- See the codepage in the [[UTF-8]] article.
	if not str:find "[\205-\244]" then
		for codepoint in mw.ustring.gcodepoint(str) do
			if lookup_script(codepoint) == "Latn" then
				return true
			end
		end
	end
	
	local Latn = false
	local i = 0;																-- indexer for use in error messages
	
	for codepoint in mw.ustring.gcodepoint(str) do
		i = i + 1;																-- bump the indexer
		local script = lookup_script(codepoint)
		
		if script == "Latn" then
			Latn = true
		elseif not (script == "Zyyy" or script == "Zinh"
				or script == "Zzzz") then
			return false, i														-- abandon as not Latn; identify the offending character's position
		end
	end
	
	return Latn, (not Latn and i) or nil										-- when <Latn> false, return offending charactor's position as second return value; nil else
end
-- Checks that a string contains only characters belonging to right-to-left
-- scripts, or characters of ignorable scripts.
function p.is_rtl(str)
	require "libraryUtil".checkType("get_best_script", 1, str, "string")
	str = mw.text.decode(str, true)
	
	-- Search for the leading bytes that introduce the UTF-8 encoding of the
	-- code points U+0580-U+10FFFF. If they are not found, the string can only
	-- have characters from a left-to-right script, because the first code point
	-- in a right-to-left script is U+0591, in the Hebrew block.
	if not str:find "[\214-\244]" then
		return false
	end
	
	local result = false
	local rtl = loader.scripts.rtl
	for codepoint in mw.ustring.gcodepoint(str) do
		local script = lookup_script(codepoint)
		
		if rtl[script] then
			result = true
		elseif not (script == "Zyyy" or script == "Zinh"
				or script == "Zzzz") then
			return false
		end
	end
	
	return result
end
--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------
external entry from an {{#invoke:}} to determine if a string of text is rtl.  Strips html and html-like tags so
that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text
has <br /> tags.
]]
function p.is_rtl_frame (frame)
	local str = frame.args[1];													-- get the string from the {{#invoke:}} frame
	str = str:gsub ('%b<>', '');												-- strip any html and html-like tags
	return p.is_rtl (str);														-- return if whatever remains rtl; false else
end
local function get_codepoint(args, arg)
	local codepoint_string = args[arg]
		or errorf(2, "Parameter %s is required", tostring(arg))
	local codepoint = tonumber(codepoint_string, 16)
		or errorf(2, "Parameter %s is not a code point in hexadecimal base",
			tostring(arg))
	if not (0 <= codepoint and codepoint <= 0x10FFFF) then
		errorf(2, "code point in parameter %s out of range", tostring(arg))
	end
	return codepoint
end
local function get_func(args, arg, prefix)
	local suffix = args[arg]
		or errorf(2, "Parameter %s is required", tostring(arg))
	suffix = mw.text.trim(suffix)
	local func_name = prefix .. suffix
	local func = p[func_name]
		or errorf(2, "There is no function '%s'", func_name)
	return func
end
-- This function allows any of the "lookup" functions to be invoked. The first
-- parameter is the word after "lookup_"; the second parameter is the code point
-- in hexadecimal base.
function p.lookup(frame)
	local func = get_func(frame.args, 1, "lookup_")
	local codepoint = get_codepoint(frame.args, 2)
	local result = func(codepoint)
	if func == p.lookup_name then
		-- Prevent code point labels such as <control-0000> from being
		-- interpreted as HTML tags.
		result = result:gsub("<", "<")
	end
	return result
end
function p.is(frame)
	local func = get_func(frame.args, 1, "is_")
	
	-- is_Latin and is_valid_pagename take strings.
	if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then
		return (func(frame.args[2]))
	else -- The rest take code points.
		local codepoint = get_codepoint(frame.args, 2)
		return (func(codepoint)) -- Adjust to one result.
	end
end
function p.lookup_kCantonese(codepoint)
	local data = loader[('Unihan/kCantonese/%02X'):format(floor(codepoint / 0x1000))]
	if data then
		return data[codepoint]
	end
end
return p







