Module:Unicode data: Difference between revisions
From All Skies Encyclopaedia
| imported>Erutuon  (describe memo_lookup) | imported>Erutuon   (binary search in memo_lookup) | ||
| Line 264: | Line 264: | ||
| 			end | 			end | ||
| 		end | 		end | ||
| 		local low, mid, high | |||
| ⚫ | |||
| 		low, high = 1, ranges.length | |||
| ⚫ | |||
| 				table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) }) | |||
| 		while low <= high do | |||
| 				return match_func(codepoint, unpack(dots)) | |||
| 			mid = floor((low + high) / 2) | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| 				high = mid - 1 | |||
| ⚫ | |||
| 				table.insert(cache, { manual_unpack(range) }) | 				table.insert(cache, { manual_unpack(range) }) | ||
| 				return match_func(codepoint, manual_unpack(range, 3)) | 				return match_func(codepoint, manual_unpack(range, 3)) | ||
| 			else | |||
| 			else -- save last codepoint of previously encountered range | |||
| 				low = mid + 1 | |||
| 			end | 			end | ||
| 		end | 		end | ||
| 		if ranges[mid] then | |||
| 			local dots_range | |||
| 			if codepoint > ranges[mid][2] then | |||
| 				dots_range = { | |||
| 					ranges[mid][2] + 1, | |||
| 					ranges[mid + 1] and ranges[mid + 1][1] - 1 or 0x10FFFF, | |||
| 					unpack(dots) | |||
| 				} | |||
| 			else -- codepoint < range[mid][1] | |||
| 				dots_range = { | |||
| 					ranges[mid - 1] and ranges[mid - 1][2] + 1 or 0, | |||
| 					ranges[mid][1] - 1, | |||
| 					unpack(dots) | |||
| 				} | |||
| 			end | |||
| 			table.insert(cache, dots_range)	 | |||
| 		end | |||
| 		--[[ | |||
| 		mw.log(unpack(require "Module:Utility".map( | |||
| 			function (range) | |||
| 				return ("U+%04X-U+%04X: %s"):format(unpack(range)) | |||
| 			end, | |||
| 			cache))) | |||
| 		--]] | |||
| 		return match_func(codepoint) | 		return match_func(codepoint) | ||
| 	end | 	end | ||
Revision as of 04:37, 29 June 2018
Documentation for this module may be created at Module:Unicode data/doc
local p = {}
local floor = math.floor
-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local Hangul_data -- loaded if needed
local name_hooks = {
	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters
	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters
	{   0x3400,   0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
	{   0x4E00,   0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables
		Hangul_data = Hangul_data or mw.loadData("Module:Unicode data/Hangul")
		local syllable_index = codepoint - 0xAC00
		return ("HANGUL SYLLABLE %s%s%s"):format(
			Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],
			Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
				/ Hangul_data.trail_count)],
			Hangul_data.trails[syllable_index % Hangul_data.trail_count]
		)
	end },
	-- High Surrogates, High Private Use Surrogates, Low Surrogates
	{   0xD800,   0xDFFF, "<surrogate-%04X>" },
	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use
	{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, -- CJK Compatibility Ideographs
	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0x17000,  0x187F1, "TANGUT IDEOGRAPH-%04X" }, -- Tangut
	{  0x18800,  0x18AF2, function (codepoint)
		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
	end },
	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
	{  0x20000,  0x2A6D6, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
	{  0x2A700,  0x2B734, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
	{  0x2A740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
	-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
	{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
	{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement
		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
	end},
	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
}
local name_range_cache
local function generate_name(data, codepoint)
	if type(data) == "string" then
		return data:format(codepoint)
	else
		return data(codepoint)
	end
end
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
	require 'libraryUtil'.checkType('lookup_name', 1, codepoint, 'number')
	if codepoint < 0 or 0x10FFFF < codepoint then
		error(("Codepoint %04X out of range"):format(codepoint))
	end
	
	-- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are Unassigned
	-- (Cn) and specifically noncharacters:
	-- https://www.unicode.org/faq/private_use.html#nonchar4
	if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
			or floor(codepoint % 0x10000) >= 0xFFFE) then
		return ("<noncharacter-%04X>"):format(codepoint)
	end
	if name_range_cache -- Check if previously used "name hook" applies to this codepoint.
			and codepoint >= name_range_cache[1]
			and codepoint <= name_range_cache[2] then
		return generate_name(name_range_cache[3], codepoint)
	end
	for _, item in ipairs(name_hooks) do
		if codepoint < item[1] then
			break
		elseif codepoint <= item[2] then
			-- Save "name hook" in case another character
			-- from the same range will be looked up in the same module invocation.
			name_range_cache = item
			
			return generate_name(item[3], codepoint)
		end
	end
	local success, data = pcall(mw.loadData,
		('Module:Unicode data/names/%03X'):format(codepoint / 0x1000))
	
	if success and data[codepoint] then
		return data[codepoint]
	
	-- Unassigned (Cn) consists of noncharacters and reserved characters.
	-- The character has been established not to be a noncharacter,
	-- and if it were assigned, its name would already been retrieved,
	-- so it must be reserved.
	else
		return ("<reserved-%04X>"):format(codepoint)
	end
end
function p.lookup_image(codepoint)
	local success, data = pcall(mw.loadData,
		('Module:Unicode data/images/%03X'):format(codepoint / 0x1000)
	)
	
	if success then
		return data[codepoint]
	end
end
function p.template_lookup_name(frame)
	local param = frame.args[1] or frame:getParent().args[1]
	
	local codepoint = tonumber(param, 16)
	if not codepoint then
		error(("Expected a codepoint in hexadecimal base, got '%s'"):format(param))
	end
	
	local name = p.lookup_name(codepoint):gsub("<", "<")
	return name
end
local planes = {
	[ 0] = "Basic Multilingual Plane";
	[ 1] = "Supplementary Multilingual Plane";
	[ 2] = "Supplementary Ideographic Plane";
	[13] = "Supplementary Special-purpose Plane";
	[14] = "Supplementary Private Use Area-A";
	[15] = "Supplementary Private Use Area-B";
}
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
local blocks
local function block_iter(blocks, i)
	i = i + 1
	local data = blocks[i]
	if data then
		return i, unpack(data)
	end
end
-- An ipairs-type iterator generator for the list of blocks.
function p.enum_blocks()
	blocks = blocks or mw.loadData("Module:Unicode data/blocks")
	return block_iter, blocks, 0
end
function p.lookup_plane(codepoint)
	local i = floor(codepoint / 0x10000)
	return planes[i] or ("Plane %u"):format(i)
end
-- Binary search, to avoid iterating over entire table in order to look up the
-- higher codepoints.
function p.lookup_block(codepoint)
	blocks = blocks or mw.loadData("Module:Unicode data/blocks")
	local iStart, iEnd = 1, blocks.length or #blocks
	while iStart <= iEnd do
		local iMid = floor((iStart + iEnd) / 2)
		local range = blocks[iMid]
		if codepoint < range[1] then
			iEnd = iMid - 1
		elseif codepoint <= range[2] then
			return range[3]
		else
			iStart = iMid + 1
		end
	end
	error(string.format("No block found for codepoint U+%04X.", codepoint))
end
function p.get_block_range(name)
	local range
	blocks = blocks or mw.loadData("Module:Unicode data/blocks")
	
	for i, block in ipairs(blocks) do
		if block[3] == name then
			range = block
		end
	end
	
	if range then
		return range[1], range[2]
	end
end
function p.is_valid_pagename(pagename)
	local has_nonws = false
	for cp in mw.ustring.gcodepoint(pagename) do
		if (cp == 0x0023) -- #
		or (cp == 0x005B) -- [
		or (cp == 0x005D) -- ]
		or (cp == 0x007B) -- {
		or (cp == 0x007C) -- |
		or (cp == 0x007D) -- }
		or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
		or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block
		or (cp == 0xFFFD) -- REPLACEMENT CHARACTER
		then
			return false
		end
		local printable, result = p.is_printable(cp)
		if not printable then
			return false
		end
		if result ~= "space-separator" then
			has_nonws = true
		end
	end
	return has_nonws
end
local function manual_unpack(what, from)
	local result = {}
	from = from or 1
	for i, item in ipairs(what) do
		if i >= from then
			table.insert(result, item)
		end
	end
	return unpack(result)
end
-- Creates a function to look up data in a module that contains "singles" (a
-- codepoint-to-data map) and "ranges" (an array containing arrays that contain
-- the low and high codepoints of a range and the data associated with that
-- range).
-- The varargs ("dots") describes the default data to be returned if there wasn't
-- a match.
-- In case the function is used more than once, "cache" saves ranges that have
-- already been found to match, or a range whose data is the default if there
-- was no match.
local function memo_lookup(loader, match_func, ...)
	local dots = { ... }
	local cache = {}
	local singles, ranges
	return function (codepoint)
		if not singles then
			singles, ranges = loader()
		end
		if singles[codepoint] then
			return match_func(codepoint, singles[codepoint])
		end
		local lastlast = -1
		for _, range in pairs(cache) do
			if (range[1] <= codepoint) and (codepoint <= range[2]) then
				return match_func(codepoint, unpack(range, 3))
			end
		end
		
		local low, mid, high
		low, high = 1, ranges.length
		
		while low <= high do
			mid = floor((low + high) / 2)
			local range = ranges[mid]
			if codepoint < range[1] then
				high = mid - 1
			elseif codepoint <= range[2] then
				table.insert(cache, { manual_unpack(range) })
				return match_func(codepoint, manual_unpack(range, 3))
			else
				low = mid + 1
			end
		end
		
		if ranges[mid] then
			local dots_range
			if codepoint > ranges[mid][2] then
				dots_range = {
					ranges[mid][2] + 1,
					ranges[mid + 1] and ranges[mid + 1][1] - 1 or 0x10FFFF,
					unpack(dots)
				}
			else -- codepoint < range[mid][1]
				dots_range = {
					ranges[mid - 1] and ranges[mid - 1][2] + 1 or 0,
					ranges[mid][1] - 1,
					unpack(dots)
				}
			end
			table.insert(cache, dots_range)	
		end
		
		--[[
		mw.log(unpack(require "Module:Utility".map(
			function (range)
				return ("U+%04X-U+%04X: %s"):format(unpack(range))
			end,
			cache)))
		--]]
		
		return match_func(codepoint)
	end
end
-- Get a codepoint's combining class value in [[Module:Unicode data/combining]],
-- and return whether this value is not zero. Zero is assigned as the default
-- if the combining class value is not found in this data module.
-- That is, return true if character is combining, or false if it is not.
-- See http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
p.is_combining = memo_lookup(function ()
	local m_comb = mw.loadData('Module:Unicode data/combining')
	return m_comb.single, m_comb.ranges
end, function (codepoint, combining_class)
	return combining_class and combining_class ~= 0
		or false
end, 0)
function p.add_dotted_circle(str)
	return (mw.ustring.gsub(str, ".",
		function(char)
			if p.is_combining(mw.ustring.codepoint(char)) then
				return '◌' .. char
			end
		end))
end
local lookup_control = memo_lookup(function ()
	local m_cc = mw.loadData('Module:Unicode data/control')
	return m_cc.single, m_cc.ranges
end, function (codepoint, ccc)
	return ccc or "assigned"
end, "assigned")
function p.is_assigned(codepoint)
	return lookup_control(codepoint) ~= "unassigned"
end
function p.is_printable(codepoint)
	local result = lookup_control(codepoint)
	return (result == "assigned") or (result == "space-separator"), result
end
function p.is_whitespace(codepoint)
	local result = lookup_control(codepoint)
	return (result == "space-separator"), result
end
local unsupported_title = {
	[0x0020] = "Unsupported titles/Space";
	[0x0023] = "Unsupported titles/Number sign";
	[0x002E] = "Unsupported titles/Full stop";
	[0x003A] = "Unsupported titles/Colon";
	[0x003C] = "Unsupported titles/Less than";
	[0x003E] = "Unsupported titles/Greater than";
	[0x005B] = "Unsupported titles/Left square bracket";
	[0x005D] = "Unsupported titles/Right square bracket";
	[0x005F] = "Unsupported titles/Low line";
	[0x007B] = "Unsupported titles/Left curly bracket";
	[0x007C] = "Unsupported titles/Vertical line";
	[0x007D] = "Unsupported titles/Right curly bracket";
	[0x1680] = "Unsupported titles/Ogham space";
	[0xFFFD] = "Unsupported titles/Replacement character";
}
function p.get_entry_title(codepoint)
	if unsupported_title[codepoint] then
		return unsupported_title[codepoint]
	end
	if lookup_control(codepoint) ~= "assigned" then
		return nil
	end
	return mw.ustring.char(codepoint)
end
return p







