Module:Unicode data: Difference between revisions
From All Skies Encyclopaedia
imported>Erutuon (update) |
imported>Pppery (Add KCantonese lookup per request) |
||
(62 intermediate revisions by 6 users not shown) | |||
Line 1: | Line 1: | ||
local |
local p = {} |
||
local floor = math.floor |
local floor = math.floor |
||
local function errorf(level, ...) |
|||
-- http://www.unicode.org/Public/UNIDATA/Jamo.txt |
|||
if type(level) == "number" then |
|||
-- For the algorithm used here, see Hangul Syllable Name Generation |
|||
return error(string.format(...), level + 1) |
|||
-- in section 3.12 of the Unicode Specification. |
|||
else -- level is actually the format string. |
|||
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf |
|||
return error(string.format(level, ...), 2) |
|||
local hangul_leads = { |
|||
end |
|||
[0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", |
|||
end |
|||
"", "J", "JJ", "C", "K", "T", "P", "H" |
|||
} |
|||
local function binary_range_search(codepoint, ranges) |
|||
local hangul_vowels = { |
|||
local low, mid, high |
|||
[0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", |
|||
low, high = 1, ranges.length or require "Module:TableTools".length(ranges) |
|||
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", |
|||
while low <= high do |
|||
"I" |
|||
mid = floor((low + high) / 2) |
|||
} |
|||
local range = ranges[mid] |
|||
if codepoint < range[1] then |
|||
high = mid - 1 |
|||
elseif codepoint <= range[2] then |
|||
return range, mid |
|||
else |
|||
low = mid + 1 |
|||
end |
|||
end |
|||
return nil, mid |
|||
end |
|||
p.binary_range_search = binary_range_search |
|||
--[[ |
|||
local hangul_trails = { |
|||
local function linear_range_search(codepoint, ranges) |
|||
[0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", |
|||
for i, range in ipairs(ranges) do |
|||
"LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", |
|||
if range[1] <= codepoint and codepoint <= range[2] then |
|||
"T", "P", "H" |
|||
return range |
|||
} |
|||
end |
|||
end |
|||
end |
|||
--]] |
|||
-- Load a module by indexing "loader" with the name of the module minus the |
|||
-- "Module:Unicode data/" part. For instance, loader.blocks returns |
|||
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be |
|||
-- returned. |
|||
local loader = setmetatable({}, { |
|||
__index = function (self, key) |
|||
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key) |
|||
if not success then |
|||
data = false |
|||
end |
|||
self[key] = data |
|||
return data |
|||
end |
|||
}) |
|||
-- For the algorithm used to generate Hangul Syllable names, |
|||
-- see "Hangul Syllable Name Generation" in section 3.12 of the |
|||
-- Unicode Specification: |
|||
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf |
|||
local name_hooks = { |
local name_hooks = { |
||
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters |
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters |
||
{ 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters |
{ 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters |
||
{ 0x3400, |
{ 0x3400, 0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A |
||
{ 0x4E00, |
{ 0x4E00, 0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph |
||
{ 0xAC00, 0xD7A3, function (codepoint) |
{ 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables |
||
local |
local Hangul_data = loader.Hangul |
||
local syllable_index = codepoint - 0xAC00 |
|||
-- lead index, vowel index, trail index |
|||
local li, vi, ti = m_hangul.syllableIndex2JamoIndices( |
|||
codepoint - 0xAC00 |
|||
) |
|||
return ("HANGUL SYLLABLE %s%s%s"):format( |
return ("HANGUL SYLLABLE %s%s%s"):format( |
||
Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)], |
|||
hangul_leads[li], -- I hate one-based indexing |
|||
Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count) |
|||
hangul_vowels[vi], |
|||
/ Hangul_data.trail_count)], |
|||
hangul_trails[ti] -- never mind, I can live with it |
|||
Hangul_data.trails[syllable_index % Hangul_data.trail_count] |
|||
) |
) |
||
end }, |
end }, |
||
-- High Surrogates, High Private Use Surrogates, Low Surrogates |
|||
{ 0xD800, 0xDB7F, "<surrogate-%04X>" }, -- Non Private Use High Surrogate |
|||
{ |
{ 0xD800, 0xDFFF, "<surrogate-%04X>" }, |
||
{ 0xDC00, 0xDFFF, "<surrogate-%04X>" }, -- Low Surrogate |
|||
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use |
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use |
||
-- CJK Compatibility Ideographs |
|||
{ 0x17000, 0x187F1, "TANGUT IDEOGRAPH-%05X" }, -- Tangut |
|||
{ |
{ 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, |
||
{ |
{ 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, |
||
{ |
{ 0x17000, 0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph |
||
{ 0x18800, 0x18AFF, function (codepoint) |
|||
{ 0x2A740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension D |
|||
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) |
|||
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension E |
|||
end }, |
|||
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension F --add v10 |
|||
{ |
{ 0x18D00, 0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement |
||
{ |
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu |
||
{ |
{ 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B |
||
{ 0x2A700, 0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C |
|||
{ 0x2B740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D |
|||
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E |
|||
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F |
|||
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) |
|||
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, |
|||
{ 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement |
|||
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) |
|||
end}, |
|||
{ 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G |
|||
{ 0x31350, 0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H |
|||
{ 0x2EBF0, 0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I |
|||
{ 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use |
|||
{ 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use |
|||
} |
} |
||
name_hooks.length = #name_hooks |
|||
local name_range_cache |
local name_range_cache |
||
Line 68: | Line 114: | ||
end |
end |
||
--[[ |
|||
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 |
|||
-- Checks that the code point is a number and in range. |
|||
function export.lookup_name(codepoint) |
|||
-- Does not check whether code point is an integer. |
|||
-- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are noncharacters: |
|||
-- Not used |
|||
local function check_codepoint(funcName, argIdx, val) |
|||
require 'libraryUtil'.checkType(funcName, argIdx, val, 'number') |
|||
if codepoint < 0 or 0x10FFFF < codepoint then |
|||
errorf("Codepoint %04X out of range", codepoint) |
|||
end |
|||
end |
|||
--]] |
|||
function p.is_noncharacter(codepoint) |
|||
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned |
|||
-- (Cn) and specifically noncharacters: |
|||
-- https://www.unicode.org/faq/private_use.html#nonchar4 |
-- https://www.unicode.org/faq/private_use.html#nonchar4 |
||
return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF |
|||
or |
or floor(codepoint % 0x10000) >= 0xFFFE) |
||
end |
|||
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 |
|||
function p.lookup_name(codepoint) |
|||
if p.is_noncharacter(codepoint) then |
|||
return ("<noncharacter-%04X>"):format(codepoint) |
return ("<noncharacter-%04X>"):format(codepoint) |
||
end |
end |
||
if name_range_cache |
if name_range_cache -- Check if previously used "name hook" applies to this code point. |
||
and codepoint >= name_range_cache[1] |
|||
and codepoint <= name_range_cache[2] then |
|||
return generate_name(name_range_cache[3], codepoint) |
|||
end |
|||
end |
end |
||
local range = binary_range_search(codepoint, name_hooks) |
|||
if range then |
|||
name_range_cache = range |
|||
break |
|||
return generate_name(range[3], codepoint) |
|||
elseif codepoint <= item[2] then |
|||
name_range_cache = item |
|||
return generate_name(item[3], codepoint) |
|||
end |
|||
end |
end |
||
local |
local data = loader[('names/%03X'):format(codepoint / 0x1000)] |
||
('Module:Unicode data/names/%03X'):format(codepoint / 0x1000)) |
|||
if |
if data and data[codepoint] then |
||
return data[codepoint] |
return data[codepoint] |
||
-- Unassigned (Cn) |
-- Unassigned (Cn) consists of noncharacters and reserved characters. |
||
-- The character |
-- The character has been established not to be a noncharacter, |
||
-- and if it were assigned, its name would already been retrieved, |
|||
-- would already been retrieved, so it must be reserved. |
|||
-- so it must be reserved. |
|||
else |
else |
||
return ("<reserved-%04X>"):format(codepoint) |
return ("<reserved-%04X>"):format(codepoint) |
||
Line 106: | Line 167: | ||
end |
end |
||
function |
function p.lookup_image(codepoint) |
||
local |
local data = loader[('images/%03X'):format(codepoint / 0x1000)] |
||
('Module:Unicode data/images/%03X'):format(codepoint / 0x1000) |
|||
) |
|||
if |
if data then |
||
return data[codepoint] |
return data[codepoint] |
||
end |
end |
||
end |
|||
function export.template_lookup_name(frame) |
|||
local codepoint = tonumber(frame.args[1] or frame:getParent().args[1]) |
|||
local name = export.lookup_name(codepoint) |
|||
return name:gsub("<", "<") |
|||
end |
end |
||
Line 126: | Line 179: | ||
[ 1] = "Supplementary Multilingual Plane"; |
[ 1] = "Supplementary Multilingual Plane"; |
||
[ 2] = "Supplementary Ideographic Plane"; |
[ 2] = "Supplementary Ideographic Plane"; |
||
[ |
[ 3] = "Tertiary Ideographic Plane"; |
||
[14] = "Supplementary |
[14] = "Supplementary Special-purpose Plane"; |
||
[15] = "Supplementary Private Use Area- |
[15] = "Supplementary Private Use Area-A"; |
||
[16] = "Supplementary Private Use Area-B"; |
|||
} |
} |
||
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable. |
|||
-- http://www.unicode.org/Public/UNIDATA/Blocks.txt |
|||
local blocks |
|||
-- This should be kept synchronized with [[Module:category tree/scriptcatboiler/blocks]]. |
|||
local blocks = { |
|||
{ "Basic Latin", 0x000000, 0x00007F }, |
|||
{ "Latin-1 Supplement", 0x000080, 0x0000FF }, |
|||
{ "Latin Extended-A", 0x000100, 0x00017F }, |
|||
{ "Latin Extended-B", 0x000180, 0x00024F }, |
|||
{ "IPA Extensions", 0x000250, 0x0002AF }, |
|||
{ "Spacing Modifier Letters", 0x0002B0, 0x0002FF }, |
|||
{ "Combining Diacritical Marks", 0x000300, 0x00036F }, |
|||
{ "Greek and Coptic", 0x000370, 0x0003FF }, |
|||
{ "Cyrillic", 0x000400, 0x0004FF }, |
|||
{ "Cyrillic Supplement", 0x000500, 0x00052F }, |
|||
{ "Armenian", 0x000530, 0x00058F }, |
|||
{ "Hebrew", 0x000590, 0x0005FF }, |
|||
{ "Arabic", 0x000600, 0x0006FF }, |
|||
{ "Syriac", 0x000700, 0x00074F }, |
|||
{ "Arabic Supplement", 0x000750, 0x00077F }, |
|||
{ "Thaana", 0x000780, 0x0007BF }, |
|||
{ "NKo", 0x0007C0, 0x0007FF }, |
|||
{ "Samaritan", 0x000800, 0x00083F }, |
|||
{ "Mandaic", 0x000840, 0x00085F }, |
|||
{ "Syriac Supplement", 0x000860, 0x00086F }, |
|||
{ "Arabic Extended-A", 0x0008A0, 0x0008FF }, |
|||
{ "Devanagari", 0x000900, 0x00097F }, |
|||
{ "Bengali", 0x000980, 0x0009FF }, |
|||
{ "Gurmukhi", 0x000A00, 0x000A7F }, |
|||
{ "Gujarati", 0x000A80, 0x000AFF }, |
|||
{ "Oriya", 0x000B00, 0x000B7F }, |
|||
{ "Tamil", 0x000B80, 0x000BFF }, |
|||
{ "Telugu", 0x000C00, 0x000C7F }, |
|||
{ "Kannada", 0x000C80, 0x000CFF }, |
|||
{ "Malayalam", 0x000D00, 0x000D7F }, |
|||
{ "Sinhala", 0x000D80, 0x000DFF }, |
|||
{ "Thai", 0x000E00, 0x000E7F }, |
|||
{ "Lao", 0x000E80, 0x000EFF }, |
|||
{ "Tibetan", 0x000F00, 0x000FFF }, |
|||
{ "Myanmar", 0x001000, 0x00109F }, |
|||
{ "Georgian", 0x0010A0, 0x0010FF }, |
|||
{ "Hangul Jamo", 0x001100, 0x0011FF }, |
|||
{ "Ethiopic", 0x001200, 0x00137F }, |
|||
{ "Ethiopic Supplement", 0x001380, 0x00139F }, |
|||
{ "Cherokee", 0x0013A0, 0x0013FF }, |
|||
{ "Unified Canadian Aboriginal Syllabics", 0x001400, 0x00167F }, |
|||
{ "Ogham", 0x001680, 0x00169F }, |
|||
{ "Runic", 0x0016A0, 0x0016FF }, |
|||
{ "Tagalog", 0x001700, 0x00171F }, |
|||
{ "Hanunoo", 0x001720, 0x00173F }, |
|||
{ "Buhid", 0x001740, 0x00175F }, |
|||
{ "Tagbanwa", 0x001760, 0x00177F }, |
|||
{ "Khmer", 0x001780, 0x0017FF }, |
|||
{ "Mongolian", 0x001800, 0x0018AF }, |
|||
{ "Unified Canadian Aboriginal Syllabics Extended", 0x0018B0, 0x0018FF }, |
|||
{ "Limbu", 0x001900, 0x00194F }, |
|||
{ "Tai Le", 0x001950, 0x00197F }, |
|||
{ "New Tai Lue", 0x001980, 0x0019DF }, |
|||
{ "Khmer Symbols", 0x0019E0, 0x0019FF }, |
|||
{ "Buginese", 0x001A00, 0x001A1F }, |
|||
{ "Tai Tham", 0x001A20, 0x001AAF }, |
|||
{ "Combining Diacritical Marks Extended", 0x001AB0, 0x001AFF }, |
|||
{ "Balinese", 0x001B00, 0x001B7F }, |
|||
{ "Sundanese", 0x001B80, 0x001BBF }, |
|||
{ "Batak", 0x001BC0, 0x001BFF }, |
|||
{ "Lepcha", 0x001C00, 0x001C4F }, |
|||
{ "Ol Chiki", 0x001C50, 0x001C7F }, |
|||
{ "Cyrillic Extended-C", 0x001C80, 0x001C8F }, |
|||
{ "Georgian Extended", 0x001C90, 0x001CBF }, |
|||
{ "Sundanese Supplement", 0x001CC0, 0x001CCF }, |
|||
{ "Vedic Extensions", 0x001CD0, 0x001CFF }, |
|||
{ "Phonetic Extensions", 0x001D00, 0x001D7F }, |
|||
{ "Phonetic Extensions Supplement", 0x001D80, 0x001DBF }, |
|||
{ "Combining Diacritical Marks Supplement", 0x001DC0, 0x001DFF }, |
|||
{ "Latin Extended Additional", 0x001E00, 0x001EFF }, |
|||
{ "Greek Extended", 0x001F00, 0x001FFF }, |
|||
{ "General Punctuation", 0x002000, 0x00206F }, |
|||
{ "Superscripts and Subscripts", 0x002070, 0x00209F }, |
|||
{ "Currency Symbols", 0x0020A0, 0x0020CF }, |
|||
{ "Combining Diacritical Marks for Symbols", 0x0020D0, 0x0020FF }, |
|||
{ "Letterlike Symbols", 0x002100, 0x00214F }, |
|||
{ "Number Forms", 0x002150, 0x00218F }, |
|||
{ "Arrows", 0x002190, 0x0021FF }, |
|||
{ "Mathematical Operators", 0x002200, 0x0022FF }, |
|||
{ "Miscellaneous Technical", 0x002300, 0x0023FF }, |
|||
{ "Control Pictures", 0x002400, 0x00243F }, |
|||
{ "Optical Character Recognition", 0x002440, 0x00245F }, |
|||
{ "Enclosed Alphanumerics", 0x002460, 0x0024FF }, |
|||
{ "Box Drawing", 0x002500, 0x00257F }, |
|||
{ "Block Elements", 0x002580, 0x00259F }, |
|||
{ "Geometric Shapes", 0x0025A0, 0x0025FF }, |
|||
{ "Miscellaneous Symbols", 0x002600, 0x0026FF }, |
|||
{ "Dingbats", 0x002700, 0x0027BF }, |
|||
{ "Miscellaneous Mathematical Symbols-A", 0x0027C0, 0x0027EF }, |
|||
{ "Supplemental Arrows-A", 0x0027F0, 0x0027FF }, |
|||
{ "Braille Patterns", 0x002800, 0x0028FF }, |
|||
{ "Supplemental Arrows-B", 0x002900, 0x00297F }, |
|||
{ "Miscellaneous Mathematical Symbols-B", 0x002980, 0x0029FF }, |
|||
{ "Supplemental Mathematical Operators", 0x002A00, 0x002AFF }, |
|||
{ "Miscellaneous Symbols and Arrows", 0x002B00, 0x002BFF }, |
|||
{ "Glagolitic", 0x002C00, 0x002C5F }, |
|||
{ "Latin Extended-C", 0x002C60, 0x002C7F }, |
|||
{ "Coptic", 0x002C80, 0x002CFF }, |
|||
{ "Georgian Supplement", 0x002D00, 0x002D2F }, |
|||
{ "Tifinagh", 0x002D30, 0x002D7F }, |
|||
{ "Ethiopic Extended", 0x002D80, 0x002DDF }, |
|||
{ "Cyrillic Extended-A", 0x002DE0, 0x002DFF }, |
|||
{ "Supplemental Punctuation", 0x002E00, 0x002E7F }, |
|||
{ "CJK Radicals Supplement", 0x002E80, 0x002EFF }, |
|||
{ "Kangxi Radicals", 0x002F00, 0x002FDF }, |
|||
{ "Ideographic Description Characters", 0x002FF0, 0x002FFF }, |
|||
{ "CJK Symbols and Punctuation", 0x003000, 0x00303F }, |
|||
{ "Hiragana", 0x003040, 0x00309F }, |
|||
{ "Katakana", 0x0030A0, 0x0030FF }, |
|||
{ "Bopomofo", 0x003100, 0x00312F }, |
|||
{ "Hangul Compatibility Jamo", 0x003130, 0x00318F }, |
|||
{ "Kanbun", 0x003190, 0x00319F }, |
|||
{ "Bopomofo Extended", 0x0031A0, 0x0031BF }, |
|||
{ "CJK Strokes", 0x0031C0, 0x0031EF }, |
|||
{ "Katakana Phonetic Extensions", 0x0031F0, 0x0031FF }, |
|||
{ "Enclosed CJK Letters and Months", 0x003200, 0x0032FF }, |
|||
{ "CJK Compatibility", 0x003300, 0x0033FF }, |
|||
{ "CJK Unified Ideographs Extension A", 0x003400, 0x004DBF }, |
|||
{ "Yijing Hexagram Symbols", 0x004DC0, 0x004DFF }, |
|||
{ "CJK Unified Ideographs", 0x004E00, 0x009FFF }, |
|||
{ "Yi Syllables", 0x00A000, 0x00A48F }, |
|||
{ "Yi Radicals", 0x00A490, 0x00A4CF }, |
|||
{ "Lisu", 0x00A4D0, 0x00A4FF }, |
|||
{ "Vai", 0x00A500, 0x00A63F }, |
|||
{ "Cyrillic Extended-B", 0x00A640, 0x00A69F }, |
|||
{ "Bamum", 0x00A6A0, 0x00A6FF }, |
|||
{ "Modifier Tone Letters", 0x00A700, 0x00A71F }, |
|||
{ "Latin Extended-D", 0x00A720, 0x00A7FF }, |
|||
{ "Syloti Nagri", 0x00A800, 0x00A82F }, |
|||
{ "Common Indic Number Forms", 0x00A830, 0x00A83F }, |
|||
{ "Phags-pa", 0x00A840, 0x00A87F }, |
|||
{ "Saurashtra", 0x00A880, 0x00A8DF }, |
|||
{ "Devanagari Extended", 0x00A8E0, 0x00A8FF }, |
|||
{ "Kayah Li", 0x00A900, 0x00A92F }, |
|||
{ "Rejang", 0x00A930, 0x00A95F }, |
|||
{ "Hangul Jamo Extended-A", 0x00A960, 0x00A97F }, |
|||
{ "Javanese", 0x00A980, 0x00A9DF }, |
|||
{ "Myanmar Extended-B", 0x00A9E0, 0x00A9FF }, |
|||
{ "Cham", 0x00AA00, 0x00AA5F }, |
|||
{ "Myanmar Extended-A", 0x00AA60, 0x00AA7F }, |
|||
{ "Tai Viet", 0x00AA80, 0x00AADF }, |
|||
{ "Meetei Mayek Extensions", 0x00AAE0, 0x00AAFF }, |
|||
{ "Ethiopic Extended-A", 0x00AB00, 0x00AB2F }, |
|||
{ "Latin Extended-E", 0x00AB30, 0x00AB6F }, |
|||
{ "Cherokee Supplement", 0x00AB70, 0x00ABBF }, |
|||
{ "Meetei Mayek", 0x00ABC0, 0x00ABFF }, |
|||
{ "Hangul Syllables", 0x00AC00, 0x00D7AF }, |
|||
{ "Hangul Jamo Extended-B", 0x00D7B0, 0x00D7FF }, |
|||
{ "High Surrogates", 0x00D800, 0x00DB7F }, |
|||
{ "High Private Use Surrogates", 0x00DB80, 0x00DBFF }, |
|||
{ "Low Surrogates", 0x00DC00, 0x00DFFF }, |
|||
{ "Private Use Area", 0x00E000, 0x00F8FF }, |
|||
{ "CJK Compatibility Ideographs", 0x00F900, 0x00FAFF }, |
|||
{ "Alphabetic Presentation Forms", 0x00FB00, 0x00FB4F }, |
|||
{ "Arabic Presentation Forms-A", 0x00FB50, 0x00FDFF }, |
|||
{ "Variation Selectors", 0x00FE00, 0x00FE0F }, |
|||
{ "Vertical Forms", 0x00FE10, 0x00FE1F }, |
|||
{ "Combining Half Marks", 0x00FE20, 0x00FE2F }, |
|||
{ "CJK Compatibility Forms", 0x00FE30, 0x00FE4F }, |
|||
{ "Small Form Variants", 0x00FE50, 0x00FE6F }, |
|||
{ "Arabic Presentation Forms-B", 0x00FE70, 0x00FEFF }, |
|||
{ "Halfwidth and Fullwidth Forms", 0x00FF00, 0x00FFEF }, |
|||
{ "Specials", 0x00FFF0, 0x00FFFF }, |
|||
{ "Linear B Syllabary", 0x010000, 0x01007F }, |
|||
{ "Linear B Ideograms", 0x010080, 0x0100FF }, |
|||
{ "Aegean Numbers", 0x010100, 0x01013F }, |
|||
{ "Ancient Greek Numbers", 0x010140, 0x01018F }, |
|||
{ "Ancient Symbols", 0x010190, 0x0101CF }, |
|||
{ "Phaistos Disc", 0x0101D0, 0x0101FF }, |
|||
{ "Lycian", 0x010280, 0x01029F }, |
|||
{ "Carian", 0x0102A0, 0x0102DF }, |
|||
{ "Coptic Epact Numbers", 0x0102E0, 0x0102FF }, |
|||
{ "Old Italic", 0x010300, 0x01032F }, |
|||
{ "Gothic", 0x010330, 0x01034F }, |
|||
{ "Old Permic", 0x010350, 0x01037F }, |
|||
{ "Ugaritic", 0x010380, 0x01039F }, |
|||
{ "Old Persian", 0x0103A0, 0x0103DF }, |
|||
{ "Deseret", 0x010400, 0x01044F }, |
|||
{ "Shavian", 0x010450, 0x01047F }, |
|||
{ "Osmanya", 0x010480, 0x0104AF }, |
|||
{ "Osage", 0x0104B0, 0x0104FF }, |
|||
{ "Elbasan", 0x010500, 0x01052F }, |
|||
{ "Caucasian Albanian", 0x010530, 0x01056F }, |
|||
{ "Linear A", 0x010600, 0x01077F }, |
|||
{ "Cypriot Syllabary", 0x010800, 0x01083F }, |
|||
{ "Imperial Aramaic", 0x010840, 0x01085F }, |
|||
{ "Palmyrene", 0x010860, 0x01087F }, |
|||
{ "Nabataean", 0x010880, 0x0108AF }, |
|||
{ "Hatran", 0x0108E0, 0x0108FF }, |
|||
{ "Phoenician", 0x010900, 0x01091F }, |
|||
{ "Lydian", 0x010920, 0x01093F }, |
|||
{ "Meroitic Hieroglyphs", 0x010980, 0x01099F }, |
|||
{ "Meroitic Cursive", 0x0109A0, 0x0109FF }, |
|||
{ "Kharoshthi", 0x010A00, 0x010A5F }, |
|||
{ "Old South Arabian", 0x010A60, 0x010A7F }, |
|||
{ "Old North Arabian", 0x010A80, 0x010A9F }, |
|||
{ "Manichaean", 0x010AC0, 0x010AFF }, |
|||
{ "Avestan", 0x010B00, 0x010B3F }, |
|||
{ "Inscriptional Parthian", 0x010B40, 0x010B5F }, |
|||
{ "Inscriptional Pahlavi", 0x010B60, 0x010B7F }, |
|||
{ "Psalter Pahlavi", 0x010B80, 0x010BAF }, |
|||
{ "Old Turkic", 0x010C00, 0x010C4F }, |
|||
{ "Old Hungarian", 0x010C80, 0x010CFF }, |
|||
{ "Hanifi Rohingya", 0x010D00, 0x010D3F }, |
|||
{ "Rumi Numeral Symbols", 0x010E60, 0x010E7F }, |
|||
{ "Old Sogdian", 0x010F00, 0x010F2F }, |
|||
{ "Sogdian", 0x010F30, 0x010F6F }, |
|||
{ "Brahmi", 0x011000, 0x01107F }, |
|||
{ "Kaithi", 0x011080, 0x0110CF }, |
|||
{ "Sora Sompeng", 0x0110D0, 0x0110FF }, |
|||
{ "Chakma", 0x011100, 0x01114F }, |
|||
{ "Mahajani", 0x011150, 0x01117F }, |
|||
{ "Sharada", 0x011180, 0x0111DF }, |
|||
{ "Sinhala Archaic Numbers", 0x0111E0, 0x0111FF }, |
|||
{ "Khojki", 0x011200, 0x01124F }, |
|||
{ "Multani", 0x011280, 0x0112AF }, |
|||
{ "Khudawadi", 0x0112B0, 0x0112FF }, |
|||
{ "Grantha", 0x011300, 0x01137F }, |
|||
{ "Newa", 0x011400, 0x01147F }, |
|||
{ "Tirhuta", 0x011480, 0x0114DF }, |
|||
{ "Siddham", 0x011580, 0x0115FF }, |
|||
{ "Modi", 0x011600, 0x01165F }, |
|||
{ "Mongolian Supplement", 0x011660, 0x01167F }, |
|||
{ "Takri", 0x011680, 0x0116CF }, |
|||
{ "Ahom", 0x011700, 0x01173F }, |
|||
{ "Dogra", 0x011800, 0x01184F }, |
|||
{ "Warang Citi", 0x0118A0, 0x0118FF }, |
|||
{ "Zanabazar Square", 0x011A00, 0x011A4F }, |
|||
{ "Soyombo", 0x011A50, 0x011AAF }, |
|||
{ "Pau Cin Hau", 0x011AC0, 0x011AFF }, |
|||
{ "Bhaiksuki", 0x011C00, 0x011C6F }, |
|||
{ "Marchen", 0x011C70, 0x011CBF }, |
|||
{ "Masaram Gondi", 0x011D00, 0x011D5F }, |
|||
{ "Gunjala Gondi", 0x011D60, 0x011DAF }, |
|||
{ "Makasar", 0x011EE0, 0x011EFF }, |
|||
{ "Cuneiform", 0x012000, 0x0123FF }, |
|||
{ "Cuneiform Numbers and Punctuation", 0x012400, 0x01247F }, |
|||
{ "Early Dynastic Cuneiform", 0x012480, 0x01254F }, |
|||
{ "Egyptian Hieroglyphs", 0x013000, 0x01342F }, |
|||
{ "Anatolian Hieroglyphs", 0x014400, 0x01467F }, |
|||
{ "Bamum Supplement", 0x016800, 0x016A3F }, |
|||
{ "Mro", 0x016A40, 0x016A6F }, |
|||
{ "Bassa Vah", 0x016AD0, 0x016AFF }, |
|||
{ "Pahawh Hmong", 0x016B00, 0x016B8F }, |
|||
{ "Medefaidrin", 0x016E40, 0x016E9F }, |
|||
{ "Miao", 0x016F00, 0x016F9F }, |
|||
{ "Ideographic Symbols and Punctuation", 0x016FE0, 0x016FFF }, |
|||
{ "Tangut", 0x017000, 0x0187FF }, |
|||
{ "Tangut Components", 0x018800, 0x018AFF }, |
|||
{ "Kana Supplement", 0x01B000, 0x01B0FF }, |
|||
{ "Kana Extended-A", 0x01B100, 0x01B12F }, |
|||
{ "Nushu", 0x01B170, 0x01B2FF }, |
|||
{ "Duployan", 0x01BC00, 0x01BC9F }, |
|||
{ "Shorthand Format Controls", 0x01BCA0, 0x01BCAF }, |
|||
{ "Byzantine Musical Symbols", 0x01D000, 0x01D0FF }, |
|||
{ "Musical Symbols", 0x01D100, 0x01D1FF }, |
|||
{ "Ancient Greek Musical Notation", 0x01D200, 0x01D24F }, |
|||
{ "Mayan Numerals", 0x01D2E0, 0x01D2FF }, |
|||
{ "Tai Xuan Jing Symbols", 0x01D300, 0x01D35F }, |
|||
{ "Counting Rod Numerals", 0x01D360, 0x01D37F }, |
|||
{ "Mathematical Alphanumeric Symbols", 0x01D400, 0x01D7FF }, |
|||
{ "Sutton SignWriting", 0x01D800, 0x01DAAF }, |
|||
{ "Glagolitic Supplement", 0x01E000, 0x01E02F }, |
|||
{ "Mende Kikakui", 0x01E800, 0x01E8DF }, |
|||
{ "Adlam", 0x01E900, 0x01E95F }, |
|||
{ "Indic Siyaq Numbers", 0x01EC70, 0x01ECBF }, |
|||
{ "Arabic Mathematical Alphabetic Symbols", 0x01EE00, 0x01EEFF }, |
|||
{ "Mahjong Tiles", 0x01F000, 0x01F02F }, |
|||
{ "Domino Tiles", 0x01F030, 0x01F09F }, |
|||
{ "Playing Cards", 0x01F0A0, 0x01F0FF }, |
|||
{ "Enclosed Alphanumeric Supplement", 0x01F100, 0x01F1FF }, |
|||
{ "Enclosed Ideographic Supplement", 0x01F200, 0x01F2FF }, |
|||
{ "Miscellaneous Symbols and Pictographs", 0x01F300, 0x01F5FF }, |
|||
{ "Emoticons", 0x01F600, 0x01F64F }, |
|||
{ "Ornamental Dingbats", 0x01F650, 0x01F67F }, |
|||
{ "Transport and Map Symbols", 0x01F680, 0x01F6FF }, |
|||
{ "Alchemical Symbols", 0x01F700, 0x01F77F }, |
|||
{ "Geometric Shapes Extended", 0x01F780, 0x01F7FF }, |
|||
{ "Supplemental Arrows-C", 0x01F800, 0x01F8FF }, |
|||
{ "Supplemental Symbols and Pictographs", 0x01F900, 0x01F9FF }, |
|||
{ "Chess Symbols", 0x01FA00, 0x01FA6F }, |
|||
{ "CJK Unified Ideographs Extension B", 0x020000, 0x02A6DF }, |
|||
{ "CJK Unified Ideographs Extension C", 0x02A700, 0x02B73F }, |
|||
{ "CJK Unified Ideographs Extension D", 0x02B740, 0x02B81F }, |
|||
{ "CJK Unified Ideographs Extension E", 0x02B820, 0x02CEAF }, |
|||
{ "CJK Unified Ideographs Extension F", 0x02CEB0, 0x02EBEF }, |
|||
{ "CJK Compatibility Ideographs Supplement", 0x02F800, 0x02FA1F }, |
|||
{ "Tags", 0x0E0000, 0x0E007F }, |
|||
{ "Variation Selectors Supplement", 0x0E0100, 0x0E01EF }, |
|||
{ "Supplementary Private Use Area-A", 0x0F0000, 0x0FFFFF }, |
|||
{ "Supplementary Private Use Area-B", 0x100000, 0x10FFFF }, |
|||
} |
|||
blocks.length = #blocks |
|||
function |
local function block_iter(blocks, i) |
||
i = i + 1 |
|||
return function (blocks, i) |
|||
local data = blocks[i] |
|||
i = i + 1 |
|||
if data then |
|||
-- Unpack doesn't work on tables loaded with mw.loadData. |
|||
if not data then |
|||
return i, data[1], data[2], data[3] |
|||
end |
|||
return i, unpack(data) |
|||
end, blocks, 0 |
|||
end |
end |
||
-- An ipairs-type iterator generator for the list of blocks. |
|||
function export.lookup_plane(codepoint) |
|||
function p.enum_blocks() |
|||
local blocks = loader.blocks |
|||
return block_iter, blocks, 0 |
|||
end |
|||
function p.lookup_plane(codepoint) |
|||
local i = floor(codepoint / 0x10000) |
local i = floor(codepoint / 0x10000) |
||
return planes[i] or ("Plane %u"):format(i) |
return planes[i] or ("Plane %u"):format(i) |
||
end |
end |
||
function p.lookup_block(codepoint) |
|||
-- Binary search, to avoid iterating over entire table in order to look up the |
|||
local blocks = loader.blocks |
|||
-- higher codepoints. |
|||
local range = binary_range_search(codepoint, blocks) |
|||
if range then |
|||
local iStart, iEnd = 1, blocks.length or #blocks |
|||
return range[3] |
|||
while iStart <= iEnd do |
|||
else |
|||
local iMid = floor((iStart + iEnd) / 2) |
|||
return "No Block" |
|||
local range = blocks[iMid] |
|||
if codepoint < range[2] then |
|||
iEnd = iMid - 1 |
|||
elseif codepoint <= range[3] then |
|||
return range[1] |
|||
else |
|||
iStart = iMid + 1 |
|||
end |
|||
end |
end |
||
error(string.format("No block found for codepoint U+%04X.", codepoint)) |
|||
end |
end |
||
function |
function p.get_block_info(name) |
||
for i, block in ipairs(loader.blocks) do |
|||
local range |
|||
if block[3] == name then |
|||
return block |
|||
for i, block in ipairs(blocks) do |
|||
if block[1] == name then |
|||
range = block |
|||
end |
end |
||
end |
|||
if range then |
|||
return range[2], range[3] |
|||
end |
end |
||
end |
end |
||
function |
function p.is_valid_pagename(pagename) |
||
local has_nonws = false |
local has_nonws = false |
||
Line 493: | Line 243: | ||
end |
end |
||
local printable, result = |
local printable, result = p.is_printable(cp) |
||
if not printable then |
if not printable then |
||
return false |
return false |
||
Line 507: | Line 257: | ||
local function manual_unpack(what, from) |
local function manual_unpack(what, from) |
||
if what[from + 1] == nil then |
|||
return what[from] |
|||
end |
|||
local result = {} |
local result = {} |
||
from = from or 1 |
from = from or 1 |
||
Line 517: | Line 271: | ||
end |
end |
||
local function |
local function compare_ranges(range1, range2) |
||
return range1[1] < range2[1] |
|||
end |
|||
-- Creates a function to look up data in a module that contains "singles" (a |
|||
-- code point-to-data map) and "ranges" (an array containing arrays that contain |
|||
-- the low and high code points of a range and the data associated with that |
|||
-- range). |
|||
-- "loader" loads and returns the "singles" and "ranges" tables. |
|||
-- "match_func" is passed the code point and either the data or the "dots", and |
|||
-- generates the final result of the function. |
|||
-- The varargs ("dots") describes the default data to be returned if there wasn't |
|||
-- a match. |
|||
-- In case the function is used more than once, "cache" saves ranges that have |
|||
-- already been found to match, or a range whose data is the default if there |
|||
-- was no match. |
|||
local function memo_lookup(data_module_subpage, match_func, ...) |
|||
local dots = { ... } |
local dots = { ... } |
||
local cache = {} |
local cache = {} |
||
Line 524: | Line 294: | ||
return function (codepoint) |
return function (codepoint) |
||
if not singles then |
if not singles then |
||
local data_module = loader[data_module_subpage] |
|||
singles, ranges = data_module.singles, data_module.ranges |
|||
end |
end |
||
Line 531: | Line 302: | ||
end |
end |
||
local range = binary_range_search(codepoint, cache) |
|||
local lastlast = -1 |
|||
if range then |
|||
return match_func(codepoint, manual_unpack(range, 3)) |
|||
return match_func(codepoint, unpack(range, 3)) |
|||
end |
|||
end |
end |
||
local range, index = binary_range_search(codepoint, ranges) |
|||
if range then |
|||
table.insert(cache, range) |
|||
table.sort(cache, compare_ranges) |
|||
return match_func(codepoint, unpack(dots)) |
|||
return match_func(codepoint, manual_unpack(range, 3)) |
|||
end |
|||
table.insert(cache, { manual_unpack(range) }) |
|||
return match_func(codepoint, manual_unpack(range, 3)) |
|||
if ranges[index] then |
|||
else |
|||
local dots_range |
|||
lastlast = range[2] |
|||
if codepoint > ranges[index][2] then |
|||
dots_range = { |
|||
ranges[index][2] + 1, |
|||
ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF, |
|||
unpack(dots) |
|||
} |
|||
else -- codepoint < range[index][1] |
|||
dots_range = { |
|||
ranges[index - 1] and ranges[index - 1][2] + 1 or 0, |
|||
ranges[index][1] - 1, |
|||
unpack(dots) |
|||
} |
|||
end |
end |
||
table.sort(cache, compare_ranges) |
|||
end |
end |
||
return match_func(codepoint) |
return match_func(codepoint) |
||
end |
end |
||
end |
end |
||
-- Get a |
-- Get a code point's combining class value in [[Module:Unicode data/combining]], |
||
-- and return whether this value is not zero. Zero is assigned as the default |
-- and return whether this value is not zero. Zero is assigned as the default |
||
-- if the combining class value is not found in this data module. |
-- if the combining class value is not found in this data module. |
||
-- That is, return true if character is combining, or false if it is not. |
-- That is, return true if character is combining, or false if it is not. |
||
-- See |
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for |
||
-- more information. |
-- more information. |
||
p.is_combining = memo_lookup( |
|||
"combining", |
|||
local m_comb = mw.loadData('Module:Unicode data/combining') |
|||
function (codepoint, combining_class) |
|||
return m_comb.single, m_comb.ranges |
|||
return combining_class and combining_class ~= 0 or false |
|||
end, |
|||
return combining_class and combining_class ~= 0 |
|||
0) |
|||
or false |
|||
end, 0) |
|||
function |
function p.add_dotted_circle(str) |
||
return (mw.ustring.gsub(str, ".", |
return (mw.ustring.gsub(str, ".", |
||
function(char) |
function(char) |
||
if |
if p.is_combining(mw.ustring.codepoint(char)) then |
||
return '◌' .. char |
return '◌' .. char |
||
end |
end |
||
Line 577: | Line 358: | ||
end |
end |
||
local lookup_control = memo_lookup( |
local lookup_control = memo_lookup( |
||
"control", |
|||
local m_cc = mw.loadData('Module:Unicode data/control') |
|||
function (codepoint, ccc) |
|||
return m_cc.single, m_cc.ranges |
|||
return ccc or "assigned" |
|||
end, function (codepoint, ccc) |
|||
end, |
|||
return ccc or "assigned" |
|||
"assigned") |
|||
p.lookup_control = lookup_control |
|||
function |
function p.is_assigned(codepoint) |
||
return lookup_control(codepoint) ~= "unassigned" |
return lookup_control(codepoint) ~= "unassigned" |
||
end |
end |
||
function |
function p.is_printable(codepoint) |
||
local result = lookup_control(codepoint) |
local result = lookup_control(codepoint) |
||
return (result == "assigned") or (result == "space-separator"), result |
return (result == "assigned") or (result == "space-separator"), result |
||
end |
end |
||
function |
function p.is_whitespace(codepoint) |
||
local result = lookup_control(codepoint) |
local result = lookup_control(codepoint) |
||
return (result == "space-separator"), result |
return (result == "space-separator"), result |
||
end |
end |
||
p.lookup_category = memo_lookup( |
|||
-- to be used in language-neutral context only (e.g. character lists) |
|||
"category", |
|||
function (codepoint, category) |
|||
return category |
|||
end, |
|||
"Cn") |
|||
local lookup_script = memo_lookup( |
|||
local script_pats |
|||
"scripts", |
|||
function (codepoint, script_code) |
|||
return script_code or 'Zzzz' |
|||
end, |
|||
"Zzzz") |
|||
p.lookup_script = lookup_script |
|||
function p.get_best_script(str) |
|||
-- Scripts that consist entirely of characters from another script. |
|||
-- Check type of argument, because mw.text.decode coerces numbers to strings! |
|||
local script_blacklist = { |
|||
require "libraryUtil".checkType("get_best_script", 1, str, "string") |
|||
["Latf"] = true; |
|||
["Hans"] = true; |
|||
-- Convert HTML character references (including named character references, |
|||
["Hant"] = true; |
|||
-- or character entities) to characters. |
|||
["Kore"] = true; |
|||
str = mw.text.decode(str, true) |
|||
["Jpan"] = true; |
|||
["fa-Arab"] = true; |
|||
local scripts = {} |
|||
["kk-Arab"] = true; |
|||
for codepoint in mw.ustring.gcodepoint(str) do |
|||
["ks-Arab"] = true; |
|||
local script = lookup_script(codepoint) |
|||
["ku-Arab"] = true; |
|||
["mzn-Arab"] = true; |
|||
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts. |
|||
["ota-Arab"] = true; |
|||
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then |
|||
["pa-Arab"] = true; |
|||
scripts[script] = true |
|||
["sd-Arab"] = true; |
|||
["tt-Arab"] = true; |
|||
["ug-Arab"] = true; |
|||
["ur-Arab"] = true; |
|||
["nv-Latn"] = true; |
|||
["pjt-Latn"] = true; |
|||
["Zyyy"] = true; |
|||
} |
|||
--[[ |
|||
Problem scripts: Grek and polytonic, Cyrl and Cyrs, Latn and Latinx. |
|||
In each key-value pair, the value should take precedence over the key. |
|||
]] |
|||
local overridden_by = { |
|||
["Cyrs"] = "Cyrl", |
|||
["polytonic"] = "Grek", |
|||
["Latinx"] = "Latn", |
|||
} |
|||
local script_cache = {} |
|||
function export.get_script(codepoint) |
|||
local text |
|||
if type(codepoint) == "number" then |
|||
text = mw.ustring.char(codepoint) |
|||
elseif type(codepoint) == "string" then |
|||
text = codepoint |
|||
else |
|||
error("Argument to get_script should be a number (codepoint) or string.") |
|||
end |
|||
for pat, sc in pairs(script_cache) do |
|||
if mw.ustring.match(text, pat) and not overridden_by[sc] then |
|||
return sc |
|||
end |
end |
||
end |
end |
||
-- If scripts does not contain two or more keys, |
|||
-- return first and only key (script code) in table. |
|||
if not next(scripts, next(scripts)) then |
|||
return next(scripts) |
|||
end -- else return majority script, or else "Zzzz"? |
|||
end |
|||
function p.is_Latin(str) |
|||
if not script_pats then |
|||
require "libraryUtil".checkType("get_best_script", 1, str, "string") |
|||
local m_scripts = mw.loadData("Module:scripts/data") |
|||
str = mw.text.decode(str, true) |
|||
script_pats = {} |
|||
for sc, info in pairs(m_scripts) do |
|||
-- Search for the leading bytes that introduce the UTF-8 encoding of the |
|||
if info.characters and not script_blacklist[sc] then |
|||
-- code points U+0340-U+10FFFF. If they are not found and there is at least |
|||
script_pats[sc] = "[" .. info.characters .. "]" |
|||
-- one Latin-script character, the string counts as Latin, because the rest |
|||
-- of the characters can only be Zyyy, Zinh, and Zzzz. |
|||
-- The only scripts found below U+0370 (the first code point of the Greek |
|||
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz. |
|||
-- See the codepage in the [[UTF-8]] article. |
|||
if not str:find "[\205-\244]" then |
|||
for codepoint in mw.ustring.gcodepoint(str) do |
|||
if lookup_script(codepoint) == "Latn" then |
|||
return true |
|||
end |
end |
||
end |
end |
||
end |
end |
||
local Latn = false |
|||
for sc, pat in pairs(script_pats) do |
|||
local i = 0; -- indexer for use in error messages |
|||
if mw.ustring.match(text, pat) then |
|||
local overriding = overridden_by[sc] |
|||
for codepoint in mw.ustring.gcodepoint(str) do |
|||
if overriding and script_pats[overriding] and mw.ustring.match(text, script_pats[overriding]) then |
|||
i = i + 1; -- bump the indexer |
|||
script_cache[script_pats[overriding]] = overriding |
|||
local script = lookup_script(codepoint) |
|||
return overriding |
|||
if script == "Latn" then |
|||
script_cache[pat] = sc |
|||
Latn = true |
|||
elseif not (script == "Zyyy" or script == "Zinh" |
|||
end |
|||
or script == "Zzzz") then |
|||
return false, i -- abandon as not Latn; identify the offending character's position |
|||
end |
end |
||
end |
end |
||
return Latn, (not Latn and i) or nil -- when <Latn> false, return offending charactor's position as second return value; nil else |
|||
return "None" |
|||
end |
end |
||
-- Checks that a string contains only characters belonging to right-to-left |
|||
local function sortRange(range1, range2) |
|||
-- scripts, or characters of ignorable scripts. |
|||
return range1[1] < range2[1] |
|||
function p.is_rtl(str) |
|||
end |
|||
require "libraryUtil".checkType("get_best_script", 1, str, "string") |
|||
str = mw.text.decode(str, true) |
|||
--[[ |
|||
Binary search: more efficient for the longer lists of codepoint ranges than |
|||
-- Search for the leading bytes that introduce the UTF-8 encoding of the |
|||
for the shorter ones. |
|||
-- code points U+0580-U+10FFFF. If they are not found, the string can only |
|||
]] |
|||
-- have characters from a left-to-right script, because the first code point |
|||
local function binary_search(ranges, value) |
|||
-- in a right-to-left script is U+0591, in the Hebrew block. |
|||
if not ranges then |
|||
if not str:find "[\214-\244]" then |
|||
return nil |
|||
return false |
|||
end |
end |
||
-- Initialize numbers. |
|||
local |
local result = false |
||
local rtl = loader.scripts.rtl |
|||
-- Can't use # because table is loaded by mw.loadData. |
|||
for codepoint in mw.ustring.gcodepoint(str) do |
|||
local iEnd = ranges.length or require("Module:table").size(ranges) |
|||
local script = lookup_script(codepoint) |
|||
if iEnd == 0 then |
|||
if rtl[script] then |
|||
return nil |
|||
result = true |
|||
elseif not (script == "Zyyy" or script == "Zinh" |
|||
or script == "Zzzz") then |
|||
return false |
|||
end |
|||
end |
end |
||
return result |
|||
end |
|||
local iterations = 0 |
|||
--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------ |
|||
-- Do search. |
|||
while iStart <= iEnd do |
|||
iterations = iterations + 1 |
|||
external entry from an {{#invoke:}} to determine if a string of text is rtl. Strips html and html-like tags so |
|||
-- Calculate middle. |
|||
that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text |
|||
iMid = floor((iStart + iEnd) / 2) |
|||
has <br /> tags. |
|||
]] |
|||
-- Get compare value. |
|||
local range = ranges[iMid] |
|||
function p.is_rtl_frame (frame) |
|||
if range[1] > value then |
|||
local str = frame.args[1]; -- get the string from the {{#invoke:}} frame |
|||
iEnd = iMid - 1 |
|||
str = str:gsub ('%b<>', ''); -- strip any html and html-like tags |
|||
return p.is_rtl (str); -- return if whatever remains rtl; false else |
|||
end |
|||
-- Return matching index. Assumes there are no duplicates. |
|||
elseif value <= range[2] then |
|||
return range |
|||
local function get_codepoint(args, arg) |
|||
-- Keep searching. |
|||
local codepoint_string = args[arg] |
|||
else |
|||
or errorf(2, "Parameter %s is required", tostring(arg)) |
|||
iStart = iMid + 1 |
|||
local codepoint = tonumber(codepoint_string, 16) |
|||
end |
|||
or errorf(2, "Parameter %s is not a code point in hexadecimal base", |
|||
tostring(arg)) |
|||
if not (0 <= codepoint and codepoint <= 0x10FFFF) then |
|||
errorf(2, "code point in parameter %s out of range", tostring(arg)) |
|||
end |
end |
||
return |
return codepoint |
||
end |
end |
||
local function |
local function get_func(args, arg, prefix) |
||
local suffix = args[arg] |
|||
for i, range in ipairs(ranges) do |
|||
or errorf(2, "Parameter %s is required", tostring(arg)) |
|||
if number < range[1] then |
|||
suffix = mw.text.trim(suffix) |
|||
return nil |
|||
local func_name = prefix .. suffix |
|||
elseif number <= range[2] then |
|||
local func = p[func_name] |
|||
return range[3] |
|||
or errorf(2, "There is no function '%s'", func_name) |
|||
end |
|||
return func |
|||
end |
|||
end |
end |
||
-- This function allows any of the "lookup" functions to be invoked. The first |
|||
-- Save previously used codepoint ranges in case another character is in the |
|||
-- parameter is the word after "lookup_"; the second parameter is the code point |
|||
-- same range. |
|||
-- in hexadecimal base. |
|||
local ranges_cache = {} |
|||
function p.lookup(frame) |
|||
local func = get_func(frame.args, 1, "lookup_") |
|||
--[=[ |
|||
local codepoint = get_codepoint(frame.args, 2) |
|||
Takes a codepoint or a character and finds the script code (if any) that is |
|||
local result = func(codepoint) |
|||
appropriate for it based on the codepoint, using the data module |
|||
if func == p.lookup_name then |
|||
[[Module:Unicode data/scripts]]. The data module was generated from the |
|||
-- Prevent code point labels such as <control-0000> from being |
|||
patterns in [[Module:scripts/data]] using [[Module:User:Erutuon/script recognition]]. |
|||
-- interpreted as HTML tags. |
|||
result = result:gsub("<", "<") |
|||
Converts the character to a codepoint. Returns a script code if the codepoint |
|||
is in the list of individual characters, or if it is in one of the defined |
|||
ranges in the 4096-character block that it belongs to, else returns "None". |
|||
]=] |
|||
function export.char_to_script(char) |
|||
local lookup = mw.loadData("Module:Unicode data/scripts") |
|||
local t = type(char) |
|||
local codepoint |
|||
if t == "string" then |
|||
local etc |
|||
codepoint, etc = mw.ustring.codepoint(char) |
|||
if etc then |
|||
error("Argument to char_to_script should be a single character.") |
|||
end |
|||
elseif t == "number" then |
|||
codepoint = char |
|||
else |
|||
error("Argument to char_to_script should be a string or a number, but its type is " .. t .. ".") |
|||
end |
end |
||
return result |
|||
local individual_match = lookup.individual[codepoint] |
|||
if individual_match then |
|||
return individual_match |
|||
else |
|||
local script = look_up_in_order(codepoint, ranges_cache) |
|||
if script then |
|||
return script |
|||
end |
|||
local index = floor(codepoint / 0x1000) |
|||
script = look_up_in_order(index, lookup.blocks) |
|||
if script then |
|||
return script |
|||
end |
|||
local range = binary_search(lookup[index], codepoint) |
|||
if range then |
|||
table.insert(ranges_cache, range) |
|||
table.sort(ranges_cache, sortRange) |
|||
return range[3] |
|||
end |
|||
end |
|||
return "None" |
|||
end |
end |
||
function |
function p.is(frame) |
||
local |
local func = get_func(frame.args, 1, "is_") |
||
for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do |
|||
local script = export.char_to_script(character) |
|||
scripts[script] = (scripts[script] or 0) + 1 |
|||
end |
|||
-- is_Latin and is_valid_pagename take strings. |
|||
local best_script |
|||
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then |
|||
local greatest_count = 0 |
|||
return (func(frame.args[2])) |
|||
for script, count in pairs(scripts) do |
|||
else -- The rest take code points. |
|||
if count > greatest_count then |
|||
local codepoint = get_codepoint(frame.args, 2) |
|||
best_script = script |
|||
return (func(codepoint)) -- Adjust to one result. |
|||
greatest_count = count |
|||
end |
|||
end |
end |
||
return best_script |
|||
end |
end |
||
function p.lookup_kCantonese(codepoint) |
|||
local unsupported_title = { |
|||
local data = loader[('Unihan/kCantonese/%02X'):format(floor(codepoint / 0x1000))] |
|||
[0x0020] = "Unsupported titles/Space"; |
|||
if data then |
|||
[0x0023] = "Unsupported titles/Number sign"; |
|||
return data[codepoint] |
|||
[0x002E] = "Unsupported titles/Full stop"; |
|||
[0x003A] = "Unsupported titles/Colon"; |
|||
[0x003C] = "Unsupported titles/Less than"; |
|||
[0x003E] = "Unsupported titles/Greater than"; |
|||
[0x005B] = "Unsupported titles/Left square bracket"; |
|||
[0x005D] = "Unsupported titles/Right square bracket"; |
|||
[0x005F] = "Unsupported titles/Low line"; |
|||
[0x007B] = "Unsupported titles/Left curly bracket"; |
|||
[0x007C] = "Unsupported titles/Vertical line"; |
|||
[0x007D] = "Unsupported titles/Right curly bracket"; |
|||
[0x1680] = "Unsupported titles/Ogham space"; |
|||
[0xFFFD] = "Unsupported titles/Replacement character"; |
|||
} |
|||
function export.get_entry_title(codepoint) |
|||
if unsupported_title[codepoint] then |
|||
return unsupported_title[codepoint] |
|||
end |
|||
if lookup_control(codepoint) ~= "assigned" then |
|||
return nil |
|||
end |
end |
||
return mw.ustring.char(codepoint) |
|||
end |
end |
||
return |
return p |
Latest revision as of 23:05, 13 January 2025
Documentation for this module may be created at Module:Unicode data/doc
local p = {}
local floor = math.floor
local function errorf(level, ...)
if type(level) == "number" then
return error(string.format(...), level + 1)
else -- level is actually the format string.
return error(string.format(level, ...), 2)
end
end
local function binary_range_search(codepoint, ranges)
local low, mid, high
low, high = 1, ranges.length or require "Module:TableTools".length(ranges)
while low <= high do
mid = floor((low + high) / 2)
local range = ranges[mid]
if codepoint < range[1] then
high = mid - 1
elseif codepoint <= range[2] then
return range, mid
else
low = mid + 1
end
end
return nil, mid
end
p.binary_range_search = binary_range_search
--[[
local function linear_range_search(codepoint, ranges)
for i, range in ipairs(ranges) do
if range[1] <= codepoint and codepoint <= range[2] then
return range
end
end
end
--]]
-- Load a module by indexing "loader" with the name of the module minus the
-- "Module:Unicode data/" part. For instance, loader.blocks returns
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be
-- returned.
local loader = setmetatable({}, {
__index = function (self, key)
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
if not success then
data = false
end
self[key] = data
return data
end
})
-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local name_hooks = {
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters
{ 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters
{ 0x3400, 0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{ 0x4E00, 0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
{ 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables
local Hangul_data = loader.Hangul
local syllable_index = codepoint - 0xAC00
return ("HANGUL SYLLABLE %s%s%s"):format(
Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],
Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
/ Hangul_data.trail_count)],
Hangul_data.trails[syllable_index % Hangul_data.trail_count]
)
end },
-- High Surrogates, High Private Use Surrogates, Low Surrogates
{ 0xD800, 0xDFFF, "<surrogate-%04X>" },
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use
-- CJK Compatibility Ideographs
{ 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{ 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{ 0x17000, 0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph
{ 0x18800, 0x18AFF, function (codepoint)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
end },
{ 0x18D00, 0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{ 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{ 0x2A700, 0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
{ 0x2B740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{ 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
end},
{ 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{ 0x31350, 0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{ 0x2EBF0, 0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
{ 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use
}
name_hooks.length = #name_hooks
local name_range_cache
local function generate_name(data, codepoint)
if type(data) == "string" then
return data:format(codepoint)
else
return data(codepoint)
end
end
--[[
-- Checks that the code point is a number and in range.
-- Does not check whether code point is an integer.
-- Not used
local function check_codepoint(funcName, argIdx, val)
require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')
if codepoint < 0 or 0x10FFFF < codepoint then
errorf("Codepoint %04X out of range", codepoint)
end
end
--]]
function p.is_noncharacter(codepoint)
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- (Cn) and specifically noncharacters:
-- https://www.unicode.org/faq/private_use.html#nonchar4
return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
or floor(codepoint % 0x10000) >= 0xFFFE)
end
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
if p.is_noncharacter(codepoint) then
return ("<noncharacter-%04X>"):format(codepoint)
end
if name_range_cache -- Check if previously used "name hook" applies to this code point.
and codepoint >= name_range_cache[1]
and codepoint <= name_range_cache[2] then
return generate_name(name_range_cache[3], codepoint)
end
local range = binary_range_search(codepoint, name_hooks)
if range then
name_range_cache = range
return generate_name(range[3], codepoint)
end
local data = loader[('names/%03X'):format(codepoint / 0x1000)]
if data and data[codepoint] then
return data[codepoint]
-- Unassigned (Cn) consists of noncharacters and reserved characters.
-- The character has been established not to be a noncharacter,
-- and if it were assigned, its name would already been retrieved,
-- so it must be reserved.
else
return ("<reserved-%04X>"):format(codepoint)
end
end
function p.lookup_image(codepoint)
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
if data then
return data[codepoint]
end
end
local planes = {
[ 0] = "Basic Multilingual Plane";
[ 1] = "Supplementary Multilingual Plane";
[ 2] = "Supplementary Ideographic Plane";
[ 3] = "Tertiary Ideographic Plane";
[14] = "Supplementary Special-purpose Plane";
[15] = "Supplementary Private Use Area-A";
[16] = "Supplementary Private Use Area-B";
}
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
local blocks
local function block_iter(blocks, i)
i = i + 1
local data = blocks[i]
if data then
-- Unpack doesn't work on tables loaded with mw.loadData.
return i, data[1], data[2], data[3]
end
end
-- An ipairs-type iterator generator for the list of blocks.
function p.enum_blocks()
local blocks = loader.blocks
return block_iter, blocks, 0
end
function p.lookup_plane(codepoint)
local i = floor(codepoint / 0x10000)
return planes[i] or ("Plane %u"):format(i)
end
function p.lookup_block(codepoint)
local blocks = loader.blocks
local range = binary_range_search(codepoint, blocks)
if range then
return range[3]
else
return "No Block"
end
end
function p.get_block_info(name)
for i, block in ipairs(loader.blocks) do
if block[3] == name then
return block
end
end
end
function p.is_valid_pagename(pagename)
local has_nonws = false
for cp in mw.ustring.gcodepoint(pagename) do
if (cp == 0x0023) -- #
or (cp == 0x005B) -- [
or (cp == 0x005D) -- ]
or (cp == 0x007B) -- {
or (cp == 0x007C) -- |
or (cp == 0x007D) -- }
or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block
or (cp == 0xFFFD) -- REPLACEMENT CHARACTER
then
return false
end
local printable, result = p.is_printable(cp)
if not printable then
return false
end
if result ~= "space-separator" then
has_nonws = true
end
end
return has_nonws
end
local function manual_unpack(what, from)
if what[from + 1] == nil then
return what[from]
end
local result = {}
from = from or 1
for i, item in ipairs(what) do
if i >= from then
table.insert(result, item)
end
end
return unpack(result)
end
local function compare_ranges(range1, range2)
return range1[1] < range2[1]
end
-- Creates a function to look up data in a module that contains "singles" (a
-- code point-to-data map) and "ranges" (an array containing arrays that contain
-- the low and high code points of a range and the data associated with that
-- range).
-- "loader" loads and returns the "singles" and "ranges" tables.
-- "match_func" is passed the code point and either the data or the "dots", and
-- generates the final result of the function.
-- The varargs ("dots") describes the default data to be returned if there wasn't
-- a match.
-- In case the function is used more than once, "cache" saves ranges that have
-- already been found to match, or a range whose data is the default if there
-- was no match.
local function memo_lookup(data_module_subpage, match_func, ...)
local dots = { ... }
local cache = {}
local singles, ranges
return function (codepoint)
if not singles then
local data_module = loader[data_module_subpage]
singles, ranges = data_module.singles, data_module.ranges
end
if singles[codepoint] then
return match_func(codepoint, singles[codepoint])
end
local range = binary_range_search(codepoint, cache)
if range then
return match_func(codepoint, manual_unpack(range, 3))
end
local range, index = binary_range_search(codepoint, ranges)
if range then
table.insert(cache, range)
table.sort(cache, compare_ranges)
return match_func(codepoint, manual_unpack(range, 3))
end
if ranges[index] then
local dots_range
if codepoint > ranges[index][2] then
dots_range = {
ranges[index][2] + 1,
ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,
unpack(dots)
}
else -- codepoint < range[index][1]
dots_range = {
ranges[index - 1] and ranges[index - 1][2] + 1 or 0,
ranges[index][1] - 1,
unpack(dots)
}
end
table.sort(cache, compare_ranges)
end
return match_func(codepoint)
end
end
-- Get a code point's combining class value in [[Module:Unicode data/combining]],
-- and return whether this value is not zero. Zero is assigned as the default
-- if the combining class value is not found in this data module.
-- That is, return true if character is combining, or false if it is not.
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
p.is_combining = memo_lookup(
"combining",
function (codepoint, combining_class)
return combining_class and combining_class ~= 0 or false
end,
0)
function p.add_dotted_circle(str)
return (mw.ustring.gsub(str, ".",
function(char)
if p.is_combining(mw.ustring.codepoint(char)) then
return '◌' .. char
end
end))
end
local lookup_control = memo_lookup(
"control",
function (codepoint, ccc)
return ccc or "assigned"
end,
"assigned")
p.lookup_control = lookup_control
function p.is_assigned(codepoint)
return lookup_control(codepoint) ~= "unassigned"
end
function p.is_printable(codepoint)
local result = lookup_control(codepoint)
return (result == "assigned") or (result == "space-separator"), result
end
function p.is_whitespace(codepoint)
local result = lookup_control(codepoint)
return (result == "space-separator"), result
end
p.lookup_category = memo_lookup(
"category",
function (codepoint, category)
return category
end,
"Cn")
local lookup_script = memo_lookup(
"scripts",
function (codepoint, script_code)
return script_code or 'Zzzz'
end,
"Zzzz")
p.lookup_script = lookup_script
function p.get_best_script(str)
-- Check type of argument, because mw.text.decode coerces numbers to strings!
require "libraryUtil".checkType("get_best_script", 1, str, "string")
-- Convert HTML character references (including named character references,
-- or character entities) to characters.
str = mw.text.decode(str, true)
local scripts = {}
for codepoint in mw.ustring.gcodepoint(str) do
local script = lookup_script(codepoint)
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then
scripts[script] = true
end
end
-- If scripts does not contain two or more keys,
-- return first and only key (script code) in table.
if not next(scripts, next(scripts)) then
return next(scripts)
end -- else return majority script, or else "Zzzz"?
end
function p.is_Latin(str)
require "libraryUtil".checkType("get_best_script", 1, str, "string")
str = mw.text.decode(str, true)
-- Search for the leading bytes that introduce the UTF-8 encoding of the
-- code points U+0340-U+10FFFF. If they are not found and there is at least
-- one Latin-script character, the string counts as Latin, because the rest
-- of the characters can only be Zyyy, Zinh, and Zzzz.
-- The only scripts found below U+0370 (the first code point of the Greek
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.
-- See the codepage in the [[UTF-8]] article.
if not str:find "[\205-\244]" then
for codepoint in mw.ustring.gcodepoint(str) do
if lookup_script(codepoint) == "Latn" then
return true
end
end
end
local Latn = false
local i = 0; -- indexer for use in error messages
for codepoint in mw.ustring.gcodepoint(str) do
i = i + 1; -- bump the indexer
local script = lookup_script(codepoint)
if script == "Latn" then
Latn = true
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
return false, i -- abandon as not Latn; identify the offending character's position
end
end
return Latn, (not Latn and i) or nil -- when <Latn> false, return offending charactor's position as second return value; nil else
end
-- Checks that a string contains only characters belonging to right-to-left
-- scripts, or characters of ignorable scripts.
function p.is_rtl(str)
require "libraryUtil".checkType("get_best_script", 1, str, "string")
str = mw.text.decode(str, true)
-- Search for the leading bytes that introduce the UTF-8 encoding of the
-- code points U+0580-U+10FFFF. If they are not found, the string can only
-- have characters from a left-to-right script, because the first code point
-- in a right-to-left script is U+0591, in the Hebrew block.
if not str:find "[\214-\244]" then
return false
end
local result = false
local rtl = loader.scripts.rtl
for codepoint in mw.ustring.gcodepoint(str) do
local script = lookup_script(codepoint)
if rtl[script] then
result = true
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
return false
end
end
return result
end
--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------
external entry from an {{#invoke:}} to determine if a string of text is rtl. Strips html and html-like tags so
that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text
has <br /> tags.
]]
function p.is_rtl_frame (frame)
local str = frame.args[1]; -- get the string from the {{#invoke:}} frame
str = str:gsub ('%b<>', ''); -- strip any html and html-like tags
return p.is_rtl (str); -- return if whatever remains rtl; false else
end
local function get_codepoint(args, arg)
local codepoint_string = args[arg]
or errorf(2, "Parameter %s is required", tostring(arg))
local codepoint = tonumber(codepoint_string, 16)
or errorf(2, "Parameter %s is not a code point in hexadecimal base",
tostring(arg))
if not (0 <= codepoint and codepoint <= 0x10FFFF) then
errorf(2, "code point in parameter %s out of range", tostring(arg))
end
return codepoint
end
local function get_func(args, arg, prefix)
local suffix = args[arg]
or errorf(2, "Parameter %s is required", tostring(arg))
suffix = mw.text.trim(suffix)
local func_name = prefix .. suffix
local func = p[func_name]
or errorf(2, "There is no function '%s'", func_name)
return func
end
-- This function allows any of the "lookup" functions to be invoked. The first
-- parameter is the word after "lookup_"; the second parameter is the code point
-- in hexadecimal base.
function p.lookup(frame)
local func = get_func(frame.args, 1, "lookup_")
local codepoint = get_codepoint(frame.args, 2)
local result = func(codepoint)
if func == p.lookup_name then
-- Prevent code point labels such as <control-0000> from being
-- interpreted as HTML tags.
result = result:gsub("<", "<")
end
return result
end
function p.is(frame)
local func = get_func(frame.args, 1, "is_")
-- is_Latin and is_valid_pagename take strings.
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then
return (func(frame.args[2]))
else -- The rest take code points.
local codepoint = get_codepoint(frame.args, 2)
return (func(codepoint)) -- Adjust to one result.
end
end
function p.lookup_kCantonese(codepoint)
local data = loader[('Unihan/kCantonese/%02X'):format(floor(codepoint / 0x1000))]
if data then
return data[codepoint]
end
end
return p