Jump to content

Module:Wikt-lang/data/sandbox: Difference between revisions

From Wikipedia, the free encyclopedia
Content deleted Content added
No edit summary
not unused even in live module
Line 18: Line 18:


--[[
--[[
This is a table of Wiktionary language codes with data belonging to them.
This is a table of Wiktionary language codes with data belonging to them.
Name is the "canonical name" used on Wiktionary.
Name is the "canonical name" used on Wiktionary.
Article is the Wikipedia article.
Script is the ISO 15924 code.
Script is the ISO 15924 code.
]]
]]
Line 34: Line 32:
["ang"] = {
["ang"] = {
["name"] = "Old English",
["name"] = "Old English",
["article"] = {"Old English"},
-- Remove macrons, acutes, and overdots
-- Remove macrons, acutes, and overdots
["replacements"] = {
["replacements"] = {
Line 43: Line 40:
["ar"] = {
["ar"] = {
["name"] = "Arabic",
["name"] = "Arabic",
["article"] = "Arabic language",
["direction"] = "rtl", -- Should be in the script data module.
["direction"] = "rtl", -- Should be in the script data module.
["replacements"] = {
["replacements"] = {
Line 58: Line 54:
["arb"] = {
["arb"] = {
["name"] = "Modern Standard Arabic",
["name"] = "Modern Standard Arabic",
["article"] = "Modern Standard Arabic",
["direction"] = "rtl", -- Should be in the script data module.
["direction"] = "rtl", -- Should be in the script data module.
["replacements"] = {
["replacements"] = {
Line 73: Line 68:
["apc"] = {
["apc"] = {
["name"] = "North Levantine Arabic",
["name"] = "North Levantine Arabic",
["article"] = "North Levantine Arabic",
["direction"] = "rtl", -- Should be in the script data module.
["direction"] = "rtl", -- Should be in the script data module.
["replacements"] = {
["replacements"] = {
Line 88: Line 82:
["ajp"] = {
["ajp"] = {
["name"] = "South Levantine Arabic",
["name"] = "South Levantine Arabic",
["article"] = "South Levantine Arabic",
["direction"] = "rtl", -- Should be in the script data module.
["direction"] = "rtl", -- Should be in the script data module.
["replacements"] = {
["replacements"] = {
Line 103: Line 96:
["arz"] = {
["arz"] = {
["name"] = "Egyptian Arabic",
["name"] = "Egyptian Arabic",
["article"] = "Egyptian Arabic",
["direction"] = "rtl", -- Should be in the script data module.
["direction"] = "rtl", -- Should be in the script data module.
["replacements"] = {
["replacements"] = {
Line 117: Line 109:
},
},
["be"] = {
["be"] = {
["article"] = "Belarusian language",
["replacements"] = {
["replacements"] = { [acute] = "", },
[acute] = "",
},
},
},
["cel-x-bryproto"] = {
["cel-x-bryproto"] = {
["name"] = "Proto-Brythonic",
["name"] = "Proto-Brythonic",
["article"] = "Common Brittonic",
["type"] = "reconstructed",
["type"] = "reconstructed",
},
},
["cu"] = {
["cu"] = {
["name"] = "Old Church Slavonic",
["name"] = "Old Church Slavonic",
["article"] = "Old Church Slavonic",
},
},
["egy"] = {
["egy"] = {
Line 137: Line 128:
["gem-x-proto"] = {
["gem-x-proto"] = {
["name"] = "Proto-Germanic",
["name"] = "Proto-Germanic",
["article"] = "Proto-Germanic language",
["type"] = "reconstructed",
["type"] = "reconstructed",
},
},
["gmw-x-proto"] = {
["gmw-x-proto"] = {
["name"] = "Proto-West Germanic",
["name"] = "Proto-West Germanic",
["article"] = "Proto-West Germanic language",
["type"] = "reconstructed",
["type"] = "reconstructed",
},
},
["gmq-x-gut"] = {
["gmq-x-gut"] = {
["name"] = "Gutnish",
["name"] = "Gutnish",
["article"] = "Gutnish",
},
},
["goh"] = {
["goh"] = {
Line 159: Line 147:
["got"] = {
["got"] = {
["name"] = "Gothic",
["name"] = "Gothic",
["article"] = "Gothic language",
["replacements"] = {
["replacements"] = {
-- Latin to Gothic since people will not want to have to copy
-- Latin to Gothic since people will not want to have to copy
Line 192: Line 179:
["grc"] = {
["grc"] = {
["name"] = "Ancient Greek",
["name"] = "Ancient Greek",
["article"] = "Ancient Greek",
["replacements"] = {
["replacements"] = {
decompose = true,
decompose = true,
Line 209: Line 195:
["name"] = "Proto-Hellenic",
["name"] = "Proto-Hellenic",
["Wikipedia_name"] = "Proto-Greek",
["Wikipedia_name"] = "Proto-Greek",
["article"] = "Proto-Greek language",
["type"] = "reconstructed",
["type"] = "reconstructed",
["replacements"] = {},
["replacements"] = {},
Line 223: Line 208:
["ine-x-bsproto"] = {
["ine-x-bsproto"] = {
["name"] = "Proto-Balto-Slavic",
["name"] = "Proto-Balto-Slavic",
["article"] = "Proto-Balto-Slavic language",
["type"] = "reconstructed",
["type"] = "reconstructed",
},
},
["ine-x-proto"] = {
["ine-x-proto"] = {
["name"] = "Proto-Indo-European",
["name"] = "Proto-Indo-European",
["article"] = "Proto-Indo-European language",
["type"] = "reconstructed",
["type"] = "reconstructed",
},
},
Line 236: Line 219:
["la"] = {
["la"] = {
["name"] = "Latin",
["name"] = "Latin",
["article"] = "Latin",
["replacements"] = {
["replacements"] = {
decompose = true,
decompose = true,
Line 255: Line 237:
["mul"] = {
["mul"] = {
["name"] = "Translingual",
["name"] = "Translingual",
["article"] = "",
},
},
["nci"] = {
["nci"] = {
["name"] = "Classical Nahuatl",
["name"] = "Classical Nahuatl",
["article"] = "Classical Nahuatl",
-- Remove macrons, acutes, circumflexes and graves
-- Remove macrons, acutes, circumflexes and graves
["replacements"] = {
["replacements"] = {
Line 276: Line 256:
["orv"] = {
["orv"] = {
["name"] = "Old East Slavic",
["name"] = "Old East Slavic",
["article"] = "Old East Slavic",
["replacements"] = {
["replacements"] = {
[U(0x484)] = "",
[U(0x484)] = "",
Line 283: Line 262:
["poz-x-polproto"] = { -- is this even in use?
["poz-x-polproto"] = { -- is this even in use?
["name"] = "Proto-Nuclear Polynesian",
["name"] = "Proto-Nuclear Polynesian",
["article"] = "Proto-Polynesian language",
["type"] = "reconstructed",
["type"] = "reconstructed",
},
},
["ru"] = {
["ru"] = {
["name"] = "Russian",
["name"] = "Russian",
["article"] = "Russian language",
["replacements"] = {
["replacements"] = { [acute] = "", },
[acute] = "",
},
},
},
["rw"] = {
["rw"] = {
Line 301: Line 280:
["sem-x-proto"] = {
["sem-x-proto"] = {
["name"] = "Proto-Semitic",
["name"] = "Proto-Semitic",
["article"] = "Proto-Semitic",
["type"] = "reconstructed",
["type"] = "reconstructed",
},
},
["sh"] = {
["sh"] = {
["article"] = "Serbo-Croatian language",
["replacements"] = {
["replacements"] = {
decompose = true,
decompose = true,
Line 347: Line 324:
["tts"] = {
["tts"] = {
["name"] = "Isan", -- also "Northeastern Thai"
["name"] = "Isan", -- also "Northeastern Thai"
["article"] = "Isan language",
},
},
["uk"] = {
["uk"] = {
["article"] = "Ukrainian language",
["replacements"] = { [acute] = "", }
["replacements"] = { [acute] = "", }
},
},
["xcl"] = {
["xcl"] = {
["name"] = "Old Armenian",
["name"] = "Old Armenian",
["article"] = "Classical Armenian",
["replacements"] = {
["replacements"] = {
["[՞՜՛՟]"] = "",
["[՞՜՛՟]"] = "",
Line 363: Line 337:
["xgf"] = {
["xgf"] = {
["name"] = "Tongva", -- not ISO name "Gabrielino-Fernandeño"
["name"] = "Tongva", -- not ISO name "Gabrielino-Fernandeño"
["article"] = "Tongva language",
["replacements"] = {
["replacements"] = {
["['`ʔ]"] = "ʼ",
["['`ʔ]"] = "ʼ",
Line 370: Line 343:
["xlu"] = {
["xlu"] = {
["name"] = "Luwian", -- not ISO name "Cuneiform Luwian"
["name"] = "Luwian", -- not ISO name "Cuneiform Luwian"
["article"] = "Cuneiform Luwian"
},
},
["zle-x-ort"] = {
["zle-x-ort"] = {
["name"] = "Old Ruthenian",
["name"] = "Old Ruthenian",
["article"] = "Old Ruthenian",
["replacements"] = {
["replacements"] = { [acute] = "", },
[acute] = "",
},
},
},
},
},

Revision as of 14:56, 3 December 2024

local U = mw.ustring.char

-- Diacritics, from the [[Combining Diacritical Marks]] block.
local grave        = U(0x300)
local acute        = U(0x301)
local circumflex   = U(0x302)
local tilde        = U(0x303)
local macron       = U(0x304)
local breve        = U(0x306)
local dot          = U(0x307)
local diaeresis    = U(0x308)
local double_acute = U(0x30B)
local caron        = U(0x30C)
local double_grave = U(0x30F)
local invbreve     = U(0x311)
local dot_below    = U(0x323)
local undertie     = U(0x35C)

--[[
	This is a table of Wiktionary language codes with data belonging to them.
	Name is the "canonical name" used on Wiktionary.
	Script is the ISO 15924 code.
]]
local data = {
	["languages"] = {
		["aaq"] = {
			["name"] = "Penobscot",
		},
		["abe"] = {
			["name"] = "Abenaki",
		},
		["ang"] = {
			["name"] = "Old English",
			-- Remove macrons, acutes, and overdots
			["replacements"] = {
				decompose = true,
				from = { "[" .. macron .. acute .. dot .. "]" },
			},
		},
		["ar"] = {
			["name"] = "Arabic",
			["direction"] = "rtl", -- Should be in the script data module.
			["replacements"] = {
				-- ālif with wasla is replaced by ālif;
				[U(0x0671)] = U(0x0627),
				-- taṭwīl, fatḥatan, ḍammatan, kasratan,
				-- fatḥa, ḍamma, kasra,
				-- shadda, sukūn, and superscript (dagger) ālif are removed.
				["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)
					..U(0x064E)..U(0x064F)..U(0x0650)
					..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",
			},
		},
		["arb"] = {
			["name"] = "Modern Standard Arabic",
			["direction"] = "rtl", -- Should be in the script data module.
			["replacements"] = {
				-- ālif with wasla is replaced by ālif;
				[U(0x0671)] = U(0x0627),
				-- taṭwīl, fatḥatan, ḍammatan, kasratan,
				-- fatḥa, ḍamma, kasra,
				-- shadda, sukūn, and superscript (dagger) ālif are removed.
				["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)
					..U(0x064E)..U(0x064F)..U(0x0650)
					..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",
			},
		},
		["apc"] = {
			["name"] = "North Levantine Arabic",
			["direction"] = "rtl", -- Should be in the script data module.
			["replacements"] = {
				-- ālif with wasla is replaced by ālif;
				[U(0x0671)] = U(0x0627),
				-- taṭwīl, fatḥatan, ḍammatan, kasratan,
				-- fatḥa, ḍamma, kasra,
				-- shadda, sukūn, and superscript (dagger) ālif are removed.
				["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)
					..U(0x064E)..U(0x064F)..U(0x0650)
					..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",
			},
		},
		["ajp"] = {
			["name"] = "South Levantine Arabic",
			["direction"] = "rtl", -- Should be in the script data module.
			["replacements"] = {
				-- ālif with wasla is replaced by ālif;
				[U(0x0671)] = U(0x0627),
				-- taṭwīl, fatḥatan, ḍammatan, kasratan,
				-- fatḥa, ḍamma, kasra,
				-- shadda, sukūn, and superscript (dagger) ālif are removed.
				["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)
					..U(0x064E)..U(0x064F)..U(0x0650)
					..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",
			},
		},
		["arz"] = {
			["name"] = "Egyptian Arabic",
			["direction"] = "rtl", -- Should be in the script data module.
			["replacements"] = {
				-- ālif with wasla is replaced by ālif;
				[U(0x0671)] = U(0x0627),
				-- taṭwīl, fatḥatan, ḍammatan, kasratan,
				-- fatḥa, ḍamma, kasra,
				-- shadda, sukūn, and superscript (dagger) ālif are removed.
				["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)
					..U(0x064E)..U(0x064F)..U(0x0650)
					..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",
			},
		},
		["be"] = {
			["replacements"] = {
				[acute] = "",
			},
		},
		["cel-x-bryproto"] = {
			["name"] = "Proto-Brythonic",
			["type"] = "reconstructed",
		},
		["cu"] = {
			["name"] = "Old Church Slavonic",
		},
		["egy"] = {
			["name"] = "Egyptian",
		},
		["frp"] = {
			["name"] = "Franco-Provençal",
		},
		["gem-x-proto"] = {
			["name"] = "Proto-Germanic",
			["type"] = "reconstructed",
		},
		["gmw-x-proto"] = {
			["name"] = "Proto-West Germanic",
			["type"] = "reconstructed",
		},
		["gmq-x-gut"] = {
			["name"] = "Gutnish",
		},
		["goh"] = {
			["replacements"] = {
				decompose = true,
				from = {
					"[" .. macron .. circumflex .. diaeresis .. "]",
				},
			},
		},
		["got"] = {
			["name"] = "Gothic",
			["replacements"] = {
				-- Latin to Gothic since people will not want to have to copy
				-- and paste Gothic letters in
				["[AÁaáĀā]"] = "𐌰",
				["[Bb]"]     = "𐌱",
				["[Gg]"]     = "𐌲",
				["[Dd]"]     = "𐌳",
				["[EeĒē]"]   = "𐌴",
				["[Qq]"]     = "𐌵",
				["[Zz]"]     = "𐌶",
				["[Hh]"]     = "𐌷",
				["[Þþ]"]     = "𐌸",
				["[IiÍí]"]   = "𐌹",
				["[Kk]"]     = "𐌺",
				["[Ll]"]     = "𐌻",
				["[Mm]"]     = "𐌼",
				["[Nn]"]     = "𐌽",
				["[Jj]"]     = "𐌾",		
				["[UuÚúŪū]"] = "𐌿",	
				["[Pp]"]     = "𐍀",		
				["[Rr]"]     = "𐍂",	
				["[Ss]"]     = "𐍃",	
				["[Tt]"]     = "𐍄",	
				["[WwYy]"]   = "𐍅",
				["[Ff]"]     = "𐍆",
				["[Xx]"]     = "𐍇",
				["[Ƕƕ]"]    = "𐍈", -- Not sure if "hw" and "hv" can safely be converted
				["[OoŌō]"]   = "𐍉",
			},
		},
		["grc"] = {
			["name"] = "Ancient Greek",
			["replacements"] = {
				decompose = true,
				from = {
					-- Replace variant letterforms with standard ones.
					"ϐ", "ϵ", "ϑ", "ϰ", "ϱ", "ϲ", "ϕ",
					-- Remove macrons and breves.
					"[" .. macron .. breve .. undertie .. "]"
				},
				to   = {
					"β", "ε", "θ", "κ", "ρ", "σ", "φ",
				}
			},
		},
		["grk-x-proto"] = {
			["name"] = "Proto-Hellenic",
			["Wikipedia_name"] = "Proto-Greek",
			["type"] = "reconstructed",
			["replacements"] = {},
		},
		["ha"] = {
			["name"] = "Hausa",
			-- remove tilde, grave, acute, macron, circumflex
			["replacements"] = {
				decompose = true,
				from = { "[" .. grave .. circumflex .. macron .. acute .. tilde .. "]" },
			},
		},
		["ine-x-bsproto"] = {
			["name"] = "Proto-Balto-Slavic",
			["type"] = "reconstructed",
		},
		["ine-x-proto"] = {
			["name"] = "Proto-Indo-European",
			["type"] = "reconstructed",
		},
		["jbo"] = { -- Lojban
			["type"] = "appendix",
		},
		["la"] = {
			["name"] = "Latin",
			["replacements"] = {
				decompose = true,
				from = { "[" .. macron .. breve .. diaeresis .. "]" },
			},
		},
		["lt"] = {
			["name"] = "Lithuanian",
			-- remove acute, tilde, grave
			["replacements"] = {
				decompose = true,
				from = { "[" .. acute .. tilde .. grave .. "]" },
			},
		},
		["moe"] = {
			["name"] = "Cree",
		},
		["mul"] = {
			["name"] = "Translingual",
		},
		["nci"] = {
			["name"] = "Classical Nahuatl",
			-- Remove macrons, acutes, circumflexes and graves
			["replacements"] = {
				decompose = true,
				-- Remove macrons, acutes, circumflexes, graves, and saltillo;
				-- see [[Saltillo (linguistics)]].
				from = { "[" .. grave .. acute .. macron .. circumflex .. "Ꞌꞌʻʼ'ʔ]" },
			},
		},
		["nds-de"] = {
			["name"] = "German Low German",
		},
		["non-x-proto"] = {
			["name"] = "Proto-Norse",
		},
		["orv"] = {
			["name"] = "Old East Slavic",
			["replacements"] = {
				[U(0x484)] = "",
			},
		},
		["poz-x-polproto"] = { -- is this even in use?
			["name"] = "Proto-Nuclear Polynesian",
			["type"] = "reconstructed",
		},
		["ru"] = {
			["name"] = "Russian",
			["replacements"] = {
				[acute] = "",
			},
		},
		["rw"] = {
			["name"] = "Rwanda-Rundi",
		},
		["se"] = {
			["replacements"] = {
				["([đflmnŋrsšŧv])'%1"] = "%1%1",
			},
		},
		["sem-x-proto"] = {
			["name"] = "Proto-Semitic",
			["type"] = "reconstructed",
		},
		["sh"] = {
			["replacements"] = {
				decompose = true,
				from =  { "([AaEeIiOoUuRrАаЕеИиОоУуРр])[" .. double_grave
					.. grave .. invbreve .. acute .. macron .. tilde .. "]" },
				to   = { "%1" },
			},
		},
		["sl"] = {
			["name"] = "Slovene",
			["replacements"] = {
				decompose = true,
				-- remove tonal orthography
				from = {"ł", "[" .. grave .. acute .. macron .. double_grave .. invbreve .. circumflex .. dot_below .. "]"},
				to = {"l"},
			},
		},
		["sla-x-proto"] = {
			["name"] = "Proto-Slavic", -- also Common Slavic
			["type"] = "reconstructed",
			["replacements"] = {
				["[ÀÁÃĀȀȂ]"] = "A",
				["[àáãāȁȃ]"] = "a",
				["[ÈÉẼĒȄȆ]"] = "E",
				["[èéẽēȅȇ]"] = "e",
				["[ÌÍĨĪȈȊ]"] = "I",
				["[ìíĩīȉȋ]"] = "i",
				["[ÒÓÕŌȌȎŐ]"] = "O", 
				["[òóõōȍȏő]"] = "o",
				["[ÙÚŨŪȔȖŰ]"] = "U",
				["[ùúũūȕȗű]"] = "u",
				["[ỲÝỸȲ]"] = "Y",
				["[ỳýỹȳ]"] = "y",
				["Ǭ"] = "Ǫ",
				["ǭ"] = "ǫ",
				["[" .. grave .. acute .. double_acute .. tilde .. macron .. double_grave .. invbreve .. "]"] = "",
				["ĭ"] = "ь",
				["ŭ"] = "ъ",
			},
		},
		["tts"] = {
			["name"] = "Isan", -- also "Northeastern Thai"
		},
		["uk"] = {
			["replacements"] = { [acute] = "", }
		},
		["xcl"] = {
			["name"] = "Old Armenian",
			["replacements"] = {
				["[՞՜՛՟]"] = "",
				["և"] = "եւ",
			},
		},
		["xgf"] = {
			["name"] = "Tongva", -- not ISO name "Gabrielino-Fernandeño"
			["replacements"] = {
				["['`ʔ]"] = "ʼ",
			},
		},
		["xlu"] = {
			["name"] = "Luwian", -- not ISO name "Cuneiform Luwian"
		},
		["zle-x-ort"] = {
			["name"] = "Old Ruthenian",
			["replacements"] = {
				[acute] = "",
			},
		},
	},
}

return data