模組:Conversion rule extractor/Extractor

-- Module:Conversion_rule_extractor/Extractor
-- 子模块：负责从指定页面提取转换规则

local Extractor = {}

local Tpv = require('Module:Template parameter value')
local Arguments = require('Module:Arguments') -- 用于解析子模块可能需要的参数（虽然在此版本中主要由主模块驱动）

-- NoteTA及其重定向和NoteTA-lite及其重定向
local NOTE_TA_TEMPLATES = {
    'NoteTA', 'TA', 'NoteAT', 'NoteTA/default', 'NOTETA', 'Note TA', 'Noteta', 'NoteTa', 'NoteTA/lua', '全文字词转换',
    'NoteTA-lite', 'TA-lite', 'TAL', 'TAl'
}

-- 工具函数：清理规则字符串（移除末尾分号，trim）
local function cleanRuleString(rule)
    if not rule then return nil end
    rule = mw.text.trim(rule)
    -- 移除可能的末尾分号，以标准化
    rule = rule:gsub(';%s*$', '')
    return rule
end

-- 工具函数：解析单个NoteTA模板的参数
local function parseNoteTATemplate(templateWikitext)
    local rules = { titleRule = nil, manualRules = {}, groupNames = {} }
    -- 使用 Tpv.getParameters 获取参数，避免子模板干扰简单规则提取
    local params = Tpv.getParameters(templateWikitext)

    -- 提取标题规则 (T)
    if params['T'] then
        local cleaned = cleanRuleString(params['T'])
        if cleaned and cleaned ~= '' then
            rules.titleRule = cleaned
        end
    end

    -- 提取手动全文规则 (数字参数 1-30)
    for i = 1, 30 do
        local key = tostring(i)
        if params[key] then
            local cleaned = cleanRuleString(params[key])
            if cleaned and cleaned ~= '' then
                table.insert(rules.manualRules, cleaned)
            end
        end
    end

    -- 提取公共转换组名称 (G1-G30)
    for i = 1, 30 do
        local key = 'G' .. i
        if params[key] then
            local groupName = mw.text.trim(params[key])
            if groupName and groupName ~= '' then
                table.insert(rules.groupNames, groupName)
            end
        end
    end

    return rules
end

-- 获取页面中所有NoteTA模板的规则
function Extractor.getNoteTARules(pageTitle)
    local combinedRules = { titleRule = nil, manualRules = {}, groupNames = {} }
    local titleObj = mw.title.new(pageTitle)
    if not titleObj or not titleObj.exists then
        -- mw.logObject('Page not found or invalid:', pageTitle)
        return combinedRules -- 返回空规则集
    end

    local templateIndex = 1
    while true do
        local success, templateWikitext = Tpv.getTemplate(pageTitle, NOTE_TA_TEMPLATES, { template_index = templateIndex })
        if not success then
            -- mw.log('No more NoteTA templates found at index:', templateIndex)
            break -- 没有更多NoteTA模板了
        end

        -- mw.log('Found NoteTA template at index:', templateIndex, templateWikitext)
        local rules = parseNoteTATemplate(templateWikitext)

        -- 合并规则：
        -- T规则：后面的覆盖前面的（通常只有一个T，但以防万一）
        if rules.titleRule then
            combinedRules.titleRule = rules.titleRule
        end
        -- 手动规则：全部累加
        for _, rule in ipairs(rules.manualRules) do
            table.insert(combinedRules.manualRules, rule)
        end
        -- 转换组名称：全部累加 (后续去重)
        for _, name in ipairs(rules.groupNames) do
            table.insert(combinedRules.groupNames, name)
        end

        templateIndex = templateIndex + 1
    end

    -- 对转换组名称去重
    local uniqueGroupNames = {}
    local groupNameSet = {}
    for _, name in ipairs(combinedRules.groupNames) do
        if not groupNameSet[name] then
            table.insert(uniqueGroupNames, name)
            groupNameSet[name] = true
        end
    end
    combinedRules.groupNames = uniqueGroupNames

    -- mw.logObject('Combined rules from NoteTAs:', combinedRules)
    return combinedRules
end

-- 获取指定公共转换组的规则
function Extractor.getGroupRules(groupNames)
    local groupRules = {}
    if not groupNames or #groupNames == 0 then
        return groupRules
    end

    for _, name in ipairs(groupNames) do
        local moduleName = 'Module:CGroup/' .. name
        local success, data = pcall(mw.loadData, moduleName)
        if success and data and data.content then
            -- mw.log('Successfully loaded CGroup module:', moduleName)
            for _, item in ipairs(data.content) do
                if item.type == 'item' and item.rule then
                    local cleaned = cleanRuleString(item.rule)
                    if cleaned and cleaned ~= '' then
                        table.insert(groupRules, cleaned)
                        -- mw.log('Added rule from CGroup', name, cleaned)
                    end
                end
            end
        else
            -- mw.log('Failed to load or parse CGroup module:', moduleName, success and "No data.content" or "pcall failed")
            -- 也可以尝试加载 Template:CGroup/name，但这超出了纯Lua规则提取的范畴，暂时忽略
        end
    end
    return groupRules
end

-- 主函数：获取指定页面的所有规则（NoteTA + 公共转换组）
function Extractor.getAllRules(pageTitle)
    local noteTARules = Extractor.getNoteTARules(pageTitle)
    local groupRules = Extractor.getGroupRules(noteTARules.groupNames)

    local allContentRules = {}
    -- 合并手动规则和组规则
    for _, rule in ipairs(noteTARules.manualRules) do
        table.insert(allContentRules, rule)
    end
    for _, rule in ipairs(groupRules) do
        table.insert(allContentRules, rule)
    end

    -- 去重所有内容规则（可能手动规则和组规则有重复）
    local uniqueContentRules = {}
    local contentRuleSet = {}
    for _, rule in ipairs(allContentRules) do
        if not contentRuleSet[rule] then
            table.insert(uniqueContentRules, rule)
            contentRuleSet[rule] = true
        end
    end
    
    local finalRules = {
        titleRule = noteTARules.titleRule, -- 可能为 nil
        contentRules = uniqueContentRules -- 手动规则 + 公共转换组规则 (已去重)
    }
    -- mw.logObject('Final extracted rules for page ' .. pageTitle .. ':', finalRules)
    return finalRules
end

return Extractor
模組:Conversion rule extractor/Extractor

公共函数

getAllRules

内部函数

getNoteTARules

getGroupRules

normalizeRuleString