Módulo:generar-pron/ang
Apariencia
La documentación para este módulo puede ser creada en Módulo:generar-pron/ang/doc
--[=[
Implementation of pronunciation-generation module from spelling for
Old English.
Author: Benwing
Adaptado por Tmagc
Generally, the user should supply the spelling, properly marked up with
macrons for long vowels, and ċ ġ ċġ sċ for soft versions of these consonants.
In addition, the following symbols can be used:
-- acute accent on a vowel to override the position of primary stress
-- (in a diphthong, put it over the first vowel)
-- grave accent to add secondary stress
-- circumflex to force no stress on the word or prefix (e.g. in a compound)
-- . (period) to force a syllable boundary
-- - (hyphen) to force a prefix/word or word/word boundary in a compound word;
-- the result will be displayed as a single word but the consonants on
-- either side treated as if they occurred at the beginning/end of the word
-- + (plus) is the opposite of -; it forces a prefix/word or word/word boundary
-- to *NOT* occur when it otherwise would
-- _ (underscore) to force the letters on either side to be interpreted
-- independently, when the combination of the two would normally have a
-- special meaning
]=]
local export = {}
local insert = table.insert
local concat = table.concat
local m_table = require("Módulo:tabla")
local m_str = require("Módulo:String")
local u = m_str.char
local strsubb = m_str.gsubb
local strsubn = m_str.gsub
local strsubrep = m_str.gsub_rep
local strstrip = m_str.strip
local strfind = m_str.find
local strmatch = m_str.match
local strsplit = m_str.split
local strlen = m_str.len
local strlower = m_str.lower
local strnfd = m_str.toNFD
local strnfc = m_str.toNFC
local strhtml = m_str.encode_html
-- version of strsubn() that discards all but the first return value
local function strsub(term, foo, bar, n)
local retval = strsubn(term, foo, bar, n)
return retval
end
-- like str:gsub() but discards all but the first return value
local function gsub(term, foo, bar, n)
local retval = term:gsub(foo, bar, n)
return retval
end
local PUNTUACION = "[%(%)%[%]%{%}¡!¿?.,;:–—]"
local PUNTUACION_EXTRA = "[%(%)%[%]%{%}¡!¿?.,;:–—\"“”„‟‘’«»»«‹››‹]"
local ACUTE = u(0x0301)
local GRAVE = u(0x0300)
local CFLEX = u(0x0302)
local MACRON = u(0x0304)
local DOTABOVE = u(0x0307)
local SYLLABIC = u(0x0329)
local CEDILLA = u(0x0327)
local DOUBLE_BREVE_BELOW = u(0x035C)
local SUST = 1 -- incluye sustantivos y adjetivos
local VERB = 2
local VERBAL = 3
local recomposer = {
["g" .. DOTABOVE] = "ġ",
["G" .. DOTABOVE] = "Ġ",
["c" .. DOTABOVE] = "ċ",
["C" .. DOTABOVE] = "Ċ",
-- used in "explicit allophone" notation in [[Module:ang-pron]]
["c" .. CEDILLA] = "ç",
["C" .. CEDILLA] = "Ç",
}
-- Decompose macron, acute, grave, circumflex, but leave alone ġ, ċ and uppercase equiv
local function decompose(text)
text = strnfd(text)
text = strsub(text, ".[" .. DOTABOVE .. "]", recomposer)
return text
end
-- We use the following syllable-splitting algorithm.
-- (1) A single consonant goes with the following syllable.
-- (2) Two consonants are split down the middle.
-- (3) For three or more consonants, check for clusters ending in
-- onsets_3 then onsets_2, with at least one preceding consonant.
-- If so, split between the onset and the preceding consonant(s).
-- (4) Check similarly for secondary_onsets_2. If seen, then check
-- the preceding consonant; if it's not an l or r, split before
-- the onset.
-- (5) Otherwise, split before the last consonant (i.e. the last
-- consonant goes with the following syllable, and all preceding
-- consonants go with the preceding syllable).
local onsets_2 = m_table.listToSet({
"pr", "pl",
"br", "bl",
"tr", "tw",
"dr", "dw",
"cr", "cl", "cw", --skip "cn"
"kr", "kl", "kw", --skip "kn"
"gr", "gl", -- skip "gn"
"sm", "sn", "sl", "sw",
"sp",
"st",
"sc", "sk", "sċ",
"fr", "fl", --skip "fn",
"þr", "þw",
"ðr", "ðw",
"hr", "hl", "hw", -- skip "hn"
"wr", "wl",
})
local secondary_onsets_2 = m_table.listToSet({
"cn", "kn",
"gn",
"fn",
"hn",
})
local onsets_3 = m_table.listToSet({
"spr", "spl",
"str",
"scr", "skr", "sċr",
})
local diphthongs = m_table.listToSet({
"ea", decompose("ēa"), decompose("eā"),
"eo", decompose("ēo"), decompose("eō"),
"io", decompose("īo"), decompose("iō"),
"ie", decompose("īe"), decompose("iē"),
})
local accent_ = MACRON .. ACUTE .. GRAVE .. CFLEX
local prefixes = {
{decompose("ā"), {verb = "unstressed", noun = "stressed"}},
{"æt", {verb = "unstressed"}},
{"æfter", {verb = "secstressed", noun = "stressed"}}, -- not very common
{"and", {verb = "unstressed", noun = "stressed"}},
{"an", {verb = "unstressed", noun = "stressed"}},
{"be", {verb = "unstressed", noun = "unstressed", restriction = "^[^" .. accent_ .. "ao]"}},
{decompose("bī"), {noun = "stressed"}},
{"ed", {verb = "unstressed", noun = "stressed"}}, -- not very common
{"fore", {verb = "unstressed", noun = "stressed", restriction = "^[^" .. accent_ .. "ao]"}},
{"for[þð]", {verb = "unstressed", noun = "stressed"}},
{"for", {verb = "unstressed", noun = "unstressed"}},
{"fram", {verb = "unstressed", noun = "stressed"}}, -- not very common
-- following is rare as a noun, mostly from verbal forms
{"ġeond", {verb = "unstressed"}},
{"ġe", {verb = "unstressed", noun = "unstressed", restriction = "^[^" .. accent_ .. "ao]"}},
{"in", {verb = "unstressed", noun = "stressed"}}, -- not very common
{"mis", {verb = "unstressed"}},
{"ofer", {verb = "secstressed", noun = "stressed"}},
{"of", {verb = "unstressed", noun = "stressed"}},
{"on", {verb = "unstressed", noun = "stressed"}},
{"or", {noun = "stressed"}},
{"o[þð]", {verb = "unstressed"}},
{decompose("stēop"), {noun = "stressed"}},
{decompose("tō"), {verb = "unstressed", noun = "stressed"}},
{"under", {verb = "secstressed", noun = "stressed"}},
{"un", {verb = "unstressed", noun = "stressed", verbal = "stressed"}}, -- uncommon as verb
{"up", {verb = "unstressed", noun = "stressed"}},
{decompose("ūt"), {verb = "unstressed", noun = "stressed"}},
{decompose("ū[þð]"), {noun = "stressed"}},
{"[wƿ]i[þð]er", {verb = "secstressed", noun = "stressed"}},
{"[wƿ]i[þð]", {verb = "unstressed"}},
{"ymb", {verb = "unstressed", noun = "stressed"}},
{"[þð]urh", {verb = "unstressed", noun = "stressed"}},
}
local suffixes = {
{decompose("bǣre"), {noun = "secstressed"}},
{"fæst", {noun = "secstressed"}},
{"feald", {noun = "secstressed"}},
{"full?", {noun = "unstressed"}},
{decompose("lēas"), {noun = "secstressed"}},
-- These can be VERBAL if following a verbal past participle or similar
{decompose("līċe"), {noun = "secstressed", verb = "secstressed"}},
-- ī is decomposed into two chars so can't combine into [īi]
{decompose("li[ċc]"), {noun = "unstressed", verb = "unstressed"}},
{decompose("lī[ċc]"), {noun = "unstressed", verb = "unstressed"}},
{"n[eiy]ss?", {noun = "unstressed", verb = "unstressed"}},
{"sum", {noun = "unstressed"}},
}
-- When auto-generating primary and secondary stress accents, we use these
-- special characters, and later convert to normal IPA accent marks, so
-- we can distinguish auto-generated stress from user-specified stress.
local AUTOACUTE = u(0xFFF0)
local AUTOGRAVE = u(0xFFF1)
-- When the user uses the "explicit allophone" notation such as [z] or [ç] to
-- force a particular allophone, we internally convert that notation into a
-- single special character.
local EXPLICIT_TH = u(0xFFF2)
local EXPLICIT_DH = u(0xFFF3)
local EXPLICIT_S = u(0xFFF4)
local EXPLICIT_Z = u(0xFFF5)
local EXPLICIT_F = u(0xFFF6)
local EXPLICIT_V = u(0xFFF7)
local EXPLICIT_G = u(0xFFF8)
local EXPLICIT_GH = u(0xFFF9)
local EXPLICIT_H = u(0xFFFA)
local EXPLICIT_X = u(0xFFFB)
local EXPLICIT_C = u(0xFFFC)
local EXPLICIT_I = u(0xFFFD)
local explicit_cons = EXPLICIT_TH .. EXPLICIT_DH .. EXPLICIT_S .. EXPLICIT_Z ..
EXPLICIT_F .. EXPLICIT_V .. EXPLICIT_G .. EXPLICIT_GH .. EXPLICIT_H ..
EXPLICIT_X .. EXPLICIT_C
-- Map "explicit allophone" notation into special char. See above.
local char_to_explicit_char = {
["þ"] = EXPLICIT_TH,
["ð"] = EXPLICIT_DH,
["s"] = EXPLICIT_S,
["z"] = EXPLICIT_Z,
["f"] = EXPLICIT_F,
["v"] = EXPLICIT_V,
["g"] = EXPLICIT_G,
["ɣ"] = EXPLICIT_GH,
["h"] = EXPLICIT_H,
["x"] = EXPLICIT_X,
["ç"] = EXPLICIT_C,
["i"] = EXPLICIT_I,
}
-- Map "explicit allophone" notation into normal spelling, for supporting ann=.
local char_to_spelling = {
["þ"] = "þ",
["ð"] = "þ",
["s"] = "s",
["z"] = "s",
["f"] = "f",
["v"] = "f",
["g"] = "g",
["ɣ"] = "g",
["h"] = "h",
["x"] = "h",
["ç"] = "h",
["i"] = "i",
}
-- Map "explicit allophone" notation into phonemes, for phonemic output.
local explicit_char_to_phonemic = {
[EXPLICIT_TH] = "θ",
[EXPLICIT_DH] = "θ",
[EXPLICIT_S] = "s",
[EXPLICIT_Z] = "s",
[EXPLICIT_F] = "f",
[EXPLICIT_V] = "f",
[EXPLICIT_G] = "ɡ", -- IPA ɡ!
[EXPLICIT_GH] = "ɡ", -- IPA ɡ!
[EXPLICIT_H] = "x",
[EXPLICIT_X] = "x",
[EXPLICIT_C] = "x",
[EXPLICIT_I] = "i",
}
-- Map "explicit allophone" notation into IPA phones, for phonetic output.
local explicit_char_to_phonetic = {
[EXPLICIT_TH] = "θ",
[EXPLICIT_DH] = "ð",
[EXPLICIT_S] = "s",
[EXPLICIT_Z] = "z",
[EXPLICIT_F] = "f",
[EXPLICIT_V] = "v",
[EXPLICIT_G] = "ɡ", -- IPA ɡ!
[EXPLICIT_GH] = "ɣ",
[EXPLICIT_H] = "h",
[EXPLICIT_X] = "x",
[EXPLICIT_C] = "ç",
[EXPLICIT_I] = "i",
}
local accent = MACRON .. ACUTE .. GRAVE .. CFLEX .. AUTOACUTE .. AUTOGRAVE
local accent_c = "[" .. accent .. "]"
local stress_accent = ACUTE .. GRAVE .. CFLEX .. AUTOACUTE .. AUTOGRAVE
local stress_accent_c = "[" .. stress_accent .. "]"
local back_vowel = "aɑou"
local front_vowel = "eiyæœø" .. EXPLICIT_I
local vowel = back_vowel .. front_vowel
local vowel_or_accent = vowel .. accent
local vowel_c = "[" .. vowel .. "]"
local vowel_or_accent_c = "[" .. vowel_or_accent .. "]"
local non_vowel_c = "[^" .. vowel .. "]"
local front_vowel_c = "[" .. front_vowel .. "]"
-- The following include both IPA symbols and letters (including regular g and IPA ɡ)
-- so it can be used at any step of the process.
local obstruent = "bcċçdfgɡɣhkpqstvxzþðθʃʒ" .. explicit_cons
local resonant = "lmnŋrɫ"
local glide = "ġjwƿ"
local cons = obstruent .. resonant .. glide
local cons_c = "[" .. cons .. "]"
local voiced_sound = vowel .. "lrmnwjbdɡ" -- WARNING, IPA ɡ used here
-- These rules operate in order, and apply to the actual spelling,
-- after (1) macron decomposition, (2) syllable and prefix splitting,
-- (3) placement of primary and secondary stresses at the beginning
-- of the syllable. Each syllable will be separated either by ˈ
-- (if the following syllable is stressed), by ˌ (if the following
-- syllable has secondary stress), or by . (otherwise). In addition,
-- morpheme boundaries where the consonants on either side should be
-- treated as at the beginning/end of word (i.e. between prefix and
-- word, or between words in a compound word) will be marked with ⁀
-- before the syllable separator, and the beginning and end of text
-- will be marked by ⁀⁀. The output of this is fed into phonetic_rules,
-- and then is used to generate the displayed phonemic pronunciation
-- by removing ⁀ symbols.
local phonemic_rules = {
{MACRON, "ː"},
{"eoː", "oː"}, -- e.g. ġeōmor
{"eaː", "aː"},
{"[ei]ː?[aeo]", {
-- Alternative notation for short diphthongs: iu̯, eo̯, æɑ̯
-- Alternative notation for long diphthongs: iːu̯, eːo̯, æːɑ̯
["ea"] = "æ͜ɑ",
["eːa"] = "æ͜ɑː",
["eo"] = "e͜o",
["eːo"] = "e͜oː",
["io"] = "i͜u",
["iːo"] = "i͜uː",
["ie"] = "i͜y",
["iːe"] = "i͜yː",
}},
-- sċ between vowels when at the beginning of a syllable should be ʃ.ʃ
{"(" .. vowel_c .. "ː?)([.ˈˌ]?)sċ(" .. vowel_c .. ")", "%1ʃ%2ʃ%3"},
-- other sċ should be ʃ; note that sċ divided between syllables becomes s.t͡ʃ
{"sċ", "ʃ"},
-- x between vowels when at the beginning of a syllable should be k.s;
-- remaining x handled below
{"(" .. vowel_c .. "ː?)([.ˈˌ]?)x(" .. vowel_c .. ")", "%1k%2s%3"},
-- z between vowels when at the beginning of a syllable should be t.s;
-- remaining z handled below
{"(" .. vowel_c .. "ː?)([.ˈˌ]?)z(" .. vowel_c .. ")", "%1t%2s%3"},
-- short front vowel + -rian, -riend, -rienne, -riende in verb or verbal is
-- rendered with /j/; we need to carefully change the syllable structure
-- when doing this
{"(" .. front_vowel_c .. ")%.ri%.(an⁀)", "%1r.ġ%2", {VERB}},
{"(" .. front_vowel_c .. ")%.ri%.(end⁀)", "%1r.ġ%2", {VERB, VERBAL}},
{"(" .. front_vowel_c .. ")%.ri%.(en%.[nd]e⁀)", "%1r.ġ%2", {VERB, VERBAL}},
{"nċ([.ˈˌ]?)ġ", "n%1j"},
{"ċ([.ˈˌ]?)ġ", "j%1j"},
{"c([.ˈˌ]?)g", "g%1g"},
{"ċ([.ˈˌ]?)ċ", "t%1t͡ʃ"},
{".", {
["ċ"] = "t͡ʃ",
["c"] = "k",
["ġ"] = "j",
["h"] = "x",
["þ"] = "θ",
["ð"] = "θ",
["ƿ"] = "w",
["x"] = "ks",
["z"] = "ts",
["g"] = "ɡ", -- map to IPA ɡ
["a"] = "ɑ",
["œ"] = "ø",
}},
}
local fricative_to_voiced = {
["f"] = "v",
["s"] = "z",
["θ"] = "ð",
}
local fricative_to_unvoiced = {
["v"] = "f",
["z"] = "s",
["ð"] = "θ",
}
-- These rules operate in order, on the output of phonemic_rules.
-- The output of this is used to generate the displayed phonemic
-- pronunciation by removing ⁀ symbols.
local phonetic_rules = {
-- Fricative voicing between voiced sounds. Note, the following operates
-- across a ⁀ boundary for a fricative before the boundary but not after.
{"([" .. voiced_sound .. "][ː.ˈˌ]*)([fsθ])([ː.ˈˌ⁀]*[" .. voiced_sound .. "])",
function(s1, c, s2)
return s1 .. fricative_to_voiced[c] .. s2
end
},
-- Fricative between unstressed vowels should be devoiced.
-- Note that unstressed syllables are preceded by . while stressed
-- syllables are preceded by a stress mark.
{"(%.[^.⁀][" .. vowel .. DOUBLE_BREVE_BELOW .. "ː]*%.)([vzð])",
function(s1, c)
return s1 .. fricative_to_unvoiced[c]
end
},
-- Final -sian, -siend, -sienne, -siende (and variants such as -siġan,
-- -siġend, etc.) in verb or verbal is rendered with [s]; clǣnsian will
-- have to be special-cased with ''[z]''
{"(" .. cons_c .. "ː?" .. "%.)z(i%.j?ɑn⁀)", "%1s%2", {VERB}},
{"(" .. cons_c .. "ː?" .. "%.)z(i%.j?end⁀)", "%1s%2", {VERB, VERBAL}},
{"(" .. cons_c .. "ː?" .. "%.)z(i%.j?en%.[nd]e⁀)", "%1s%2", {VERB, VERBAL}},
-- Final unstressed -þu/-þo after a consonant should be devoiced.
{"(" .. cons_c .. "ː?" .. "%.)ð([uo]⁀)",
function(s1, s2)
return s1 .. "θ" .. s2
end
},
{"x[wnlr]", {
["xw"] = "ʍ",
["xl"] = "l̥",
["xn"] = "n̥",
["xr"] = "r̥",
}},
-- Note, the following will not operate across a ⁀ boundary.
{"n([.ˈˌ]?[ɡk])", "ŋ%1"}, -- WARNING, IPA ɡ used here
{"n([.ˈˌ]?)j", "n%1d͡ʒ"},
{"j([.ˈˌ]?)j", "d%1d͡ʒ"},
{"([^x][⁀.ˈˌ])x", "%1h"}, -- [h] occurs as a syllable-initial allophone
{"(" .. front_vowel_c .. ")x", "%1ç"}, -- [ç] occurs after front vowels
-- An IPA ɡ after a word/prefix boundary, after another ɡ or after n
-- (previously converted to ŋ in this circumstance) should remain as ɡ,
-- while all other ɡ's should be converted to ɣ except that word-final ɡ
-- becomes x. We do this by converting the ɡ's that should remain to regular
-- g (which should never occur otherwise), convert the remaining IPA ɡ's to ɣ
-- or x, and then convert the regular g's back to IPA ɡ.
{"ɡ([.ˈˌ]?)ɡ", "g%1g"}, -- WARNING, IPA ɡ on the left, regular g on the right
{"([ŋ⁀])([.ˈˌ]?)ɡ", "%1%2g"}, -- WARNING, IPA ɡ on the left, regular g on the right
{"ɡ", "ɣ"},
{"g", "ɡ"}, -- WARNING, regular g on the left, IPA ɡ on the right
{"l([.ˈˌ]?)l", "ɫ%1ɫ"},
{"r([.ˈˌ]?)r", "rˠ%1rˠ"},
{"l([.ˈˌ]?" .. cons_c .. ")", "ɫ%1"},
{"r([.ˈˌ]?" .. cons_c .. ")", "rˠ%1"},
-- Geminate consonants within a single syllable are pronounced singly.
-- Does not apply e.g. to ''ǣttren'', which will be divided as ''ǣt.tren''.
{"(" .. cons_c .. ")%1", "%1"},
{"rˠrˠ", "rˠ"},
-- [In the sequence vowel + obstruent + resonant in a single syllable,
-- the resonant should become syllabic, e.g. ādl [ˈɑːdl̩], blōstm [bloːstm̩],
-- fæþm [fæðm̩], bēacn [ˈbæːɑ̯kn̩]. We allow anything but a syllable or word
-- boundary betweent the vowel and the obstruent.] [BASED ON INPUT FROM
-- [[User:Urszag]], I'VE DECIDE AGAINST THIS]
-- {"(" .. vowel_c .. "[^.ˈˌ⁀]*[" .. obstruent .. "]ː?[" .. resonant .. "])", "%1" .. SYLLABIC},
-- also -mn e.g stemn /ˈstemn̩/; same for m + other resonants except m
-- {"(" .. vowel_c .. "[^.ˈˌ⁀]*mː?[lnŋrɫ])", "%1" .. SYLLABIC},
{".", explicit_char_to_phonetic},
}
local pron_abc = {
["A"] = {"a"},
["B"] = {"bee"},
["C"] = {"cee"},
["D"] = {"dee"},
["E"] = {"e"},
["F"] = {"eff"},
["G"] = {"gee"},
["H"] = {"aitch"},
["I"] = {"i"},
["J"] = {"jay"},
["K"] = {"kay"},
["L"] = {"el"},
["M"] = {"em"},
["N"] = {"en"},
["O"] = {"o"},
["P"] = {"pee"},
["Q"] = {"cue"},
["R"] = {"ar"},
["S"] = {"ess"},
["T"] = {"tee"},
["U"] = {"u"},
["V"] = {"vee"},
["W"] = {"double-u"},
["X"] = {"ex"},
["Y"] = {"wye"},
["Z"] = {"zed"},
["a"] = {"a"},
["b"] = {"bee"},
["c"] = {"cee"},
["d"] = {"dee"},
["e"] = {"e"},
["f"] = {"eff"},
["g"] = {"gee"},
["h"] = {"aitch"},
["i"] = {"i"},
["j"] = {"jay"},
["k"] = {"kay"},
["l"] = {"el"},
["m"] = {"em"},
["n"] = {"en"},
["o"] = {"o"},
["p"] = {"pee"},
["q"] = {"cue"},
["r"] = {"ar"},
["s"] = {"ess"},
["t"] = {"tee"},
["u"] = {"u"},
["v"] = {"vee"},
["w"] = {"double-u"},
["x"] = {"ex"},
["y"] = {"wye"},
["z"] = {"zed"},
["ᚠ"] = {"feoh"},
["ᚢ"] = {"ūr"},
["ᚦ"] = {"þorn"},
["ᚩ"] = {"ōs"},
["ᚱ"] = {"rād"},
["ᚳ"] = {"cēn"},
["ᚷ"] = {"gyfu"},
["ᚹ"] = {"wynn"},
["ᚻ"] = {"hægl"},
["ᚾ"] = {"nēod"},
["ᛁ"] = {"īs"},
["ᛡ"] = {"gēar"},
["ᛄ"] = {"gēar"},
["ᛇ"] = {"īw"},
["ᛈ"] = {"peorð"},
["ᛉ"] = {"ilcs"},
["ᛋ"] = {"sigel"},
["ᚴ"] = {"sigel"},
["ᛏ"] = {"Tīw"},
["ᛒ"] = {"beorc"},
["ᛖ"] = {"eh"},
["ᛗ"] = {"mann"},
["ᛚ"] = {"lagu"},
["ᛝ"] = {"ing"},
["ᛟ"] = {"ēðel"},
["ᛞ"] = {"dæg"},
["ᚪ"] = {"āc"},
["ᚫ"] = {"æsc"},
["ᛠ"] = {"ēar"},
["ᚣ"] = {"ȳr"},
}
local function normalizar(texto)
local is_prefix, is_suffix
texto = strlower(texto)
--t = strsub(t, "%[(.)%]", char_to_explicit_char)
texto = strsubrep(texto, PUNTUACION, " | ") -- convierto lo que delimite fragmentos a los IPA foot boundaries |
texto = strsubrep(texto, PUNTUACION_EXTRA, "") -- elimino la puntuación restante que haya quedado t, is_suffix = strsubb(t, "^%-", "")
texto, is_prefix = strsubb(texto, "[%-‐]$", "")
texto, is_sufix = strsubb(texto, "^[%-‐]", "")
-- texto = strsubrep(texto, "[%-‐]", " ") --los guiones pasan a ser espacios (austro-húngaro, franco-italiano)
texto = strsubrep(texto, "%s*|%s*|%s*", " | ") --finalmente, elimino las barras y espacios de más
texto = strsubrep(texto, "%s+", " ")
texto = strstrip(texto, "[%s|]+")
return texto, is_prefix, is_sufix
end
local function apply_rules(word, rules, pos)
for _, rule in ipairs(rules) do
local allowed_pos = rule[3]
if not allowed_pos or m_table.contains(allowed_pos, pos) then
word = strsub(word, rule[1], rule[2])
end
end
return word
end
local function lookup_stress_spec(stress_spec, pos)
return stress_spec[pos] or (pos == VERBAL and stress_spec[VERB]) or nil
end
local function split_on_word_boundaries(word, pos)
local retparts = {}
local parts = strsplit(word, "([<>%-])")
local i = 1
local saw_primary_stress = false
while i <= #parts do
local split_part = false
local insert_position = #retparts + 1
if parts[i + 1] ~= "<" and parts[i - 1] ~= ">" then
-- Split off any prefixes.
while true do
local broke_prefix = false
for _, prefixspec in ipairs(prefixes) do
local prefix_pattern = prefixspec[1]
local stress_spec = prefixspec[2]
local pos_stress = lookup_stress_spec(stress_spec, pos)
local prefix, rest = strmatch(parts[i], "^(" .. prefix_pattern .. ")(.*)$")
if prefix then
if not pos_stress then
-- prefix not recognized for this POS, don't split here
elseif stress_spec.restriction and not strfind(rest, stress_spec.restriction) then
-- restriction not met, don't split here
elseif strfind(rest, "^%+") then
-- explicit non-boundary here, so don't split here
elseif not strfind(rest, vowel_c) then
-- no vowels, don't split here
elseif strfind(rest, "^..?$") then
-- only two letters, unlikely to be a word, probably an ending, so don't split
-- here
else
local initial_cluster, after_cluster = strmatch(rest, "^(" .. non_vowel_c .. "*)(.-)$")
if strfind(initial_cluster, "..") and (
not (onsets_2[initial_cluster] or secondary_onsets_2[initial_cluster] or
onsets_3[initial_cluster])) then
-- initial cluster isn't a possible onset, don't split here
elseif strfind(initial_cluster, "^x") then
-- initial cluster isn't a possible onset, don't split here
elseif strfind(after_cluster, "^" .. vowel_c .. "$") then
-- remainder is a cluster + short vowel,
-- unlikely to be a word so don't split here
else
-- break the word in two; next iteration we process
-- the rest, which may need breaking again
parts[i] = rest
if pos_stress == "unstressed" then
-- don't do anything
elseif pos_stress == "secstressed" or (saw_primary_stress and pos_stress == "stressed") then
prefix = strsub(prefix, "(" .. vowel_c .. ")", "%1" .. AUTOGRAVE, 1)
elseif pos_stress == "stressed" then
prefix = strsub(prefix, "(" .. vowel_c .. ")", "%1" .. AUTOACUTE, 1)
saw_primary_stress = true
else
error("Unrecognized stress spec for pos=" .. pos .. ", prefix=" .. prefix .. ": " .. pos_stress)
end
insert(retparts, insert_position, prefix)
insert_position = insert_position + 1
broke_prefix = true
break
end
end
end
end
if not broke_prefix then
break
end
end
-- Now do the same for suffixes.
while true do
local broke_suffix = false
for _, suffixspec in ipairs(suffixes) do
local suffix_pattern = suffixspec[1]
local stress_spec = suffixspec[2]
local pos_stress = lookup_stress_spec(stress_spec, pos)
local rest, suffix = strmatch(parts[i], "^(.-)(" .. suffix_pattern .. ")$")
if suffix then
if not pos_stress then
-- suffix not recognized for this POS, don't split here
elseif stress_spec.restriction and not strfind(rest, stress_spec.restriction) then
-- restriction not met, don't split here
elseif strfind(rest, "%+$") then
-- explicit non-boundary here, so don't split here
elseif not strfind(rest, vowel_c) then
-- no vowels, don't split here
else
local before_cluster, final_cluster = strmatch(rest, "^(.-)(" .. non_vowel_c .. "*)$")
if strfind(final_cluster, "%..") then
-- syllable division within or before final
-- cluster, don't split here
else
-- break the word in two; next iteration we process
-- the rest, which may need breaking again
parts[i] = rest
if pos_stress == "unstressed" then
-- don't do anything
elseif pos_stress == "secstressed" then
suffix = strsub(suffix, "(" .. vowel_c .. ")", "%1" .. AUTOGRAVE, 1)
elseif pos_stress == "stressed" then
error("Primary stress not allowed for suffixes (suffix=" .. suffix .. ")")
else
error("Unrecognized stress spec for pos=" .. pos .. ", suffix=" .. suffix .. ": " .. pos_stress)
end
insert(retparts, insert_position, suffix)
broke_suffix = true
break
end
end
end
end
if not broke_suffix then
break
end
end
end
local acc = strfind(parts[i], "(" .. stress_accent_c .. ")")
if acc == CFLEX then
-- remove circumflex but don't accent
parts[i] = gsub(parts[i], CFLEX, "")
elseif acc == ACUTE or acc == AUTOACUTE then
saw_primary_stress = true
elseif not acc and parts[i + 1] ~= "<" and parts[i - 1] ~= ">" then
-- Add primary or secondary stress on the part; primary stress if no primary
-- stress yet, otherwise secondary stress.
acc = saw_primary_stress and AUTOGRAVE or AUTOACUTE
saw_primary_stress = true
parts[i] = strsub(parts[i], "(" .. vowel_c .. ")", "%1" .. acc, 1)
end
insert(retparts, insert_position, parts[i])
i = i + 2
end
-- remove any +, which has served its purpose
for _, part in ipairs(retparts) do
retparts[i] = gsub(part, "%+", "")
end
return retparts
end
local function break_vowels(vowelseq)
local function check_empty(char)
if char ~= "" then
error("Something wrong, non-vowel '" .. char .. "' seen in vowel sequence '" .. vowelseq .. "'")
end
end
local vowels = {}
local chars = strsplit(vowelseq, "(" .. vowel_c .. accent_c .. "*)")
local i = 1
while i <= #chars do
if i % 2 == 1 then
check_empty(chars[i])
i = i + 1
else
if i < #chars - 1 and diphthongs[
strsub(chars[i], stress_accent_c, "") .. strsub(chars[i + 2], stress_accent_c, "")
] then
check_empty(chars[i + 1])
insert(vowels, chars[i] .. chars[i + 2])
i = i + 3
else
insert(vowels, chars[i])
i = i + 1
end
end
end
return vowels
end
-- Break a word into alternating C and V components where a C component is a run
-- of zero or more consonants and a V component in a single vowel or dipthong.
-- There will always be an odd number of components, where all odd-numbered
-- components (starting from 1) are C components and all even-numbered components
-- are V components.
local function break_into_c_and_v_components(word)
local cons_vowel = strsplit(word, "(" .. vowel_or_accent_c .. "+)")
local components = {}
for i = 1, #cons_vowel do
if i % 2 == 1 then
insert(components, cons_vowel[i])
else
local vowels = break_vowels(cons_vowel[i])
for j = 1, #vowels do
if j == 1 then
insert(components, vowels[j])
else
insert(components, "")
insert(components, vowels[j])
end
end
end
end
return components
end
local function split_into_syllables(word)
local cons_vowel = break_into_c_and_v_components(word)
if #cons_vowel == 1 then
return cons_vowel
end
for i = 1, #cons_vowel do
if i % 2 == 1 then
-- consonant
local cluster = cons_vowel[i]
local len = strlen(cluster)
if i == 1 then
cons_vowel[i + 1] = cluster .. cons_vowel[i + 1]
elseif i == #cons_vowel then
cons_vowel[i - 1] = cons_vowel[i - 1] .. cluster
elseif strfind(cluster, "%.") then
local before_break, after_break = strmatch(cluster, "^(.-)%.(.*)$")
cons_vowel[i - 1] = cons_vowel[i - 1] .. before_break
cons_vowel[i + 1] = after_break .. cons_vowel[i + 1]
elseif len == 0 then
-- do nothing
elseif len == 1 then
cons_vowel[i + 1] = cluster .. cons_vowel[i + 1]
elseif len == 2 then
local c1, c2 = strmatch(cluster, "^(.)(.)$")
if c1 == "s" and c2 == "ċ" then
cons_vowel[i + 1] = "sċ" .. cons_vowel[i + 1]
else
cons_vowel[i - 1] = cons_vowel[i - 1] .. c1
cons_vowel[i + 1] = c2 .. cons_vowel[i + 1]
end
else
-- check for onset_3 preceded by consonant(s).
local first3, last3 = strmatch(cluster, "^(.-)(...)$")
if #first3 > 0 and onsets_3[last3] then
cons_vowel[i - 1] = cons_vowel[i - 1] .. first3
cons_vowel[i + 1] = last3 .. cons_vowel[i + 1]
else
local first2, last2 = strmatch(cluster, "^(.-)(..)$")
if onsets_2[last2] or (secondary_onsets_2[last2] and not first2:find("[lr]$")) then
cons_vowel[i - 1] = cons_vowel[i - 1] .. first2
cons_vowel[i + 1] = last2 .. cons_vowel[i + 1]
else
local first, last = strmatch(cluster, "^(.-)(.)$")
cons_vowel[i - 1] = cons_vowel[i - 1] .. first
cons_vowel[i + 1] = last .. cons_vowel[i + 1]
end
end
end
end
end
local retval = {}
for i = 1, #cons_vowel do
if i % 2 == 0 then
-- remove any stray periods.
insert(retval, strsub(cons_vowel[i], "%.", ""))
end
end
return retval
end
-- Combine syllables into a word, moving stress markers (acute/grave) to the
-- beginning of the syllable.
local function combine_syllables_moving_stress(syllables, no_auto_stress)
local modified_syls = {}
for i, syl in ipairs(syllables) do
if syl:find(ACUTE) or syl:find(AUTOACUTE) and not no_auto_stress then
syl = "ˈ" .. syl
elseif syl:find(GRAVE) or syl:find(AUTOGRAVE) and not no_auto_stress then
syl = "ˌ" .. syl
elseif i > 1 then
syl = "." .. syl
end
syl = strsub(syl, stress_accent_c, "")
insert(modified_syls, syl)
end
return concat(modified_syls)
end
-- Combine word parts (split-off prefixes, suffixes or parts of a compound word)
-- into a single word. Separate parts with ⁀ and the put ⁀⁀ at word boundaries.
local function combine_parts(parts)
local text = {}
for i, part in ipairs(parts) do
if i > 1 and not strfind(part, "^[ˈˌ]") then
-- Need a syllable boundary if there isn't a stress marker.
insert(text, "." .. part)
else
insert(text, part)
end
end
return "⁀⁀" .. concat(text, "⁀") .. "⁀⁀"
end
local function transform_word(word, pos, no_auto_stress)
word = decompose(word)
local parts = split_on_word_boundaries(word, pos)
for i, part in ipairs(parts) do
local syllables = split_into_syllables(part)
parts[i] = combine_syllables_moving_stress(syllables,
no_auto_stress or (#parts == 1 and #syllables == 1))
end
return combine_parts(parts)
end
local function default_pos(word)
-- verbs in -an/-ōn/-ēon, inflected infinitives in -enne
if strfind(word, "[aāō]n$") or strfind(word, "ēon$") or strfind(word, "enne$") then
return VERB
end
-- adjectives in -līċ, adverbs in -līċe and nouns in -nes can follow
-- nouns or participles (which are VERBAL); truncate the ending
-- and check what precedes
word = strsub(word, "^(.*" .. vowel_c .. ".*)l[iī][cċ]e?$", "%1")
word = strsub(word, "^(.*" .. vowel_c .. ".*)n[eiy]ss?$", "%1")
-- participles in -end(e)/-en/-ed/-od, verbal nouns in -ing/-ung
if strfind(word, "ende?$") or strfind(word, "[eo]d$") or strfind(word, "en$")
or strfind(word, "[iu]ng$") then
return VERBAL
end
return SUST
end
local function generar_pron(t1, fone, cg)
local t, is_prefix, is_suffix = normalizar(t1)
local convertido = {}
local fragmentos = strsplit(t, "%s*|%s*")
local k = 1
for _,fragmento in ipairs(fragmentos) do
local palabras = strsplit(fragmento, "%s")
local is_prefix_suffix = (is_prefix or is_suffix) and #palabras == 1
local palabras_convertidas = {}
for _,p in ipairs(palabras) do
local pos = cg[k] or default_pos(p)
p = transform_word(p, pos, is_prefix_suffix)
p = apply_rules(p, phonemic_rules, pos)
if fone then
p = apply_rules(p, phonetic_rules, pos)
end
insert(palabras_convertidas, p)
k = k + 1
end
insert(convertido, concat(palabras_convertidas, " "))
end
local result = concat(convertido, " | ")
if fone then
result = strsub(result, ".", explicit_char_to_phonetic)
else
result = strsub(result, ".", explicit_char_to_phonemic)
end
result = strsub(result, "⁀", "")
result = strhtml(result)
return {{result}}
end
function export.procesar_pron_args(titulo, args)
local tit = titulo
local vino_ayuda, x
if #args["ayuda"] < 1 then
args["ayuda"][1] = tit
else
vino_ayuda = true
end
if #args["fone"] < 1 and #args["fono"] < 1 then
x = pron_abc[args["ayuda"][1]]
if x then
args["ayuda"] = x
args["tl"] = x
end
local A = #args["ayuda"]
local j = 1 -- indice de la ayuda
local k = 1 -- cantidad de pronunciaciones insertadas (máximo 9)
while k <= 9 and j <= A do
local cg = {}
local flags = args["ayudaextra"][j] and strsplit(args["ayudaextra"][j], ";") or {}
for _,flag in ipairs(flags) do
if flag == "s" or flag == "sust" or flag == "sustantivo" or flag == "a" or flag == "adj" or flag == "adjetivo" then
insert(cg, SUST)
elseif flag == "v" or flag == "verb" or flag == "verbo" then
insert(cg, VERB)
elseif flag == "l" or flag == "verbal" then
insert(cg, VERBAL)
end
end
if vino_ayuda then
args["fgraf"][j] = {args["ayuda"][j]}
end
local fone = generar_pron(args["ayuda"][j], true, cg)
for i,_ in ipairs(fone) do
insert(args["fone"], fone[i])
k = k + 1
if k > 9 then
break
end
end
j = j + 1
end
end
local tiene_espacios = strfind(tit, "%s")
if args["fone"][1] and args["fone"][1][1] then
local rim = strsub(args["fone"][1][1], ".*%s([^%s]+)$", "%1") -- me quedo con la última palabra
rim = strsub(rim, "^.*ˈ(.-)$", "%1")
args["rima"][1] = strsub(rim, ".-".."(["..vowel.."].*"..")".."$", "%1")
end
return args
end
return export