La ducumintazzioni di stu mòdulu si pò criari nta Mòdulu:Sicilian/doc

-- Stu mòdulu cunteni funzioni utili pi manipulari palori in Sicilianu,
-- in particulari è adupiratu pi criari autumaticamenti tavuli dâ
-- cugnugazzioni dî verbi rigulari e dâ flissioni di aggittivi e sustantivi
-- rigulari.

-- This module contains functions useful to manipolate Sicilian words,
-- in particular it's used to automatically create tables for the
-- conjugation of regular verbs and for the inflection of regular adjectives
-- and nouns.

local p = {}

-- Remove any accents from the vowels in the given word.
-- The function knows and applies the Sicilian rules
-- for changing E and O into I and U when they lose the accent.
function remove_accents(word)
	local str = mw.ustring.gsub(word, "à", "a")
	str = mw.ustring.gsub(str, "è", "i")
	str = mw.ustring.gsub(str, "ì", "i")
	str = mw.ustring.gsub(str, "ò", "u")
	str = mw.ustring.gsub(str, "ù", "u")
	str = mw.ustring.gsub(str, "ï̀", "ï")
	return (str)
end

-- Return the same word as the one given, except in the case of an accented
-- diaeresis, which is not an acceptable character, and has to be replaced
-- with a plain accent in the returned word.
local function leave_accents(word)
	return mw.ustring.gsub(word, "ï̀", "ì")
end

-- Return true if the given word contains any accented vowel.
function is_accented(word)
	if mw.ustring.find(word, "[àèìòùï̀]") then
		return true
	else
		return false
	end
end

-- Truncate the given word at the position pos, and return the resulting
-- fragment of the word.
-- The truncation is done in such a way that the returned fragment does not
-- lose phonetic information, and can be glued again to the same or another tail
-- using the join_words function to obtain a valid word.
-- This means that truncated fragments ending with the "soft" "c" and "g" will
-- always end with the "soft" consonant and without an "i", while fragments
-- ending with the "hard" "c" and "g" will always end with the "h". 
function truncate_word(word, pos)
	-- If the word ends with "ci" or "gi", don't return the final "i".
	if (pos > 1) and (
			mw.ustring.sub(word, pos - 1, pos) == "ci"
			or mw.ustring.sub(word, pos - 1, pos) == "gi"
	) then
		-- This case must not match *accented* "ci" or "gi" because they are
		-- often used instead of "cij", "gij" or diaeresis to prevent diphthong,
		-- e.g. "caccì-ari" must be split into "caccì-ari" and not "cacc-ari"
		-- because it conjugates to "caccì-u" and not "càcc-iu",
		-- which is a different verb.
		return mw.ustring.sub(word, 1, pos - 1)
	end
	-- If the word ends with a "hard" "c" or "g", return a final "h".
	if (pos < mw.ustring.len(word)) and (
		mw.ustring.sub(word, pos, pos) == "c"
		or mw.ustring.sub(word, pos, pos) == "g"
	) then
		local next_ltr = remove_accents(mw.ustring.sub(word, pos + 1, pos + 1))
		if next_ltr ~= "e" and next_ltr ~= "i" then
			return mw.ustring.sub(word, 1, pos) .. "h"
		end
	end
	return mw.ustring.sub(word, 1, pos)
end

-- Join two word fragments together and return the result, in a way that the
-- returned word is phonetically equivalent to the two joined fragments.
-- This means that we must add an "i" between the two fragments if the first
-- one ends with a "soft" consonant, and we must remove an "h" at the end of
-- the first fragment if it ends with a "hard" consonant and the second one
-- does not begin with "e" or "i".
function join_words(word1, word2)
	-- If the first fragment ends with "c" or "g", and the second fragment
	-- does not begin with "e" or "i", then we need to add a "i" between
	if mw.ustring.sub(word1, -1) == "c"
		or mw.ustring.sub(word1, -1) == "g"
	then
		local first_ltr = remove_accents(mw.ustring.sub(word2, 1, 1))
		if first_ltr == "e" or first_ltr == "i" then
			return word1 .. word2
		else
			return word1 .. "i" .. word2
		end
	end
	-- If the first fragment ends with "ch" or "gh", and the second fragment
	-- does not begin with "e" or "i", then we need to strip the final "h" from
	-- the first fragment
	if mw.ustring.sub(word1, -2) == "ch"
		or mw.ustring.sub(word1, -2) == "gh"
	then
		local first_ltr = remove_accents(mw.ustring.sub(word2, 1, 1))
		if first_ltr == "e" or first_ltr == "i" then
			return word1 .. word2
		else
			return mw.ustring.sub(word1, 1, -2) .. word2
		end
	end
	-- If the first fragment ends with "i" and the second fragment begins
	-- with "i" too, and none of them are accented, then we replace them with
	-- a single "i with circumflex", because in sicilian ortography two
	-- consecutive, unaccented "i" letters are never written as-is
	if mw.ustring.sub(word1, -1) == "i"
		and mw.ustring.sub(word2, 1, 1) == "i"
	then
		return mw.ustring.sub(word1, 1, -2) .. "î" .. mw.ustring.sub(word2, 2)
	end
	-- If the first fragment ends with unaccented "i" and the second fragment
	-- begins with an accented "i", then the two merge with a single accented
	-- "i", because "i with circumflex and grave" is not an acceptable
	-- typographic representation of "ji". Alternatively and more accurately,
	-- the first "i" could be replaced with a "j", but this is not common in
	-- modern typography so we don't do that
	if mw.ustring.sub(word1, -1) == "i"
		and mw.ustring.sub(word2, 1, 1) == "ì"
	then
		return mw.ustring.sub(word1, 1, -2) .. word2
	end
	-- If the first fragment ends with a i with diaeresis, and the second
	-- fragment does not begin with a vowel other than i, then strip the
	-- diaeresis since there can't be a diphtong
	if mw.ustring.sub(word1, -1) == "ï" then
		local first_ltr = remove_accents(mw.ustring.sub(word2, 1, 1))
		if mw.ustring.find(first_ltr, "[aeou]") then
			return word1 .. word2
		else
			return mw.ustring.sub(word1, 1, -2) .. "i" .. word2
		end
	end
	-- The common case
	return word1 .. word2
end

-- Utility function used by a wikicode template.
-- It removes the tail of the given verb, and glues it to one of the three
-- given tails ari, iri, iiri depending on whether the verb ends with "àri",
-- "iri" or "ìri" respectively.
-- It removes the accent from the head of the verb if the tail is accented on
-- its own.
-- If the argument override is given, it merely returns that argument instead.
local function build_verb(verb, ari, iri, iiri, override)
	if override and mw.ustring.len(override) > 0 then
		return override
	end
	
	local tail = mw.ustring.sub(verb, mw.ustring.len(verb) - 2)
	local head = truncate_word(verb, mw.ustring.len(verb) - 3)
	
	if tail == "îri" then
		head = head .. "i"
		tail = "iri"
	end
	
	if tail == "àri" then
		if is_accented(ari) then
			return join_words(remove_accents(head), ari)
		else
			return join_words(leave_accents(head), ari)
		end
	elseif tail == "iri" then
		if is_accented(iri) then
			return join_words(remove_accents(head), iri)
		else
			return join_words(leave_accents(head), iri)
		end
	elseif tail == "ìri" then
		if is_accented(iiri) then
			return join_words(remove_accents(head), iiri)
		else
			return join_words(leave_accents(head), iiri)
		end
	end
	error "did not specify a verb in infinitive mode"
end

-- Turn a regular adjective into the specified masculine or feminine,
-- singular or plural form, starting from the masculine singular form,
-- which is passed as the first argument.
-- The second argument specifies which form is desired:
--	"sm" for the singular masculine;
--	"sf" for the singular feminine;
--	"pm" for the plural masculine;
--	"pf" for the plural feminine.
-- The adjective is returned unchanged if the function can't figure out which
-- regular adjective pattern it belongs to.
function turn_adjective(adj_sm, gennum)
	assert(gennum == "sm" or gennum == "sf" or gennum == "pm" or gennum == "pf",
		"invalid argument as gender/number")
	
	if gennum == "sm" then
		return adj_sm
	end
	
	local len = mw.ustring.len(adj_sm)
	local head, tail
	
	if mw.ustring.sub(adj_sm, -3) == "icu" then
		head = truncate_word(adj_sm, len - 2)
		if gennum == "sf" then
			tail = "ca"
		elseif gennum == "pm" then
			tail = "ci"
		else
			tail = "chi"
		end
	elseif mw.ustring.sub(adj_sm, -3) == "ìsi" then
		head = truncate_word(adj_sm, len - 1)
		if gennum == "sf" then
			tail = "a"
		else
			tail = "i"
		end
	elseif mw.ustring.sub(adj_sm, -1) == "u" then
		head = truncate_word(adj_sm, len - 1)
		if gennum == "sf" then
			tail = "a"
		else
			tail = "i"
		end
	end
	
	if head and tail then
		return join_words(head, tail)
	end
	return adj_sm
end

-- Utility function used by a wikicode template.
-- It works just like the turn_adective function but has an additional
-- argument, override, which gets retured as-is whenever it contains a
-- non-empty string.
local function build_adjective(adj_sm, gennum, override)
	if override and mw.ustring.len(override) > 0 then
		return override
	end
	
	return turn_adjective(adj_sm, gennum)
end

-- Worker functions used by turn_noun() to turn regular nouns across various
-- gender/number combinations. Not all combinations are implemented, only those
-- required by currently used templates, but adding new ones is trivial.

-- Turn a regular noun from singular masculine into plural masculine.
function turn_noun_sm_to_pm(noun)
	local len = mw.ustring.len(noun)
	local head, tail
	
	if mw.ustring.sub(noun, -3) == "icu" then
		-- miccànicu -> miccànici
		head = truncate_word(noun, len - 2)
		tail = "ci"
	elseif mw.ustring.sub(noun, -1) == "a" then
		-- puèta -> puèti
		head = truncate_word(noun, len - 1)
		tail = "i"
	elseif mw.ustring.sub(noun, -1) == "u" then
		-- lùpu -> lùpi
		head = truncate_word(noun, len - 1)
		tail = "i"
	end
	
	if head and tail then
		return join_words(head, tail)
	end
	return noun
end

-- Turn a regular noun from singular masculine into singular feminine.
function turn_noun_sm_to_sf(noun)
	local len = mw.ustring.len(noun)
	local head, tail
	
	if mw.ustring.sub(noun, -3) == "ìsi" then
		-- catanìsi -> catanìsa
		head = truncate_word(noun, len - 1)
		tail = "a"
	elseif mw.ustring.sub(noun, -1) == "u" then
		-- lùpu -> lùpa
		head = truncate_word(noun, len - 1)
		tail = "a"
	end
	
	if head and tail then
		return join_words(head, tail)
	end
	return noun
end

-- Turn a regular noun from singular neuter into plural neuter.
function turn_noun_sn_to_pn(noun)
	local len = mw.ustring.len(noun)
	local head, tail
	
	if mw.ustring.sub(noun, -1) == "u" then
		-- pùgnu -> pùgna
		head = truncate_word(noun, len - 1)
		tail = "a"
	elseif mw.ustring.sub(noun, -1) == "i" then
		-- casciùni -> casciùna
		head = truncate_word(noun, len - 1)
		tail = "a"
	end
	
	if head and tail then
		return join_words(head, tail)
	end
	return noun
end

-- Turn a regular noun from singular masculine into plural feminine.
function turn_noun_sm_to_pf(noun)
	local len = mw.ustring.len(noun)
	local head, tail
	
	if mw.ustring.sub(noun, -1) == "u" then
		-- mèdicu -> mèdichi
		head = truncate_word(noun, len - 1)
		tail = "i"
	end
	
	if head and tail then
		return join_words(head, tail)
	end
	return noun
end

-- Turn a regular noun from singular feminine into plural feminine.
function turn_noun_sf_to_pf(noun)
	local len = mw.ustring.len(noun)
	local head, tail
	
	if mw.ustring.sub(noun, -1) == "a" then
		-- ròsa --> ròsi
		head = truncate_word(noun, len - 1)
		tail = "i"
	end
	
	if head and tail then
		return join_words(head, tail)
	end
	return noun
end

-- Turn a regular noun which is under the specified singular form into the
-- requested singular or plural, masculine or feminine form.
-- The result will only be correct if the noun is regular and it does have
-- the requested form.
-- The second argument specifies the form the noun is currently under:
--	"sm" for the singular masculine;
--	"sn" for the singular neuter-masculine;
--	"sf" for the singular feminine.
-- The third argument specifies which form the noun should be switched into:
--	"sm" or "sn" for the singular masculine;
--	"sf" for the singular feminine;
--	"pm" or "pn" for the plural masculine;
--	"pf" for the plural feminine.
-- The noun is returned unchanged if the function can't figure out which
-- regular noun pattern it belongs to.
-- Feminine to masculine conversion is not currently implemented as it wasn't
-- needed by the current users of this function.
function turn_noun(noun, source, dest)
	if source == "sm" then
		if dest == "sm" then return noun end
		if dest == "sf" then return turn_noun_sm_to_sf(noun) end
		if dest == "pm" or dest == "pn" then return turn_noun_sm_to_pm(noun) end
		if dest == "pf" then return turn_noun_sm_to_pf(noun) end
		error "invalid argument as destination gender / number"
	elseif source == "sn" then
		if dest == "sm" or dest == "sn" then return noun end
		if dest == "pm" or dest == "pn" then return turn_noun_sn_to_pn(noun) end
		error "invalid argument as destination gender / number"
	elseif source == "sf" then
		if dest == "sf" or dest == "sn" then return noun end
		if dest == "pf" or dest == "pn" then return turn_noun_sf_to_pf(noun) end
		error "invalid argument as destination gender / number"
	end
	error "invalid argument as source gender / number"
end

-- Utility function used by a wikicode template.
-- It works just like the turn_noun function but has an additional
-- argument, override, which gets retured as-is whenever it contains a
-- non-empty string.
local function build_noun(noun, source, dest, override)
	if override and mw.ustring.len(override) > 0 then
		return override
	end
	
	return turn_noun(noun, source, dest)
end

-- Database of the consonant groups that, when followed by a vowel, can begin
-- a new syllable in the Italian language. They can if there’s some word that
-- begins with them.
local syll_starters_ita = {
	["bl"] = true,	-- blòcco, blù
	["br"] = true,	-- bràttea, bràvo
	["ch"] = true,	-- chièsa, chìna
	["cl"] = true,	-- clàsse, clòro
	["cr"] = true,	-- crèsceri, cròce
	["ḍḍ"] = true,	-- needed for Sicilian words with Italian rules
	["dr"] = true,	-- drìtto, dràgo
	["fl"] = true,	-- flèmma, flùsso
	["fr"] = true,	-- frèddo, Frància
	["gh"] = true,	-- ghiànda, ghìro
	["gl"] = true,	-- glòria, glàssa
	["gn"] = true,	-- gnòcco, gnòmo
	["gr"] = true,	-- grèco, grànde
	["pl"] = true,	-- plàgio, plàstica
	["pn"] = true,	-- pneumàtico
	["pr"] = true,	-- pròprio, pràtico
	["ps"] = true,	-- psicòlogo
	["sb"] = true,	-- sbàrra, sbàttere
	["sbl"] = true,	-- sbloccàre
	["sbr"] = true,	-- sbrinàre, sbrottàre
	["sc"] = true,	-- scìvolo, scèndere
	["sch"] = true,	-- schìfo, schèma
	["scl"] = true,	-- sclèra
	["scr"] = true,	-- scrostàre, scremàre
	["sd"] = true,	-- sdentàto, sdoganàre
	-- sdl: no words
	["sdr"] = true,	-- sdraiàre, sdrùcciolo
	["sf"] = true,	-- sfondàre, sfìda
	["sfl"]	= true,	-- (tran-SFLu-èn-za)
	["sfr"] = true,	-- sfrontàto, sfruttàre
	["sg"] = true,	-- sgòzzare, sguazzàre
	["sgh"] = true,	-- sghèrro, sghèmbo
	-- sgl: no words
	["sgr"] = true,	-- sgranàre, sgrassàre
	["sl"] = true,	-- Slovènia, slàvo
	["sm"] = true,	-- smèttere, smània
	["sn"] = true,	-- snòdo, snaturàre
	["sp"] = true,	-- sperànza, sparàre
	["spl"] = true,	-- splèndere, splènico
	["spr"] = true,	-- sprèmere, spropòsito
	["sq"] = true,	-- squàdra, squàllido
	["sr"] = true,	-- sradicàre, sregolàto
	["st"] = true,	-- stùfo, stèmma
	-- stl: no words, "post-lu-dio" would break but it's semantic anyway
	["str"] = true,	-- stràda, strappàre
	["sv"] = true,	-- sventàre, svèndere
	["tl"] = true,	-- (a-TLè-ta)
	["tr"] = true,	-- tròppo, tràino
	["vr"] = true,	-- nop
}

-- Database of the consonant groups that, when followed by a vowel, can begin
-- a new syllable in the Sicilian language. They can if there’s some word
-- that begins with them.
local syll_starters_scn = {
	["bl"] = true,	-- blòccu, blè
	["br"] = true,	-- bràtta, Bràsi
	["bbl"] = true,	-- bblòccu, bblè
	["bbr"] = true,	-- bbràtta, Bbràsi
	["ch"] = true,	-- chiànu, chèccu
	["cch"] = true,	-- cchiù'
	["cl"] = true,	-- clàssi, clòru
	["cr"] = true,	-- crìsciri, crùci
	["dr"] = true,	-- drìtta, dràgu
	["ddr"] = true,	-- ddrìtta, ddrummintàri
	["ḍḍ"] = true,	-- ḍḍumari, ḍḍù
	["fl"] = true,	-- flèmma, flùssu
	["fr"] = true,	-- frìddu, Frància
	["gh"] = true,	-- ghiànna, ghiàcciu
	["ggh"] = true,	-- gghiòvu, gghiànu
	["gl"] = true,	-- glòria, glàssa
	["gn"] = true,	-- gnòccu, gnizziòni
	["gr"] = true,	-- grècu, grànni
	["mb"] = true,	-- mballàri, mbìviri
	["mbr"] = true,	-- mbriàcu, mbrattàri
	["mp"] = true,	-- mpizzàri, mpajàri
	["mpr"] = true,	-- mprìsa, mprinàri
	["nc"] = true,	-- nciràta, ncinnirìri
	["nch"] = true,	-- nchianàri, nchiajàtu
	["ncl"] = true,	-- nclùdiri, nclinàri
	["ncr"] = true,	-- ncruccàri, ncrustàri
	["nd"] = true,	-- ndùja, nduràri
	["ndr"] = true,	-- ndrizzàri, ndrìna
	["nf"] = true,	-- nfàmi, nfussàri
	["nfl"] = true,	-- nfluènza, nflatàrisi
	["nfr"] = true,	-- nfriddulùtu, nfrascàti
	["ng"] = true,	-- ngissàri, ngignùsu
	["ngh"] = true,	-- nghiùttiri
	["ngl"] = true,	-- nglìssi
	["ngr"] = true,	-- ngrasciàri, ngrìsi
	["nq"] = true,	-- nquatràri, nquilìnu
	["ns"] = true,	-- nsirtàri, nsivàtu
	["nt"] = true,	-- ntènniri, Ntòni
	["ntr"] = true,	-- ntràsiri, ntrallàzzu
	["nv"] = true,	-- nvèci, nvìdia
	["nz"] = true,	-- nzavanàri, nzìgna
	["pl"] = true,	-- planèta, plàstica
	["pn"] = true,	-- pneumàticu
	["pr"] = true,	-- pròpia, prùppu
	["ps"] = true,	-- psicòlogu
	["sb"] = true,	-- sbàrra, sbàttiri
	["sbl"] = true,	-- sbluccàri
	["sbr"] = true,	-- sbrizziàri, sbarazzàri
	["sc"] = true,	-- sciàrra, scìnniri
	["sch"] = true,	-- schìfu, schèma
	["scl"] = true,	-- sclamàri, sclùdiri
	["scr"] = true,	-- scrustàri, scrimàri
	["sd"] = true,	-- sdintàtu, sdisanuràtu
	["sdr"] = true,	-- sdrajàri, sdrùcciulu
	["sf"] = true,	-- sfunnàri, sfìda
	["sfl"] = true,	-- sflàvidu, sflavidìri
	["sfr"] = true,	-- sfruntàtu, sfruttàri
	["sg"] = true,	-- sgàrru, sguazzàri
	["sgh"] = true,	-- sghèrru, sghìcciu
	["sgr"] = true,	-- sgranàri, sgrasciàri
	["sl"] = true,	-- Slùvenia, slàvu
	["sp"] = true,	-- spirànza, sparàri
	["spl"] = true,	-- splènniri, splènnidu
	["spr"] = true,	-- sprèmiri, spropòsitu
	["sq"] = true,	-- squatrïàri, squaddàtu
	["sr"] = true,	-- sradicàri, sregulàtu
	["st"] = true,	-- stutàri, stèmma
	["str"] = true,	-- stràta, strazzàri
	["sv"] = true,	-- svintàri, svìnniri
	["tl"] = true,	-- (a-tlèta)
	["tr"] = true,	-- tròppu, tràsiri
	["ttr"] = true,	-- ttraccàri
	["vr"] = true,	-- vràzzu, vròdu
}

-- Return true if the argument is a single character that matches a Sicilian
-- vowel, unaccented, accented or with diaeresis. Return false in all other
-- cases.
local function is_vowel(l)
	return (mw.ustring.len(l) == 1)
		and mw.ustring.find(l, "[aeiouàèìòùâêîôûäëïöü]")
end

-- Return true if the argument is a single character that matches a Sicilian
-- consonant, except ḍ. Return false in all other cases.
local function is_consonant(l)
	return (mw.ustring.len(l) == 1)
		and mw.ustring.find(l, "[b-df-hj-np-tv-z]")
end

-- Return true if the passed string could be the beginning of a Sicilian
-- word. Return false otherwise.
local function can_begin_word(str, scn_rule)
	if scn_rule == nil then scn_rule = true end

	local first_ltr = mw.ustring.sub(str, 1, 1)
	
	-- A vowel can always begin a word.
	if is_vowel(first_ltr) then
		return true
	end
	
	-- Any consonant followed by a vowel can begin a word.
	if is_consonant(first_ltr)
		and is_vowel(mw.ustring.sub(str, 2, 2))
	then
		return true
	end
	
	-- A geminated consonant followed by a vowel can always begin a word.
	if scn_rule
		and is_consonant(first_ltr)
		and first_ltr == mw.ustring.sub(str, 2, 2)
		and is_vowel(mw.ustring.sub(str, 3, 3))
	then
		return true
	end

	if scn_rule then
		-- Lookup the database of valid two-letter syllable starters,
		-- valid when they're followed by a vowel.
		if syll_starters_scn[mw.ustring.sub(str, 1, 2)]
			and is_vowel(mw.ustring.sub(str, 3, 3))
		then
			return true
		end
		
		-- Lookup the database of valid three-letter syllable starters,
		-- valid when they're followed by a vowel.
		if syll_starters_scn[mw.ustring.sub(str, 1, 3)]
			and is_vowel(mw.ustring.sub(str, 4, 4))
		then
			return true
		end
	else
		-- Lookup the database of valid two-letter syllable starters,
		-- valid when they're followed by a vowel.
		if syll_starters_ita[mw.ustring.sub(str, 1, 2)]
			and is_vowel(mw.ustring.sub(str, 3, 3))
		then
			return true
		end
		
		-- Lookup the database of valid three-letter syllable starters,
		-- valid when they're followed by a vowel.
		if syll_starters_ita[mw.ustring.sub(str, 1, 3)]
			and is_vowel(mw.ustring.sub(str, 4, 4))
		then
			return true
		end
	end

	-- If none of the above tests succeeded, our string can't begin
	-- a word.
	return false
end

-- Return true if the substring starting at the 1-based position pos
-- inside the string str contains the first vowel of a hiatus.
-- Return false in all other cases.
function is_hiatus(str, pos)
	assert(pos > 0 and pos <= mw.ustring.len(str),
		"invalid position specified")
	-- According to the Italian grammar, by default we have a hiatus
	-- when:
	-- - two vowels are next to each other, and EITHER:
	-- 1) none of the two is a weak vowel (I or U);
	-- 2) one of the two is an accented weak vowel (I or U)
	--    and the other one is a strong vowel (A, E or O).
	-- We also signal a hiatus:
	-- 3) between two weak vowels (I or U), one accented, the other not;
	-- 4) when the first vowel is actually a semiconsonant I
	--    (that is, when we use I instead of J)
	-- 5) when one of the two vowels is explicitly marked
	--    with a diaeresis sign.	
	local l0 = mw.ustring.sub(str, pos - 1, pos - 1)
	local l1 = mw.ustring.sub(str, pos, pos)
	local l2 = mw.ustring.sub(str, pos + 1, pos + 1)
	local l3 = mw.ustring.sub(str, pos + 2, pos + 2)
	
	-- A hiatus can only happen between two vowels.
	if (not is_vowel(l1)) or (not is_vowel(l2)) then
		return false
	end
	
	-- A hiatus can not happen after the qu- or gu- groups.
	-- This check is part of the detection of the semiconsonant I.
	if mw.ustring.find(l0, "[gq]") and l1 == "u" then
		return false
	end
	
	-- Case 1: strong vowel against strong vowel.
	if mw.ustring.find(l1, "[aàeèoò]")
		and mw.ustring.find(l2, "[aàeèoò]")
	then
		return true
	end
	
	-- Case 2: accented weak vowel against strong vowel.
	if mw.ustring.find(l1, "[ìù]") and mw.ustring.find(l2, "[aeo]") then
		return true
	end
	if mw.ustring.find(l1, "[aeo]") and mw.ustring.find(l2, "[ìù]") then
		return true
	end
	
	-- Case 3: accented weak vowel against the same unaccented vowel.
	if l1 == "ì" and l2 == "i" then return true end
	if l1 == "i" and l2 == "ì" then return true end
	if l1 == "ù" and l2 == "u" then return true end
	if l1 == "u" and l2 == "ù" then return true end
	
	-- Case 4: semiconsonant I (unaccented I between two vowels).
	if l2 == "i" and is_vowel(l3) then
		return true
	end
	
	-- Case 5: explicit diaeresis over either vowel.
	if mw.ustring.find(l1, "[äëïöü]")
		or mw.ustring.find(l2, "[äëïöü]")
	then
		return true
	end
	
	-- All checks failed, therefore we are not inside a hiatus.
	return false
end

-- Return true if the substring starting at the 1-based position pos
-- inside the string str contains the first vowel of a dihptong.
-- Return false in all other cases.
function is_diphthong(str, pos)
	-- A diphthong can only happen between two vowels.
	if not is_vowel(mw.ustring.sub(str, pos, pos))
		or not is_vowel(mw.ustring.sub(str, pos + 1, pos + 1))
	then
		return false
	end
	
	-- Just check that we haven't a hiatus.
	return not is_hiatus(str, pos)
end

-- This function can be used to split a Sicilian word into syllables.
-- The first parameter is the word to split, and the second parameter is
-- an 1-based index into the word where the splitting will begin.
-- The return value is the 1-based index of the end of the first syllable
-- found in the word, starting at the given index.
-- The third parameter is optional and it tells the function whether to use,
-- in order to decide whether a syllable is legal or not, the Sicilian words
-- as a template (by passing true, which is the default) or to use the Italian
-- words instead (by passng false).
-- The function won't return meaningful results for non-Sicilian words.
function next_syllable(word, start_index, scn_rule)
	if scn_rule == nil then scn_rule = true end

	-- A syllable is extracted by cutting the shortest segment at the
	-- beginning of the word such that:
	-- a) it contains a vowel;
	-- b) the breaking point does not happen inside a diphthong;
	-- c) the remaining part of the word (the next syllable) is
	--    legitimate as the beginning of a Sicilian word. That is, there
	--    is some Sicilian word which begins with those letters.
	-- Since, as a reference for the last point, many actually use
	-- Italian words instead, there is support for that too.
	local len = mw.ustring.len(word)
	assert(start_index >= 1 and start_index <= len,
		"invalid start index specified")
	word = mw.ustring.lower(word)
	local i = start_index - 1
	local got_vowel = false
	local next_ok = false
	local breaking_diph = false
	repeat
		if is_vowel(mw.ustring.sub(word, i + 1, i + 1)) then
			got_vowel = true
		end
		if can_begin_word(mw.ustring.sub(word, i + 2), scn_rule) then
			next_ok = true
		else
			next_ok = false
		end
		if is_diphthong(word, i + 1) then
			breaking_diph = true
		else
			breaking_diph = false
		end
		i = i + 1
	until (i == len) or (got_vowel and next_ok and not breaking_diph)
	return i
end

-- This function returns a hyphenated version of a given word.
-- The first argument specifies the word.
-- The optional second argument specifies the string to be inserted between the
-- syllables of the word, and it defaults to "-".
-- The optional third argument specifies whether to split the word into
-- syllables using Sicilian vocabulary as a template (if true)
-- or using the Italian vocabulary instead (if false, the default).
function hyphenate(word, separator, scn_rule)
	if scn_rule == nil then scn_rule = false end
	separator = separator or "-"
	local i = 1
	local hw = ""
	while i <= mw.ustring.len(word) do
		local n = next_syllable(word, i, rule)
		if mw.ustring.len(hw) > 0 then
			hw = hw .. separator
		end
		hw = hw .. mw.ustring.sub(word, i, n)
		i = n + 1
	end
	return hw
end

-- Utility function used to access truncate_word() from wikicode.
function p.truncate_word(frame)
	return truncate_word(frame.args[1], tonumber(frame.args[2]))
end

-- Utility function used to access join_words() from wikicode.
function p.join_words(frame)
	return join_words(frame.args[1], frame.args[2])
end

-- Utility function used to access remove_accents() from wikicode.
function p.remove_accents(frame)
	return remove_accents(frame.args[1])
end

-- Utility function used to access build_verb() from wikicode.
function p.build_verb(frame)
	local verb = frame.args[1]
	local ari = frame.args[2]
	local iri = frame.args[3]
	local iiri = frame.args[4]
	local override = frame.args[5]
	return build_verb(verb, ari, iri, iiri, override)
end

-- Utility function used to access build_adjective() from wikicode
function p.build_adjective(frame)
	local adj_sm = frame.args[1]
	local gennum = frame.args[2]
	local override = frame.args[3]
	return build_adjective(adj_sm, gennum, override)
end

-- Utility function used to access build_noun() from wikicode
function p.build_noun(frame)
	local noun = frame.args[1]
	local source = frame.args[2]
	local dest = frame.args[3]
	local override = frame.args[4]
	return build_noun(noun, source, dest, override)
end

-- Utility function used to access hyphenate() from wikicode
function p.hyphenate(frame)
	local word = frame.args[1]
	local separator = frame.args[2]
	local scn_rule = frame.args[3]
	return hyphenate(word, separator, scn_rule)
end

return p