Module:Lib UTF8

-- from touhouwiki local common = require("Module:Common")

local utf8 = {}

-- Iterator that returns the start, end and the current character.    Required by utf8.explode function utf8.iter(str) str = tostring(str) local i = 1 local j local n = #str local char = nil return function if i <= n then j = i       char = str:byte(j) if char < 0x80 then -- 0xxxxxxx i = j + 1 elseif char < 0xc0 then -- 10xxxxxx return nil -- error, we're in the middle of a character elseif char < 0xe0 then -- 110xxxxx 10xxxxxx i = j + 2 elseif char < 0xf0 then -- 1110xxxx 10xxxxxx 10xxxxxx i = j + 3 elseif char < 0xf8 then -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx i = j + 4 elseif char < 0xfc then -- 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx i = j + 5 elseif char < 0xfe then -- 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx i = j + 6 else return nil -- 0xfe and 0xff are invalid UTF-8 values end -- TODO: parsing of a character? return j,(i-1),string.sub(str, j, (i-1)) end end end

-- Splits an UTF-8 text (encoding used by Lua) into single character parts. function utf8.explode(text) local pts = {} if text == nil or type(text) ~= 'string' then return pts end if #text > 0 then for s,e,v in utf8.iter(text) do pts[#pts+1] = v end else -- technically there is one part - the empty string pts[#pts+1] = '' end return pts end

-- Replaces whole non-ASCII characters.    'reps' should be a table in format ['find'] = 'replace', like     {['A'] = 'a', ['B'] = b, ['犬'] = '猫', ...},     where table keys have to be single characters and the replacement can be any string. function utf8.replace_char(text, reps) local parts = utf8.explode(text) for k,v in pairs(parts) do   if common.isset(reps[v]) then parts[k] = reps[v] end end return table.concat(parts) end

-- Replaces non-ASCII strings.    'reps' should be a table of string pairs, like:     { {'find', 'replace'}, {'bird', 'cat'}, {'fly', 'walk'}, {'八雲　藍', '式神'}, ... }     Only the first found replacement is being executed.     Note that this is relatively slow solution, so should be used only for replacement of non-ASCII texts.     For ASCII text it's better to use string.gsub function utf8.replace(text, reps) local parts = utf8.explode(text) local ret = {} local reps2 = {} local found for k,v in pairs(reps) do reps2[#reps2+1] = {utf8.explode(v[1]), utf8.explode(v[2])} end

local i = 1 while i <= #parts do   found = false

for k,v in pairs(reps2) do     if common.partialTableCompare(parts, v[1], i, 1, #v[1]) then found = true -- found match, perform swap for k1,v1 in pairs(v[2]) do ret[#ret+1] = v1 end i = i + #v[1] - 1 break end end

if not found then ret[#ret+1] = parts[i] end i = i + 1 end

return table.concat(ret) end

return utf8