Given a source text like
nin2 hao3 ma
(which is a typical way to write ASCII Pinyin, without proper accentuated characters) and given a (UTF8) conversion table like
a1;ā
e1;ē
i1;ī
o1;ō
u1;ū
ü1;ǖ
A1;Ā
E1;Ē
...
how would I convert the source text into
nín hǎo ma
?
For what it's worth I'm using PHP, and this might be a regex I'm looking into?
<?php
$in = 'nin2 hao3 ma';
$out = 'nín hǎo ma';
function replacer($match) {
static $trTable = array(
1 => array(
'a' => 'ā',
'e' => 'ē',
'i' => 'ī',
'o' => 'ō',
'u' => 'ū',
'ü' => 'ǖ',
'A' => 'Ā',
'E' => 'Ē'),
2 => array('i' => 'í'),
3 => array('a' => 'ǎ')
);
list(, $word, $i) = $match;
return str_replace(
array_keys($trTable[$i]),
array_values($trTable[$i]),
$word); }
// Outputs: bool(true)
var_dump(preg_replace_callback('~(\w+)(\d+)~', 'replacer', $in) === $out);
Ollie's algorithm was a nice start, but it didn't apply the marks correctly. For example, qiao1 became qīāō. This one is correct and complete. You can easily see how the replacement rules are defined.
It does the whole thing for tone 5 as well, although it doesn't affect the output, except for deleting the number. I left it in, in case you want to do something with tone 5.
The algorithm works as follows:
Example:
qiao => (iao becomes ia*o) => qia*o => qiǎo
This strategy, and the use of strtr
(which prioritizes longer replacements), makes sure that this won't happen:
qiao1 => qīāō
function pinyin_addaccents($string) {
# Find words with a number behind them, and replace with callback fn.
return preg_replace_callback(
'~([a-zA-ZüÜ]+)(\d)~',
'pinyin_addaccents_cb',
$string);
}
# Helper callback
function pinyin_addaccents_cb($match) {
static $accentmap = null;
if( $accentmap === null ) {
# Where to place the accent marks
$stars =
'a* e* i* o* u* ü* '.
'A* E* I* O* U* Ü* '.
'a*i a*o e*i ia* ia*o ie* io* iu* '.
'A*I A*O E*I IA* IA*O IE* IO* IU* '.
'o*u ua* ua*i ue* ui* uo* üe* '.
'O*U UA* UA*I UE* UI* UO* ÜE*';
$nostars = str_replace('*', '', $stars);
# Build an array like Array('a' => 'a*') and store statically
$accentmap = array_combine(explode(' ',$nostars), explode(' ', $stars));
unset($stars, $nostars);
}
static $vowels =
Array('a*','e*','i*','o*','u*','ü*','A*','E*','I*','O*','U*','Ü*');
static $pinyin = Array(
1 => Array('ā','ē','ī','ō','ū','ǖ','Ā','Ē','Ī','Ō','Ū','Ǖ'),
2 => Array('á','é','í','ó','ú','ǘ','Á','É','Í','Ó','Ú','Ǘ'),
3 => Array('ǎ','ě','ǐ','ǒ','ǔ','ǚ','Ǎ','Ě','Ǐ','Ǒ','Ǔ','Ǚ'),
4 => Array('à','è','ì','ò','ù','ǜ','À','È','Ì','Ò','Ù','Ǜ'),
5 => Array('a','e','i','o','u','ü','A','E','I','O','U','Ü')
);
list(,$word,$tone) = $match;
# Add star to vowelcluster
$word = strtr($word, $accentmap);
# Replace starred letter with accented
$word = str_replace($vowels, $pinyin[$tone], $word);
return $word;
}
For a .NET solution try Pinyin4j.NET
Features Convert Chinese (both Simplified and Traditional) to most popular pinyin systems. Supporting pinyin system are listed below.
VB Macro (Libre)Office : Convert pinyin tone numbers to accents
Hopefully the algorithm is correct accordingly to pinyin rules specially for i and u.
sub replaceNumberByTones
call PinyinTonesNumber("a([a-z]*[a-z]*)0", "a$1")
call PinyinTonesNumber("a([a-z]*[a-z]*)1", "a$1")
call PinyinTonesNumber("a([a-z]*[a-z]*)2", "á$1")
call PinyinTonesNumber("a([a-z]*[a-z]*)3", "a$1")
call PinyinTonesNumber("a([a-z]*[a-z]*)4", "à$1")
call PinyinTonesNumber("o([a-z]*[a-z]*)0", "o$1")
call PinyinTonesNumber("o([a-z]*[a-z]*)1", "o$1")
call PinyinTonesNumber("o([a-z]*[a-z]*)2", "ó$1")
call PinyinTonesNumber("o([a-z]*[a-z]*)3", "o$1")
call PinyinTonesNumber("o([a-z]*[a-z]*)4", "ò$1")
call PinyinTonesNumber("e([a-z]*[a-z]*)0", "e$1")
call PinyinTonesNumber("e([a-z]*[a-z]*)1", "e$1")
call PinyinTonesNumber("e([a-z]*[a-z]*)2", "é$1")
call PinyinTonesNumber("e([a-z]*[a-z]*)3", "e$1")
call PinyinTonesNumber("e([a-z]*[a-z]*)4", "è$1")
call PinyinTonesNumber("u([a-hj-z]*[a-hj-z]*)0", "u$1")
call PinyinTonesNumber("u([a-hj-z]*[a-hj-z]*)1", "u$1")
call PinyinTonesNumber("u([a-hj-z]*[a-hj-z]*)2", "ú$1")
call PinyinTonesNumber("u([a-hj-z]*[a-hj-z]*)3", "u$1")
call PinyinTonesNumber("u([a-hj-z]*[a-hj-z]*)4", "ù$1")
call PinyinTonesNumber("i([a-z]*[a-z]*)0", "i$1")
call PinyinTonesNumber("i([a-z]*[a-z]*)1", "i$1")
call PinyinTonesNumber("i([a-z]*[a-z]*)2", "í$1")
call PinyinTonesNumber("i([a-z]*[a-z]*)3", "i$1")
call PinyinTonesNumber("i([a-z]*[a-z]*)4", "ì$1")
End sub
sub PinyinTonesNumber(expression, replacement)
rem ----------------------------------------------------------------------
rem define variables
dim document as object
dim dispatcher as object
rem ----------------------------------------------------------------------
rem get access to the document
document = ThisComponent.CurrentController.Frame
dispatcher = createUnoService("com.sun.star.frame.DispatchHelper")
rem ----------------------------------------------------------------------
dim args1(18) as new com.sun.star.beans.PropertyValue
args1(0).Name = "SearchItem.StyleFamily"
args1(0).Value = 2
args1(1).Name = "SearchItem.CellType"
args1(1).Value = 0
args1(2).Name = "SearchItem.RowDirection"
args1(2).Value = true
args1(3).Name = "SearchItem.AllTables"
args1(3).Value = false
args1(4).Name = "SearchItem.Backward"
args1(4).Value = false
args1(5).Name = "SearchItem.Pattern"
args1(5).Value = false
args1(6).Name = "SearchItem.Content"
args1(6).Value = false
args1(7).Name = "SearchItem.AsianOptions"
args1(7).Value = false
args1(8).Name = "SearchItem.AlgorithmType"
args1(8).Value = 1
args1(9).Name = "SearchItem.SearchFlags"
args1(9).Value = 65536
args1(10).Name = "SearchItem.SearchString"
args1(10).Value = expression
args1(11).Name = "SearchItem.ReplaceString"
args1(11).Value = replacement
args1(12).Name = "SearchItem.Locale"
args1(12).Value = 255
args1(13).Name = "SearchItem.ChangedChars"
args1(13).Value = 2
args1(14).Name = "SearchItem.DeletedChars"
args1(14).Value = 2
args1(15).Name = "SearchItem.InsertedChars"
args1(15).Value = 2
args1(16).Name = "SearchItem.TransliterateFlags"
args1(16).Value = 1280
args1(17).Name = "SearchItem.Command"
args1(17).Value = 3
args1(18).Name = "Quiet"
args1(18).Value = true
dispatcher.executeDispatch(document, ".uno:ExecuteSearch", "", 0, args1())
end sub
Hope this helps someone
François
To add a javascript solution:
This code places Tonemarks according to the official algorithm for placing one, see wikipedia.
Hope that helps some of you, suggestions and improvements wellcome!
var ACCENTED = {
'1': {'a': '\u0101', 'e': '\u0113', 'i': '\u012B', 'o': '\u014D', 'u': '\u016B', 'ü': '\u01D6'},
'2': {'a': '\u00E1', 'e': '\u00E9', 'i': '\u00ED', 'o': '\u00F3', 'u': '\u00FA', 'ü': '\u01D8'},
'3': {'a': '\u01CE', 'e': '\u011B', 'i': '\u01D0', 'o': '\u01D2', 'u': '\u01D4', 'ü': '\u01DA'},
'4': {'a': '\u00E0', 'e': '\u00E8', 'i': '\u00EC', 'o': '\u00F2', 'u': '\u00F9', 'ü': '\u01DC'},
'5': {'a': 'a', 'e': 'e', 'i': 'i', 'o': 'o', 'u': 'u', 'ü': 'ü'}
};
function getPos (token) {
if (token.length === 1){
// only one letter, nothing to differentiate
return 0;
}
var precedence = ['a', 'e', 'o'];
for (i=0; i<precedence.length; i += 1){
var pos = token.indexOf(precedence[i]);
// checking a before o, will take care of ao automatically
if (pos >= 0){
return pos;
}
}
var u = token.indexOf('u');
var i = token.indexOf('i');
if (i < u){
// -iu OR u-only case, accent goes to u
return u;
} else {
// -ui OR i-only case, accent goes to i
return i;
}
// the only vowel left is ü
var ü = token.indexOf('ü');
if (ü >= 0){
return ü;
}
}
//and finally:
// we asume the input to be valid PinYin, therefore no security checks....
function placeTone(numbered_PinYin){
var ToneIndex = numbered_PinYin.charAt(numbered_PinYin.length -1);
var accentpos = getPos(numbered_PinYin);
var accented_Char = ACCENTED[ToneIndex][numbered_PinYin.charAt(accentpos)];
var accented_PinYin = "";
if (accentpos === 0){
// minus one to trimm the number off
accented_PinYin = accented_Char + numbered_PinYin.substr(1, numbered_PinYin.length-1);
} else {
var before = numbered_PinYin.substr(0, accentpos);
var after = numbered_PinYin.substring(accentpos+1, numbered_PinYin.length-1);
accented_PinYin = before + accented_Char + after;
}
return accented_PinYin;
}
console.log(placeTone('han4 zi4'));