Friday, May 6, 2011

Javascript localization hell

Not long time ago I visited a hell. The hell was called javascript localization. And the headache started when I wanted to sort some strings. To do so I needed to compare them first. This would seem as a simple task if you would be an english gentleman without the fear of diacritiques. But as I live and work in a country where diacritique is ever present I needed to process it properly. Method localeCompare was working differently on every browser. Also if method indexOf was used for searching trough text with diacritique using search expression without diacritique it was not working properly.

So I once again ended with something I don't like very much. I wrote my custom javascript util that could be used like this:
localeHelper.diacritiqueComparison('čerešne', 'citron'); // = 1
localeHelper.diacritiqueComparison('čerešne', 'hrozno'); // = -1

// this is case insensitive
localeHelper.containsDiacritiqueText('štart', 'st'); // = true
localeHelper.containsDiacritiqueText('štart', 'št'); // = true
localeHelper.containsDiacritiqueText('štart', 'tar'); // = true
localeHelper.containsDiacritiqueText('trend', 'tr'); // = true
localeHelper.containsDiacritiqueText('trend', 'ťr'); // = false (as we explicitly want the symbol 'ť' and not 't')
localeHelper.containsDiacritiqueText('trend', 'b'); // = false (symbol 'b' is not present)
The final code looked like this:
function LocaleHelper() { // constructor

var i;

// source: http://en.wikipedia.org/wiki/Alphabets_derived_from_the_Latin
// supported languages :
// austro-bavarian, belarusian, croatian, czech, dutch, estonian, finish, french, german, hungarian, irish, italian,
// latvian, lithuanian, polish, portuquese, romanian, slovak, sorbian, spanish, swedish, turkish
//
// todo: not sure how to or if I should process german letter 'ß'
var localUpperVowelList = "ÁÀÂÄĂĀÃÅĄÆÉÈĖÊËĚĒĘÍÌİÎÏĪĮÓÒÔÖÕŐŒÚÙÛÜŬŪŰŮŲÝŸ";
var latinUpperVowelList = "AAAAAAAAAAEEEEEEEEIIIIIIIOOOOOOOUUUUUUUUUYY";

var localLowerVowelList = "áàâäăāãåąæéèėêëěēęıíìîïīįóòôöõőœúùûüŭūűůųýÿ";
var latinLowerVowelList = "aaaaaaaaaaeeeeeeeeiiiiiiiooooooouuuuuuuuuyy";

var localUpperConsonantList = "ĆČÇĎĐĞĢĶĹĻŁĽŃŇÑŅŔŘŚŠŞȘŤŢṬŹŻŽ";
var latinUpperConsonantList = "CCCDDGGKLLLLNNNNRRSSSSTTTZZZ";

var localLowerConsonantList = "ćčçďđğģķĺļłľńňñņŕřśšşșťţṭźżž";
var latinLowerConsonantList = "cccddggkllllnnnnrrsssstttzzz";

this.charMap = [];
for (i = 0; i < localUpperVowelList.length; i++) {
this.charMap[localUpperVowelList.charAt(i)] = latinUpperVowelList.charAt(i);
}
for (i = 0; i < localLowerVowelList.length; i++) {
this.charMap[localLowerVowelList.charAt(i)] = latinLowerVowelList.charAt(i);
}
for (i = 0; i < localUpperConsonantList.length; i++) {
this.charMap[localUpperConsonantList.charAt(i)] = latinUpperConsonantList.charAt(i);
}
for (i = 0; i < localLowerConsonantList.length; i++) {
this.charMap[localLowerConsonantList.charAt(i)] = latinLowerConsonantList.charAt(i);
}
}


LocaleHelper.prototype = {

removeCharDiacritique : function(charToProcess) {

var result = this.charMap[charToProcess];
if ((result == undefined) || (result == null)) {
result = charToProcess;
}

return result;
},

localeCharCompare : function(charA, charB) {

var newCharA = this.removeCharDiacritique(charA);
var newCharB = this.removeCharDiacritique(charB);

return (newCharA == newCharB) ? 0 : ((newCharA < newCharB) ? -1 : 1);

// removed: doesn't work on every browser
// return charA.localeCompare(charB);
},

isLatinLetter : function(character) {
return (character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z');
},


// case sensitivity is used only when the words have the same letters (they are read the same way)
diacritiqueComparison : function(textA, textB) {
// todo: note: in Lithuanian alphabet the 'y' character is just before before 'j' - so this algorithm won't work properly

var result = 0;

var caseDiff = 0; // difference in case sensitiveness
var minLength = Math.min(textA.length, textB.length);
for (var i = 0; i < minLength; i++) {
var charA = textA.charAt(i);
var charB = textB.charAt(i);
var lowerA = charA.toLocaleLowerCase();
var lowerB = charB.toLocaleLowerCase();

result = this.localeCharCompare(lowerA, lowerB);
if (result == 0 && lowerA != lowerB) {
result = (lowerA < lowerB) ? -1 : 1;
}

if (result == 0) {
if (caseDiff == 0 && charA != charB) { // first most left difference in case is the only important one
caseDiff = (charA < charB) ? -1 : 1;
}
} else {
break;
}
}

if (result == 0) {
if (textA.length != textB.length) {
result = (textA.length < textB.length) ? -1 : 1;
} else {
result = caseDiff; // if the strings are identical let the case sensitive difference decide
}
}

return result;
},

containsDiacritiqueText : function (fullText, searchText) {

var textA = fullText;
var textB = searchText;

var result = false;

if (textB.length == 0) {
result = true;
} else if (textA.length >= textB.length) {
for (var i = 0 ; i < textA.length - textB.length + 1; i++) {
var found = true;

for (var j = 0; j < textB.length; j++) {
var charA = textA.charAt(i + j).toLocaleLowerCase();
var charB = textB.charAt(j).toLocaleLowerCase();

if (charA != charB) {
if (this.localeCharCompare(charA, charB) != 0) {
found = false;
break;
} else if (!this.isLatinLetter(charB)) {
found = false;
break;
}
}
}

if (found === true) {
result = true;
break;
}
}
}

return result;
}
};


var localeHelper = new LocaleHelper();

1 comment:

  1. To make the translation of Java strings a less hellish process, you could use the software localization platform https://poeditor.com/ to collaboratively translate the strings.

    ReplyDelete